都是按照包的manual进行学习(链接都采用谷歌打开,或者直接搜索IRanges包)
老版本安装方法
# source("http://www.bioconductor.org/biocLite.R")
# biocLite(c("IRanges"))
# 新版本安装方法
if (!require("BiocManager"))
install.packages("BiocManager")
BiocManager::install("IRanges")
library(IRanges)
# 使用IRanges包产生以start为开始坐标,宽度为width的区间
# width区域长度为 end - start + 1
ir1 <- IRanges(start = 1:10, width = 10:1)
# 以start为开始, 终止位置都为11的不同宽同终止位置的区间
ir2 <- IRanges(start = 1:10, end = 11)
# 固定终止位置,可变宽度
ir3 <- IRanges(end = 11, width = 10:1)
# identical: Check whether x and y are identical objects.
identical(ir1, ir2) && identical(ir1, ir3)
ir <- IRanges(c(1, 8, 14, 15, 19, 34, 40),
width=c(12, 6, 6, 15, 6, 2, 7))
## 以上都是使用不同的`start`, `end`, `width` 组合方式创建类似的IRanges对象
## 可以通过`start`, `end`, `width`三个函数提取对应的数据
# 提取起始坐标
start(ir)
# 提取终止坐标
end(ir)
#提取坐标宽度
width(ir)
# 通过数值和逻辑值可以提取IRanges的子集
ir[1:4] # 提取1至4行
ir[start(ir) <= 15] # 提取起始位置小于等于15的行
# 可视化区域
plotRanges = function(x, xlim = x, main = deparse(substitute(x)), col = "black", sep=0.5){
height = 1
if (is(xlim, "Ranges"))
xlim = c(min(start(xlim)), max(end(xlim)))
bins = disjointBins(IRanges(start(x), end(x)+1))
plot.new()
plot.window(xlim, c(0, max(bins) * (height + sep)))
ybottom = bins * (sep + height) - height
rect(start(x) - 0.5, ybottom, end(x) + 0.5, ybottom + height, col = col)
title(main)
axis(1)
invisible(ybottom)
}
plotRanges(ir, col = "blue")
# 插播 逐行解释
#############################################################################################
height <- 1
sep = 0.5
main = deparse(substitute(ir))
xlim <- c(min(start(ir)), max(end(ir))) ## 得到ir的整个区域起始的最小值和最大值
bins <- disjointBins(IRanges(start(ir), end(ir) + 1))
# ?disjointBins
# disjointBins segregates x into a set of bins so that the ranges in each bin are disjoint. Lower-indexed bins are filled first.
# The method returns an integer vector indicating the bin index for each range.
# 我的理解应该是在所有的区间中,首先从不相交的区间开始排列建立index为1,然后考虑剩下的bin不相交的建立index为2,以此类推。
# https://stackoverflow.com/questions/25130462/get-disjoint-sets-from-a-list-in-r
plot.new()
plot.window(xlim, c(0, max(bins)*(height + sep)))
# xlim给定x轴的范围, 根据bin中index的最大数目,来查看图片所需要叠加的行,即画多少行,然后为了不显的拥挤,将y轴扩大1.5倍
ybottom <- bins * (sep + height) - height
# 指定rect画图时候y轴方向下面那条现的起始位置
rect(start(ir)-0.5, ybottom, end(ir)+0.5, ybottom + height, col = "black")
# rect(xleft = 1, ybottom = 1, xright = 5, ytop = 5) 对应的值为此代码中的字面意思,即x左,y下,x右, y上,四个值构成一个矩形。
title(deparse(substitute(ir))) # 添加图标标题
axis(1)
# axis为基础的绘图函数,里面参数有side = 1,2,3,4分别代表刻度尺为放在最下面,最左边,最上面,最右边,
# axis还有一个参数at表示指定轴的标签如 at = c(0, 2, 4, 6))表示轴的标签为0,2,4,6
# 相对应的还有一个label参数,对应at里面的刻度,可以自定义为你想要展示的字符,如:
# axis(side = 1, at = c(0, 2, 4, 6), labels = paste(c(0, 2, 4, 6), "A", sep = ""))
# 更多的需要另外学习基础画图函数
#############################################################################################
## 2.1 Normality
## reduce函数
reduce(ir) # 合并有交集的区域
plotRanges(reduce(ir))
## 2.2 Lists of IRanges objects
## IRangesList
## 将多个IRanges文件合并为list形式。
rl <- IRangesList(ir, rev(ir))
# rev函数表示反向排列输出
start(rl) ## 结果将会输出两个IRanges的起始位置
# 2.3 Vector Extraction
set.seed(0) # 保证随机数据别人可重复
lambda <- c(rep(0.001, 4500), seq(0.001, 10, length=500),
seq(10, 0.001, length=500))
## seq函数表示随机生成数据,seq(0.001, 10, length=500) 表示0.001~10随机生成500个数值
xVector <- rpois(1e7, lambda)
## 表示Poisson分布生成λ为lambda的1e7个数据
# 泊松分布的参数λ是单位时间(或单位面积)内随机事件的平均发生率。 泊松分布适合于描述单位时间内随机事件发生的次数。
yVector <- rpois(1e7, lambda[c(251:length(lambda), 1:250)])
xRle <- Rle(xVector) # 可以明显看到将数据从38.0Mb变为了11.5Mb
# BioC的IRanges包为这些数据提供了一种简便可行的信息压缩方式,即Rle(Run Length Encoding)
yRle <- Rle(yVector)
irextract <- IRanges(start=c(4501, 4901) , width=100)
xRle[irextract]
## 2.4 Finding Overlapping Ranges
## 区域里面查找两个IRanges的交集
ol <- findOverlaps(ir, reduce(ir))
as.matrix(ol) ## 将结果转换为矩阵
# 2.5 Counting Overlapping Ranges
# 连接2.4查找两个IRanges的交集的个数
# coverage counts the number of ranges over each position
cov <- coverage(ir) ## 计算按照是否相交排序分类归于哪一类的问题,以及每个位点对应的个数
plotRanges(ir) ## 可视化后明显分为三类,
cov <- as.vector(cov)
mat <- cbind(seq_along(cov)-0.5, cov)
d <- diff(cov) != 0
# diff函数的使用,
# 参考 https://cloud.tencent.com/developer/ask/190364
# 它计算连续元素对之间的差异。即没有参数指定的时候即为后一个数与前一个数的差
# 假设temp是对某些变量的观察,例如一小时的温度读数。然后diff(temp)会告诉你每小时温度变化了多少。
# 相反的diff()是cumsum()(累积总和):
mat <- rbind(cbind(mat[d,1]+1, mat[d,2]), mat)
mat <- mat[order(mat[,1]),]
lines(mat, col="red", lwd=4)
axis(2)
# 2.6 Finding Neighboring Ranges
# nearest 查找最邻近的区域
# 2.7 Transforming Ranges
# 2.7.1 Adjusting starts, ends and widths 调整开始、终止、宽度
# 图文并茂的参考的教程
# https://blog.csdn.net/u014801157/article/details/24372479
# 为了形象化展示区别,这里采用上面网站中修改后的plotRanges画图函数
plotRanges <- function(x, xlim = x, main = deparse(substitute(x)), col = "black",
add = FALSE, ybottom = NULL, ...) {
require(scales)
col <- alpha(col, 0.5)
height <- 1
sep <- 0.5
if (is(xlim, "Ranges")) {
xlim <- c(min(start(xlim)), max(end(xlim)) * 1.2)
}
if (!add) {
bins <- disjointBins(IRanges(start(x), end(x) + 1))
ybottom <- bins * (sep + height) - height
par(mar = c(3, 0.5, 2.5, 0.5), mgp = c(1.5, 0.5, 0))
plot.new()
plot.window(xlim, c(0, max(bins) * (height + sep)))
}
rect(start(x) - 0.5, ybottom, end(x) + 0.5, ybottom + height, col = col,
...)
text((start(x) + end(x))/2, ybottom + height/2, 1:length(x), col = "white",
xpd = TRUE)
title(main)
axis(1)
invisible(ybottom)
}
# shift(x, shift=0L, use.names=TRUE)
ir <- IRanges(c(3000, 2500), width = c(300, 850))
ir.trans <- shift(ir, 500) # 表示该区域整体向前平移十个单位比如[1, 10] → [11, 20]
xlim <- c(0, max(end(ir, ir.trans)) * 1.3)
ybottom <- plotRanges(ir, xlim = xlim, main = "shift", col = "blue")
plotRanges(ir.trans, add = TRUE, ybottom = ybottom, main = "", col = "red")
# 同样的还有函数 narrow, resize, flank, reflect, restrict, and threebands
# narrow(x, start=NA, end=NA, width=NA, use.names=TRUE)
narrow(ir, start=1:5, width=2) # 将原来的开始位置按照1到5逐级增加,保持宽度为2,比如原来的c(1, 2, 3, 4, 5, 6) → c(1, 3, 5, 7, 9, 6)
# resize(x, width, fix="start", use.names=TRUE, ...)
resize(ir, width = 2, fix = "start") # 保持start不变,将宽度调整为2
resize(ir, width = 2, fix = "end") # 保持end不变,将宽度调整为2
# flank(x, width, start=TRUE, both=FALSE, use.names=TRUE, ...)
ir.trans <- flank(ir, width = 2, both = T) # 表示以start为中心区域向两侧扩展2个单位,即1变为c(-1,2)
xlim <- c(0, max(end(ir, ir.trans)) * 1.3)
ybottom <- plotRanges(ir, col = "red")
plotRanges(ir.trans, col = "blue", add = TRUE, ybottom = ybottom)
# promoters(x, upstream=2000, downstream=200, use.names=TRUE, ...)
promoters(ir, upstream = 3, downstream = 0) # 表示以start为中心取上游3个单位
promoters(ir, upstream = 0, downstream = 3) # 表示以start为中心取下游3个单位的区间
# reflect(x, bounds, use.names=TRUE)
bounds <- IRanges(c(2000, 3000), width = 500)
ir.trans <- reflect(ir, bounds = bounds) # 不懂
xlim <- c(0, max(end(ir, ir.trans)) * 1.3)
ybottom <- plotRanges(ir, col = "red")
plotRanges(ir.trans, col = "blue", add = TRUE, ybottom = ybottom)
# restrict(x, start=NA, end=NA, keep.all.ranges=FALSE, use.names=TRUE)
ir.trans <- restrict(ir, start = 2000, end = 3000) # 将之前的区间都落在此区间进行重新定义
ybottom <- plotRanges(ir, col = "red")
plotRanges(ir.trans, col = "blue", add = TRUE, ybottom = ybottom)
# threebands(x, start=NA, end=NA, width=NA)
# 延伸了narrow函数的功能,返回3个相关范围,分别对应到"left","middle" 和 "right",其中"middle"范围对应的就是narrow函数返回的范围:
ir + seq_len(length(ir))
ir * -2 # 将区域范围扩大两倍
ir * 2 # 将区域需缩小
# 2.7.2 Making ranges disjoint
ir <- IRanges(c(1, 8, 14, 15, 19, 34, 40),
width=c(12, 6, 6, 15, 6, 2, 7))
disjoin(ir) # 将整个区域变成没有交集的最小区间,
plotRanges(disjoin(ir))
plotRanges(ir)
disjointBins(ir) # 表示按照不重叠原则,将区间分类,可以通过plotRanges()画出来一目了然
# 2.7.3 Other transformations
trans_ir <- reflect(ir, IRanges(start(ir), width=width(ir)*2))
plotRanges(trans_ir)
flank(ir, width = seq_len(length(ir)))
# 2.8 Set Operations
# gap找间隙
gaps(ir, start=1, end=50) # 查找1到50这个区间,ir没有覆盖到的区域即gap
plotRanges(gaps(ir, start=1, end=50), c(1,50))
# intersect求交集区域;setdiff求差异区域;union求并集:
ir1 <- IRanges(c(200, 1000, 3000, 2500), width = c(600, 1000, 300, 850))
ir2 <- IRanges(c(100, 1500, 2000, 3500), width = c(600, 800, 1000, 550))
xlim <- c(0, max(end(ir1, ir2)) * 1.3)
ybottom <- plotRanges(reduce(ir1), xlim = xlim, col = "blue", main = "original")
plotRanges(ir1)
plotRanges(ir2)
plotRanges(reduce(ir2), xlim = xlim, col = "blue", main = "", add = TRUE, ybottom = ybottom)
plotRanges(intersect(ir1, ir2), xlim = xlim, col = "red")
# intersect两个区间的交集
plotRanges(setdiff(ir1, ir2), xlim = xlim, col = "red")
# setdiff两个区间没有交集的区间
plotRanges(union(ir1, ir2), xlim = xlim, col = "red")
# union两个区间的并集
# 3 Vector Views
# 3.1 Creating Views
set.seed(0) # 保证随机数据别人可重复
lambda <- c(rep(0.001, 4500), seq(0.001, 10, length=500),
seq(10, 0.001, length=500))
## seq函数表示随机生成数据,seq(0.001, 10, length=500) 表示0.001~10随机生成500个数值
xVector <- rpois(1e7, lambda)
xRle <- Rle(xVector)
xViews <- Views(xRle, xRle >= 1)
xViews <- slice(xRle, 1)
xRleList <- RleList(xRle, 2L * rev(xRle))
xViewsList <- slice(xRleList, 1)
# 3.2 Aggregating Views
# 4 Lists of Atomic Vectors
学习过程中很nice的一个参考资源, 图文并茂的解释其中一些不容易靠文字理解的东西
R/BioC序列处理之五:Rle和Ranges
image.png
网友评论