01-Bioconductor包学习之IRanges文档学习

作者: 热衷组培的二货潜 | 来源:发表于2018-11-29 14:06 被阅读19次

01-Bioconductor包学习之IRanges文档学习
46.《Bioinformatics Data Skills》之
40.《Bioinformatics Data Skills》之
生信地基系列--IRanges包
02-Bioconductor包学习之GRanges文档学习
45. 《Bioinformatics Data Skills》
javascript中的闭包
Swift5.x-枚举（中文文档）
【生信训练营-4】GenomicRanges入门
swift闭包学习

都是按照包的manual进行学习(链接都采用谷歌打开，或者直接搜索IRanges包)

IRanges包manual
image.png

老版本安装方法

# source("http://www.bioconductor.org/biocLite.R")
# biocLite(c("IRanges"))

# 新版本安装方法
if (!require("BiocManager"))
  install.packages("BiocManager")
BiocManager::install("IRanges")

library(IRanges)
# 使用IRanges包产生以start为开始坐标，宽度为width的区间 
# width区域长度为 end - start + 1
ir1 <- IRanges(start = 1:10, width = 10:1)

# 以start为开始, 终止位置都为11的不同宽同终止位置的区间
ir2 <- IRanges(start = 1:10, end = 11)

# 固定终止位置，可变宽度
ir3 <- IRanges(end = 11, width = 10:1)

# identical: Check whether x and y are identical objects.
identical(ir1, ir2) && identical(ir1, ir3)

ir <- IRanges(c(1, 8, 14, 15, 19, 34, 40),
              width=c(12, 6, 6, 15, 6, 2, 7))
## 以上都是使用不同的`start`, `end`, `width` 组合方式创建类似的IRanges对象
## 可以通过`start`, `end`, `width`三个函数提取对应的数据

# 提取起始坐标
start(ir)

# 提取终止坐标
end(ir)

#提取坐标宽度
width(ir)

# 通过数值和逻辑值可以提取IRanges的子集
ir[1:4] # 提取1至4行
ir[start(ir) <= 15] # 提取起始位置小于等于15的行

# 可视化区域
plotRanges = function(x, xlim = x, main = deparse(substitute(x)), col = "black", sep=0.5){
  height = 1
  if (is(xlim, "Ranges"))
    xlim = c(min(start(xlim)), max(end(xlim)))
  bins = disjointBins(IRanges(start(x), end(x)+1))  
  
  plot.new()
  plot.window(xlim, c(0, max(bins) * (height + sep)))
  ybottom = bins * (sep + height) - height
  rect(start(x) - 0.5, ybottom, end(x) + 0.5, ybottom + height, col = col)
  title(main)
  axis(1)
  invisible(ybottom)
}

plotRanges(ir, col = "blue")

# 插播 逐行解释
#############################################################################################
height <- 1
sep = 0.5
main = deparse(substitute(ir))
xlim <- c(min(start(ir)), max(end(ir))) ## 得到ir的整个区域起始的最小值和最大值
bins <- disjointBins(IRanges(start(ir), end(ir) + 1))
# ?disjointBins
# disjointBins segregates x into a set of bins so that the ranges in each bin are disjoint. Lower-indexed bins are filled first. 
# The method returns an integer vector indicating the bin index for each range.
# 我的理解应该是在所有的区间中，首先从不相交的区间开始排列建立index为1，然后考虑剩下的bin不相交的建立index为2，以此类推。
# https://stackoverflow.com/questions/25130462/get-disjoint-sets-from-a-list-in-r
plot.new()
plot.window(xlim, c(0, max(bins)*(height + sep)))
# xlim给定x轴的范围， 根据bin中index的最大数目，来查看图片所需要叠加的行，即画多少行，然后为了不显的拥挤，将y轴扩大1.5倍
ybottom <- bins * (sep + height) - height 
# 指定rect画图时候y轴方向下面那条现的起始位置
rect(start(ir)-0.5, ybottom, end(ir)+0.5, ybottom + height, col = "black")
# rect(xleft = 1, ybottom = 1, xright = 5, ytop = 5) 对应的值为此代码中的字面意思，即x左，y下，x右, y上，四个值构成一个矩形。
title(deparse(substitute(ir))) # 添加图标标题
axis(1) 
# axis为基础的绘图函数，里面参数有side = 1，2，3，4分别代表刻度尺为放在最下面，最左边，最上面，最右边，
# axis还有一个参数at表示指定轴的标签如 at = c(0, 2, 4, 6))表示轴的标签为0，2，4，6
# 相对应的还有一个label参数，对应at里面的刻度，可以自定义为你想要展示的字符，如：
# axis(side = 1, at = c(0, 2, 4, 6), labels = paste(c(0, 2, 4, 6), "A", sep = ""))
# 更多的需要另外学习基础画图函数
#############################################################################################

## 2.1 Normality
## reduce函数
reduce(ir) # 合并有交集的区域
plotRanges(reduce(ir))

## 2.2 Lists of IRanges objects
## IRangesList
## 将多个IRanges文件合并为list形式。
rl <- IRangesList(ir, rev(ir))
# rev函数表示反向排列输出
start(rl) ## 结果将会输出两个IRanges的起始位置

# 2.3 Vector Extraction
set.seed(0) # 保证随机数据别人可重复
lambda <- c(rep(0.001, 4500), seq(0.001, 10, length=500),
            seq(10, 0.001, length=500))
## seq函数表示随机生成数据，seq(0.001, 10, length=500) 表示0.001~10随机生成500个数值
xVector <- rpois(1e7, lambda)
## 表示Poisson分布生成λ为lambda的1e7个数据
# 泊松分布的参数λ是单位时间(或单位面积)内随机事件的平均发生率。 泊松分布适合于描述单位时间内随机事件发生的次数。
yVector <- rpois(1e7, lambda[c(251:length(lambda), 1:250)])
xRle <- Rle(xVector) # 可以明显看到将数据从38.0Mb变为了11.5Mb
# BioC的IRanges包为这些数据提供了一种简便可行的信息压缩方式，即Rle(Run Length Encoding)
yRle <- Rle(yVector)
irextract <- IRanges(start=c(4501, 4901) , width=100)
xRle[irextract]

## 2.4 Finding Overlapping Ranges
## 区域里面查找两个IRanges的交集
ol <- findOverlaps(ir, reduce(ir))
as.matrix(ol) ## 将结果转换为矩阵

# 2.5 Counting Overlapping Ranges
# 连接2.4查找两个IRanges的交集的个数
#  coverage counts the number of ranges over each position
cov <- coverage(ir) ## 计算按照是否相交排序分类归于哪一类的问题，以及每个位点对应的个数
plotRanges(ir) ## 可视化后明显分为三类，
cov <- as.vector(cov)
mat <- cbind(seq_along(cov)-0.5, cov)
d <- diff(cov) != 0
# diff函数的使用，
# 参考 https://cloud.tencent.com/developer/ask/190364
# 它计算连续元素对之间的差异。即没有参数指定的时候即为后一个数与前一个数的差
# 假设temp是对某些变量的观察，例如一小时的温度读数。然后diff(temp)会告诉你每小时温度变化了多少。
# 相反的diff()是cumsum()（累积总和）：
mat <- rbind(cbind(mat[d,1]+1, mat[d,2]), mat)
mat <- mat[order(mat[,1]),]
lines(mat, col="red", lwd=4)
axis(2)

# 2.6 Finding Neighboring Ranges
# nearest 查找最邻近的区域

# 2.7 Transforming Ranges
# 2.7.1 Adjusting starts, ends and widths 调整开始、终止、宽度
# 图文并茂的参考的教程
# https://blog.csdn.net/u014801157/article/details/24372479

# 为了形象化展示区别，这里采用上面网站中修改后的plotRanges画图函数
plotRanges <- function(x, xlim = x, main = deparse(substitute(x)), col = "black",
                       add = FALSE, ybottom = NULL, ...) {
  require(scales)
  col <- alpha(col, 0.5)
  height <- 1
  sep <- 0.5
  if (is(xlim, "Ranges")) {
    xlim <- c(min(start(xlim)), max(end(xlim)) * 1.2)
  }
  if (!add) {
    bins <- disjointBins(IRanges(start(x), end(x) + 1))
    ybottom <- bins * (sep + height) - height
    par(mar = c(3, 0.5, 2.5, 0.5), mgp = c(1.5, 0.5, 0))
    plot.new()
    plot.window(xlim, c(0, max(bins) * (height + sep)))
  }
  rect(start(x) - 0.5, ybottom, end(x) + 0.5, ybottom + height, col = col,
       ...)
  text((start(x) + end(x))/2, ybottom + height/2, 1:length(x), col = "white",
       xpd = TRUE)
  title(main)
  axis(1)
  invisible(ybottom)
}

# shift(x, shift=0L, use.names=TRUE)
ir <- IRanges(c(3000, 2500), width = c(300, 850))
ir.trans <- shift(ir, 500) # 表示该区域整体向前平移十个单位比如[1, 10] → [11, 20]
xlim <- c(0, max(end(ir, ir.trans)) * 1.3)
ybottom <- plotRanges(ir, xlim = xlim, main = "shift", col = "blue")
plotRanges(ir.trans, add = TRUE, ybottom = ybottom, main = "", col = "red")

# 同样的还有函数 narrow, resize, flank, reflect, restrict, and threebands
# narrow(x, start=NA, end=NA, width=NA, use.names=TRUE)
narrow(ir, start=1:5, width=2)  # 将原来的开始位置按照1到5逐级增加，保持宽度为2，比如原来的c(1, 2, 3, 4, 5, 6) → c(1, 3, 5, 7, 9, 6)

# resize(x, width, fix="start", use.names=TRUE, ...)
resize(ir, width = 2, fix = "start") # 保持start不变，将宽度调整为2
resize(ir, width = 2, fix = "end") # 保持end不变，将宽度调整为2

# flank(x, width, start=TRUE, both=FALSE, use.names=TRUE, ...)
ir.trans <- flank(ir, width = 2, both = T) # 表示以start为中心区域向两侧扩展2个单位，即1变为c(-1,2)
xlim <- c(0, max(end(ir, ir.trans)) * 1.3)
ybottom <- plotRanges(ir, col = "red")
plotRanges(ir.trans, col = "blue", add = TRUE, ybottom = ybottom)

# promoters(x, upstream=2000, downstream=200, use.names=TRUE, ...)
promoters(ir, upstream = 3, downstream = 0) # 表示以start为中心取上游3个单位
promoters(ir, upstream = 0, downstream = 3) # 表示以start为中心取下游3个单位的区间

# reflect(x, bounds, use.names=TRUE)
bounds <- IRanges(c(2000, 3000), width = 500)
ir.trans <- reflect(ir, bounds = bounds) # 不懂
xlim <- c(0, max(end(ir, ir.trans)) * 1.3)
ybottom <- plotRanges(ir, col = "red")
plotRanges(ir.trans, col = "blue", add = TRUE, ybottom = ybottom)

# restrict(x, start=NA, end=NA, keep.all.ranges=FALSE, use.names=TRUE)
ir.trans <- restrict(ir, start = 2000, end = 3000) # 将之前的区间都落在此区间进行重新定义
ybottom <- plotRanges(ir, col = "red")
plotRanges(ir.trans, col = "blue", add = TRUE, ybottom = ybottom)

# threebands(x, start=NA, end=NA, width=NA)
# 延伸了narrow函数的功能，返回3个相关范围，分别对应到"left","middle" 和 "right"，其中"middle"范围对应的就是narrow函数返回的范围：
ir + seq_len(length(ir)) 
ir * -2 # 将区域范围扩大两倍
ir * 2 # 将区域需缩小

# 2.7.2 Making ranges disjoint
ir <- IRanges(c(1, 8, 14, 15, 19, 34, 40),
              width=c(12, 6, 6, 15, 6, 2, 7))
disjoin(ir) # 将整个区域变成没有交集的最小区间，
plotRanges(disjoin(ir)) 
plotRanges(ir)

disjointBins(ir) # 表示按照不重叠原则，将区间分类，可以通过plotRanges()画出来一目了然

# 2.7.3 Other transformations
trans_ir <- reflect(ir, IRanges(start(ir), width=width(ir)*2))
plotRanges(trans_ir)

flank(ir, width = seq_len(length(ir)))

# 2.8 Set Operations
# gap找间隙
gaps(ir, start=1, end=50) # 查找1到50这个区间，ir没有覆盖到的区域即gap
plotRanges(gaps(ir, start=1, end=50), c(1,50))

# intersect求交集区域；setdiff求差异区域；union求并集：
ir1 <- IRanges(c(200, 1000, 3000, 2500), width = c(600, 1000, 300, 850))
ir2 <- IRanges(c(100, 1500, 2000, 3500), width = c(600, 800, 1000, 550))

xlim <- c(0, max(end(ir1, ir2)) * 1.3)
ybottom <- plotRanges(reduce(ir1), xlim = xlim, col = "blue", main = "original")
plotRanges(ir1)
plotRanges(ir2)
plotRanges(reduce(ir2), xlim = xlim, col = "blue", main = "", add = TRUE, ybottom = ybottom)
plotRanges(intersect(ir1, ir2), xlim = xlim, col = "red") 
# intersect两个区间的交集
plotRanges(setdiff(ir1, ir2), xlim = xlim, col = "red")
# setdiff两个区间没有交集的区间
plotRanges(union(ir1, ir2), xlim = xlim, col = "red")
# union两个区间的并集

# 3 Vector Views
# 3.1 Creating Views
set.seed(0) # 保证随机数据别人可重复
lambda <- c(rep(0.001, 4500), seq(0.001, 10, length=500),
            seq(10, 0.001, length=500))
## seq函数表示随机生成数据，seq(0.001, 10, length=500) 表示0.001~10随机生成500个数值
xVector <- rpois(1e7, lambda)
xRle <- Rle(xVector)
xViews <- Views(xRle, xRle >= 1)
xViews <- slice(xRle, 1)

xRleList <- RleList(xRle, 2L * rev(xRle))
xViewsList <- slice(xRleList, 1)

# 3.2 Aggregating Views
# 4 Lists of Atomic Vectors

学习过程中很nice的一个参考资源, 图文并茂的解释其中一些不容易靠文字理解的东西

R/BioC序列处理之五：Rle和Ranges

image.png

01-Bioconductor包学习之IRanges文档学习
都是按照包的manual进行学习(链接都采用谷歌打开，或者直接搜索IRanges包) IRanges包manual...
46.《Bioinformatics Data Skills》之
1. 创建GRanges对象 1.1. 基本信息前面我们学习了IRanges对象，GenomicRanges包引...
40.《Bioinformatics Data Skills》之
《Bioinformatics Data Skills》之使用IRanges包存储基因组范围 R有一系列的包可以帮...
生信地基系列--IRanges包
IRanges包 IRanges 包旨在表示序列、表示沿这些序列的索引的范围以及与这些范围相关的数据。 IRang...
02-Bioconductor包学习之GRanges文档学习
Week 1_08_GenomicRanges - GRanges http://kasperdanielhans...
45. 《Bioinformatics Data Skills》
今天完成IRanges包最后一部分的学习。 slice函数我们之前简单模拟了基因组测序并计算了各个位点的覆盖度：...
javascript中的闭包
该文章是MDN闭包文档学习笔记，方便日后查阅。如要查阅源文档，请移步闭包文档传送门闭包闭包是函数和声明该函数的...
Swift5.x-枚举（中文文档）
引言继续学习Swift文档，从上一章节：闭包，我们学习了Swift闭包相关的内容，如闭包的定义和使用、闭包的简写...
【生信训练营-4】GenomicRanges入门
今天介绍的GenomicRanges包是建立在IRanges包的基础之上，能够为BioC中很多其他的软件包提供支持...
swift闭包学习
闭包作为参数参考 Swift学习之闭包