美文网首页R数据读取 清理Bioconductor学习
02-Bioconductor包学习之GRanges文档学习

02-Bioconductor包学习之GRanges文档学习

作者: 热衷组培的二货潜 | 来源:发表于2018-11-30 22:22 被阅读21次

    Week 1_08_GenomicRanges - GRanges

    # source("http://www.bioconductor.org/biocLite.R")
    # biocLite(c("GenomicRanges"))
    
    library(GenomicRanges)
    gr <- GRanges(seqnames = "chr1",
                  strand = c("+", "-", "+"),
                  ranges = IRanges(start = c(1, 3, 5), width = 3))
    ## 获得下游区域
    flank(gr, 2, start = FALSE)
    ## 获得上游2bp区域
    flank(gr, 2, start = T)
    
    ## seqinfo
    seqinfo(gr)
    seqlengths(gr) <- c("chr1" = 10)
    seqinfo(gr)
    seqlevels(gr)
    # gaps 获取基因上没有倍GRanges覆盖到的区域
    gaps(gr)
    
    seqlevels(gr) <- c("chr1", "chr2")
    seqnames(gr) <- c("chr1", "chr2", "chr1")
    seqinfo(gr)
    sort(gr)
    
    seqlevels(gr) <- c("chr2", "chr1") # 定义染色体的水平顺序
    sort(gr) # 排序后会按照指定的染色体顺序来排序
    
    genome(gr) <- "hg19"  ## 注明基因组的版本
    gr
    
    gr2 <- gr
    genome(gr2) <- "hg18"
    findOverlaps(gr, gr2)  ## 由于基因组版本号不同所以结果会报错
    

    GRanges 的参考说明书学习

    # 新版本包安装方法
    if (!require("BiocManager"))
      install.packages("BiocManager")
    BiocManager::install("GenomicRanges")
    
    library(GenomicRanges)
    gr <- GRanges(seqnames = Rle(c("chr1", "chr2", "chr1", "ch3"), c(1, 3, 2, 4)), # 目前理解Rle这里就相当于rep函数
                  ranges = IRanges(101:100, end = 111:120, names = head(letters, 10)), # 指定区域坐标,以及每个区间的名称
                  strand = Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)),
                  score = 1:10,
                  GC = seq(1, 0, length = 10)
                  )
    seqnames(gr)
    ranges(gr)
    strand(gr) ## 对于重复数据比较多的情况使用Rle能后节省空间
    granges(gr)
    gr
    mcols(gr) # 提取metadata列
    mcols(gr)$score
    
    seqlengths(gr) <- c(249250621, 243199373, 198022430) ## 指定每条染色体的长度
    seqlengths(gr)
    names(gr)
    length(gr)
    
    # 2.1 分割和合并GRanges
    sp <- split(gr, rep(1:2, each = 5)) # 分割
    sp
    c(sp[[1]], sp[[2]]) # 合并
    
    # 2.2 筛选GRanges
    gr[2:3]
    gr[2:3, "GC"]
    
    singles <- split(gr, names(gr))
    singles
    grMod <- gr
    grMod[2] <- singles[[1]]
    head(grMod, n = 3)
    ## 重复、反转、选择
    rep(singles[[2]], times = 3)
    rev(gr)
    head(gr, n = 2)
    tail(gr, n = 2)
    window(gr, start = 2, end = 4)
    gr[IRanges(start = c(2, 7), end = c(3, 9))] ## 结合IRanges来筛选目标行
    
    # 2.3 Basic interval operations for GRanges objects
    g <- gr[1:3]
    g <- append(g, singles[[10]])  ## append函数表示在末尾添加信息
    start(g)
    end(g)
    width(g)
    range(g)
    flank(g, 10) # 获取下游downstream区域10
    flank(g, 10, start = F) ## 获取上游Updtream区域10
    shift(g, 10) ## 表示整体向前移动十个单位
    resize(g ,10) ## 将区间重新定义,保持起始位置不变,将其变为宽度为10的区间(考虑了正负方向)
    reduce(g) ## 将交集区域合并为一个区域
    gaps(g) ## 取基因组上没有被覆盖的区域
    disjoin(g) ## 取已有区域没有交集的区域坐标
    coverage(g) ## 将所有区域按照交集程度划分为不同等级
    
    # 2.4 Interval set operations for GRanges objects
    g2 <- head(gr, n = 2)
    union(g, g2) ## 取两个GRanges的并集
    intersect(g, g2) # 取交集区域
    setdiff(g, g2) # 取没有交集的区域
    
    g3 <- g[1:2]
    ranges(g3[1]) <- IRanges(start = 105, end = 112) ## 修改区域
    punion(g2, g3) ## 当有相同的metdata时候,可以使用p开头的来进行操作 
    pintersect(g2, g3)
    psetdiff(g2, g3)
    methods(class = "GRanges") # 查看GRanges里面所有函数
    
    # 3GRangesList: Groups of Genomic Ranges
    gr1 <- GRanges(
      seqnames = "chr2",
      ranges = IRanges(103, 106),
      strand = "+",
      score = 5L, GC = 0.45)
    gr2 <- GRanges(
      seqnames = c("chr1", "chr1"),
      ranges = IRanges(c(107, 113), width = 3),
      strand = c("+", "-"),
      score = 3:4, GC = c(0.3, 0.5))
    grl <- GRangesList("txA" = gr1, "txB" = gr2)
    # 3.1 Basic GRangesList accessors
    seqnames(grl)
    ranges(grl)
    strand(grl)
    length(grl)
    names(grl)
    seqnames(grl)
    
    elementNROWS(grl) # 统计每一个小GRanges的行数
    isEmpty(grl)
    mcols(grl) <- c("Transcript A", "Transcript B")
    mcols(unlist(grl))
    
    # 3.2 Combining GRangesList objects
    ul <- unlist(grl)
    ul
    
    grl1 <- GRangesList(
      gr1 = GRanges("chr2", IRanges(3, 6)),
      gr2 = GRanges("chr1", IRanges(c(7,13), width = 3)))
    grl2 <- GRangesList(
      gr1 = GRanges("chr2", IRanges(9, 12)),
      gr2 = GRanges("chr1", IRanges(c(25,38), width = 3)))
    gr1
    gr2
    pc(grl1, grl2) ## 合并两个GRanges
    grl3 <- c(grl1, grl2)
    regroup(grl3, names(grl3)) ## 这两步等同于pc()
    
    # 3.3 Basic interval operations for GRangesList objects
    start(grl)
    end(grl)
    width(grl)
    sum(width(grl))
    shift(grl, 20)
    coverage(grl)
    
    # 3.4 Subsetting GRangesList objects
    grl[1]
    grl[[1]]
    grl["txA"]
    grl$txB
    
    grl[1, "score"]
    grl["txB", "GC"]
    
    rep(grl[[1]], times = 3)
    rev(grl)
    head(grl, n = 1)
    tail(grl, n = 1)
    window(grl, start = 1, end = 1)
    grl[IRanges(start = 2, end = 2)]
    
    # 3.5 Looping over GRangesList objects
    lapply(grl, length)
    sapply(grl, length)
    
    grl2 <- shift(grl, 10)
    names(grl2) <- c("shiftTxA", "shiftTxB")
    mapply(c, grl, grl2)
    Map(c, grl, grl2)
    
    endoapply(grl, rev) 
    mendoapply(c, grl, grl2)
    Reduce(c, grl)
    
    gr <- unlist(grl)
    gr$log_score <- log(gr$score)
    grl <- relist(gr, grl)
    
    # 更多资料
    ?GRangesList
    methods(class="GRangesList")   # _partial_ list
    
    # 4 Interval overlaps involving GRanges and GRangesList objects
    findOverlaps(gr, grl)
    countOverlaps(gr, grl)
    subsetByOverlaps(gr,grl)
    findOverlaps(gr, grl, select="first")
    findOverlaps(grl, gr, select="first")
    
    

    相关文章

      网友评论

        本文标题:02-Bioconductor包学习之GRanges文档学习

        本文链接:https://www.haomeiwen.com/subject/rrvkcqtx.html