美文网首页基因组坐标区间操作
跟着官方文档来看看GRanges这个对象

跟着官方文档来看看GRanges这个对象

作者: 刘小泽 | 来源:发表于2020-08-05 11:32 被阅读0次

    刘小泽写于2020.8.5
    今天跟着官方文档来看看GRanges这个对象

    1 前言

    内容来自:https://bioconductor.org/packages/release/bioc/vignettes/GenomicRanges/inst/doc/GenomicRangesIntroduction.html

    GenomicRanges是Bioconductor各个项目都在使用的基因组坐标的存储方式,它基于IRanges 建立,目前为BSgenome、Rsamtools、ShortRead 、rtracklayer、GenomicFeatures、 GenomicAlignments、VariantAnnotation 等提供支持

    安装加载

    if (!require("BiocManager"))install.packages("BiocManager")
    if (!require("GenomicRanges"))BiocManager::install("GenomicRanges")
    
    library(GenomicRanges)
    

    2 GRanges: Genomic Ranges

    存储了一系列的基因组起始、终止坐标,可以为连续的结合位点、转录本、外显子等提供支持

    gr <- GRanges(
        seqnames = Rle(c("chr1", "chr2", "chr1", "chr3"), c(1, 3, 2, 4)),
        ranges = IRanges(101:110, end = 111:120, names = head(letters, 10)),
        strand = Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)),
        score = 1:10,
        GC = seq(1, 0, length=10))
    gr
    #> GRanges object with 10 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   a     chr1   101-111      - |         1                 1
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   c     chr2   103-113      + |         3 0.777777777777778
    #>   d     chr2   104-114      * |         4 0.666666666666667
    #>   e     chr1   105-115      * |         5 0.555555555555556
    #>   f     chr1   106-116      + |         6 0.444444444444444
    #>   g     chr3   107-117      + |         7 0.333333333333333
    #>   h     chr3   108-118      + |         8 0.222222222222222
    #>   i     chr3   109-119      - |         9 0.111111111111111
    #>   j     chr3   110-120      - |        10                 0
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome; no seqlengths
    

    左边是基因组坐标(染色体名称、范围、正负链),右边是meta信息(score、GC等等)

    2.1 认识这个数据

    其中包含多少行
    names(gr)
    #>  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"
    # 或者
    length(gr)
    #> [1] 10
    
    分别提取seqnames、ranges、strand
    seqnames(gr)
    #> factor-Rle of length 10 with 4 runs
    #>   Lengths:    1    3    2    4
    #>   Values : chr1 chr2 chr1 chr3
    #> Levels(3): chr1 chr2 chr3
    
    ranges(gr)
    #> IRanges object with 10 ranges and 0 metadata columns:
    #>         start       end     width
    #>     <integer> <integer> <integer>
    #>   a       101       111        11
    #>   b       102       112        11
    #>   c       103       113        11
    #>   d       104       114        11
    #>   e       105       115        11
    #>   f       106       116        11
    #>   g       107       117        11
    #>   h       108       118        11
    #>   i       109       119        11
    #>   j       110       120        11
    
    strand(gr)
    #> factor-Rle of length 10 with 5 runs
    #>   Lengths: 1 2 2 3 2
    #>   Values : - + * + -
    #> Levels(3): + - *
    
    一次性提取seqnames、ranges、strand
    granges(gr)
    #> GRanges object with 10 ranges and 0 metadata columns:
    #>     seqnames    ranges strand
    #>        <Rle> <IRanges>  <Rle>
    #>   a     chr1   101-111      -
    #>   b     chr2   102-112      +
    #>   c     chr2   103-113      +
    #>   d     chr2   104-114      *
    #>   e     chr1   105-115      *
    #>   f     chr1   106-116      +
    #>   g     chr3   107-117      +
    #>   h     chr3   108-118      +
    #>   i     chr3   109-119      -
    #>   j     chr3   110-120      -
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome; no seqlengths
    
    一次性提取meta信息

    得到一个数据框

    mcols(gr)
    #> DataFrame with 10 rows and 2 columns
    #>       score                GC
    #>   <integer>         <numeric>
    #> a         1                 1
    #> b         2 0.888888888888889
    #> c         3 0.777777777777778
    #> d         4 0.666666666666667
    #> e         5 0.555555555555556
    #> f         6 0.444444444444444
    #> g         7 0.333333333333333
    #> h         8 0.222222222222222
    #> i         9 0.111111111111111
    #> j        10                 0
    

    如果只想提取其中的某部分

    mcols(gr)$score
    #>  [1]  1  2  3  4  5  6  7  8  9 10
    
    还可以加入染色体长度
    seqlengths(gr) <- c(249250621, 243199373, 198022430)
    # 提取
    seqlengths(gr)
    #>      chr1      chr2      chr3 
    #> 249250621 243199373 198022430
    

    2.2 拆分+取子集

    拆分

    利用split会得到一个GRangesList对象

    sp <- split(gr, rep(1:2, each=5))
    sp
    #> GRangesList object of length 2:
    #> $`1`
    #> GRanges object with 5 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   a     chr1   101-111      - |         1                 1
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   c     chr2   103-113      + |         3 0.777777777777778
    #>   d     chr2   104-114      * |         4 0.666666666666667
    #>   e     chr1   105-115      * |         5 0.555555555555556
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    #> 
    #> $`2`
    #> GRanges object with 5 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   f     chr1   106-116      + |         6 0.444444444444444
    #>   g     chr3   107-117      + |         7 0.333333333333333
    #>   h     chr3   108-118      + |         8 0.222222222222222
    #>   i     chr3   109-119      - |         9 0.111111111111111
    #>   j     chr3   110-120      - |        10                 0
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    

    如果要再连起来

    c(sp[[1]], sp[[2]])
    #> GRanges object with 10 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   a     chr1   101-111      - |         1                 1
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   c     chr2   103-113      + |         3 0.777777777777778
    #>   d     chr2   104-114      * |         4 0.666666666666667
    #>   e     chr1   105-115      * |         5 0.555555555555556
    #>   f     chr1   106-116      + |         6 0.444444444444444
    #>   g     chr3   107-117      + |         7 0.333333333333333
    #>   h     chr3   108-118      + |         8 0.222222222222222
    #>   i     chr3   109-119      - |         9 0.111111111111111
    #>   j     chr3   110-120      - |        10                 0
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    
    简单取子集

    操作和向量的操作类似

    gr[2:3]
    #> GRanges object with 2 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   c     chr2   103-113      + |         3 0.777777777777778
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    

    如果要再选取一些meta信息,操作又和数据框类似

    gr[2:3, "GC"]
    #> GRanges object with 2 ranges and 1 metadata column:
    #>     seqnames    ranges strand |                GC
    #>        <Rle> <IRanges>  <Rle> |         <numeric>
    #>   b     chr2   102-112      + | 0.888888888888889
    #>   c     chr2   103-113      + | 0.777777777777778
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    
    对子集重新赋值
    # 先把全部的行拆成一个列表,然后把第一行赋值给第二行
    singles <- split(gr, names(gr))
    grMod <- gr
    grMod[2] <- singles[[1]]
    head(grMod, n=3)
    #> GRanges object with 3 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   a     chr1   101-111      - |         1                 1
    #>   b     chr1   101-111      - |         1                 1
    #>   c     chr2   103-113      + |         3 0.777777777777778
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    
    还有重复、翻转、取特定区域等操作
    # 重复
    rep(singles[[2]], times = 3)
    #> GRanges object with 3 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    # 翻转
    rev(gr)
    #> GRanges object with 10 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   j     chr3   110-120      - |        10                 0
    #>   i     chr3   109-119      - |         9 0.111111111111111
    #>   h     chr3   108-118      + |         8 0.222222222222222
    #>   g     chr3   107-117      + |         7 0.333333333333333
    #>   f     chr1   106-116      + |         6 0.444444444444444
    #>   e     chr1   105-115      * |         5 0.555555555555556
    #>   d     chr2   104-114      * |         4 0.666666666666667
    #>   c     chr2   103-113      + |         3 0.777777777777778
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   a     chr1   101-111      - |         1                 1
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    # 前后两行
    head(gr,n=2)
    #> GRanges object with 2 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   a     chr1   101-111      - |         1                 1
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    tail(gr,n=2)
    #> GRanges object with 2 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   i     chr3   109-119      - |         9 0.111111111111111
    #>   j     chr3   110-120      - |        10                 0
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    # 取第2-4行
    window(gr, start=2,end=4)
    #> GRanges object with 3 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   c     chr2   103-113      + |         3 0.777777777777778
    #>   d     chr2   104-114      * |         4 0.666666666666667
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    # 取2-3行 + 7-9行
    gr[IRanges(start=c(2,7), end=c(3,9))]
    #> GRanges object with 5 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   c     chr2   103-113      + |         3 0.777777777777778
    #>   g     chr3   107-117      + |         7 0.333333333333333
    #>   h     chr3   108-118      + |         8 0.222222222222222
    #>   i     chr3   109-119      - |         9 0.111111111111111
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    

    3 对区间操作

    3.1 intra-range(同一个GRange中;行内部调整)

    获取数据
    g <- gr[1:3]
    g <- append(g, singles[[10]])
    start(g)
    #> [1] 101 102 103 110
    end(g)
    #> [1] 111 112 113 120
    width(g)
    #> [1] 11 11 11 11
    # 如果存在两行包含的关系,range就会合并显示
    range(g)
    #> GRanges object with 3 ranges and 0 metadata columns:
    #>       seqnames    ranges strand
    #>          <Rle> <IRanges>  <Rle>
    #>   [1]     chr1   101-111      -
    #>   [2]     chr2   102-113      +
    #>   [3]     chr3   110-120      -
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    g
    #> GRanges object with 4 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   a     chr1   101-111      - |         1                 1
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   c     chr2   103-113      + |         3 0.777777777777778
    #>   j     chr3   110-120      - |        10                 0
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    
    对侧翼操作
    # 取上游10bp(区分正负链)
    flank(g, 10)
    #> GRanges object with 4 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   a     chr1   112-121      - |         1                 1
    #>   b     chr2    92-101      + |         2 0.888888888888889
    #>   c     chr2    93-102      + |         3 0.777777777777778
    #>   j     chr3   121-130      - |        10                 0
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    # 取下游10bp(区分正负链)
    flank(g, 10, start=FALSE)
    #> GRanges object with 4 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   a     chr1    91-100      - |         1                 1
    #>   b     chr2   113-122      + |         2 0.888888888888889
    #>   c     chr2   114-123      + |         3 0.777777777777778
    #>   j     chr3   100-109      - |        10                 0
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    
    整体移动
    # 全部移动5bp(不分正负链)
    shift(g, 5)
    #> GRanges object with 4 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   a     chr1   106-116      - |         1                 1
    #>   b     chr2   107-117      + |         2 0.888888888888889
    #>   c     chr2   108-118      + |         3 0.777777777777778
    #>   j     chr3   115-125      - |        10                 0
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    
    直接修改区间范围
    # 全部修改为30bp(区分正负链;默认固定start)
    resize(g, 30,fix="start")
    #> GRanges object with 4 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   a     chr1    82-111      - |         1                 1
    #>   b     chr2   102-131      + |         2 0.888888888888889
    #>   c     chr2   103-132      + |         3 0.777777777777778
    #>   j     chr3    91-120      - |        10                 0
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    

    3.2 inter-range (同一个GRange中;行与行之间调整)

    g
    #> GRanges object with 4 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   a     chr1   101-111      - |         1                 1
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   c     chr2   103-113      + |         3 0.777777777777778
    #>   j     chr3   110-120      - |        10                 0
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    
    将重叠区域合并
    reduce(g)
    #> GRanges object with 3 ranges and 0 metadata columns:
    #>       seqnames    ranges strand
    #>          <Rle> <IRanges>  <Rle>
    #>   [1]     chr1   101-111      -
    #>   [2]     chr2   102-113      +
    #>   [3]     chr3   110-120      -
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    
    得到没有任何重叠的结果
    # 如果有重叠,则单独列出来
    disjoin(g)
    #> GRanges object with 5 ranges and 0 metadata columns:
    #>       seqnames    ranges strand
    #>          <Rle> <IRanges>  <Rle>
    #>   [1]     chr1   101-111      -
    #>   [2]     chr2       102      +
    #>   [3]     chr2   103-112      +
    #>   [4]     chr2       113      +
    #>   [5]     chr3   110-120      -
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    
    对没有任何重叠的结果数量进行统计
    coverage(g)
    #> RleList of length 3
    #> $chr1
    #> integer-Rle of length 249250621 with 3 runs
    #>   Lengths:       100        11 249250510
    #>   Values :         0         1         0
    #> 
    #> $chr2
    #> integer-Rle of length 243199373 with 5 runs
    #>   Lengths:       101         1        10         1 243199260
    #>   Values :         0         1         2         1         0
    #> 
    #> $chr3
    #> integer-Rle of length 198022430 with 3 runs
    #>   Lengths:       109        11 198022310
    #>   Values :         0         1         0
    
    看GRange对象以外的区域
    # 也就是每个染色体长度除去了g包含的坐标
    gaps(g)
    #> GRanges object with 12 ranges and 0 metadata columns:
    #>        seqnames        ranges strand
    #>           <Rle>     <IRanges>  <Rle>
    #>    [1]     chr1   1-249250621      +
    #>    [2]     chr1         1-100      -
    #>    [3]     chr1 112-249250621      -
    #>    [4]     chr1   1-249250621      *
    #>    [5]     chr2         1-101      +
    #>    ...      ...           ...    ...
    #>    [8]     chr2   1-243199373      *
    #>    [9]     chr3   1-198022430      +
    #>   [10]     chr3         1-109      -
    #>   [11]     chr3 121-198022430      -
    #>   [12]     chr3   1-198022430      *
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    

    3.3 between-range (两个GRange对象之间)

    重点操作是:findOverlapsunionintersectsetdiff

    数据准备
    g2 <- head(gr, n=2)
    g
    #> GRanges object with 4 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   a     chr1   101-111      - |         1                 1
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   c     chr2   103-113      + |         3 0.777777777777778
    #>   j     chr3   110-120      - |        10                 0
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    g2
    #> GRanges object with 2 ranges and 2 metadata columns:
    #>     seqnames    ranges strand |     score                GC
    #>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
    #>   a     chr1   101-111      - |         1                 1
    #>   b     chr2   102-112      + |         2 0.888888888888889
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    
    并集
    union(g, g2)
    #> GRanges object with 3 ranges and 0 metadata columns:
    #>       seqnames    ranges strand
    #>          <Rle> <IRanges>  <Rle>
    #>   [1]     chr1   101-111      -
    #>   [2]     chr2   102-113      +
    #>   [3]     chr3   110-120      -
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    
    交集
    intersect(g, g2)
    #> GRanges object with 2 ranges and 0 metadata columns:
    #>       seqnames    ranges strand
    #>          <Rle> <IRanges>  <Rle>
    #>   [1]     chr1   101-111      -
    #>   [2]     chr2   102-112      +
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    
    补集
    setdiff(g, g2)
    #> GRanges object with 2 ranges and 0 metadata columns:
    #>       seqnames    ranges strand
    #>          <Rle> <IRanges>  <Rle>
    #>   [1]     chr2       113      +
    #>   [2]     chr3   110-120      -
    #>   -------
    #>   seqinfo: 3 sequences from an unspecified genome
    
    findOverlaps

    数据一:https://raw.githubusercontent.com/Bioconductor/BiocWorkshops/master/100_Morgan_RBiocForAll/CpGislands.Hsapiens.hg38.UCSC.bed

    suppressMessages(library("rtracklayer"))
    library(stringr)
    fname='CpGislands.Hsapiens.hg38.UCSC.bed.txt'
    # 重命名,不需要后缀.txt
    aft_name <- gsub('.txt','',fname)
    file.rename(fname,aft_name)
    #> Warning in file.rename(fname, aft_name): cannot rename file 'CpGislands.Hsapiens.hg38.UCSC.bed.txt' to
    #> 'CpGislands.Hsapiens.hg38.UCSC.bed', reason 'No such file or directory'
    #> [1] FALSE
    cpg <- import(aft_name)
    # 原本BED文件是0-based,半开半闭区间,比如[0,10) = 0..9十个数
    # 而import()函数自动将BED的类型转成Bioconductor统一类型:1-based、全闭区间,也就是将[0,10)变成了[1,10],还是原来的十个数值
    head(cpg)
    #> GRanges object with 6 ranges and 1 metadata column:
    #>       seqnames              ranges strand |        name
    #>          <Rle>           <IRanges>  <Rle> | <character>
    #>   [1]     chr1 155188537-155192004      * |    CpG:_361
    #>   [2]     chr1     2226774-2229734      * |    CpG:_366
    #>   [3]     chr1   36306230-36307408      * |    CpG:_110
    #>   [4]     chr1   47708823-47710847      * |    CpG:_164
    #>   [5]     chr1   53737730-53739637      * |    CpG:_221
    #>   [6]     chr1 144179072-144179313      * |     CpG:_20
    #>   -------
    #>   seqinfo: 254 sequences from an unspecified genome; no seqlengths
    seqnames(cpg)
    #> factor-Rle of length 30477 with 254 runs
    #>   Lengths:                    2535                    1682 ...                       1                       2
    #>   Values :                    chr1                    chr2 ... chr22_KI270735v1_random chr22_KI270738v1_random
    #> Levels(254): chr1 chr2 chr3 chr4 ... chr22_KI270734v1_random chr22_KI270735v1_random chr22_KI270738v1_random
    # 只对标准的染色体1-22+X+Y进行处理
    cpg <- keepStandardChromosomes(cpg, pruning.mode = "coarse")
    head( start(cpg) )
    #> [1] 155188537   2226774  36306230  47708823  53737730 144179072
    hist(log10(width(cpg)))
    
    # 取子集
    subset(cpg, seqnames %in% c("chr1", "chr2"))
    #> GRanges object with 4217 ranges and 1 metadata column:
    #>          seqnames              ranges strand |        name
    #>             <Rle>           <IRanges>  <Rle> | <character>
    #>      [1]     chr1 155188537-155192004      * |    CpG:_361
    #>      [2]     chr1     2226774-2229734      * |    CpG:_366
    #>      [3]     chr1   36306230-36307408      * |    CpG:_110
    #>      [4]     chr1   47708823-47710847      * |    CpG:_164
    #>      [5]     chr1   53737730-53739637      * |    CpG:_221
    #>      ...      ...                 ...    ... .         ...
    #>   [4213]     chr2 242003256-242004412      * |     CpG:_79
    #>   [4214]     chr2 242006590-242010686      * |    CpG:_263
    #>   [4215]     chr2 242045491-242045723      * |     CpG:_16
    #>   [4216]     chr2 242046615-242047706      * |    CpG:_170
    #>   [4217]     chr2 242088150-242089411      * |    CpG:_149
    #>   -------
    #>   seqinfo: 24 sequences from an unspecified genome; no seqlengths
    

    数据二:TxDb.Hsapiens.UCSC.hg38.knownGene

    library("TxDb.Hsapiens.UCSC.hg38.knownGene")
    tx <- transcripts(TxDb.Hsapiens.UCSC.hg38.knownGene)
    # 如果取外显子坐标:exons(TxDb.Hsapiens.UCSC.hg38.knownGene)
    # 如果取基因坐标:genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
    tx
    #> GRanges object with 247541 ranges and 2 metadata columns:
    #>                    seqnames        ranges strand |     tx_id           tx_name
    #>                       <Rle>     <IRanges>  <Rle> | <integer>       <character>
    #>        [1]             chr1   11869-14409      + |         1 ENST00000456328.2
    #>        [2]             chr1   12010-13670      + |         2 ENST00000450305.2
    #>        [3]             chr1   29554-31097      + |         3 ENST00000473358.1
    #>        [4]             chr1   30267-31109      + |         4 ENST00000469289.1
    #>        [5]             chr1   30366-30503      + |         5 ENST00000607096.1
    #>        ...              ...           ...    ... .       ...               ...
    #>   [247537] chrUn_GL000220v1 155997-156149      + |    247537 ENST00000619779.1
    #>   [247538] chrUn_KI270442v1 380608-380726      + |    247538 ENST00000620265.1
    #>   [247539] chrUn_KI270442v1 217250-217401      - |    247539 ENST00000611690.1
    #>   [247540] chrUn_KI270744v1   51009-51114      - |    247540 ENST00000616830.1
    #>   [247541] chrUn_KI270750v1 148668-148843      + |    247541 ENST00000612925.1
    #>   -------
    #>   seqinfo: 595 sequences (1 circular) from hg38 genome
    
    # 同样也是保留标准染色体信息
    tx <- keepStandardChromosomes(tx, pruning.mode="coarse")
    seqnames(tx)
    #> factor-Rle of length 227462 with 25 runs
    #>   Lengths: 19915 16586 14251  9364 10786 10427 10798  9307 ...  4437 13623  5403  2925  4920  6970   978    37
    #>   Values :  chr1  chr2  chr3  chr4  chr5  chr6  chr7  chr8 ... chr18 chr19 chr20 chr21 chr22  chrX  chrY  chrM
    #> Levels(25): chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 ... chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY chrM
    

    开始取交集

    findOverlaps(tx,cpg)
    #> Hits object with 140077 hits and 0 metadata columns:
    #>            queryHits subjectHits
    #>            <integer>   <integer>
    #>        [1]         3          10
    #>        [2]        32          18
    #>        [3]        33          18
    #>        [4]        34          18
    #>        [5]        35          18
    #>        ...       ...         ...
    #>   [140073]    227385       13824
    #>   [140074]    227386       13823
    #>   [140075]    227386       13824
    #>   [140076]    227419       13829
    #>   [140077]    227424       13831
    #>   -------
    #>   queryLength: 227462 / subjectLength: 27949
    
    # 统计每个转录本上有多少CpG位点
    olaps <- countOverlaps(tx, cpg)
    length(olaps)
    #> [1] 227462
    table(olaps)
    #> olaps
    #>      0      1      2      3      4      5      6      7      8      9     10     11     12     13     14     15 
    #> 126181  81273  12621   3804   1557    713    427    266    175     96     68     42     43     33     25     22 
    #>     16     17     18     19     20     21     22     23     24     25     26     27     28     29     30     31 
    #>     20      7     14      7      7      3      7      6      3      1      4      1      3      1      6      5 
    #>     32     33     34     35     36     37     38     63     65     71 
    #>      3      3      1      2      3      2      1      4      1      1
    
    # 添加结果到GRanges
    tx$cpgOverlaps <- olaps
    tx
    #> GRanges object with 227462 ranges and 3 metadata columns:
    #>            seqnames      ranges strand |     tx_id           tx_name cpgOverlaps
    #>               <Rle>   <IRanges>  <Rle> | <integer>       <character>   <integer>
    #>        [1]     chr1 11869-14409      + |         1 ENST00000456328.2           0
    #>        [2]     chr1 12010-13670      + |         2 ENST00000450305.2           0
    #>        [3]     chr1 29554-31097      + |         3 ENST00000473358.1           1
    #>        [4]     chr1 30267-31109      + |         4 ENST00000469289.1           0
    #>        [5]     chr1 30366-30503      + |         5 ENST00000607096.1           0
    #>        ...      ...         ...    ... .       ...               ...         ...
    #>   [227458]     chrM   5826-5891      - |    227458 ENST00000387409.1           0
    #>   [227459]     chrM   7446-7514      - |    227459 ENST00000387416.2           0
    #>   [227460]     chrM 14149-14673      - |    227460 ENST00000361681.2           0
    #>   [227461]     chrM 14674-14742      - |    227461 ENST00000387459.1           0
    #>   [227462]     chrM 15956-16023      - |    227462 ENST00000387461.2           0
    #>   -------
    #>   seqinfo: 25 sequences (1 circular) from hg38 genome
    # 根据这个结果取子集
    subset(tx, cpgOverlaps > 10)
    #> GRanges object with 281 ranges and 3 metadata columns:
    #>         seqnames            ranges strand |     tx_id           tx_name cpgOverlaps
    #>            <Rle>         <IRanges>  <Rle> | <integer>       <character>   <integer>
    #>     [1]     chr1   2050411-2185395      + |       292 ENST00000378567.8          15
    #>     [2]     chr1   2050485-2146108      + |       293 ENST00000468310.5          11
    #>     [3]     chr1   2073462-2185390      + |       298 ENST00000400921.6          13
    #>     [4]     chr1   2073986-2185190      + |       300 ENST00000461106.6          13
    #>     [5]     chr1   3069168-3434342      + |       382 ENST00000511072.5          32
    #>     ...      ...               ...    ... .       ...               ...         ...
    #>   [277]     chrX 40051246-40177329      - |    223668 ENST00000378455.8          11
    #>   [278]     chrX 40051248-40177329      - |    223669 ENST00000342274.8          11
    #>   [279]     chrX 40062955-40177083      - |    223675 ENST00000406200.3          11
    #>   [280]     chrY     333933-386907      - |    226968 ENST00000390665.9          14
    #>   [281]     chrY     344896-386955      - |    226974 ENST00000445792.7          12
    #>   -------
    #>   seqinfo: 25 sequences (1 circular) from hg38 genome
    

    4 更复杂的GRangesList

    多个GRanges对象放到一个列表组成了 GRangesList

    4.1 构建

    gr1 <- GRanges(
        seqnames = "chr2",
        ranges = IRanges(103, 106),
        strand = "+",
        score = 5L, GC = 0.45)
    gr2 <- GRanges(
        seqnames = c("chr1", "chr1"),
        ranges = IRanges(c(107, 113), width = 3),
        strand = c("+", "-"),
        score = 3:4, GC = c(0.3, 0.5))
    grl <- GRangesList("txA" = gr1, "txB" = gr2)
    grl
    #> GRangesList object of length 2:
    #> $txA
    #> GRanges object with 1 range and 2 metadata columns:
    #>       seqnames    ranges strand |     score        GC
    #>          <Rle> <IRanges>  <Rle> | <integer> <numeric>
    #>   [1]     chr2   103-106      + |         5      0.45
    #>   -------
    #>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
    #> 
    #> $txB
    #> GRanges object with 2 ranges and 2 metadata columns:
    #>       seqnames    ranges strand |     score        GC
    #>          <Rle> <IRanges>  <Rle> | <integer> <numeric>
    #>   [1]     chr1   107-109      + |         3       0.3
    #>   [2]     chr1   113-115      - |         4       0.5
    #>   -------
    #>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
    

    4.2 基本操作

    seqnames(grl)
    #> RleList of length 2
    #> $txA
    #> factor-Rle of length 1 with 1 run
    #>   Lengths:    1
    #>   Values : chr2
    #> Levels(2): chr2 chr1
    #> 
    #> $txB
    #> factor-Rle of length 2 with 1 run
    #>   Lengths:    2
    #>   Values : chr1
    #> Levels(2): chr2 chr1
    ranges(grl)
    #> IRangesList object of length 2:
    #> $txA
    #> IRanges object with 1 range and 0 metadata columns:
    #>           start       end     width
    #>       <integer> <integer> <integer>
    #>   [1]       103       106         4
    #> 
    #> $txB
    #> IRanges object with 2 ranges and 0 metadata columns:
    #>           start       end     width
    #>       <integer> <integer> <integer>
    #>   [1]       107       109         3
    #>   [2]       113       115         3
    strand(grl)
    #> RleList of length 2
    #> $txA
    #> factor-Rle of length 1 with 1 run
    #>   Lengths: 1
    #>   Values : +
    #> Levels(3): + - *
    #> 
    #> $txB
    #> factor-Rle of length 2 with 2 runs
    #>   Lengths: 1 1
    #>   Values : + -
    #> Levels(3): + - *
    length(grl)
    #> [1] 2
    names(grl)
    #> [1] "txA" "txB"
    seqlengths(grl)
    #> chr2 chr1 
    #>   NA   NA
    isEmpty(grl)
    #> [1] FALSE
    
    # 提取meta信息,需要先unlist
    mcols(grl) #没结果
    #> DataFrame with 2 rows and 0 columns
    mcols(unlist(grl))
    #> DataFrame with 3 rows and 2 columns
    #>         score        GC
    #>     <integer> <numeric>
    #> txA         5      0.45
    #> txB         3       0.3
    #> txB         4       0.5
    

    4.3 列表的循环操作

    lapply(grl, length)
    #> $txA
    #> [1] 1
    #> 
    #> $txB
    #> [1] 2
    sapply(grl, length)
    #> txA txB 
    #>   1   2
    

    还可以先unlist,计算完重新list

    gr <- unlist(grl)
    gr$log_score <- log(gr$score)
    grl <- relist(gr, grl)
    grl
    #> GRangesList object of length 2:
    #> $txA
    #> GRanges object with 1 range and 3 metadata columns:
    #>       seqnames    ranges strand |     score        GC       log_score
    #>          <Rle> <IRanges>  <Rle> | <integer> <numeric>       <numeric>
    #>   txA     chr2   103-106      + |         5      0.45 1.6094379124341
    #>   -------
    #>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
    #> 
    #> $txB
    #> GRanges object with 2 ranges and 3 metadata columns:
    #>       seqnames    ranges strand |     score        GC        log_score
    #>          <Rle> <IRanges>  <Rle> | <integer> <numeric>        <numeric>
    #>   txB     chr1   107-109      + |         3       0.3 1.09861228866811
    #>   txB     chr1   113-115      - |         4       0.5 1.38629436111989
    #>   -------
    #>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
    

    欢迎关注我们的公众号~_~  
    我们是两个农转生信的小硕,打造生信星球,想让它成为一个不拽术语、通俗易懂的生信知识平台。需要帮助或提出意见请后台留言或发送邮件到jieandze1314@gmail.com

    Welcome to our bioinfoplanet!

    相关文章

      网友评论

        本文标题:跟着官方文档来看看GRanges这个对象

        本文链接:https://www.haomeiwen.com/subject/wapnrktx.html