美文网首页多组学知识基础
count normalization的代码

count normalization的代码

作者: 城管大队哈队长 | 来源:发表于2020-05-02 15:40 被阅读0次

    在前面的count normalization的方法部分,我初步介绍了一些常见的count normalization的方法,这部分我会写出对应的代码。用的数据集是我自己的一个数据。

    欢迎大家用自己数据集检查代码是否有错误

    # 总共是37336个基因
    > head(data)
              WT_0h_R1 WT_0h_R2 WT_4h_R1 WT_4h_R2
    AT1G01010       15       15      152      106
    AT1G01020      353      461      255      279
    AT1G03987        1        0        3        0
    AT1G01030       23       39       74       56
    AT1G01040      695      746     1831     1611
    AT1G03993        0        0        0        0
    
    > str(data)
    'data.frame':   37336 obs. of  4 variables:
     $ WT_0h_R1: int  15 353 1 23 695 0 412 0 64 13 ...
     $ WT_0h_R2: int  15 461 0 39 746 0 541 0 118 7 ...
     $ WT_4h_R1: int  152 255 3 74 1831 0 517 0 349 0 ...
     $ WT_4h_R2: int  106 279 0 56 1611 0 512 0 629 0 ...
    

    CPM

    公式
    CPM_i=\frac{X_i}{N}10^6

    CPM其实是最简单的,就是你对应基因的count除以你样本的总count,从而得到你对应基因在对应样本里面的百分比。

    data_CPM <- (sweep(data, 2, colSums(data), FUN = "/"))*1e6
    
    > head(data_CPM)
                 WT_0h_R1   WT_0h_R2   WT_4h_R1  WT_4h_R2
    AT1G01010  0.36330819  0.3272056  7.0490344  5.413802
    AT1G01020  8.54985270 10.0561188 11.8256828 14.249536
    AT1G03987  0.02422055  0.0000000  0.1391257  0.000000
    AT1G01030  0.55707256  0.8507346  3.4317668  2.860122
    AT1G01040 16.83327940 16.2730252 84.9130397 82.279578
    AT1G03993  0.00000000  0.0000000  0.0000000  0.000000
    

    我们可以来检查下

    > colSums(data_CPM)
    WT_0h_R1 WT_0h_R2 WT_4h_R1 WT_4h_R2 
       1e+06    1e+06    1e+06    1e+06 
    
    > data[2,2] / sum(data[, 2]) * 1e6
    [1] 10.05612
    

    FPKM

    公式:
    FPKM_i=\frac{\frac{X_i}{l_i}}{N}10^9FPKM_i=\frac{\frac{X_i}{l_i}}{N}10^9
    这里就要涉及到基因长度的问题了,我在之前的方法部分也提到过,不同的算法对于长度的界定是不一样的,我这里用的基因长度是直接把外显子overlap,然后去掉冗余部分,得到累加之后的长度。因为我count是利用featureCount得到的,用的GTF是Araport11的GTF,所以我直接用GenomicFeatures包来得到外显子去冗余累加之后的长度。

    # 先得到外显子长度
    library(GenomicFeatures)
    Tx_At <- makeTxDbFromGFF("~/reference/annoation/Athaliana/Araport11/Araport11_GFF3_genes_transposons.201606.gtf",
                             chrominfo = Seqinfo(seqnames = paste0("Chr",c(1:5,"C","M")),
                                                 seqlengths = c(30427671,19698289,23459830,18585056,26975502,154478,366924)))
    # 会报一些warning
    Import genomic features from the file as a GRanges object ... OK
    Prepare the 'metadata' data frame ... OK
    Make the TxDb object ... OK
    Warning messages:
    1: In .get_cds_IDX(mcols0$type, mcols0$phase) :
      The "phase" metadata column contains non-NA values for features of
      type stop_codon, exon. This information was ignored.
    2: In .reject_transcripts(bad_tx, because) :
      The following transcripts were rejected because they have stop
      codons that cannot be mapped to an exon: AT1G07320.4, AT1G18180.2,
      AT1G30230.1, AT1G36380.1, AT1G52415.1, AT1G52940.1, AT2G18110.1,
      AT2G35130.1, AT2G35130.3, AT2G39050.1, AT3G13445.1, AT3G17660.1,
      AT3G17660.3, AT3G52070.2, AT3G54680.1, AT3G59450.2, AT4G13730.2,
      AT4G17730.1, AT4G39620.2, AT5G01520.2, AT5G22794.1, AT5G27710.2,
      AT5G45240.2
    

    # 然后发现少6个基因
    > genes(Tx_At)
    GRanges object with 37330 ranges and 1 metadata column:
                seqnames        ranges strand |     gene_id
                   <Rle>     <IRanges>  <Rle> | <character>
      AT1G01010     Chr1     3631-5899      + |   AT1G01010
      AT1G01020     Chr1     6788-9130      - |   AT1G01020
      AT1G01030     Chr1   11649-13714      - |   AT1G01030
      AT1G01040     Chr1   23121-31227      + |   AT1G01040
      AT1G01050     Chr1   31170-33171      - |   AT1G01050
            ...      ...           ...    ... .         ...
      ATMG09730     ChrM 181268-181347      + |   ATMG09730
      ATMG09740     ChrM 191954-192025      - |   ATMG09740
      ATMG09950     ChrM 333651-333725      - |   ATMG09950
      ATMG09960     ChrM 347266-347348      + |   ATMG09960
      ATMG09980     ChrM 359666-359739      - |   ATMG09980
      -------
      seqinfo: 7 sequences from an unspecified genome
    
    # 我们可以看下缺失的6个基因,刚好就是报warning的那几个基因
    > rownames(data)[!rownames(data) %in% names(genes(Tx_At)) ]
    [1] "AT1G36380" "AT1G52415" "AT1G52940" "AT2G18110" "AT2G39050" "AT3G54680"
    > gene_notin <- rownames(data)[!rownames(data) %in% names(genes(Tx_At)) ]
    > data[gene_notin, ]
              WT_0h_R1 WT_0h_R2 WT_4h_R1 WT_4h_R2
    AT1G36380       75       82      140      179
    AT1G52415        0        0        0        0
    AT1G52940        0        0        0        0
    AT2G18110      522      488      622      686
    AT2G39050       26       31      133      182
    AT3G54680     1236     1777      727      728
    

    之所以会不包括这6个基因,应该是这6个基因的stop condon部分并不在exon里面的缘故……

    后来我发现gff并不会报这个错

    Tx_At <- makeTxDbFromGFF("~/reference/annoation/Athaliana/Araport11/Araport11_GFF3_genes_transposons.201606.gff",
                             chrominfo = Seqinfo(seqnames = paste0("Chr",c(1:5,"C","M")),
                                                 seqlengths = c(30427671,19698289,23459830,18585056,26975502,154478,366924)))
    
    Import genomic features from the file as a GRanges object ... OK
    Prepare the 'metadata' data frame ... OK
    Make the TxDb object ... OK
    Warning message:
    In .get_cds_IDX(mcols0$type, mcols0$phase) :
      The "phase" metadata column contains non-NA values for features of type exon. This
      information was ignored.
    
    > genes(Tx_At)
    GRanges object with 38194 ranges and 1 metadata column:
                seqnames            ranges strand |     gene_id
                   <Rle>         <IRanges>  <Rle> | <character>
      AT1G01010     Chr1         3631-5899      + |   AT1G01010
      AT1G01020     Chr1         6788-9130      - |   AT1G01020
      AT1G01030     Chr1       11649-13714      - |   AT1G01030
      AT1G01040     Chr1       23121-31227      + |   AT1G01040
      AT1G01046     Chr1       28500-28706      + |   AT1G01046
            ...      ...               ...    ... .         ...
        MIR854c     Chr5 11838096-11838316      + |     MIR854c
        MIR854d     Chr5 11689861-11690081      - |     MIR854d
        MIR854e     Chr5 11942467-11942687      + |     MIR854e
         MIR855     Chr2   4674427-4674698      + |      MIR855
        MIR858b     Chr1 26772633-26772935      + |     MIR858b
      -------
      seqinfo: 7 sequences from an unspecified genome
    
    > rownames(data)[!rownames(data) %in% names(genes(Tx_At)) ]
    character(0)
    

    然后我们就可以得到对应基因的外显子信息

    > exons_gene <- exonsBy(Tx_At, by = "gene")
    > exons_gene
    GRangesList object of length 38194:
    $AT1G01010
    GRanges object with 6 ranges and 2 metadata columns:
          seqnames    ranges strand |   exon_id     exon_name
             <Rle> <IRanges>  <Rle> | <integer>   <character>
      [1]     Chr1 3631-3913      + |         1 NAC001:exon:1
      [2]     Chr1 3996-4276      + |         2 NAC001:exon:2
      [3]     Chr1 4486-4605      + |         3 NAC001:exon:3
      [4]     Chr1 4706-5095      + |         4 NAC001:exon:4
      [5]     Chr1 5174-5326      + |         5 NAC001:exon:5
      [6]     Chr1 5439-5899      + |         6 NAC001:exon:6
      -------
      seqinfo: 7 sequences from an unspecified genome
    
    $AT1G01020
    GRanges object with 14 ranges and 2 metadata columns:
           seqnames    ranges strand |   exon_id    exon_name
              <Rle> <IRanges>  <Rle> | <integer>  <character>
       [1]     Chr1 6788-7069      - |     26363 ARV1:exon:14
       [2]     Chr1 7157-7232      - |     26364 ARV1:exon:13
       [3]     Chr1 7157-7450      - |     26365 ARV1:exon:12
       [4]     Chr1 7384-7450      - |     26366 ARV1:exon:11
       [5]     Chr1 7564-7649      - |     26367 ARV1:exon:10
       ...      ...       ...    ... .       ...          ...
      [10]     Chr1 8417-8464      - |     26372  ARV1:exon:5
      [11]     Chr1 8571-8737      - |     26373  ARV1:exon:4
      [12]     Chr1 8571-9130      - |     26374  ARV1:exon:3
      [13]     Chr1 8594-8737      - |     26375  ARV1:exon:2
      [14]     Chr1 8594-9130      - |     26376  ARV1:exon:1
      -------
      seqinfo: 7 sequences from an unspecified genome
    
    ...
    <38192 more elements>
    
    # 我们只用到37336个基因的外显子信息
    > exons_gen_expr <- exons_gene[rownames(data)]
    > exons_gen_expr
    GRangesList object of length 37336:
    $AT1G01010
    GRanges object with 6 ranges and 2 metadata columns:
          seqnames    ranges strand |   exon_id     exon_name
             <Rle> <IRanges>  <Rle> | <integer>   <character>
      [1]     Chr1 3631-3913      + |         1 NAC001:exon:1
      [2]     Chr1 3996-4276      + |         2 NAC001:exon:2
      [3]     Chr1 4486-4605      + |         3 NAC001:exon:3
      [4]     Chr1 4706-5095      + |         4 NAC001:exon:4
      [5]     Chr1 5174-5326      + |         5 NAC001:exon:5
      [6]     Chr1 5439-5899      + |         6 NAC001:exon:6
      -------
      seqinfo: 7 sequences from an unspecified genome
    
    $AT1G01020
    GRanges object with 14 ranges and 2 metadata columns:
           seqnames    ranges strand |   exon_id    exon_name
              <Rle> <IRanges>  <Rle> | <integer>  <character>
       [1]     Chr1 6788-7069      - |     26363 ARV1:exon:14
       [2]     Chr1 7157-7232      - |     26364 ARV1:exon:13
       [3]     Chr1 7157-7450      - |     26365 ARV1:exon:12
       [4]     Chr1 7384-7450      - |     26366 ARV1:exon:11
       [5]     Chr1 7564-7649      - |     26367 ARV1:exon:10
       ...      ...       ...    ... .       ...          ...
      [10]     Chr1 8417-8464      - |     26372  ARV1:exon:5
      [11]     Chr1 8571-8737      - |     26373  ARV1:exon:4
      [12]     Chr1 8571-9130      - |     26374  ARV1:exon:3
      [13]     Chr1 8594-8737      - |     26375  ARV1:exon:2
      [14]     Chr1 8594-9130      - |     26376  ARV1:exon:1
      -------
      seqinfo: 7 sequences from an unspecified genome
    
    ...
    <37334 more elements>
    

    然后我们做去冗余,统计长度等操作

    # reduce去冗余
    > reduce(exons_gen_expr)
    GRangesList object of length 37336:
    $AT1G01010
    GRanges object with 6 ranges and 0 metadata columns:
          seqnames    ranges strand
             <Rle> <IRanges>  <Rle>
      [1]     Chr1 3631-3913      +
      [2]     Chr1 3996-4276      +
      [3]     Chr1 4486-4605      +
      [4]     Chr1 4706-5095      +
      [5]     Chr1 5174-5326      +
      [6]     Chr1 5439-5899      +
      -------
      seqinfo: 7 sequences from an unspecified genome
    
    $AT1G01020
    GRanges object with 7 ranges and 0 metadata columns:
          seqnames    ranges strand
             <Rle> <IRanges>  <Rle>
      [1]     Chr1 6788-7069      -
      [2]     Chr1 7157-7450      -
      [3]     Chr1 7564-7649      -
      [4]     Chr1 7762-7835      -
      [5]     Chr1 7942-7987      -
      [6]     Chr1 8236-8464      -
      [7]     Chr1 8571-9130      -
      -------
      seqinfo: 7 sequences from an unspecified genome
    
    $AT1G03987
    GRanges object with 1 range and 0 metadata columns:
          seqnames      ranges strand
             <Rle>   <IRanges>  <Rle>
      [1]     Chr1 11101-11372      +
      -------
      seqinfo: 7 sequences from an unspecified genome
    
    ...
    <37333 more elements>
    
    # width统计长度
    > width(reduce(exons_gen_expr))
    IntegerList of length 37336
    [["AT1G01010"]] 283 281 120 390 153 461
    [["AT1G01020"]] 282 294 86 74 46 229 560
    [["AT1G03987"]] 272
    [["AT1G01030"]] 1525 380
    [["AT1G01040"]] 1331 114 211 395 220 173 123 161 234 151 183 165 96 629 98 191 906 165 407 326
    [["AT1G03993"]] 788
    [["AT1G01050"]] 255 82 121 66 108 66 29 124 171 143
    [["AT1G03997"]] 283
    [["AT1G01060"]] 225 666 1074 81 270 82 62 112 849
    [["AT1G01070"]] 611 152 406 117 545
    ...
    <37326 more elements>
    
    # sum下
    > gene_exon_length <- sum(width(reduce(exons_gen_expr)))
    > head(gene_exon_length)
    AT1G01010 AT1G01020 AT1G03987 AT1G01030 AT1G01040 AT1G03993 
         1688      1571       272      1905      6279       788 
    
    # 然后再看看基因的顺序和我们data上的基因顺序是不是一致的
    # 是一样的
    > mean(names(gene_exon_length) == rownames(data))
    [1] 1
    

    参考文章

    这样我们就可以计算了

    # 先除以长度
    > data_count_length <- sweep(data, 2, gene_exon_length , FUN = "/")
    > head(data_count_length)
                 WT_0h_R1    WT_0h_R2   WT_4h_R1   WT_4h_R2
    AT1G01010 0.008886256 0.008886256 0.09004739 0.06279621
    AT1G01020 0.224697645 0.293443666 0.16231700 0.17759389
    AT1G03987 0.003676471 0.000000000 0.01102941 0.00000000
    AT1G01030 0.012073491 0.020472441 0.03884514 0.02939633
    AT1G01040 0.110686415 0.118808728 0.29160694 0.25656952
    AT1G03993 0.000000000 0.000000000 0.00000000 0.00000000
    
    # 再除以文库的总count
    > data_FRPKM <- sweep(data_count_length, 2, colSums(data), FUN = "/")*1e9
    > head(data_FRPKM)
                WT_0h_R1  WT_0h_R2   WT_4h_R1  WT_4h_R2
    AT1G01010 0.21522997 0.1938422  4.1759683  3.207229
    AT1G01020 5.44229962 6.4010941  7.5274874  9.070360
    AT1G03987 0.08904612 0.0000000  0.5114915  0.000000
    AT1G01030 0.29242654 0.4465798  1.8014524  1.501376
    AT1G01040 2.68088540 2.5916587 13.5233381 13.103930
    AT1G03993 0.00000000 0.0000000  0.0000000  0.000000
    

    检查下

    # 我们会发现各个文库之间的FPKM总和是不一样的,这也是FPKM受到人质疑的原因
    > colSums(data_FRPKM)
    WT_0h_R1 WT_0h_R2 WT_4h_R1 WT_4h_R2 
    752371.2 716130.4 616267.3 608064.9
    
    # 我们检查一个基因
    > rownames(data_FRPKM)[10]
    [1] "AT1G01070"
    > data_FRPKM[rownames(data_FRPKM)[10],]
      WT_0h_R1   WT_0h_R2   WT_4h_R1   WT_4h_R2 
    0.17196455 0.08339484 0.00000000 0.00000000 
    
    # 手动算一个基因的FPKM值
    > gene_exon_length["AT1G01070"]
    AT1G01070 
         1831 
    > (data["AT1G01070",1] / 1831) / sum(data[, 1]) * 1e9
    [1] 0.1719646
    
    > (data["AT1G01070", ] / 1831) / colSums(data) * 1e9
               WT_0h_R1   WT_0h_R2 WT_4h_R1 WT_4h_R2
    AT1G01070 0.1719646 0.08339484        0        0
    

    TPM

    公式:
    TPM_i=\frac{\frac{X_i}{l_i}}{\sum_j\frac{X_j}{l_j}}10^6

    > data_TPM <- sweep(data_count_length, 2, colSums(data_count_length), FUN = "/") * 1e6
    > head(data_TPM)
               WT_0h_R1  WT_0h_R2   WT_4h_R1  WT_4h_R2
    AT1G01010 0.2860689 0.2706800  6.7762290  5.274484
    AT1G01020 7.2335303 8.9384480 12.2146472 14.916763
    AT1G03987 0.1183540 0.0000000  0.8299832  0.000000
    AT1G01030 0.3886732 0.6236013  2.9231673  2.469105
    AT1G01040 3.5632485 3.6189762 21.9439494 21.550216
    AT1G03993 0.0000000 0.0000000  0.0000000  0.000000
    
    > colSums(data_TPM)
    WT_0h_R1 WT_0h_R2 WT_4h_R1 WT_4h_R2 
       1e+06    1e+06    1e+06    1e+06 
    

    DESeq2的norm count

    library(DESeq2)
    
    coldata <- data.frame(row.names = colnames(data), 
                          type = gsub("_R\\d+", "", colnames(data)))
    
    > coldata
              type
    WT_0h_R1 WT_0h
    WT_0h_R2 WT_0h
    WT_4h_R1 WT_4h
    WT_4h_R2 WT_4h
    
    
    dds <- DESeqDataSetFromMatrix(countData = data,
                                  colData = coldata,
                                  design= ~ type)
    
    > dds <- DESeq(dds)
    estimating size factors
    estimating dispersions
    gene-wise dispersion estimates
    mean-dispersion relationship
    final dispersion estimates
    fitting model and testing
    
    
    > resultsNames(dds)
    [1] "Intercept"           "type_WT_4h_vs_WT_0h"
    
    > res <- lfcShrink(dds, coef=2, type="apeglm")
    using 'apeglm' for LFC shrinkage. If used in published research, please cite:
        Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
        sequence count data: removing the noise and preserving large differences.
        Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
    
    
    > data_DESeq2_norm <- DESeq2::counts(dds, normalized = TRUE)
    > head(data_DESeq2_norm)
                WT_0h_R1  WT_0h_R2    WT_4h_R1   WT_4h_R2
    AT1G01010  16.519753  14.86971  143.445091  100.86482
    AT1G01020 388.764862 456.99580  240.648014  265.48381
    AT1G03987   1.101317   0.00000    2.831153    0.00000
    AT1G01030  25.330288  38.66125   69.835110   53.28707
    AT1G01040 765.415238 739.52032 1727.947115 1532.95488
    AT1G03993   0.000000   0.00000    0.000000    0.00000
    

    还有个我没用到的链接
    https://gist.github.com/slowkow/c6ab0348747f86e2748b

    edgeR的TMM

    不太会……不敢说……懒得写

    下一部分我会探究不同的normaliztion的差异

    相关文章

      网友评论

        本文标题:count normalization的代码

        本文链接:https://www.haomeiwen.com/subject/kkvkghtx.html