美文网首页注释和富集
2021-06-27 count_TPM_FPKM转换

2021-06-27 count_TPM_FPKM转换

作者: 学习生信的小兔子 | 来源:发表于2021-06-27 08:55 被阅读0次

    基因长度的计算

    #计算基因长度
    #思路1:计算基因在染色体的起始和结束之差
    #思路2:计算每个基因的最长转录本或所有外显子之和
    ##############################################
    
    #### 方法1 简单把基因在染色体上的起始位置和结束位置之差用作标准化的长度。
    
    if (!requireNamespace("BiocManager", quietly = TRUE))
      install.packages("BiocManager")
    BiocManager::install("biomaRt")
    
    library(biomaRt)
    
    #查看基因组参数
    mart = useMart('ensembl')
    listDatasets(mart)
    
    #以人类为例 获取基因组信息
    bmart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL", 
                              dataset = "hsapiens_gene_ensembl",
                              host = "www.ensembl.org")
    
    # 从输入数据里提取基因名
    feature_ids <- rownames(expMatrix)
    
    attributes = c(
      "ensembl_gene_id",
      #"hgnc_symbol",
      "chromosome_name",
      "start_position",
      "end_position"
    )
    filters = "ensembl_gene_id"
    
    feature_info <- biomaRt::getBM(attributes = attributes, 
                                   filters = filters, 
                                   values = feature_ids, mart = bmart)
    mm <- match(feature_ids, feature_info[[filters]])
    feature_info_full <- feature_info[mm, ]
    rownames(feature_info_full) <- feature_ids
    
    # 计算基因的有效长度eff_length
    eff_length <- abs(feature_info_full$end_position - feature_info_full$start_position)
    names(eff_length) <- feature_info_full$ensembl_gene_id
    write.csv(eff_length, "gene_length_1.csv", row.names = TRUE)
    
    ##############################################
    #方法2 计算每个基因的最长转录本或所有外显子之和
    #source("https://bioconductor.org/biocLite.R")
    #biocLite("GenomicFeatures")
    library(GenomicFeatures)
    #导入GTF 或者GFF3文件,ensembl或者gencode网站GTF注释皆可
    #下载链接ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22/gencode.v22.annotation.gtf.gz
    txdb <- makeTxDbFromGFF("gencode.v22.annotation.gtf",format="gtf")
    #获取外显子
    exons_gene <- exonsBy(txdb, by = "gene")
    ##并行计算
    #install.packages('parallel')
    library(parallel)
    #检测核心数
    cores<-detectCores(logical = F)
    #设定个核心
    cl <- makeCluster(cores)
    #对外显子重叠部分通过reduce 去冗余,并计算总长度
    results <- parLapply(cl,exons_gene,function(x){sum(width(reduce(x)))})
    stopCluster(cl)#停止
    gene_length2_test <- do.call(rbind,lapply(results, data.frame))
    #或者plyr得到结果
    #install.packages('plyr')
    library (plyr)
    gene_length2_test<- ldply(results, data.frame)
    colnames(gene_length2_test)<-c('gene_id','eff_length')
    
    write.csv(gene_length2, "gene_length_2.csv", row.names = TRUE)
    

    counTOFPKM

    ###读取Count表达矩阵
    expMatrix <- read.table("raw_count.txt",
                                        row.names = 1, header = TRUE, sep="\t")
    #查看前三个基因的read count
    expMatrix[1:3,]
                     sample1 sample2 sample3 sample4 sample5 sample6
    ENSG00000000003    2250    1450    1850    2136    5321    3729
    ENSG00000000005       4       1       3       3       1       2
    ENSG00000000419     512     889     701     569    1144     857
    
    ### read count转FPKM
    #要保证表达矩阵的行名和存放基因长度向量的名字一致, 这一步非常重要。
    eff_length <-read.csv("gene_length_2.csv", row.names = 1, header = T)
    rownames(eff_length)<-eff_length$gene_id 
    colnames(eff_length)<-c("gene_id","eff_length")
    rownames(eff_length) <- do.call(rbind,strsplit(as.character(eff_length$gene_id),'\\.'))[,1]
    
    # 从输入数据里提取基因名
    feature_ids <- rownames(expMatrix)
    
    # 检查gtf eff_length文件和表达量输入文件里基因名的一致性
    if (! all(feature_ids %in% rownames(eff_length))){
      tbl <- table(feature_ids %in% rownames(eff_length))
      msg1 <- sprintf("%i gene is shared, %i gene is specified", tbl[[2]],tbl[[1]])
      warning(msg1)
    } 
    
    if (! identical(feature_ids, rownames(eff_length))){
      msg2 <- sprintf("Given GTF file only contain %i gene, but experssion matrix has %i gene", nrow(eff_length), nrow(expMatrix))
      warning(msg2)
    }
    
    # trim the expression matrix and effetive gene length
    expMatrix <- expMatrix[feature_ids %in% rownames(eff_length),]
    mm <- match(rownames(expMatrix), rownames(eff_length))
    eff_length <- eff_length[mm, ]
    
    if (identical(rownames(eff_length), rownames(expMatrix))){
      print("GTF and expression matix now have the same gene and gene in same order")
    }
    #如果上面代码运行时有警告,主要是因为GTF里面的基因数少于表达矩阵,请换一个更新版本的GTF文件。为了让二者基因数量一致,会删减表达矩阵的行数(基因数)。
    
    #countToFpkm函数
    countToFpkm <- function(counts, effLen){
      N <- sum(counts)
      exp( log(counts) + log(1e9) - log(effLen) - log(N) )
    }
    
    #最后执行下面的代码,从read count转成FPKM:
    fpkms <- apply(expMatrix, 2, countToFpkm, effLen = eff_length$eff_length)
    fpkms.m<-data.frame(fpkms)
    colnames(fpkms.m)<-colnames(expMatrix)
    dim(fpkms.m)
     56830     6
    #查看前三个基因的FPKM值
    fpkms.m[1:3,]
       sample1     sample2     sample3     sample4
    ENSG00000000003 16.69071412 12.89858359 14.96212407 10.90373371
    ENSG00000000005  0.08358028  0.02505679  0.06834302  0.04313667
    ENSG00000000419 14.27027633 29.71295208 21.30146940 10.91330458
                        sample5     sample6
    ENSG00000000003 17.90399004 15.12281791
    ENSG00000000005  0.00947781  0.02284661
    ENSG00000000419 14.46280779 13.05843654
    
    #把算好的FPKM保存到本地
    write.table(fpkms.m, "/Users/apple/Desktop/output_count2fpkm.txt", sep="\t", quote=F, row.names=T)
    
    
    fpkms.m

    counTOTPM

    ##读取Count表达矩阵
    expMatrix <- read.table("raw_count.txt",
                            row.names = 1, header = TRUE, sep="\t")
    expMatrix[1:3,]
    sample1 sample2 sample3 sample4 sample5 sample6
    ENSG00000000003    2250    1450    1850    2136    5321    3729
    ENSG00000000005       4       1       3       3       1       2
    ENSG00000000419     512     889     701     569    1144     857
    
    
    ### read count转TPM
    #首先要保证表达矩阵的行名和存放基因长度向量的名字一致, 这一步非常重要
    
    eff_length <-read.csv("gene_length_1.csv", row.names = 1, header = T)
    eff_length$gene_id <- rownames(eff_length)
    rownames(eff_length) <- do.call(rbind,strsplit(eff_length$gene_id,'\\.'))[,1]
    
    feature_ids <- rownames(expMatrix)
    
    if (! all(feature_ids %in% rownames(eff_length))){
      tbl <- table(feature_ids %in% rownames(eff_length))
      msg1 <- sprintf("%i gene is shared, %i gene is specified", tbl[[2]],tbl[[1]])
      warning(msg1)
      
    } 
    
    if (! identical(feature_ids, rownames(eff_length))){
      msg2 <- sprintf("Given GTF file only contain %i gene, but experssion matrix has %i gene", nrow(eff_length), nrow(expMatrix))
      warning(msg2)
    }
    
    # trim the expression matrix and effetive gene length
    expMatrix <- expMatrix[feature_ids %in% rownames(eff_length),]
    mm <- match(rownames(expMatrix), rownames(eff_length))
    eff_length <- eff_length[mm, ]
    
    if (identical(rownames(eff_length), rownames(expMatrix))){
      print("GTF and expression matix now have the same gene and gene in same order")
    }
    
    #计算TPM
    x <- expMatrix / eff_length$eff_length
    expMatrix_tpm <- t( t(x) / colSums(x) ) * 1e6 
    #检查一下,是不是每列的总和都是1
    colSums(expMatrix_tpm)
    sample1 sample2 sample3 sample4 sample5 sample6 
      1e+06   1e+06   1e+06   1e+06   1e+06   1e+06 
    
    #把算好的TPM保存到本地
    write.table(expMatrix_tpm, "/Users/apple/Desktop/output_count2tpm.txt", sep="\t", quote=F, row.names=T)
    
    expMatrix_tpm

    FPKMToTPM

    rm(list=ls())
    ##读取FPKM表达矩阵
    expMatrix<-read.table("./output_count2fpkm.txt",header = T,row.names = 1)
    #查看前三个基因的FPKM值
    expMatrix[1:3,]
    
                        sample1     sample2     sample3     sample4
    ENSG00000000003 16.69071412 12.89858359 14.96212407 10.90373371
    ENSG00000000005  0.08358028  0.02505679  0.06834302  0.04313667
    ENSG00000000419 14.27027633 29.71295208 21.30146940 10.91330458
                        sample5     sample6
    ENSG00000000003 17.90399004 15.12281791
    ENSG00000000005  0.00947781  0.02284661
    ENSG00000000419 14.46280779 13.05843654
    
    ### FPKM转TPM
    fpkmToTpm <- function(fpkm)
    {
      exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
    }
    
    #计算TPM值,保存到tpms里
    tpms <- apply(expMatrix,2,fpkmToTpm)
    
    #查看前三个基因的TPM值
    tpms[1:3,](可看出和前面计算的TPM一致)
    sample1     sample2    sample3    sample4
    ENSG00000000003 54.7479721 38.98519367 47.2173751 30.6182682
    ENSG00000000005  0.2741555  0.07573264  0.2156765  0.1211301
    ENSG00000000419 46.8085838 89.80561184 67.2230403 30.6451437
                       sample5     sample6
    ENSG00000000003 55.3028651 44.91547454
    ENSG00000000005  0.0292756  0.06785549
    ENSG00000000419 44.6735452 38.78416559
    
    #检查一下,是不是每列的总和都是1
    colSums(tpms)
    sample1 sample2 sample3 sample4 sample5 sample6 
      1e+06   1e+06   1e+06   1e+06   1e+06   1e+06 
    
    #把TPM值保存到文件
    write.table(tpms,"./FPKM2TPM.genes.txt",sep="\t", quote=F, row.names=T)
    

    相关文章

      网友评论

        本文标题:2021-06-27 count_TPM_FPKM转换

        本文链接:https://www.haomeiwen.com/subject/uowtultx.html