转录组分析--FPKM与TPM

作者: 千万别加香菜 | 来源:发表于2022-06-26 08:38 被阅读0次
    读取文件(featurecounts后产生的row count文件)
    rm(list=ls()) 
    options(stringsAsFactors = F)  
    library(tidyverse) 
    # ggplot2 stringer dplyr tidyr readr purrr  tibble forcats 
    library(data.table) #可多核读取文件 
    a1 <- fread('all.featurecounts.txt', header = T, data.table = F)#载入counts,第一列设置为列名 
    

    counts矩阵的构建

    counts <- a1[,7:ncol(a1)] #截取样本基因表达量的counts部分作为counts  
    rownames(counts) <- a1$Geneid #将基因名作为行名 
    ### 从featurecounts 原始输出文件counts.txt中提取Geneid、Length(转录本长度), 
    geneid_efflen <- subset(a1,select = c("Geneid","Length"))        
    colnames(geneid_efflen) <- c("geneid","efflen")   
    geneid_efflen_fc <- geneid_efflen #用于之后比较 
    ### 取出counts中geneid的对应的efflen 
    dim(geneid_efflen) 
    efflen <- geneid_efflen[match(rownames(counts),                               
                                  geneid_efflen$geneid),"efflen"] 
    

    FPKM/RPKM (Fragments/Reads Per Kilobase Million ) 每千个碱基的转录每百万映射读取的Fragments/reads

    # RPKM与FPKM分别针对单端与双端测序而言,计算公式是一样的 
    counts2FPKM <- function(count=count, efflength=efflen){    
      PMSC_counts <- sum(count)/1e6   #counts的每百万缩放因子 (“per million” scaling factor) 深度标准化   
      FPM <- count/PMSC_counts        #每百万reads/Fragments (Reads/Fragments Per Million) 长度标准化   
      FPM/(efflength/1000)                                       
    }
    FPKM <- as.data.frame(apply(counts,2,counts2FPKM))
    colnames(FPKM) <- c("Simmental_1","Simmental_2","Simmental_3","Wagyu_1","Wagyu_2","Wagyu_3") # 修改列名
    FPKM <- FPKM[rowSums(FPKM)>=1,] # 去除全部为0的列
    colSums(FPKM)
    

    当前推荐使用 TPM 进行相关性分析、PCA分析等 (Transcripts Per Kilobase Million) 每千个碱基的转录每百万映射读取的Transcripts

    counts2TPM <- function(count=count, efflength=efflen){   
      RPK <- count/(efflength/1000)   #每千碱基reads (reads per kilobase) 长度标准化   
      PMSC_rpk <- sum(RPK)/1e6        #RPK的每百万缩放因子 (“per million” scaling factor ) 深度标准化   
      RPK/PMSC_rpk                       
    }
    TPM <- as.data.frame(apply(counts,2,counts2TPM))
    colnames(TPM) <- c("Zebu_1","Zebu_2","Zebu_3","Zebu_4","Zebu_5","Holstein_1","Holstein_2","Holstein_3","Holstein_4","Holstein_5") # 修改列名
    TPM <- TPM[rowSums(TPM)>0,] # 去除全部为0的列
    colSums(TPM)
    

    相关文章

      网友评论

        本文标题:转录组分析--FPKM与TPM

        本文链接:https://www.haomeiwen.com/subject/qkwivrtx.html