读取文件(featurecounts后产生的row count文件)
rm(list=ls())
options(stringsAsFactors = F)
library(tidyverse)
# ggplot2 stringer dplyr tidyr readr purrr tibble forcats
library(data.table) #可多核读取文件
a1 <- fread('all.featurecounts.txt', header = T, data.table = F)#载入counts,第一列设置为列名
counts矩阵的构建
counts <- a1[,7:ncol(a1)] #截取样本基因表达量的counts部分作为counts
rownames(counts) <- a1$Geneid #将基因名作为行名
### 从featurecounts 原始输出文件counts.txt中提取Geneid、Length(转录本长度),
geneid_efflen <- subset(a1,select = c("Geneid","Length"))
colnames(geneid_efflen) <- c("geneid","efflen")
geneid_efflen_fc <- geneid_efflen #用于之后比较
### 取出counts中geneid的对应的efflen
dim(geneid_efflen)
efflen <- geneid_efflen[match(rownames(counts),
geneid_efflen$geneid),"efflen"]
FPKM/RPKM (Fragments/Reads Per Kilobase Million ) 每千个碱基的转录每百万映射读取的Fragments/reads
# RPKM与FPKM分别针对单端与双端测序而言,计算公式是一样的
counts2FPKM <- function(count=count, efflength=efflen){
PMSC_counts <- sum(count)/1e6 #counts的每百万缩放因子 (“per million” scaling factor) 深度标准化
FPM <- count/PMSC_counts #每百万reads/Fragments (Reads/Fragments Per Million) 长度标准化
FPM/(efflength/1000)
}
FPKM <- as.data.frame(apply(counts,2,counts2FPKM))
colnames(FPKM) <- c("Simmental_1","Simmental_2","Simmental_3","Wagyu_1","Wagyu_2","Wagyu_3") # 修改列名
FPKM <- FPKM[rowSums(FPKM)>=1,] # 去除全部为0的列
colSums(FPKM)
当前推荐使用 TPM 进行相关性分析、PCA分析等 (Transcripts Per Kilobase Million) 每千个碱基的转录每百万映射读取的Transcripts
counts2TPM <- function(count=count, efflength=efflen){
RPK <- count/(efflength/1000) #每千碱基reads (reads per kilobase) 长度标准化
PMSC_rpk <- sum(RPK)/1e6 #RPK的每百万缩放因子 (“per million” scaling factor ) 深度标准化
RPK/PMSC_rpk
}
TPM <- as.data.frame(apply(counts,2,counts2TPM))
colnames(TPM) <- c("Zebu_1","Zebu_2","Zebu_3","Zebu_4","Zebu_5","Holstein_1","Holstein_2","Holstein_3","Holstein_4","Holstein_5") # 修改列名
TPM <- TPM[rowSums(TPM)>0,] # 去除全部为0的列
colSums(TPM)
网友评论