2021-06-27 count_TPM_FPKM转换

作者: 学习生信的小兔子 | 来源:发表于2021-06-27 08:55 被阅读0次

2021-06-27 count_TPM_FPKM转换
210627承传公园
2022-07-03 思考成长周复盘（第106/130周）
2021-06-27 思考成长周复盘
南湖之行
20210703《梦圆大地：袁隆平传》
我一定是把老公气急了
【餐饮100问】28.美团禁止二选一？假的！
#Dairy235 ?!
把一件事做好就行

基因长度的计算

#计算基因长度
#思路1：计算基因在染色体的起始和结束之差
#思路2：计算每个基因的最长转录本或所有外显子之和
##############################################

#### 方法1 简单把基因在染色体上的起始位置和结束位置之差用作标准化的长度。

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("biomaRt")

library(biomaRt)

#查看基因组参数
mart = useMart('ensembl')
listDatasets(mart)

#以人类为例 获取基因组信息
bmart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL", 
                          dataset = "hsapiens_gene_ensembl",
                          host = "www.ensembl.org")

# 从输入数据里提取基因名
feature_ids <- rownames(expMatrix)

attributes = c(
  "ensembl_gene_id",
  #"hgnc_symbol",
  "chromosome_name",
  "start_position",
  "end_position"
)
filters = "ensembl_gene_id"

feature_info <- biomaRt::getBM(attributes = attributes, 
                               filters = filters, 
                               values = feature_ids, mart = bmart)
mm <- match(feature_ids, feature_info[[filters]])
feature_info_full <- feature_info[mm, ]
rownames(feature_info_full) <- feature_ids

# 计算基因的有效长度eff_length
eff_length <- abs(feature_info_full$end_position - feature_info_full$start_position)
names(eff_length) <- feature_info_full$ensembl_gene_id
write.csv(eff_length, "gene_length_1.csv", row.names = TRUE)

##############################################
#方法2 计算每个基因的最长转录本或所有外显子之和
#source("https://bioconductor.org/biocLite.R")
#biocLite("GenomicFeatures")
library(GenomicFeatures)
#导入GTF 或者GFF3文件，ensembl或者gencode网站GTF注释皆可
#下载链接ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22/gencode.v22.annotation.gtf.gz
txdb <- makeTxDbFromGFF("gencode.v22.annotation.gtf",format="gtf")
#获取外显子
exons_gene <- exonsBy(txdb, by = "gene")
##并行计算
#install.packages('parallel')
library(parallel)
#检测核心数
cores<-detectCores(logical = F)
#设定个核心
cl <- makeCluster(cores)
#对外显子重叠部分通过reduce 去冗余，并计算总长度
results <- parLapply(cl,exons_gene,function(x){sum(width(reduce(x)))})
stopCluster(cl)#停止
gene_length2_test <- do.call(rbind,lapply(results, data.frame))
#或者plyr得到结果
#install.packages('plyr')
library (plyr)
gene_length2_test<- ldply(results, data.frame)
colnames(gene_length2_test)<-c('gene_id','eff_length')

write.csv(gene_length2, "gene_length_2.csv", row.names = TRUE)

counTOFPKM

###读取Count表达矩阵
expMatrix <- read.table("raw_count.txt",
                                    row.names = 1, header = TRUE, sep="\t")
#查看前三个基因的read count
expMatrix[1:3,]
                 sample1 sample2 sample3 sample4 sample5 sample6
ENSG00000000003    2250    1450    1850    2136    5321    3729
ENSG00000000005       4       1       3       3       1       2
ENSG00000000419     512     889     701     569    1144     857

### read count转FPKM
#要保证表达矩阵的行名和存放基因长度向量的名字一致, 这一步非常重要。
eff_length <-read.csv("gene_length_2.csv", row.names = 1, header = T)
rownames(eff_length)<-eff_length$gene_id 
colnames(eff_length)<-c("gene_id","eff_length")
rownames(eff_length) <- do.call(rbind,strsplit(as.character(eff_length$gene_id),'\\.'))[,1]

# 从输入数据里提取基因名
feature_ids <- rownames(expMatrix)

# 检查gtf eff_length文件和表达量输入文件里基因名的一致性
if (! all(feature_ids %in% rownames(eff_length))){
  tbl <- table(feature_ids %in% rownames(eff_length))
  msg1 <- sprintf("%i gene is shared, %i gene is specified", tbl[[2]],tbl[[1]])
  warning(msg1)
} 

if (! identical(feature_ids, rownames(eff_length))){
  msg2 <- sprintf("Given GTF file only contain %i gene, but experssion matrix has %i gene", nrow(eff_length), nrow(expMatrix))
  warning(msg2)
}

# trim the expression matrix and effetive gene length
expMatrix <- expMatrix[feature_ids %in% rownames(eff_length),]
mm <- match(rownames(expMatrix), rownames(eff_length))
eff_length <- eff_length[mm, ]

if (identical(rownames(eff_length), rownames(expMatrix))){
  print("GTF and expression matix now have the same gene and gene in same order")
}
#如果上面代码运行时有警告，主要是因为GTF里面的基因数少于表达矩阵，请换一个更新版本的GTF文件。为了让二者基因数量一致，会删减表达矩阵的行数（基因数）。

#countToFpkm函数
countToFpkm <- function(counts, effLen){
  N <- sum(counts)
  exp( log(counts) + log(1e9) - log(effLen) - log(N) )
}

#最后执行下面的代码，从read count转成FPKM：
fpkms <- apply(expMatrix, 2, countToFpkm, effLen = eff_length$eff_length)
fpkms.m<-data.frame(fpkms)
colnames(fpkms.m)<-colnames(expMatrix)
dim(fpkms.m)
 56830     6
#查看前三个基因的FPKM值
fpkms.m[1:3,]
   sample1     sample2     sample3     sample4
ENSG00000000003 16.69071412 12.89858359 14.96212407 10.90373371
ENSG00000000005  0.08358028  0.02505679  0.06834302  0.04313667
ENSG00000000419 14.27027633 29.71295208 21.30146940 10.91330458
                    sample5     sample6
ENSG00000000003 17.90399004 15.12281791
ENSG00000000005  0.00947781  0.02284661
ENSG00000000419 14.46280779 13.05843654

#把算好的FPKM保存到本地
write.table(fpkms.m, "/Users/apple/Desktop/output_count2fpkm.txt", sep="\t", quote=F, row.names=T)

fpkms.m

counTOTPM

##读取Count表达矩阵
expMatrix <- read.table("raw_count.txt",
                        row.names = 1, header = TRUE, sep="\t")
expMatrix[1:3,]
sample1 sample2 sample3 sample4 sample5 sample6
ENSG00000000003    2250    1450    1850    2136    5321    3729
ENSG00000000005       4       1       3       3       1       2
ENSG00000000419     512     889     701     569    1144     857


### read count转TPM
#首先要保证表达矩阵的行名和存放基因长度向量的名字一致, 这一步非常重要

eff_length <-read.csv("gene_length_1.csv", row.names = 1, header = T)
eff_length$gene_id <- rownames(eff_length)
rownames(eff_length) <- do.call(rbind,strsplit(eff_length$gene_id,'\\.'))[,1]

feature_ids <- rownames(expMatrix)

if (! all(feature_ids %in% rownames(eff_length))){
  tbl <- table(feature_ids %in% rownames(eff_length))
  msg1 <- sprintf("%i gene is shared, %i gene is specified", tbl[[2]],tbl[[1]])
  warning(msg1)
  
} 

if (! identical(feature_ids, rownames(eff_length))){
  msg2 <- sprintf("Given GTF file only contain %i gene, but experssion matrix has %i gene", nrow(eff_length), nrow(expMatrix))
  warning(msg2)
}

# trim the expression matrix and effetive gene length
expMatrix <- expMatrix[feature_ids %in% rownames(eff_length),]
mm <- match(rownames(expMatrix), rownames(eff_length))
eff_length <- eff_length[mm, ]

if (identical(rownames(eff_length), rownames(expMatrix))){
  print("GTF and expression matix now have the same gene and gene in same order")
}

#计算TPM
x <- expMatrix / eff_length$eff_length
expMatrix_tpm <- t( t(x) / colSums(x) ) * 1e6 
#检查一下，是不是每列的总和都是1
colSums(expMatrix_tpm)
sample1 sample2 sample3 sample4 sample5 sample6 
  1e+06   1e+06   1e+06   1e+06   1e+06   1e+06 

#把算好的TPM保存到本地
write.table(expMatrix_tpm, "/Users/apple/Desktop/output_count2tpm.txt", sep="\t", quote=F, row.names=T)

expMatrix_tpm

FPKMToTPM

rm（list=ls()）
##读取FPKM表达矩阵
expMatrix<-read.table("./output_count2fpkm.txt",header = T,row.names = 1)
#查看前三个基因的FPKM值
expMatrix[1:3,]

                    sample1     sample2     sample3     sample4
ENSG00000000003 16.69071412 12.89858359 14.96212407 10.90373371
ENSG00000000005  0.08358028  0.02505679  0.06834302  0.04313667
ENSG00000000419 14.27027633 29.71295208 21.30146940 10.91330458
                    sample5     sample6
ENSG00000000003 17.90399004 15.12281791
ENSG00000000005  0.00947781  0.02284661
ENSG00000000419 14.46280779 13.05843654

### FPKM转TPM
fpkmToTpm <- function(fpkm)
{
  exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
}

#计算TPM值，保存到tpms里
tpms <- apply(expMatrix,2,fpkmToTpm)

#查看前三个基因的TPM值
tpms[1:3,](可看出和前面计算的TPM一致）
sample1     sample2    sample3    sample4
ENSG00000000003 54.7479721 38.98519367 47.2173751 30.6182682
ENSG00000000005  0.2741555  0.07573264  0.2156765  0.1211301
ENSG00000000419 46.8085838 89.80561184 67.2230403 30.6451437
                   sample5     sample6
ENSG00000000003 55.3028651 44.91547454
ENSG00000000005  0.0292756  0.06785549
ENSG00000000419 44.6735452 38.78416559

#检查一下，是不是每列的总和都是1
colSums(tpms)
sample1 sample2 sample3 sample4 sample5 sample6 
  1e+06   1e+06   1e+06   1e+06   1e+06   1e+06 

#把TPM值保存到文件
write.table(tpms,"./FPKM2TPM.genes.txt",sep="\t", quote=F, row.names=T)

2021-06-27 count_TPM_FPKM转换
基因长度的计算 counTOFPKM counTOTPM FPKMToTPM
210627承传公园
2021-06-27 大洋边城承传公园（Heritage Park, Oceanside, California）...
2022-07-03 思考成长周复盘（第106/130周）
一时间 2021-06-27 ~ 2022-07-03 二行动回顾与反思 1 生活作息（2022年度早起目标：...
2021-06-27 思考成长周复盘
一时间 2021-06-22 ~ 2021-06-27 二围绕精力提升所做的事情简书日更写作低脂纯素饮食，...
南湖之行
2021-06-27 周日晴辛丑农历五月十八日，应好友相邀，移步南湖。烈日炎炎，非阳伞能挡。然湖中...
20210703《梦圆大地：袁隆平传》
缘起 2021-06-27从江宁图书馆借到的，第1个番茄钟1-18/202，全书计划10个。 2021-07-03...
我一定是把老公气急了
幸福日志2021-06-27 周日阴雨我一定是把老公气急了，他这么理智的人都能说出，“我不跟你过了。” 虽然我是...
【餐饮100问】28.美团禁止二选一？假的！
Day239 2021-06-27 休息日，在家陪伴家人。今天想来吐槽一下美团外卖。前段时间给商家更新了新的合同...
#Dairy235 ?!
2021-06-27 晴周日没干啥，睡了两觉。九点挣扎起来和AIESEC市场部顾问，互相认识一下。一个二十岁...
把一件事做好就行
日记850篇 2021-06-27 今天听小娥说，有个做家政公司的女老板，之前做了5年月嫂，后来出来自己做公司，短...

2021-06-27 count_TPM_FPKM转换

基因长度的计算

counTOFPKM

counTOTPM

FPKMToTPM

相关文章

2021-06-27 count_TPM_FPKM转换

210627承传公园

2022-07-03 思考成长周复盘（第106/130周）

2021-06-27 思考成长周复盘

南湖之行

20210703《梦圆大地：袁隆平传》

我一定是把老公气急了

【餐饮100问】28.美团禁止二选一？假的！

#Dairy235 ?!

把一件事做好就行

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读

注释和富集