基因长度的计算
#计算基因长度
#思路1:计算基因在染色体的起始和结束之差
#思路2:计算每个基因的最长转录本或所有外显子之和
##############################################
#### 方法1 简单把基因在染色体上的起始位置和结束位置之差用作标准化的长度。
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("biomaRt")
library(biomaRt)
#查看基因组参数
mart = useMart('ensembl')
listDatasets(mart)
#以人类为例 获取基因组信息
bmart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl",
host = "www.ensembl.org")
# 从输入数据里提取基因名
feature_ids <- rownames(expMatrix)
attributes = c(
"ensembl_gene_id",
#"hgnc_symbol",
"chromosome_name",
"start_position",
"end_position"
)
filters = "ensembl_gene_id"
feature_info <- biomaRt::getBM(attributes = attributes,
filters = filters,
values = feature_ids, mart = bmart)
mm <- match(feature_ids, feature_info[[filters]])
feature_info_full <- feature_info[mm, ]
rownames(feature_info_full) <- feature_ids
# 计算基因的有效长度eff_length
eff_length <- abs(feature_info_full$end_position - feature_info_full$start_position)
names(eff_length) <- feature_info_full$ensembl_gene_id
write.csv(eff_length, "gene_length_1.csv", row.names = TRUE)
##############################################
#方法2 计算每个基因的最长转录本或所有外显子之和
#source("https://bioconductor.org/biocLite.R")
#biocLite("GenomicFeatures")
library(GenomicFeatures)
#导入GTF 或者GFF3文件,ensembl或者gencode网站GTF注释皆可
#下载链接ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22/gencode.v22.annotation.gtf.gz
txdb <- makeTxDbFromGFF("gencode.v22.annotation.gtf",format="gtf")
#获取外显子
exons_gene <- exonsBy(txdb, by = "gene")
##并行计算
#install.packages('parallel')
library(parallel)
#检测核心数
cores<-detectCores(logical = F)
#设定个核心
cl <- makeCluster(cores)
#对外显子重叠部分通过reduce 去冗余,并计算总长度
results <- parLapply(cl,exons_gene,function(x){sum(width(reduce(x)))})
stopCluster(cl)#停止
gene_length2_test <- do.call(rbind,lapply(results, data.frame))
#或者plyr得到结果
#install.packages('plyr')
library (plyr)
gene_length2_test<- ldply(results, data.frame)
colnames(gene_length2_test)<-c('gene_id','eff_length')
write.csv(gene_length2, "gene_length_2.csv", row.names = TRUE)
counTOFPKM
###读取Count表达矩阵
expMatrix <- read.table("raw_count.txt",
row.names = 1, header = TRUE, sep="\t")
#查看前三个基因的read count
expMatrix[1:3,]
sample1 sample2 sample3 sample4 sample5 sample6
ENSG00000000003 2250 1450 1850 2136 5321 3729
ENSG00000000005 4 1 3 3 1 2
ENSG00000000419 512 889 701 569 1144 857
### read count转FPKM
#要保证表达矩阵的行名和存放基因长度向量的名字一致, 这一步非常重要。
eff_length <-read.csv("gene_length_2.csv", row.names = 1, header = T)
rownames(eff_length)<-eff_length$gene_id
colnames(eff_length)<-c("gene_id","eff_length")
rownames(eff_length) <- do.call(rbind,strsplit(as.character(eff_length$gene_id),'\\.'))[,1]
# 从输入数据里提取基因名
feature_ids <- rownames(expMatrix)
# 检查gtf eff_length文件和表达量输入文件里基因名的一致性
if (! all(feature_ids %in% rownames(eff_length))){
tbl <- table(feature_ids %in% rownames(eff_length))
msg1 <- sprintf("%i gene is shared, %i gene is specified", tbl[[2]],tbl[[1]])
warning(msg1)
}
if (! identical(feature_ids, rownames(eff_length))){
msg2 <- sprintf("Given GTF file only contain %i gene, but experssion matrix has %i gene", nrow(eff_length), nrow(expMatrix))
warning(msg2)
}
# trim the expression matrix and effetive gene length
expMatrix <- expMatrix[feature_ids %in% rownames(eff_length),]
mm <- match(rownames(expMatrix), rownames(eff_length))
eff_length <- eff_length[mm, ]
if (identical(rownames(eff_length), rownames(expMatrix))){
print("GTF and expression matix now have the same gene and gene in same order")
}
#如果上面代码运行时有警告,主要是因为GTF里面的基因数少于表达矩阵,请换一个更新版本的GTF文件。为了让二者基因数量一致,会删减表达矩阵的行数(基因数)。
#countToFpkm函数
countToFpkm <- function(counts, effLen){
N <- sum(counts)
exp( log(counts) + log(1e9) - log(effLen) - log(N) )
}
#最后执行下面的代码,从read count转成FPKM:
fpkms <- apply(expMatrix, 2, countToFpkm, effLen = eff_length$eff_length)
fpkms.m<-data.frame(fpkms)
colnames(fpkms.m)<-colnames(expMatrix)
dim(fpkms.m)
56830 6
#查看前三个基因的FPKM值
fpkms.m[1:3,]
sample1 sample2 sample3 sample4
ENSG00000000003 16.69071412 12.89858359 14.96212407 10.90373371
ENSG00000000005 0.08358028 0.02505679 0.06834302 0.04313667
ENSG00000000419 14.27027633 29.71295208 21.30146940 10.91330458
sample5 sample6
ENSG00000000003 17.90399004 15.12281791
ENSG00000000005 0.00947781 0.02284661
ENSG00000000419 14.46280779 13.05843654
#把算好的FPKM保存到本地
write.table(fpkms.m, "/Users/apple/Desktop/output_count2fpkm.txt", sep="\t", quote=F, row.names=T)
fpkms.m
counTOTPM
##读取Count表达矩阵
expMatrix <- read.table("raw_count.txt",
row.names = 1, header = TRUE, sep="\t")
expMatrix[1:3,]
sample1 sample2 sample3 sample4 sample5 sample6
ENSG00000000003 2250 1450 1850 2136 5321 3729
ENSG00000000005 4 1 3 3 1 2
ENSG00000000419 512 889 701 569 1144 857
### read count转TPM
#首先要保证表达矩阵的行名和存放基因长度向量的名字一致, 这一步非常重要
eff_length <-read.csv("gene_length_1.csv", row.names = 1, header = T)
eff_length$gene_id <- rownames(eff_length)
rownames(eff_length) <- do.call(rbind,strsplit(eff_length$gene_id,'\\.'))[,1]
feature_ids <- rownames(expMatrix)
if (! all(feature_ids %in% rownames(eff_length))){
tbl <- table(feature_ids %in% rownames(eff_length))
msg1 <- sprintf("%i gene is shared, %i gene is specified", tbl[[2]],tbl[[1]])
warning(msg1)
}
if (! identical(feature_ids, rownames(eff_length))){
msg2 <- sprintf("Given GTF file only contain %i gene, but experssion matrix has %i gene", nrow(eff_length), nrow(expMatrix))
warning(msg2)
}
# trim the expression matrix and effetive gene length
expMatrix <- expMatrix[feature_ids %in% rownames(eff_length),]
mm <- match(rownames(expMatrix), rownames(eff_length))
eff_length <- eff_length[mm, ]
if (identical(rownames(eff_length), rownames(expMatrix))){
print("GTF and expression matix now have the same gene and gene in same order")
}
#计算TPM
x <- expMatrix / eff_length$eff_length
expMatrix_tpm <- t( t(x) / colSums(x) ) * 1e6
#检查一下,是不是每列的总和都是1
colSums(expMatrix_tpm)
sample1 sample2 sample3 sample4 sample5 sample6
1e+06 1e+06 1e+06 1e+06 1e+06 1e+06
#把算好的TPM保存到本地
write.table(expMatrix_tpm, "/Users/apple/Desktop/output_count2tpm.txt", sep="\t", quote=F, row.names=T)
expMatrix_tpm
FPKMToTPM
rm(list=ls())
##读取FPKM表达矩阵
expMatrix<-read.table("./output_count2fpkm.txt",header = T,row.names = 1)
#查看前三个基因的FPKM值
expMatrix[1:3,]
sample1 sample2 sample3 sample4
ENSG00000000003 16.69071412 12.89858359 14.96212407 10.90373371
ENSG00000000005 0.08358028 0.02505679 0.06834302 0.04313667
ENSG00000000419 14.27027633 29.71295208 21.30146940 10.91330458
sample5 sample6
ENSG00000000003 17.90399004 15.12281791
ENSG00000000005 0.00947781 0.02284661
ENSG00000000419 14.46280779 13.05843654
### FPKM转TPM
fpkmToTpm <- function(fpkm)
{
exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
}
#计算TPM值,保存到tpms里
tpms <- apply(expMatrix,2,fpkmToTpm)
#查看前三个基因的TPM值
tpms[1:3,](可看出和前面计算的TPM一致)
sample1 sample2 sample3 sample4
ENSG00000000003 54.7479721 38.98519367 47.2173751 30.6182682
ENSG00000000005 0.2741555 0.07573264 0.2156765 0.1211301
ENSG00000000419 46.8085838 89.80561184 67.2230403 30.6451437
sample5 sample6
ENSG00000000003 55.3028651 44.91547454
ENSG00000000005 0.0292756 0.06785549
ENSG00000000419 44.6735452 38.78416559
#检查一下,是不是每列的总和都是1
colSums(tpms)
sample1 sample2 sample3 sample4 sample5 sample6
1e+06 1e+06 1e+06 1e+06 1e+06 1e+06
#把TPM值保存到文件
write.table(tpms,"./FPKM2TPM.genes.txt",sep="\t", quote=F, row.names=T)
网友评论