###计算RPKM值
##读入gff3文件
genecode<-read.table(file = "C:\\Users\\zhouwenqing789\\Desktop\\乳腺癌TCGA数据库20191226\\gencode.v22.long_noncoding_RNAs.gff3" )
g_len<-genecode[genecode[,3]=="gene",]
annogene<-data.frame(matrix(NA,nrow(g_len),3))
colnames(annogene)<-c("ENSG","symbol","gene_length")
lncname<-str_split(g_len[,9],";")
library(stringr)
dim(annogene)
tail(annogene)
class(annogene)
head(annogene)
for(j in 1:length(lncname)){
annogene[j,1]=substring(lncname[[j]][1],4,)
annogene[j,2]=substring(lncname[[j]][5],11,)
}
##注意每个基因id不一样,而且其中每个转录本的id又不一样,但每个转录本下的外显子id一样,给了我们提取外显子的思路
##首先选出一个基因id下的所有外显子和转录本(共一个基因id),再根据转录本id,选取相应的一个转录本和外显子,再删除转录本。
ci<-1
for(len in 1:as.numeric(length(annogene[,1]))){
ll<-str_detect(genecode[,9],annogene[len,1])
llen<-genecode[ll,]
llen<-llen[llen[,3]!="gene",]
lnc_n<-str_split(llen[1,9],";")
t_name<-substring(lnc_n[[1]][1],4,)
befor_len<-genecode[str_detect(genecode[,9],t_name),][-1,c(4:5)]
g_length<-sum(befor_len[,2]-befor_len[,1])
annogene[len,3]<-g_length
if(ci%%100==0){
print(paste("获得了",ci,"个lncrna",sep=""))
}
ci=ci+1
}
write.csv(annogene,file = "C:\\Users\\zhouwenqing789\\Desktop\\乳腺癌TCGA数据库20191226\\annogenelncrna.csv",row.names = F, quote = F)
annogene$ensg<-substr(annogene$ENSG,1,15)
annogene<-annogene[,-1]
rawcountlncrna1<-rawcountlncrna*10^9
dim(sample_merge20[annogene$ensg%in%row.names(sample_merge20),])
##真正的开始计算RPKM值
sampl_mergelnc<-sample_merge20[annogene$ensg%in%row.names(sample_merge20),]
shendu<-apply(sample_merge20,2,sum)
rawcountlncrna<-t(rawcountlncrna)
annogene1<-annogene[annogene$ensg%in%row.names(rawcountlncrna),]
dim(annogene1)
annogene1<-annogene1[order(annogene1$ensg),]
rawcountlncrna1<-cbind(rawcountlncrna,annogene1$gene_length)
rpkm<-rawcountlncrna1
for(len in 1:ncol(rpkm)){
rpkm[,len]<-rpkm[,len]*10^9/shendu[len]/rpkm[,ncol(rpkm)]
}
网友评论