TranscriptID转GeneID

作者: 裁尘的人儿 | 来源:发表于2019-10-14 20:37 被阅读0次

x=data.table::fread("Transcripts.readscount.xls",header=T,sep="\t")
y=as.matrix(x,rownames=1)#带rownames参数值的as.matrix() 转换后不会将dataframe里的数值变成character

#将行名转换成第一列
y1=y
rownames(y1)=NULL
y1=data.frame(transcript_id=rownames(y),y1)

map=read.table("selected_probemap.xls",header=T)

#selected_transcript_type=df[df %in% c("IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_LV_gene", 
#                                      +              "IG_M_gene", "IG_V_gene", "IG_Z_gene", 
#                                     +              "nonsense_mediated_decay", "nontranslating_CDS", 
#                                     +              "non_stop_decay", "polymorphic_pseudogene", 
#                                     +              "protein_coding", "TR_C_gene", "TR_D_gene", "TR_gene", 
#                                     +              "TR_J_gene", "TR_V_gene")]

#selected_transcript_type=df[df %in% c("IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_LV_gene","IG_M_gene", "IG_V_gene", "IG_Z_gene","nonsense_mediated_decay", "nontranslating_CDS","non_stop_decay", "polymorphic_pseudogene", "protein_coding", "TR_C_gene", "TR_D_gene", "TR_gene", "TR_J_gene", "TR_V_gene")]



#从当前文件的转录本类型中挑选出免疫相关的转录本类型
df=unique(map$transcript_type)
#length(df)
#22
immune_transcript_type=c("IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_LV_gene","IG_M_gene", "IG_V_gene", "IG_Z_gene","nonsense_mediated_decay", "nontranslating_CDS","non_stop_decay", "polymorphic_pseudogene", "protein_coding", "TR_C_gene", "TR_D_gene", "TR_gene", "TR_J_gene", "TR_V_gene")
selected_transcript_type=df[df %in%  immune_transcript_type]
#length(selected_transcript_type)
#[1] 12

#[1] protein_coding          nonsense_mediated_decay non_stop_decay         
#[4] polymorphic_pseudogene  IG_V_gene               IG_C_gene              
#[7] IG_J_gene               TR_C_gene               TR_J_gene              
#[10] TR_V_gene               TR_D_gene               IG_D_gene 


#把这些免疫转录本类型ID挑出来
map=map[map$transcript_type %in% selected_transcript_type,]
length(unique(map1$transcript_type))
#12


table(rownames(y) %in% map$transcript_id) 
#FALSE   TRUE 
#56607 152085 

#在挑出免疫相关的转录本ID(即成为参考转录本id库)后,统计表达谱里有多少个转录本id不在参考转录本id库里
#发现有56607个转录本ID不在ID库里,要将这些ID过滤删除掉
exprset=y[rownames(y) %in% map$transcript_id,]
dim(exprset)
#[1] 152085   7028


#接下来,ID与symbol匹配
map=map[match(rownames(exprset),map$transcript_id),]#match(a,b):返回a中元素在b中的位置
tmp=by(exprset,map$gene_name,function(x) {colSums(x)})
dim(tmp)
#20721
length(tmp[[1]])
#7028
names(tmp)#symbol names
names(tmp[[2]])#sample id

#tmp is list,need to be changed into matrix
eset=matrix(0,20721,7028)
rownames(eset)=names(tmp)
colnames(eset)=names(tmp[[2]])
for(i in 1:nrow(eset)){
  eset[i,]=tmp[[i]]
}
write.csv(eset,"transcript_reads_count.txt")

网友评论

本文标题：TranscriptID转GeneID

本文链接：https://www.haomeiwen.com/subject/jxsdmctx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

TranscriptID转GeneID

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读