x=data.table::fread("Transcripts.readscount.xls",header=T,sep="\t")
y=as.matrix(x,rownames=1)#带rownames参数值的as.matrix() 转换后不会将dataframe里的数值变成character
#将行名转换成第一列
y1=y
rownames(y1)=NULL
y1=data.frame(transcript_id=rownames(y),y1)
map=read.table("selected_probemap.xls",header=T)
#selected_transcript_type=df[df %in% c("IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_LV_gene",
# + "IG_M_gene", "IG_V_gene", "IG_Z_gene",
# + "nonsense_mediated_decay", "nontranslating_CDS",
# + "non_stop_decay", "polymorphic_pseudogene",
# + "protein_coding", "TR_C_gene", "TR_D_gene", "TR_gene",
# + "TR_J_gene", "TR_V_gene")]
#selected_transcript_type=df[df %in% c("IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_LV_gene","IG_M_gene", "IG_V_gene", "IG_Z_gene","nonsense_mediated_decay", "nontranslating_CDS","non_stop_decay", "polymorphic_pseudogene", "protein_coding", "TR_C_gene", "TR_D_gene", "TR_gene", "TR_J_gene", "TR_V_gene")]
#从当前文件的转录本类型中挑选出免疫相关的转录本类型
df=unique(map$transcript_type)
#length(df)
#22
immune_transcript_type=c("IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_LV_gene","IG_M_gene", "IG_V_gene", "IG_Z_gene","nonsense_mediated_decay", "nontranslating_CDS","non_stop_decay", "polymorphic_pseudogene", "protein_coding", "TR_C_gene", "TR_D_gene", "TR_gene", "TR_J_gene", "TR_V_gene")
selected_transcript_type=df[df %in% immune_transcript_type]
#length(selected_transcript_type)
#[1] 12
#[1] protein_coding nonsense_mediated_decay non_stop_decay
#[4] polymorphic_pseudogene IG_V_gene IG_C_gene
#[7] IG_J_gene TR_C_gene TR_J_gene
#[10] TR_V_gene TR_D_gene IG_D_gene
#把这些免疫转录本类型ID挑出来
map=map[map$transcript_type %in% selected_transcript_type,]
length(unique(map1$transcript_type))
#12
table(rownames(y) %in% map$transcript_id)
#FALSE TRUE
#56607 152085
#在挑出免疫相关的转录本ID(即成为参考转录本id库)后,统计表达谱里有多少个转录本id不在参考转录本id库里
#发现有56607个转录本ID不在ID库里,要将这些ID过滤删除掉
exprset=y[rownames(y) %in% map$transcript_id,]
dim(exprset)
#[1] 152085 7028
#接下来,ID与symbol匹配
map=map[match(rownames(exprset),map$transcript_id),]#match(a,b):返回a中元素在b中的位置
tmp=by(exprset,map$gene_name,function(x) {colSums(x)})
dim(tmp)
#20721
length(tmp[[1]])
#7028
names(tmp)#symbol names
names(tmp[[2]])#sample id
#tmp is list,need to be changed into matrix
eset=matrix(0,20721,7028)
rownames(eset)=names(tmp)
colnames(eset)=names(tmp[[2]])
for(i in 1:nrow(eset)){
eset[i,]=tmp[[i]]
}
write.csv(eset,"transcript_reads_count.txt")
网友评论