美文网首页
TranscriptID转GeneID

TranscriptID转GeneID

作者: 裁尘的人儿 | 来源:发表于2019-10-14 20:37 被阅读0次
    x=data.table::fread("Transcripts.readscount.xls",header=T,sep="\t")
    y=as.matrix(x,rownames=1)#带rownames参数值的as.matrix() 转换后不会将dataframe里的数值变成character
    
    #将行名转换成第一列
    y1=y
    rownames(y1)=NULL
    y1=data.frame(transcript_id=rownames(y),y1)
    
    map=read.table("selected_probemap.xls",header=T)
    
    #selected_transcript_type=df[df %in% c("IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_LV_gene", 
    #                                      +              "IG_M_gene", "IG_V_gene", "IG_Z_gene", 
    #                                     +              "nonsense_mediated_decay", "nontranslating_CDS", 
    #                                     +              "non_stop_decay", "polymorphic_pseudogene", 
    #                                     +              "protein_coding", "TR_C_gene", "TR_D_gene", "TR_gene", 
    #                                     +              "TR_J_gene", "TR_V_gene")]
    
    #selected_transcript_type=df[df %in% c("IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_LV_gene","IG_M_gene", "IG_V_gene", "IG_Z_gene","nonsense_mediated_decay", "nontranslating_CDS","non_stop_decay", "polymorphic_pseudogene", "protein_coding", "TR_C_gene", "TR_D_gene", "TR_gene", "TR_J_gene", "TR_V_gene")]
    
    
    
    #从当前文件的转录本类型中挑选出免疫相关的转录本类型
    df=unique(map$transcript_type)
    #length(df)
    #22
    immune_transcript_type=c("IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_LV_gene","IG_M_gene", "IG_V_gene", "IG_Z_gene","nonsense_mediated_decay", "nontranslating_CDS","non_stop_decay", "polymorphic_pseudogene", "protein_coding", "TR_C_gene", "TR_D_gene", "TR_gene", "TR_J_gene", "TR_V_gene")
    selected_transcript_type=df[df %in%  immune_transcript_type]
    #length(selected_transcript_type)
    #[1] 12
    
    #[1] protein_coding          nonsense_mediated_decay non_stop_decay         
    #[4] polymorphic_pseudogene  IG_V_gene               IG_C_gene              
    #[7] IG_J_gene               TR_C_gene               TR_J_gene              
    #[10] TR_V_gene               TR_D_gene               IG_D_gene 
    
    
    #把这些免疫转录本类型ID挑出来
    map=map[map$transcript_type %in% selected_transcript_type,]
    length(unique(map1$transcript_type))
    #12
    
    
    table(rownames(y) %in% map$transcript_id) 
    #FALSE   TRUE 
    #56607 152085 
    
    #在挑出免疫相关的转录本ID(即成为参考转录本id库)后,统计表达谱里有多少个转录本id不在参考转录本id库里
    #发现有56607个转录本ID不在ID库里,要将这些ID过滤删除掉
    exprset=y[rownames(y) %in% map$transcript_id,]
    dim(exprset)
    #[1] 152085   7028
    
    
    #接下来,ID与symbol匹配
    map=map[match(rownames(exprset),map$transcript_id),]#match(a,b):返回a中元素在b中的位置
    tmp=by(exprset,map$gene_name,function(x) {colSums(x)})
    dim(tmp)
    #20721
    length(tmp[[1]])
    #7028
    names(tmp)#symbol names
    names(tmp[[2]])#sample id
    
    #tmp is list,need to be changed into matrix
    eset=matrix(0,20721,7028)
    rownames(eset)=names(tmp)
    colnames(eset)=names(tmp[[2]])
    for(i in 1:nrow(eset)){
      eset[i,]=tmp[[i]]
    }
    write.csv(eset,"transcript_reads_count.txt")
    

    相关文章

      网友评论

          本文标题:TranscriptID转GeneID

          本文链接:https://www.haomeiwen.com/subject/jxsdmctx.html