美文网首页
转录组下游分析

转录组下游分析

作者: 略略咯 | 来源:发表于2022-11-08 10:54 被阅读0次

    (linux)上游分析得到count矩阵,进行下游分析(R)
    流程:各组count合并矩阵--去除Ensemble ID版本号(.)--去重过滤--ID转换(1、AnnoProbe包annoGene函数转换 2clusterProfiler包bitr函数转换)--

    # 清空环境
    rm(list = ls())
    ## 报错设置为英文
    Sys.setenv(LANGUAGE = "en")
    
    ######2.1. 软件准备
    if (!requireNamespace("BiocManager", quietly = TRUE))install.packages("BiocManager")
    if (!requireNamespace("stringr", quietly = TRUE))install.packages("stringr")
    if (!requireNamespace("biomaRt", quietly = TRUE))install.packages("biomaRt")
    if (!requireNamespace("filelock", quietly = TRUE))install.packages("filelock")
    if (!requireNamespace("AnnotationDbi", quietly = TRUE))install.packages("AnnotationDbi")
    if (!requireNamespace("clusterProfiler", quietly = TRUE))install.packages("clusterProfiler")
    if (!requireNamespace("org.Hs.eg.db", quietly = TRUE))install.packages("org.Hs.eg.db")
    if (!requireNamespace("AnnoProbe", quietly = TRUE))install.packages("AnnoProbe")
    
    ######2.2.多个count矩阵的合并(merge)
    setwd("D:\\fsdownload\\学校服务器\\psj\\SW620 LOVO 测序数据\\SW620\\bam文件")
    v1<- read.table("SW620-V-1.txt",header = TRUE,sep = "\t")
    v2<- read.table("SW620-V-2.txt",header = TRUE,sep = "\t")
    v3<- read.table("SW620-V-3.txt",header = TRUE,sep = "\t")
    Con_count<- merge(merge(v1,v2,by="Geneid"),v3,by="Geneid")
    x1<- read.table("SW620-82-1.txt",header = TRUE,sep = "\t")
    x2<- read.table("SW620-82-2.txt",header = TRUE,sep = "\t")
    x3<- read.table("SW620-82-3.txt",header = TRUE,sep = "\t")
    UL82_count<- merge(merge(x1,x2,by="Geneid"),x3,by="Geneid")
    counts <- merge(Con_count,UL82_count,by="Geneid")#merge函数通过相同的Geneid列合并矩阵
    names(counts)<-c("ENSEMBL","Con_1","Con_2","Con_3","UL82_1","UL82_2","UL82_3")#数据框name()即列名
    ##简化
    #counts <- merge(merge(merge(v1,v2,by="Geneid"),v3,by="Geneid"),merge(merge(x1,x2,by="Geneid"),x3,by="Geneid"),by="Geneid")
    #names(counts)<-c("ENSEMBL","Con_1","Con_2","Con_3","UL82_1","UL82_2","UL82_3")
    ##
    
    ######2.3. 去除Ensemble ID版本号
    counts$ENSEMBL=unlist(str_split(counts$ENSEMBL,"[.]",simplify=T))[,1]###!!!
    #counts$ENSEMBL列根据字符串的点号进行分割。
    #a = str_split(a,"\\.",simplify = T)[,1] #不带\\直接写"."是正则表达式任意字符的意思,需要\\来转义。
    
    ######2.4.去重:保留平均值大的重复行
    dim(counts)
    counts <- na.omit(counts)#删除NA值  
    counts<- counts[apply(counts,1,max)>=10, ]#剔除count值小于10的行
    dup_gene<-data.frame(gene=counts$ENSEMBL,mean=apply(counts[,2:7],1,mean))#创建矩阵,计算每行的平均值(不能包括第一列)
    counts1 <-counts[order(dup_gene$mean,decreasing = T),]#按平均值降序排列
    counts1<- counts1[!duplicated(counts1 $ENSEMBL),]#保留平均值大的重复行
    dim(counts1)
    counts1 <- counts1[order(counts1 $ENSEMBL),]
    setwd("D:\\fsdownload\\学校服务器\\psj\\SW620 LOVO 测序数据\\SW620")
    write.table(counts1,file=".\\sw620-counts.txt" , sep ="\t", row.names =F,col.names =T,quote = FALSE)
    write.csv(counts1,file=".\\sw620-counts.csv",row.names = F)
    
    #######2.5 ID转换(如果提取蛋白编码基因,建议用AnnoProbe包)
    ###2.5.1. 基因类型注释:AnnoProbe包annoGene函数提取protein_coding基因,利用自带gene_symbol完成ID转换
    library(AnnoProbe)
    IDs <- counts1$ENSEMBL
    ID_type = "ENSEMBL"
    type <- annoGene(IDs, ID_type = "ENSEMBL",out_file ='ID_biotypes.csv')
    
    
    
    ![cdbf2c0d47d1641dd7ca250404e5035.png](https://img.haomeiwen.com/i28015006/c48c3b201c17b871.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
    
    #提取protein_coding基因
    library(dplyr)
    pro_coding<- filter(type,biotypes=="protein_coding")#提取type中biotypes=="protein_coding"的行
    pro_coding <- pro_coding[!duplicated(pro_coding$ENSEMBL), ]#去除重复蛋白基因注释
    protein_coding <- merge(counts1, pro_coding, by = c("ENSEMBL", "ENSEMBL"))#通过ENSEMBL合并数据框
    protein_ENSEMBL <- protein_coding[,1:7]#保留"ENSEMBL","Con_1","Con_2","Con_3","UL82_1","UL82_2","UL82_3"列
    write.csv(protein_ENSEMBL, file = "./procode_ENSEMBL.csv",row.names = F)
    #x <- protein_coding[duplicated(protein_coding$SYMBOL),] 报错原因:多个ENSEMBL对应一个SYMBLE;或SYMBL未标出亚家族
    ##ENSEMBL转为SYMBOL
    #保留"Con_1","Con_2","Con_3","UL82_1","UL82_2","UL82_3","SYMBOL"列,并将SYMBOL列作为第一列
    protein_SYMBOL <- protein_coding %>% select(8,2,3,4,5,6,7)
    #简化 protein_SYMBOL <- protein_coding[,c(8,2,3,4,5,6,7)]
    write.csv(protein_SYMBOL, file = "./protein_SYMBOL_AnnoProbe.csv",row.names = F)
    
    ###2.5.2. clusterProfiler包bitr()函数转换ID
    library(clusterProfiler)
    library(org.Hs.eg.db)
    table(duplicated(protein_ENSEMBL$ENSEMBL)) # 检查是否有重复的ensemble ID
    dim(protein_ENSEMBL)
    gene_id <- bitr(protein_ENSEMBL$ENSEMBL, 
                fromType = "ENSEMBL",
                toType = "SYMBOL",
                OrgDb = org.Hs.eg.db)#将ENSEMBL转为SYMBOL
    dim(gene_id)
    protein_symbol <- merge(gene_id, protein_ENSEMBL, by = c("ENSEMBL", "ENSEMBL"))
    write.csv(protein_symbol, file = "./protein_symbol_bitr.csv",row.names = F)
    dim(protein_SYMBOL)
    dim(protein_symbol)
    

    相关文章

      网友评论

          本文标题:转录组下游分析

          本文链接:https://www.haomeiwen.com/subject/pvfntdtx.html