RNA-seq分析至尊套餐

作者: 落寞的橙子 | 来源:发表于2019-05-09 11:13 被阅读6次

    本分析从HISAT2比对的counts开始,先整合所有的counts数据,再获取再半数以上样本中CPM>1的基因,再进行差异分析,pathway分析,并并制作GSEA的input文件

    suppressMessages(library(DESeq2))
    suppressMessages(library(humanid))#参考Jimmy教程,安装这个包
    suppressMessages(library(limma))
    suppressMessages(library(ggplot2))
    suppressMessages(library(pathview))
    suppressMessages(library(topGO))
    suppressMessages(library(clusterProfiler))
    suppressMessages(library(enrichplot))
    suppressMessages(library(DOSE))
    suppressMessages(library(org.Hs.eg.db))
    setwd("~/Fcounts")
    
    ##########Combind data############
    prefix<-"Your file names"
    
    
    mytsvfile = list.files(pattern="*.tsv")   
    list2env(
     lapply(setNames(mytsvfile, make.names(gsub("*.tsv$", "", mytsvfile))),
            read.table,header=T,row.names=1,check.names=FALSE,skip = 1), envir = .GlobalEnv)
    files<-unlist(lapply(mytsvfile, FUN = function(x) {return(strsplit(x, split = ".tsv")[[1]][1])}))
    files2<-unlist(lapply(mytsvfile, FUN = function(x) {return(strsplit(x, split = ".trimmed_R.tsv")[[1]][1])}))
    data<-as.matrix(get(files[1])[,6])
    rownames(data)<-row.names(get(files[1]))
    colnames(data)<-files[1]
    info<-get(files[1])[,1:5]
    
    ##########get gene whose CPM>1 in more than half samples##########
    for (i in files[-1]){
     tmp<-as.matrix(get(i)[,6])
     rownames(tmp)<-row.names(get(i))
     colnames(tmp)<-i
     data<-cbind(data,tmp)
    }
    data<-cbind(info,data)
    meta_data<-data[,1:5]
    counts<-data[,6:ncol(data)]
    colnames(counts)<-files2
    data<-cbind(meta_data,counts)
    write.csv(data,"BC_featureCounts_counts.csv")
    
    rt <- as.matrix(t(t(counts)/colSums(counts) * 1000000))
    write.csv(rt,"BC_CPM.csv")
    
    MyFunction<-function(x){
     ifelse(x>1,1,0)
    }
    #设置一个计数函数
    keep_genes<-vector()#设置一个空的变量储存基因
    row_names<-row.names(rt)
    for (i in row_names){
     cpm<-rt[i,]
     count<-apply(as.matrix(cpm),2,MyFunction)
     a<-sum(count)
     if (a>=ncol(rt)/2)
       keep_genes<-c(keep_genes,i)
    }
    new_counts<-counts[keep_genes,]
    new_cpm<-rt[keep_genes,]
    
    #-----TPM Calculation------
    
    avg_cpm <- data.frame(avg_cpm=rowMeans(new_cpm))
    kb <- meta_data$Length / 1000
    rpk <- new_counts / kb
    tpm <- t(t(rpk)/colSums(rpk) * 1000000)
    avg_tpm <- data.frame(avg_tpm=rowMeans(tpm))
    
    write.csv(new_counts,paste0(prefix,"_new_counts_CPM1.csv"))
    write.csv(new_cpm,paste0(prefix,"_cpm_CPM1.csv"))
    write.csv(avg_tpm,paste0(prefix,"_avg_tpm_CPM1.csv"))
    write.csv(avg_cpm,paste0(prefix,"_avg_cpm_CPM1.csv"))
    write.csv(tpm,paste0(prefix,"_tpm_CPM1.csv"))
    write.csv(cpm,paste0(prefix,"_cpm_CPM1.csv"))
    
    ###########DEG analysis############
    group_list<-c(rep("NC",3),rep("OE",3))
    colData <- data.frame(row.names=colnames(new_counts), group_list=group_list)
    dds1 <- DESeqDataSetFromMatrix(countData = new_counts,
                                  colData = colData,
                                  design = ~ group_list)
    suppressMessages(dds <- DESeq(dds1))
    res <-results(dds, contrast=c("group_list","OE","NC"))
    
    resOrdered <- res[order(res$padj),]
    resOrdered=as.data.frame(resOrdered)
    sigoutTab1<-resOrdered[abs(resOrdered$log2FoldChange)>1 & resOrdered$padj<0.05,]
    sigoutTab2<-resOrdered[abs(resOrdered$log2FoldChange)>0.5849625 & resOrdered$padj<0.05,]
    write.csv(resOrdered,paste0(prefix,"_DESeq2_results.csv"))
    write.csv(sigoutTab1,paste0(prefix,"_FoldChange_2_DESeq2_results.csv"))
    write.csv(sigoutTab2,paste0(prefix,"_FoldChange_1.5_DESeq2_results.csv"))
    
    
    ##########Pathway#########
    genes1<-unlist(lapply(row.names(sigoutTab1), FUN = function(x) {return(strsplit(x, split = ".",fixed = T)[[1]][1])}))
    gene_list1<-select(org.Hs.eg.db, keys=as.character(genes1), columns=c("SYMBOL","ENTREZID"), keytype="ENSEMBL") #keytype是你输入基因编号类型,columns是你输出对基因编号类型,基因怎么导入不再赘述。
    genes2<-unlist(lapply(row.names(sigoutTab2), FUN = function(x) {return(strsplit(x, split = ".",fixed = T)[[1]][1])}))
    gene_list2<-select(org.Hs.eg.db, keys=as.character(genes2), columns=c("SYMBOL","ENTREZID"), keytype="ENSEMBL") #keytype是你输入基因编号类型,columns是你输出对基因编号类型,基因怎么导入不再赘述。
    entrezIDs1 <- as.character(gene_list1[,3])
    entrezIDs2 <- as.character(gene_list2[,3])
    
    go1 <- enrichGO(entrezIDs1, OrgDb = "org.Hs.eg.db", ont="all")
    write.csv(as.data.frame(go1@result), file=paste0(prefix,"_go_all_all_FoldChange_2.csv"))
    
    kk1 <- enrichKEGG(gene = entrezIDs1,
                    organism="human",
                    pvalueCutoff = 0.05,
                    qvalueCutoff = 0.05,
                    minGSSize = 1,
                    #readable = TRUE ,
                    use_internal_data =FALSE)
    write.csv(as.data.frame(kk1@result), file=paste0(prefix,"_kegg_all_FoldChange_2.csv"))
    
    tiffFile<-paste0(prefix,"_FoldChange_2_go_all_dotplot.tiff")
    p1<-dotplot(go1, split="ONTOLOGY") + facet_grid(ONTOLOGY~., scale="free")
    tiff(file=tiffFile,width = 1000,height = 600)
    plot(p1)
    dev.off()
    tiff(file=paste0(prefix,"_FoldChange_2_KEGG_barplot.tiff"),width = 20,height = 20,units ="cm",compression="lzw",bg="white",res=300)
    g1<-barplot(kk1, drop = TRUE, showCategory = 12)
    plot(g1)
    dev.off()
    #??ͼ
    tiff(file=paste0(prefix,"_FoldChange_2_KEGG_dotplot.tiff"),width = 20,height = 20,units ="cm",compression="lzw",bg="white",res=300)
    gg1<-dotplot(kk1)
    plot(gg1)
    dev.off()
    
    
    kk2 <- enrichKEGG(gene = entrezIDs2,
                     organism="human",
                     pvalueCutoff = 0.05,
                     qvalueCutoff = 0.05,
                     minGSSize = 1,
                     #readable = TRUE ,
                     use_internal_data =FALSE)
    write.csv(as.data.frame(kk2@result), paste0(prefix,"_kegg_all_FoldChange_1.5.csv"))
    
    tiffFile<-paste0(prefix,"_FoldChange_1.5_go_all_dotplot.tiff")
    p2<-dotplot(go2, split="ONTOLOGY") + facet_grid(ONTOLOGY~., scale="free")
    tiff(file=tiffFile,width = 1000,height = 600)
    plot(p2)
    dev.off()
    
    tiff(file=paste0(prefix,"_FoldChange_1.5_KEGG_barplot.tiff"),width = 20,height = 20,units ="cm",compression="lzw",bg="white",res=300)
    g2<-barplot(kk2, drop = TRUE, showCategory = 12)
    plot(g2)
    dev.off()
    #??ͼ
    tiff(file=paste0(prefix,"_FoldChange_1.5_KEGG_dotplot.tiff"),width = 20,height = 20,units ="cm",compression="lzw",bg="white",res=300)
    gg2<-dotplot(kk2)
    plot(gg2)
    dev.off()
    
    
    ############get the annotation form gtf files#######
    #creat .csv from gtf files very easy得到human_v0.0.3.csv
    #参考这个https://www.jianshu.com/p/e1f3e5abb44f
    ann<-read.csv("~/human_v0.0.3.csv",header = T,row.names = 1,check.names = F)
    ann_tmp<-ann[,c("gene_name","gene_type")]
    tpm_ann<-cbind(ann_tmp[row.names(tpm),],tpm)
    tpm_ann<-as.matrix(tpm_ann[!duplicated(tpm_ann$gene_name),])
    row.names(tpm_ann)<-as.character(tpm_ann[,1])
    new_tpm<-tpm_ann[,3:ncol(tpm_ann)]
    write.csv(new_tpm,paste0(prefix,"_tpm_CPM1_with_annotation.csv"))  
    createGSEAinput(prefix = prefix, exprSet=new_tpm,group_list=group_list)
    ##########
    

    相关文章

      网友评论

        本文标题:RNA-seq分析至尊套餐

        本文链接:https://www.haomeiwen.com/subject/rfaioqtx.html