美文网首页生物信息学转录组
转录组从下机数据到GO、kegg、GSEA

转录组从下机数据到GO、kegg、GSEA

作者: wo_monic | 来源:发表于2019-07-19 20:02 被阅读1次

    所有的命令粘贴于此,用于快速完成分析任务。具体软件参数,见

    #!/bin/bash
    #复制下机数据到新的文件夹data ,尽量避免操作原始文件~/disk/lyb/
    find ./Cleandata -name '*fq.gz'|xargs -i cp {} ./data
     
    #以下内容运行目录 ~/disk/lyb/data/
    #1.质控 
    
    fastqc *.fq.gz -t 8 
    
     bg1='RNA_R1.fq.gz'
     bg2='RNA_R2.fq.gz'
     bef=(NS-1 NS-2 NS-3 WT-1 WT-2 WT-3)
    for ((i=0;i<6;i++));
    do
    inA1=${bef[$i]}$bg1;
    inA2=${bef[$i]}$bg2;
    out1=${bef[$i]}"paired-R1.fq.gz";
    out2=${bef[$i]}"paired-R2.fq.gz";
    unpaired1=${bef[$i]}"unpaired-R1.fq.gz";
    unpaired2=${bef[$i]}"unpaired-R2.fq.gz";
    java -jar /home/guo/tool/Trimmomatic-0.38/trimmomatic-0.38.jar PE -threads 12 -phred33 $inA1 $inA2 $out1 $unpaired1 $out2 $unpaired2 ILLUMINACLIP:TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 
    echo $inA1,$inA2,$out1,$out2,$unpaired1,$unpaired2;
    done
    
    
    
    #运行目录是 /disks/backup/chaim/maize/
    /home/chaim/disk/soft/hisat2/extract_exons.py Zea_mays.B73_RefGen_v4.42.gtf > genome.exon
    /home/chaim/disk/soft/hisat2/extract_splice_sites.py Zea_mays.B73_RefGen_v4.42.gtf > genome.ss
    /home/chaim/disk/soft/hisat2/hisat2_extract_snps_haplotypes_VCF.py zea_mays.vcf> genome.snp
    
    #2.1建立索引
    hisat2-build -p 8 Zea_mays.B73_RefGen_v4.42.fa --ss genome.ss --exon genome.exon genome_tran &
    #2.2比对
    for((i=0;i<6;i++));
    do
    out1=${bef[$i]}"paired-R1.fq.gz";
    out2=${bef[$i]}"paired-R2.fq.gz";
    hisat2 -x /disks/backup/chaim/maize/genome_tran -p 16 -1 $out1 -2 $out2 -S ${bef[$i]}".map.sam" --dta-cufflinks --novel-splicesite-outfile ${bef[$i]}".nsplice"     
    done
            
    #第3步:用samtool,格式转换,将sam转换为bam(共6条)
    for((i=0;i<6;i++));
    do
    samtools sort -@ 8 -o ${bef[$i]}".map.bam" ${bef[$i]}".map.sam" 2>${bef[$i]}"samtool_out" 
    done
    
    
    #第4步装配:用stringtie(共三轮)
    #组装转录本(6个分别比对到基因组)
     for((i=0;i<6;i++));
     do
     stringtie ${bef[$i]}".map.bam" -G /disks/backup/chaim/maize/Zea_mays.B73_RefGen_v4.42.gtf -p 8 -o ${bef[$i]}".gtf" &
     done
    #合并各样本(整合6个的结果成一个)
    stringtie --merge -G /disks/backup/chaim/maize/Zea_mays.B73_RefGen_v4.42.gtf -p 8 -o merged.gtf NS-1.gtf NS-2.gtf NS-3.gtf WT-1.gtf WT-2.gtf WT-3.gtf 2>stringtie_merge &
    #估计表达丰度(以第二轮的结果作为参考序列,6个分别比对)
     stringtie ${bef[$i]}".map.bam" -G merged.gtf -p 8 -b ${bef[$i]}"_out" -e -o ${bef[$i]}"-st.gtf" &
    #第5步 生成CSV文件
    #python路径 
    python2.7 /disks/backup/chaim/soft/prepDE.py -i gtf2
    #第6步 deseq2进行定量分析
    
    
    source("https://bioconductor.org/biocLite.R")
    biocLite("DESeq2")
    
    
    #输入数据
    library(tidyverse)
    library(DESeq2)
    library(ggplot2)
    #import data
    #setwd("/home/chaim/disk/lyb/data/")
    #setwd("/mnt/d/RNA-seq/")
    setwd("D:/RNA-seq/")
    
    countData <- as.matrix(read.csv("gene_count_matrix.csv",row.names="gene_id"))
    
    condition <- factor(c(rep("NS",3),rep("WT",3)),levels = c("NS","WT"))
    colData <- data.frame(row.names=colnames(countData),condition)
    dds <- DESeqDataSetFromMatrix(countData = countData,colData = colData, design = ~ condition)
    dds <- DESeq(dds)
    #总体结果查看
    
    res = results(dds)
    res = res[order(res$pvalue),]
    
    summary(res)
    write.csv(res,file="All_results.csv")
    table(res$padj<0.05)
    
    
    
    
    
    
    #提取差异基因(DEGs)并进行gene Symbol注释
    
    diff_gene_deseq2 <- subset(res,padj<0.05 & abs(log2FoldChange)>1)
    dim(diff_gene_deseq2)
    write.csv(diff_gene_deseq2,file = "DEG_treat_vs_control.csv")
    
    
    
    
     # resdata <- res
     # threshold <- as.factor(ifelse(resdata$padj < 0.001 & abs(resdata$log2FoldChange) >= 2 ,ifelse(resdata$log2FoldChange >= 2 ,'Up','Down'),'Not'))
     # ggplot(resdata,aes(x=log2FoldChange,y=-log10(padj),colour=threshold)) + xlab("log2(Fold Change)")+ylab("-log10(qvalue)") + geom_point(size = 0.5,alpha=1) + ylim(0,200) + xlim(-12,12) + scale_color_manual(values=c("green","grey", "red"))
    
    
    
    
    # #安装biomaRt包
    # source("http://bioconductor.org/biocLite.R")
    # biocLite("biomaRt")
    # install.packages('DT')
    # #用bioMart对差异表达基因进行注释
    # library("biomaRt")
    # listMarts()
    # 
    # ensembl=useMart("ENSEMBL_MART_ENSEMBL")
    # all_datasets <- listDatasets(ensembl)
    # library(DT)
    # datatable(all_datasets,options = list(searching=FALSE,pageLength=5,lengthMenu=c(5,10,15,20)))
    
    
    
    #安装clusterProfiler 用于GO/KEGG分析及GSEA
    source("https://bioconductor.org/biocLite.R")
    biocLite("clusterProfiler")
     biocLite("DOSE")
     require(DOSE)
     library(DO.db)
    library(clusterProfiler)
    
    
     
     
     
     
     
     
     
     if (!requireNamespace("BiocManager", quietly = TRUE))
       install.packages("BiocManager")
     BiocManager::install("S4Vectors", version = "3.8")
     
     
     
    
    #安装annotationhub
    if(!requireNamespace("BiocManager",quietly = TRUE))
    install.packages("BiocManager")
    BiocManager::install("AnnotationHub", version = "3.8")
    
    library(AnnotationHub)
    require(AnnotationHub)
    hub <- AnnotationHub()
    query(hub,"zea mays")
    
    maize <- hub[['AH66225']]
    length(keys(maize))
    
    columns(maize)
    
    require(clusterProfiler)
    
    bitr(keys(maize)[1],'GID',c("ACCNUM","ENTREZID","UNIGENE"),maize)
    
    
    "ALIAS","EVIDENCE","EVIDENCELL",
    
    #GO富集分析
    #使用enrichGO
    
    
    
    sample_genes <- keys(maize)
    
    res=enrichGO(sample_genes,OrgDb=maize,pvalueCutoff=1,qvalueCutoff=1)
    
    ego <- enrichGO(gene=row.names(diff_gene_deseq2),OrgDb = maize,keyType = "GENENAME",ont="MF")
    
    
    ensids <- c("Zm00001d011037","Zm00001d035600","Zm00001d035599")
    cols <- c("SYMBOL","GO")
    select(maize,keys = ensids,columns = cols,keytype = "GENENAME")
    
    #气泡图
    dotplot(ego,font.size=5)
    #网络图
    enrichMap(ego,vertex.label.cex=1.2,layout=igraph::layout.kamada.kawai)
    #GO图额外安装的包
    biocLite("topGO")
    biocLite("Rgraphviz")
    plotGOgraph(ego)
    
    
    
    #gseGO进行GSEA分析
    
    
    
    
    
    
    
    #快速匹配文件中,以gene开头的行,并输出其中的第3,12列内容。
    cat genome_table.txt |awk '$1 ~/gene/ {print $3,$12}' >gene_id
    

    相关文章

      网友评论

        本文标题:转录组从下机数据到GO、kegg、GSEA

        本文链接:https://www.haomeiwen.com/subject/xkqelctx.html