美文网首页生信小教程收藏
Hisat2+Stringtie+DESeq2 workflow

Hisat2+Stringtie+DESeq2 workflow

作者: 547可是贼帅的547 | 来源:发表于2020-01-18 14:41 被阅读0次

    前言

    因为以前的分析流程要在Shell 和 R 之间切换,然后可控性还差,有点烦,正好最近有点数据要分析,干脆就重新建立一个 all in one 的流程,方便以后使用。

    运行环境

    Ubuntu 18.04
    R 3.6.1

    用到的软件

    fastp
    bowtie2
    hisat2
    stringtie
    DESeq2

    Refference data

    rRNA index:用bowtie2去除fastq文件中的rRNA
    tRNA index:http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/hg38-tRNAs.fa
    Hisat2 index:https://cloud.biohpc.swmed.edu/index.php/s/grch38_tran/download
    GRCh38.96.gtf:ftp://ftp.ensembl.org/pub/release-96/gtf/homo_sapiens/Homo_sapiens.GRCh38.96.gtf.gz
    Homo_sapiens.GRCh38.dna.primary_assembly.fa:ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz

    代码

    ############## Functions #####################
    #Hisat2 参数
    grch38_tran_index = "/home/zhou/RNASEQ/index/grch38_tran/genome_tran"
    rRNA_index = "/home/zhou/RNASEQ/index/rRNA_index/human_rRNA"
    tRNA_index = "/home/zhou/RNASEQ/index/tRNA_index_hg38/hg38_tRNA"
    GRCh38.96.gtf = "/home/zhou/RNASEQ/index/grch38_tran/Homo_sapiens.GRCh38.96.gtf"
    fasta = "/home/zhou/RNASEQ/index/grch38_tran/Homo_sapiens.GRCh38.dna.primary_assembly.fa"
    
    #fastp preprocess
    ##Single version
    fastp_SE <- function(input_fastq = input_fastq) {
      fastp_PATH = "/home/zhou/anaconda2/bin/fastp"
      output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq),"_fastpedited.fastq.gz")
      fastp_cmd = paste(fastp_PATH, "-i", input_fastq, "-o", output_fastq, "-w 16", "-z 9")
      system(fastp_cmd,wait = T)
      return(output_fastq)
    }
    ##Paired version
    fastp_PE <- function(input_fastq_F = input_fastq_F, input_fastq_R = input_fastq_R) {
      fastp_PATH = "/home/zhou/anaconda2/bin/fastp"
      output_fastq_F = paste0(gsub(pattern = "\\.fq\\.gz",replacement = "",x = input_fastq_F),"_fastpedited.fastq.gz")
      output_fastq_R = paste0(gsub(pattern = "\\.fq\\.gz",replacement = "",x = input_fastq_R),"_fastpedited.fastq.gz")
      fastp_cmd = paste(fastp_PATH, "-i", input_fastq_F, "-o", output_fastq_F, "-I", input_fastq_R, "-O", output_fastq_R, "-w 16", "-z 9")
      system(fastp_cmd,wait = T)
      return(c(output_fastq_F,output_fastq_R))
    }
    
    #rRNA remove
    ##Single version
    remove_rRNA_SE <- function(input_fastq = input_fastq, index = index) {
      bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
      output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq),"_rmrRNA.fastq.gz")
      bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-gz", output_fastq, "-U", input_fastq, "-p 20 -S tmp.sam")
      system(bowtie2_cmd,wait = T)
      system("rm tmp.sam")
      return(output_fastq)
    }
    
    ##Paired version
    remove_rRNA_PE <- function(input_fastq_F = input_fastq_F, input_fastq_R = input_fastq_R, index = index) {
      bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
      output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmrRNA.fastq.gz")
      output_fastq_F = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.1\\.gz",x = output_fastq)
      output_fastq_R = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.2\\.gz",x = output_fastq)
      output_fastq_F_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmrRNA.fastq.gz")
      output_fastq_R_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_R),"_rmrRNA.fastq.gz")
      bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-conc-gz", output_fastq, "-1", input_fastq_F, "-2", input_fastq_R, "-p 20 -S tmp.sam")
      rename_cmd_F = paste("mv", output_fastq_F, output_fastq_F_reset)
      rename_cmd_R = paste("mv", output_fastq_R, output_fastq_R_reset)
      system(bowtie2_cmd,wait = T)
      system("rm tmp.sam",wait = T)
      system(rename_cmd_F,wait = T)
      system(rename_cmd_R,wait = T)
      return(c(output_fastq_F_reset,output_fastq_R_reset))
    }
    #tRNA remove
    ##Single version
    remove_tRNA_SE <- function(input_fastq = input_fastq, index = index) {
      bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
      output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq),"_rmtRNA.fastq.gz")
      bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-gz", output_fastq, "-U", input_fastq, "-p 20 -S tmp.sam")
      system(bowtie2_cmd,wait = T)
      system("rm tmp.sam")
      return(output_fastq)
    }
    
    ##Paired version
    remove_tRNA_PE <- function(input_fastq_F = input_fastq_F, input_fastq_R = input_fastq_R, index = index) {
      bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
      output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmtRNA.fastq.gz")
      output_fastq_F = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.1\\.gz",x = output_fastq)
      output_fastq_R = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.2\\.gz",x = output_fastq)
      output_fastq_F_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmtRNA.fastq.gz")
      output_fastq_R_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_R),"_rmtRNA.fastq.gz")
      bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-conc-gz", output_fastq, "-1", input_fastq_F, "-2", input_fastq_R, "-p 20 -S tmp.sam")
      rename_cmd_F = paste("mv", output_fastq_F, output_fastq_F_reset)
      rename_cmd_R = paste("mv", output_fastq_R, output_fastq_R_reset)
      system(bowtie2_cmd,wait = T)
      system("rm tmp.sam",wait = T)
      system(rename_cmd_F,wait = T)
      system(rename_cmd_R,wait = T)
      return(c(output_fastq_F_reset,output_fastq_R_reset))
    }
    #hisat2 
    ##Single version
    hisat2_SE <- function(index = index, input_fastq = input_fastq, output_prefix = output_prefix) {
      bam_filename = paste0(output_prefix,".bam")
      hisat2_PATH = "/home/zhou/bin/hisat2"
      hisat2_cmd = paste(hisat2_PATH, "--dta -x",index, "-p 20 -U", input_fastq, " | samtools sort -@ 20 -o", bam_filename)
      system(hisat2_cmd,wait = T)
      return(bam_filename)
    }
    
    ##Paired version
    hisat2_PE <- function(index = index, input_fastq_F = input_fastq_F,input_fastq_R = input_fastq_R, output_prefix = output_prefix) {
      bam_filename = paste0(output_prefix,".bam")
      hisat2_PATH = "/home/zhou/bin/hisat2"
      hisat2_cmd = paste(hisat2_PATH, "--dta -x",index, "-p 20 -1", input_fastq_F, "-2", input_fastq_R, " | samtools sort -@ 20 -o", bam_filename)
      system(hisat2_cmd,wait = T)
      return(bam_filename)
    }
    
    #stringtie
    stringtie <- function(input_bam = input_bam, gtf = gtf) {
      output_prefix = gsub(pattern = "\\.bam",replacement = "",x = input_bam)
      tab_filename = paste0(output_prefix,".tab")
      gtf_filename = paste0(output_prefix,".gtf")
      stringtie_PATH = "/home/zhou/bin/stringtie"
      stringtie_cmd = paste(stringtie_PATH,"-p 20 -G", gtf, "-o", gtf_filename, "-e -l", output_prefix, input_bam)
      system(stringtie_cmd,wait = T)
      return(gtf_filename)
    }
    
    #generate gene\transcript count matrix
    run.prepDE.py <- function(inputfile = inputfile) {
      run.cmd = paste("python ~/Software/prepDE.py -i",inputfile)
      system(run.cmd,wait = T)
    }
    ################ test data ############################
    setwd("~/ribosome/mTOR_Liver_cell/testdata/")
    sample_pattern = unique(gsub(x = list.files(path = "./"), pattern = "_R1_test.fq.gz|_R2_test.fq.gz",replacement = ""))
    
    for (i in sample_pattern) {
      print(i)
      ### fastp preprocess
      fastp.fastq = fastp_PE(input_fastq_F = paste0(i,"_R1_test.fq.gz"),input_fastq_R = paste0(i,"_R2_test.fq.gz"))
      ### remove rRNA
      rmrRNA.fastq = remove_rRNA_PE(input_fastq_F = fastp.fastq[1],input_fastq_R = fastp.fastq[2],index = rRNA_index)
      ### remove tRNA
      rmtRNA.fastq = remove_tRNA_PE(input_fastq_F = rmrRNA.fastq[1],input_fastq_R = rmrRNA.fastq[2],index = tRNA_index)
      ### hisat2 mapping
      hisat2.bam = hisat2_PE(index = grch38_tran_index,input_fastq_F = rmtRNA.fastq[1],input_fastq_R = rmtRNA.fastq[2],output_prefix = i )
      ### stringtie assemble
      stringtie(input_bam = hisat2.bam,gtf = GRCh38.96.gtf)
    }
    
    
    ################ real data ############################
    #rnaseq
    setwd("~/ribosome/mTOR_Liver_cell/RNA-Rawdata/")
    sample_pattern = unique(gsub(x = list.files(path = "./"), pattern = "_R1.fq.gz|_R2.fq.gz",replacement = ""))
    
    for (i in sample_pattern) {
      print(i)
      ### fastp preprocess
      fastp.fastq = fastp_PE(input_fastq_F = paste0(i,"_R1.fq.gz"),input_fastq_R = paste0(i,"_R2.fq.gz"))
      ### remove rRNA
      rmrRNA.fastq = remove_rRNA_PE(input_fastq_F = fastp.fastq[1],input_fastq_R = fastp.fastq[2],index = rRNA_index)
      ### remove tRNA
      rmtRNA.fastq = remove_tRNA_PE(input_fastq_F = rmrRNA.fastq[1],input_fastq_R = rmrRNA.fastq[2],index = tRNA_index)
      ### hisat2 mapping
      hisat2.bam = hisat2_PE(index = grch38_tran_index,input_fastq_F = rmtRNA.fastq[1],input_fastq_R = rmtRNA.fastq[2],output_prefix = i )
      ### stringtie assemble
      stringtie(input_bam = hisat2.bam,gtf = GRCh38.96.gtf)
    }
    
    ## generate counts matrix
    mergelist = data.frame(sample_pattern,paste0("./",sample_pattern,".gtf"))
    write.table(mergelist,file = "mergelist", sep = "\t",row.names = F,col.names = F,quote = F)
    run.prepDE.py(inputfile = "mergelist")
    
    ## DEA
    ### load count_matrix
    gene_matrix = read.csv(file = "gene_count_matrix.csv",header = T,row.names = 1)
    
    library(DESeq2)
    library(BiocParallel)
    register(MulticoreParam(24))
    
    database = gene_matrix
    condition <- factor(c(rep("T",3),rep("UT",3)))
    coldata <- data.frame(row.names = colnames(database), condition)
    dds <- DESeqDataSetFromMatrix(countData=database, colData=coldata, design=~condition)
    dds <- dds[ rowSums(counts(dds)) > 1, ]
    nrow(dds)
    dds <- DESeq(dds,parallel = T)
    res <- results(dds)
    summary(res) 
    table(res$padj<0.1)
    res <- res[order(res$padj),]
    resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE)
    total.RNA.signresdata<-na.omit(resdata[resdata$padj<0.1 & abs(resdata$log2FoldChange) > 1,])
    
    #riboseq
    setwd("~/ribosome/mTOR_Liver_cell/RIBO-Rawdata/")
    sample_pattern = unique(gsub(x = list.files(path = "./"), pattern = "_R1.fq.gz|_R2.fq.gz",replacement = ""))
    
    for (i in sample_pattern) {
      print(i)
      ### fastp preprocess
      fastp.fastq = fastp_PE(input_fastq_F = paste0(i,"_R1.fq.gz"),input_fastq_R = paste0(i,"_R2.fq.gz"))
      ### remove rRNA
      rmrRNA.fastq = remove_rRNA_PE(input_fastq_F = fastp.fastq[1],input_fastq_R = fastp.fastq[2],index = rRNA_index)
      ### remove tRNA
      rmtRNA.fastq = remove_tRNA_PE(input_fastq_F = rmrRNA.fastq[1],input_fastq_R = rmrRNA.fastq[2],index = tRNA_index)
      ### hisat2 mapping
      hisat2.bam = hisat2_PE(index = grch38_tran_index,input_fastq_F = rmtRNA.fastq[1],input_fastq_R = rmtRNA.fastq[2],output_prefix = i )
      ### stringtie assemble
      stringtie(input_bam = hisat2.bam,gtf = GRCh38.96.gtf)
    }
    
    ## generate counts matrix
    mergelist = data.frame(sample_pattern,paste0("./",sample_pattern,".gtf"))
    write.table(mergelist,file = "mergelist", sep = "\t",row.names = F,col.names = F,quote = F)
    run.prepDE.py(inputfile = "mergelist")
    
    ## DEA
    ### load count_matrix
    gene_matrix = read.csv(file = "gene_count_matrix.csv",header = T,row.names = 1)
    
    library(DESeq2)
    library(BiocParallel)
    register(MulticoreParam(24))
    
    database = gene_matrix
    condition <- factor(c(rep("T",3),rep("UT",3)))
    coldata <- data.frame(row.names = colnames(database), condition)
    dds <- DESeqDataSetFromMatrix(countData=database, colData=coldata, design=~condition)
    dds <- dds[ rowSums(counts(dds)) > 1, ]
    nrow(dds)
    dds <- DESeq(dds,parallel = T)
    res <- results(dds)
    summary(res) 
    table(res$padj<0.1)
    res <- res[order(res$padj),]
    resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE)
    Ribo.RNA.signresdata<-na.omit(resdata[resdata$padj<0.1 & abs(resdata$log2FoldChange) > 1,])
    
    

    结语

    把所有功能都模块化了,真舒服啊!
    此外,有些function可以增加参数来进一步增加自由度,比如说线程数,我懒得写了,其实可以慢慢完善的,以后看心情把,毕业季实在太忙了。。。

    相关文章

      网友评论

        本文标题:Hisat2+Stringtie+DESeq2 workflow

        本文链接:https://www.haomeiwen.com/subject/hppczctx.html