Transcript-level expression anal

作者: 宗肃書 | 来源:发表于2021-03-10 11:12 被阅读0次
    image.png
    今天要分享的一篇文章是转录组分析的一篇方法文章
    介绍:TopHat2和Cufflinks是之前常用来作转录组分析的两个工具,它们能够完成这几个任务:
    (i) alignment of the reads to the genome;
    (ii) assembly of the alignments into full-length transcripts;
    (iii) quantification of the expression levels of each gene and transcript;
    (iv) calculation of the differences in expression for all genes among the different experimental conditions.
    但是最近新推出的软件StringTie和HISAT在完成这些任务的同时可以以更快的速度运行。并可以估算所有基因和转录本的表达水平。流程图如下:
    image.png

    这里注意的要点是:定量时使用的注释文件为stringtie --merge 合并所有样本转录本后的文件
    为什么要用stringtie merge 命令?
    This step is required because transcripts in some of the samples might be only partially covered by reads, and as a consequence only partial versions of them will be assembled in the initial StringTie run. The merge step creates a set of transcripts that is consistent across all samples, so that the transcripts can be compared in subsequent steps.

    image.png

    该流程的局限性:

    1. 不能处理三代测序数据;
    2. 除了stringtie、cufflinks、RESM三个软件,其他组装方法输出的结果与Ballgown不兼容,Ballgown的样本量至少为三个。
    • Install the Ballgown packages and other dependencies in R as follows:
    >install.packages(″devtools″,repos=″http://cran.us.r-project.org″)
    
    >source(″http://www.bioconductor.org/biocLite.R″)
    
    >biocLite(c(″alyssafrazee/RSkittleBrewer″,″ballgown″, ″genefilter″,″dplyr″,″devtools″))
    #!/usr/bin/env Rscript
    # run this in the output directory for rnaseq_pipeline.sh
    # passing the pheno data csv file as the only argument 
    args = commandArgs(trailingOnly=TRUE)
    if (length(args)==0) {
    # assume no output directory argument was given to rnaseq_pipeline.sh
      pheno_data_file <- paste0(getwd(), "/chrX_data/geuvadis_phenodata.csv")
    } else {
      pheno_data_file <- args[1]
    }
    
    library(ballgown)
    library(RSkittleBrewer)
    library(genefilter)
    library(dplyr)
    library(devtools)
    
    ## Read phenotype sample data
    pheno_data <- read.csv(pheno_data_file)
    
    ## Read in expression data
    bg_chrX <- ballgown(dataDir = "ballgown", samplePattern="ERR", pData=pheno_data)
    
    ## Filter low abundance genes
    bg_chrX_filt <- subset(bg_chrX, "rowVars(texpr(bg_chrX)) > 1", genomesubset=TRUE)
    
    ## DE by transcript
    results_transcripts <-  stattest(bg_chrX_filt, feature='transcript', covariate='sex', 
             adjustvars=c('population'), getFC=TRUE, meas='FPKM')
    
    ## DE by gene
    results_genes <-  stattest(bg_chrX_filt, feature='gene', covariate='sex', 
             adjustvars=c('population'), getFC=TRUE, meas='FPKM')
    
    ## Add gene name
    results_transcripts <- data.frame(geneNames=ballgown::geneNames(bg_chrX_filt),
              geneIDs=ballgown::geneIDs(bg_chrX_filt), results_transcripts)
    
    ## Sort results from smallest p-value
    results_transcripts <- arrange(results_transcripts, pval)
    results_genes <-  arrange(results_genes, pval)
    
    ## Write results to CSV
    write.csv(results_transcripts, "chrX_transcripts_results.csv", row.names=FALSE)
    write.csv(results_genes, "chrX_genes_results.csv", row.names=FALSE)
    
    ## Filter for genes with q-val <0.05
    subset(results_transcripts, results_transcripts$qval <=0.05)
    subset(results_genes, results_genes$qval <=0.05)
    
    ## Plotting setup
    #tropical <- c('darkorange', 'dodgerblue', 'hotpink', 'limegreen', 'yellow')
    #palette(tropical)
    
    ## Plotting gene abundance distribution
    #fpkm <- texpr(bg_chrX, meas='FPKM')
    #fpkm <- log2(fpkm +1)
    #boxplot(fpkm, col=as.numeric(pheno_data$sex), las=2,ylab='log2(FPKM+1)')
    
    ## Plot individual transcripts
    #ballgown::transcriptNames(bg_chrX)[12]
    #plot(fpkm[12,] ~ pheno_data$sex, border=c(1,2),
    #     main=paste(ballgown::geneNames(bg_chrX)[12], ' : ',ballgown::transcriptNames(bg_chrX)[12]),
    #     pch=19, xlab="Sex", ylab='log2(FPKM+1)')
    #points(fpkm[12,] ~ jitter(as.numeric(pheno_data$sex)), col=as.numeric(pheno_data$sex))
    
    ## Plot gene of transcript 1729
    #plotTranscripts(ballgown::geneIDs(bg_chrX)[1729], bg_chrX,
    #                main=c('Gene XIST in sample ERR188234'), sample=c('ERR188234'))
    
    ## Plot average expression
    #plotMeans(ballgown::geneIDs(bg_chrX)[203], bg_chrX_filt, groupvar="sex", legend=FALSE)
    
    
    

    其余后续流程包含详细的命令运行参数,可自行查看

    #!/usr/bin/env bash
    usage() {
        NAME=$(basename $0)
        cat <<EOF
    Usage:
      ${NAME} [output_dir]
    Wrapper script for HISAT2/StringTie RNA-Seq analysis protocol.
    In order to configure the pipeline options (input/output files etc.)
    please copy and edit a file rnaseq_pipeline.config.sh which must be
    placed in the current (working) directory where this script is being launched.
    
    Output directories "hisat2" and "ballgown" will be created in the 
    current working directory or, if provided, in the given <output_dir>
    (which will be created if it does not exist).
    
    EOF
    }
    
    OUTDIR="."
    if [[ "$1" ]]; then
     if [[ "$1" == "-h" || "$1" == "--help" ]]; then
      usage
      exit 1
     fi
     OUTDIR=$1
    fi
    
    ## load variables
    if [[ ! -f ./rnaseq_pipeline.config.sh ]]; then
     usage
     echo "Error: configuration file (rnaseq_pipeline.config.sh) missing!"
     exit 1
    fi
    
    source ./rnaseq_pipeline.config.sh
    WRKDIR=$(pwd -P)
    errprog=""
    if [[ ! -x $SAMTOOLS ]]; then
        errprog="samtools"
    fi
    if [[ ! -x $HISAT2 ]]; then
        errprog="hisat2"
    fi
    if [[ ! -x $STRINGTIE ]]; then
        errprog="stringtie"
    fi
    
    if [[ "$errprog" ]]; then    
      echo "ERROR: $errprog program not found, please edit the configuration script."
      exit 1
    fi
    
    if [[ ! -f rnaseq_ballgown.R ]]; then
       echo "ERROR: R script rnaseq_ballgown.R not found in current directory!"
       exit 1
    fi
    
    #determine samtools version
    newsamtools=$( ($SAMTOOLS 2>&1) | grep 'Version: 1\.')
    
    set -e
    #set -x
    
    if [[ $OUTDIR != "." ]]; then
      mkdir -p $OUTDIR
      cd $OUTDIR
    fi
    
    SCRIPTARGS="$@"
    ALIGNLOC=./hisat2
    BALLGOWNLOC=./ballgown
    
    LOGFILE=./run.log
    
    for d in "$TEMPLOC" "$ALIGNLOC" "$BALLGOWNLOC" ; do
     if [ ! -d $d ]; then
        mkdir -p $d
     fi
    done
    
    # main script block
    pipeline() {
    
    echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> START: " $0 $SCRIPTARGS
    
    for ((i=0; i<=${#reads1[@]}-1; i++ )); do
        sample="${reads1[$i]%%.*}"
        sample="${sample%_*}"
        stime=`date +"%Y-%m-%d %H:%M:%S"`
        echo "[$stime] Processing sample: $sample"
        echo [$stime] "   * Alignment of reads to genome (HISAT2)"
        $HISAT2 -p $NUMCPUS --dta -x ${GENOMEIDX} \
         -1 ${FASTQLOC}/${reads1[$i]} \
         -2 ${FASTQLOC}/${reads2[$i]} \
         -S ${TEMPLOC}/${sample}.sam 2>${ALIGNLOC}/${sample}.alnstats
        echo [`date +"%Y-%m-%d %H:%M:%S"`] "   * Alignments conversion (SAMTools)"
        if [[ "$newsamtools" ]]; then
         $SAMTOOLS view -S -b ${TEMPLOC}/${sample}.sam | \
          $SAMTOOLS sort -@ $NUMCPUS -o ${ALIGNLOC}/${sample}.bam -
        else
         $SAMTOOLS view -S -b ${TEMPLOC}/${sample}.sam | \
          $SAMTOOLS sort -@ $NUMCPUS - ${ALIGNLOC}/${sample}
        fi
        #$SAMTOOLS index ${ALIGNLOC}/${sample}.bam
        #$SAMTOOLS flagstat ${ALIGNLOC}/${sample}.bam
        
        #echo "..removing intermediate files"
        rm ${TEMPLOC}/${sample}.sam
        #rm ${TEMPLOC}/${sample}.unsorted.bam
    
        echo [`date +"%Y-%m-%d %H:%M:%S"`] "   * Assemble transcripts (StringTie)"
        $STRINGTIE -p $NUMCPUS -G ${GTFFILE} -o ${ALIGNLOC}/${sample}.gtf \
         -l ${sample} ${ALIGNLOC}/${sample}.bam
    done
    
    ## merge transcript file
    echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> Merge all transcripts (StringTie)"
    ls -1 ${ALIGNLOC}/*.gtf > ${ALIGNLOC}/mergelist.txt
    
    $STRINGTIE --merge -p $NUMCPUS -G  ${GTFFILE} \
        -o ${BALLGOWNLOC}/stringtie_merged.gtf ${ALIGNLOC}/mergelist.txt
    

    相关文章

      网友评论

        本文标题:Transcript-level expression anal

        本文链接:https://www.haomeiwen.com/subject/rvxwqltx.html