美文网首页生信
Bulk RNAseq上游比对2:下载数据、质控

Bulk RNAseq上游比对2:下载数据、质控

作者: 小贝学生信 | 来源:发表于2021-09-21 00:35 被阅读0次

    Bulk RNAseq上游比对1:大致流程与conda环境 - 简书 (jianshu.com)
    Bulk RNAseq上游比对2:下载数据、质控 - 简书 (jianshu.com)
    Bulk RNAseq上游比对3:比对mapping - 简书 (jianshu.com)

    Step1:下载数据

    1.1 下载公共数据框的测序数据

    cat > SraAccList.txt
    SRR12720999
    SRR12721000
    SRR12721001
    SRR12721002
    SRR12721003
    SRR12721004
    
    conda activate download
    

    方式1:aspera

    # 批量生成下载链接
    # era-fasp@fasp.sra.ebi.ac.uk:/vol1/fastq/SRR166/009/SRR1663609/SRR1663609_1.fastq.gz
    touch ascp.link
    cat SraAccList.txt | while read id 
    do
    echo "era-fasp@fasp.sra.ebi.ac.uk:/vol1/fastq/${id:0:6}/0${id:0-2}/${id}/${id}_1.fastq.gz" >> ascp.link
    echo "era-fasp@fasp.sra.ebi.ac.uk:/vol1/fastq/${id:0:6}/0${id:0-2}/${id}/${id}_2.fastq.gz" >> ascp.link
    done
    
    #ascp高速下载
    cat ascp.link |while read sample
    do
    ascp -QT -l 300m -P33001  \
    -i ~/miniconda3/envs/download/etc/asperaweb_id_dsa.openssh   \
    $sample  .
    done
    

    方式2:wget ftp

    # 批量生成下载链接
    # ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR127/099/SRR12720999/SRR12720999_1.fastq.gz
    touch ftp.link
    cat SraAccList.txt | while read id 
    do
    echo "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/${id:0:6}/0${id:0-2}/${id}/${id}_1.fastq.gz" >> ftp.link
    echo "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/${id:0:6}/0${id:0-2}/${id}/${id}_2.fastq.gz" >> ftp.link
    done
    
    cat ftp.link | while read id 
    do
    wget -c $id
    done
    

    方式3:prefetch下载sra文件,再转为fastq.gz

    #单独下载
    prefetch SRR12720999
    #批量下载 SRR list
    prefetch --option-file SraAccList.txt
    
    # sra2fastq
    cat SraAccList.txt | while read id
    do
    echo ${id}
    fasterq-dump --split-files $id
    done
    #fastq2fastq.gz
    gzip *fastq
    
    • 然后查看下载得到的fastq.gz质量如何
    #### fastqc
    fastqc $(ls ${pare_dir}/raw/ebi/*gz) -o ${pare_dir}/raw/ebi/ -t 10
    ####QC merged report
    multiqc ./ -n rawfq_multiqc_report.html
    

    1.2 下载参考基因组及相关数据

    refgenie init -c ~/refgenie/genome_config.yaml
    refgenie listr -c ~/refgenie/genome_config.yaml
    refgenie listr -g hg38 -c ~/refgenie/genome_config.yaml
    
    #参考基因组
    refgenie pull hg38/fasta -c ~/refgenie/genome_config.yaml
    refgenie pull hg38_cdna/fasta -c ~/refgenie/genome_config.yaml
    
    #参考注释信息
    refgenie pull hg38/gencode_gtf -c ~/refgenie/genome_config.yaml
    
    #比对软件的索引文件
    refgenie pull hg38/bowtie2_index -c ~/refgenie/genome_config.yaml
    refgenie pull hg38/bwa_index -c ~/refgenie/genome_config.yaml
    refgenie pull hg38/star_index -c ~/refgenie/genome_config.yaml
    refgenie pull hg38/hisat2_index -c ~/refgenie/genome_config.yaml
    refgenie pull hg38_cdna/salmon_index -c ~/refgenie/genome_config.yaml
    
    #列出本地已经下载的数据
    refgenie list -c ~/refgenie/genome_config.yaml
    

    Step2:质控

    • trim_glaore
    #以其中一个作为示例
    pare_dir=/home/data/****/mapping
    fq1=${pare_dir}/raw/ebi/SRR12720999_1.fastq.gz
    fq2=${pare_dir}/raw/ebi/SRR12720999_2.fastq.gz
    
    trim_galore -j 8 -q 25 --phred33 --length 36 \
    -paired -o ${pare_dir}/trim \
    $fq1 $fq2
    
    #批量
    cat ${pare_dir}/SraAccList.txt | while read id
    do 
    echo $id
    trim_galore -j 8 -q 25 --phred33 --length 36 \
    -paired -o ${pare_dir}/trim \
    ${pare_dir}/raw/ebi/${id}_1.fastq.gz \
    ${pare_dir}/raw/ebi/${id}_2.fastq.gz
    done
    
    • 查看质控之后的fatsq.gz质量
    #### fastqc
    fastqc $(ls ${pare_dir}/trim/*gz) -o ${pare_dir}/trim/ -t 10
    ####QC report
    multiqc ./ -n trim_multiqc_report.html
    

    相关文章

      网友评论

        本文标题:Bulk RNAseq上游比对2:下载数据、质控

        本文链接:https://www.haomeiwen.com/subject/hkzvgltx.html