FG12组装注释全过程

作者: 马连洼小法师 | 来源:发表于2021-07-21 17:13 被阅读0次

    测序:

    RNA-seq:100X
    nanopore:80X
    resequence:30X

    质控:

    使用fastp对RNAseq和Resequence测序数据进行质控
    Resequence Q20,Q30 皆为100%
    RNAseq Q20,90% Q30,89%

    基因组Survey

    FG12

    jellyfish.sh

    pre=Kmer_19
    
    ls  ../resequence/FG12C/1.Cleandata/FG12C.IS300-400bp_Clean.*.fq.gz | awk  '{print "gzip -dc "$0 }' > generate.file
    /ifs1/Software/bin/jellyfish count -t 4 -C -m 19 -s 1G  -g generate.file -G 2  -o $pre 
    /ifs1/Software/bin/jellyfish histo -v -o $pre.histo $pre -t 4 -h 10000
    /ifs1/Software/bin/jellyfish stats $pre -o $pre.stat
    
    
    image.png image.png
    /ifs1/Grp8/haozhigang/training20200405/genomics_training/41.Assembly/01.kmer_analysis/gce-1.0.2/gce -f Kmer_19.histo -c 26  -g 1140936461  -M 10000 >gce.table 2>gce.log
    
    
    gce.log
    Rscript ~/training20200405/genomics_training/41.Assembly/01.kmer_analysis/genomescope-master/genomescope.R Kmer_19.histo 19 150 ./ 100000
    
    
    summary.txt

    FG7

    pre=Kmer_19
    
    ls  ../resequence/FG7C/1.Cleandata/FG7C.IS300-400bp_Clean.*.fq.gz | awk  '{print "gzip -dc "$0 }' > generate.file
    /ifs1/Software/bin/jellyfish count -t 4 -C -m 19 -s 1G  -g generate.file -G 2  -o $pre 
    /ifs1/Software/bin/jellyfish histo -v -o $pre.histo $pre -t 4 -h 10000
    /ifs1/Software/bin/jellyfish stats $pre -o $pre.stat
    
    
    image.png image.png
    /ifs1/Grp8/haozhigang/training20200405/genomics_training/41.Assembly/01.kmer_analysis/gce-1.0.2/gce -f Kmer_19.histo -c 26  -g 1141618012 -M 10000 >gce.table 2>gce.log
    
    
    gce.log
    Rscript ~/training20200405/genomics_training/41.Assembly/01.kmer_analysis/genomescope-master/genomescope.R Kmer_19.histo 19 150 ./ 100000
    
    
    summary.txt image.png

    基因组拼接

    使用wtdbg2

    ## 组装
    /ifs1/Software/bin/wtdbg2 \
        -t 6 -x ont -g 37M -L 500 -l 500 -e 2  \
        -i /ifs1/Grp8/haozhigang/nanopore_data/FG11C/1.Cleandata/FG11C.filtered_reads.fq.gz \
        -o FG11_ont
    
    ## 得到一致性序列
    ###  wtdbg-cns 2分钟
    /ifs1/Software/biosoft/wtdbg2/wtdbg-cns \
        -t 6 \
        -i FG11_ont.ctg.lay.gz \
        -f \
        -o FG11_ont.wtdbg-cns.fa
    
    ### wtpoa-cns 稍慢 12 分钟
    /ifs1/Software/biosoft/wtdbg2/wtpoa-cns\
        -t 6 \
        -i FG11_ont.ctg.lay.gz \
        -f \
        -o FG11_ont.wtpoa-cns.fa
    
    

    基因组pilon

    draft_genome=/ifs1/Grp8/haozhigang/training20200405/genomics_training/41.Assembly/06.assembly_wtdbg2/FG12_ont.wtpoa-cns.fa
     fq1=/ifs1/Grp8/haozhigang/RNAseq/Filter_SOAPnuke/Clean/FG12B/FG12B_clean_1.fq.gz
     fq2=/ifs1/Grp8/haozhigang/RNAseq/Filter_SOAPnuke/Clean/FG12B/FG12B_clean_2.fq.gz
    
    
     
    
     ln -s $draft_genome ./genome_FG12.fa
     bwa index genome_FG12.fa
    
    
     bwa mem -t 6  genome_FG12.fa  $fq1 $fq2  | /ifs1/Software/bin/samtools sort - -o reads_FG12.bam
     samtools index reads_FG12.bam
    
     java -Xmx80G -jar /ifs1/Software/biosoft/pilon/pilon-1.23.jar \
     --genome $draft_genome \
     --frags reads_FG12.bam  \
     --changes --vcf --diploid --threads 6 \
     --outdir ./pilon_out_FG12 --output genome_pilon_FG12
    
    
    image.png

    gc-depth

    minimap2 -ax map-ont genome_pilon_FG12.fasta FG12C.filtered_reads.fq.gz| /pub/software/samtools/samtools sort - -o aln_sort.bam
    
    genome=genome_pilon_FG12.fasta
    bam=aln_sort.bam
    prefix=gcdep
    window=500
    step=250
    
    # 计算序列长度
    seqtk comp  $genome  | awk '{print $1"\t"$2}' > $prefix.len
    
    # 划分窗口 生成bed文件
    bedtools  makewindows -w $window -s $step -g $prefix.len > $prefix.window.bed
    
    # 按窗口提取序列并计算gc含量
    seqtk subseq $genome  $prefix.window.bed  > $prefix.window.fasta
    seqtk comp  $prefix.window.fasta |awk '{print $1 "\t" ($4+$5)/($3+$4+$5+$6) } ' > $prefix.window.gc
    
    # 按窗口计算平均深度
    bedtools  coverage -b aln_sort.bam -a gcdep.window.bed -mean  | awk '{print $1":"$2+1"-"$3"\t"$4}' >  $prefix.window.depth
    
    # 绘图
    Rscript  run_gcdep.R $prefix.window.gc $prefix.window.depth $prefix.pdf 0 0.8 0 500
    
    
    
    gc-depth

    由图可知,存在一部分低GC含量的片段,推测可能是污染所以,一方面比对NT库,另一方面截取低GC区的片段,重新进行比对。

    NT库比对命令及结果

    blastn -db nt -query genome_pilon_FG12.fasta -outfmt 10 -num_alignments 5 -max_hsps 1 -out blast-output.m6
    
    
    NT库比对结果

    低GC区片段

    #寻找小于30%片段
    seqtk comp  gcdep.window.fasta |awk '{print $1 "\t" ($4+$5)/($3+$4+$5+$6) } ' |awk '$2<0.3'|less -S
    
    seqtk subseq ../genome_pilon_FG12.fasta sample.txt>1.txt
    nucmer -c 200 -g 200 -p out 1.txt PH-1.fa
    show-coords -c -d -l -I 95 -L 10000 -r out.delta > out.show
    mummerplot -f -l -p out -s large -t png out.delta
    /pub/software/gnuplot4.6.7/bin/gnuplot out.gp
    
    
    低GC片段比对PH-1

    由图可知,低GC区域片段全部比对到PH-1上面,不是污染

    MUMER作图

    nucmer -c 200 -g 200 -p out genome_pilon_FG12.fasta PH-1.fa
    show-coords -c -d -l -I 95 -L 10000 -r out.delta > out.show
    mummerplot -f -l -p out -s large -t png out.delta
    /pub/software/gnuplot4.6.7/bin/gnuplot out.gp
    
    mummer

    由图可知,ctg2和ctg5为一条序列,所以加一个中间序列100N,cat命令。最后得到4条序列和一条染色体。

    image.png

    Repeat注释

    # 构建数据库
    /pub/software/RepeatModeler-2.0.1/BuildDatabase -name sesame test_data/genome.fasta
    
    # 运行 RepeatModeler
    /pub/software/RepeatModeler-2.0.1/RepeatModeler -database sesame -pa 20 -LTRStruct
    
    # 运行 RepeatMasker
    /pub/software/RepeatMasker/RepeatMasker -pa 20 -qq -lib sesame-families.fa test_data/genome.fasta >repeatmasker.log 2>&1
    
    # 生成RepeatLandscape
    /pub/software/RepeatMasker/util/calcDivergenceFromAlign.pl -s sesame.divsum test_data/genome.fasta.cat
    
    perl /pub/software/RepeatMasker/util/createRepeatLandscape.pl -div sesame.divsum -g 18577337 > sesame.html
    
    

    相关文章

      网友评论

        本文标题:FG12组装注释全过程

        本文链接:https://www.haomeiwen.com/subject/iowsbktx.html