美文网首页泛基因组组学
vg: 染色体结构变异SV检测

vg: 染色体结构变异SV检测

作者: 胡童远 | 来源:发表于2021-12-17 17:48 被阅读0次

    文章

    标题:Genotyping structural variants in pangenome graphs using the vg toolkit
    中文:使用vg对泛基因组结构变异进行基因分型
    杂志:Genome Biol.
    时间:2020
    被引:66(谷歌学术2021.12.17)

    Github: https://github.com/vgteam/vg
    Bioconda: https://anaconda.org/bioconda/vg

    conda安装

    conda create -n SV
    conda activate SV
    conda install -c bioconda vg
    # version v1.36.0
    

    conda安装中没找到测试数据,因此重新下载Linux同版vg v1.36.0
    地址:https://github.com/vgteam/vg/releases/download/v1.36.0/vg-v1.36.0.tar.gz

    主要piplines

    main mapping and calling pipeline:
      -- autoindex     mapping tool-oriented index construction from interchange formats
      -- construct     graph construction
      -- rna           construct splicing graphs and pantranscriptomes
      -- index         index graphs or alignments for random access or mapping
      -- map           MEM-based read alignment
      -- giraffe       fast haplotype-aware short read alignment
      -- mpmap         splice-aware multipath alignment of short reads
      -- augment       augment a graph from an alignment
      -- pack          convert alignments to a compact coverage index
      -- call          call or genotype VCF variants
      -- help          show all subcommands
    

    1 Variation graph construction

    # 工作地址
    #/hwfssz1/ST_HEALTH/P18Z10200N0423/hutongyuan/softwares/vg-v1.36.0/test
    cd /hwfssz1/ST_HEALTH/P18Z10200N0423/hutongyuan/softwares/vg-v1.36.0/test/
    mkdir small_out
    vg construct \
    -r small/x.fa \
    -v small/x.vcf.gz \
    > small_out/x.vg
    # x.vg是一个二进制文件
    

    参数
    x.fa 基因组
    x.vcf.gz 基因组VCF
    -r, --reference FILE
    -v, --vcf FILE

    2 Viewing, conversion

    # GFA output
    vg view small_out/x.vg > small_out/x.gfa
    
    # dot output suitable for graphviz
    vg view -d small_out/x.vg > small_out/x.dot
    
    # And if you have a GAM file
    cp small/x-s1337-n1.gam small_out/x.gam
    
    # json version of binary alignments
    vg view -a small_out/x.gam > small_out/x.json
    

    3 Mapping

    # construct the graph (paths below assume running from `vg/test` directory)
    vg construct -r small/x.fa -v small/x.vcf.gz > small_out/x.vg
    
    # store the graph in the xg/gcsa index pair
    vg index \
    -x small_out/x.xg \
    -g small_out/x.gcsa \
    -k 16 small_out/x.vg
    
    # align a read to the indexed version of the graph
    # note that the graph file is not opened, but x.vg.index is assumed
    vg map -s CTACTGACAGCAGAAGTTTGCTGTGAAGATTAAATTAGGTGATGCTTG \
    -x small_out/x.xg \
    -g small_out/x.gcsa \
    > small_out/read.gam
    
    # simulate a bunch of 150bp reads from the graph, one per line
    vg sim \
    -n 1000 -l 150 \
    -x small_out/x.xg \
    > small_out/x.sim.txt
    
    # now map these reads against the graph to get a GAM
    vg map \
    -T small_out/x.sim.txt \
    -x small_out/x.xg \
    -g small_out/x.gcsa \
    > small_out/aln.gam
    
    # surject the alignments back into the reference space of sequence "x", yielding a BAM file
    vg surject \
    -x small_out/x.xg \
    -b small_out/aln.gam \
    > small_out/aln.bam
    
    # or alternatively, surject them to BAM in the call to map
    vg sim -n 1000 -l 150 \
    -x small_out/x.xg \
    > small_out/x.sim.txt
    
    vg map \
    -T small_out/x.sim.txt \
    -x small_out/x.xg \
    -g small_out/x.gcsa --surject-to bam \
    > small_out/aln.bam
    

    4 Augmentation

    vg augment \
    small_out/x.vg \
    small_out/aln.gam \
    -A small_out/aug.gam \
    > small_out/aug.vg
    
    vg augment \
    small_out/x.vg \
    small_out/aln.gam -i -S \
    > small_out/aug_with_paths.vg
    

    5 Variant Calling

    Calling variants using read support

    # Compute the read support from the gam
    # -Q 5: ignore mapping and base qualitiy < 5
    # -s 5: ignore first and last 5bp from each read
    vg pack \
    -x small_out/x.xg \
    -g small_out/aln.gam \
    -Q 5 -s 5 \
    -o small_out/aln.pack
    
    # Generate a VCF from the support.  
    vg call small_out/x.xg \
    -k small_out/aln.pack \
    > small_out/graph_calls.vcf
    
    ############################################
    vg call \
    small_out/x.xg \
    -k small_out/aln.pack -a \
    > small_out/snarl_genotypes.vcf
    
    ############################################
    # Index our augmented graph
    vg index \
    small_out/aug.vg \
    -x small_out/aug.xg
    
    # Compute the read support from the augmented gam (ignoring qualitiy < 5, and 1st and last 5bp of each read)
    vg pack \
    -x small_out/aug.xg \
    -g small_out/aug.gam \
    -Q 5 -s 5 \
    -o small_out/aln_aug.pack
    
    # Generate a VCF from the support
    vg call small_out/aug.xg \
    -k small_out/aln_aug.pack \
    > small_out/calls.vcf
    
    #########################################
    # Re-construct the same graph as before but with `-a`
    vg construct \
    -r small/x.fa \
    -v small/x.vcf.gz -a \
    > small_out/xa.vg
    
    # Index the graph with `-L' to preserve alt paths in the xg
    vg index small_out/xa.vg -x small_out/xa.xg -L
    
    # Compute the support (we could also reuse aln.pack from above)
    vg pack \
    -x small_out/xa.xg \
    -g small_out/aln.gam \
    -o small_out/aln.pack
    
    # Genotype the VCF (use -v)
    vg \
    call small_out/xa.xg \
    -k small_out/aln.pack \
    -v small/x.vcf.gz \
    > small_out/genotypes.vcf
    
    ##################################################
    # filter secondary and ambiguous read mappings out of the gam
    vg filter \
    small_out/aln.gam \
    -r 0.90 -fu -m 1 -q 15 -D 999 \
    -x small_out/x.xg \
    > small_out/aln.filtered.gam
    
    # then compute the support from aln.filtered.gam instead of aln.gam in above etc.
    vg pack \
    -x small_out/xa.xg \
    -g small_out/aln.filtered.gam \
    -o small_out/aln.pack
    
    vg call \
    small_out/xa.xg \
    -k small_out/aln.pack \
    -v small/x.vcf.gz \
    > small_out/genotypes.vcf
    
    vg snarls small_out/x.xg \
    > small_out/x.snarls
    
    # load snarls from a file instead of computing on the fly
    vg call \
    small_out/x.xg \
    -k small_out/aln.pack \
    -r small_out/x.snarls \
    > small_out/calls.vcf
    

    Calling variants from paths in the graph

    mkdir small_hsp
    
    # create a graph from a multiple alignment of HLA haplotypes (from vg/test directory)
    vg msga \
    -f GRCh38_alts/FASTA/HLA/V-352962.fa \
    -t 1 -k 16 | vg mod -U 10 - | vg mod -c - \
    > small_hsp/hla.vg
    
    # index it
    vg index \
    small_hsp/hla.vg \
    -x small_hsp/hla.xg
    
    # generate a VCF using gi|568815592:29791752-29792749 as the reference contig.  The other paths will be considered as haploid samples
    vg deconstruct \
    small_hsp/hla.xg \
    -e -p "gi|568815592:29791752-29792749" \
    > small_hsp/hla_variants.vcf
    
    

    相关文章

      网友评论

        本文标题:vg: 染色体结构变异SV检测

        本文链接:https://www.haomeiwen.com/subject/gmxifrtx.html