生信格式VCF

作者: 泥人吴 | 来源:发表于2018-12-15 14:29 被阅读332次

    资料推荐

    VCF格式文件的shell小练习

    1.把突变记录的vcf文件区分成 INDEL和SNP条目
    # SNP:
    vip39@VM-0-15-ubuntu:~/test/bowtie2-2.3.4.3-linux-x86_64/example/reads$ cat tmp.vcf | grep -v '^#'|less -S|awk '{if (length($4)==1 && length($5)==1) print}'
    gi|9626243|ref|NC_001416.1| 1104    .   C   A   225 .   DP=43;VDB=0.162843;SGB=-0.693079;MQSB=0.981133;MQ0F=0;AC=2;AN=2;DP4=0,0,8,21;MQ=41  GT:PL   1/1:255,84,0
    gi|9626243|ref|NC_001416.1| 1344    .   G   T   225 .   DP=37;VDB=0.273288;SGB=-0.690438;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,9,8;MQ=42  GT:PL   1/1:255,51,0
    gi|9626243|ref|NC_001416.1| 2143    .   C   G   225 .   DP=46;VDB=0.902087;SGB=-0.692831;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,10,14;MQ=42    GT:PL   1/1:255,72,0
    gi|9626243|ref|NC_001416.1| 3316    .   T   C   225 .   DP=59;VDB=0.712644;SGB=-0.69311;MQSB=0.899452;MQ0F=0;AC=2;AN=2;DP4=0,0,18,13;MQ=41  GT:PL   1/1:255,93,0
    gi|9626243|ref|NC_001416.1| 3406    .   G   T   218 .   DP=40;VDB=0.0470228;SGB=-0.69168;MQSB=0.920044;MQ0F=0;AC=2;AN=2;DP4=0,0,10,9;MQ=41  GT:PL   1/1:248,54,0
    gi|9626243|ref|NC_001416.1| 5812    .   A   C   24.4299 .   DP=13;VDB=0.0618664;SGB=-0.511536;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,2,1;MQ=30 GT:PL   1/1:54,9,0
    gi|9626243|ref|NC_001416.1| 7089    .   A   C   208 .   DP=26;VDB=0.135432;SGB=-0.688148;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,3,12;MQ=42 GT:PL   1/1:238,45,0
    gi|9626243|ref|NC_001416.1| 9632    .   T   A   97  .   DP=17;VDB=0.677364;SGB=-0.636426;MQSB=1.01283;MQ0F=0;AC=2;AN=2;DP4=0,0,3,4;MQ=42    GT:PL   1/1:154,44,29
    gi|9626243|ref|NC_001416.1| 9642    .   T   A   132 .   DP=20;VDB=0.959419;SGB=-0.670168;MQSB=1.00775;MQ0F=0;AC=2;AN=2;DP4=0,0,4,6;MQ=42    GT:PL   1/1:162,30,0
    gi|9626243|ref|NC_001416.1| 12512   .   G   A   225 .   DP=46;VDB=0.828249;SGB=-0.692562;MQSB=0.261423;MQ0F=0;AC=2;AN=2;DP4=0,0,14,8;MQ=41  GT:PL   1/1:255,66,0
    gi|9626243|ref|NC_001416.1| 14897   .   G   C   225 .   DP=38;VDB=0.449535;SGB=-0.689466;MQSB=0.976745;MQ0F=0;AC=2;AN=2;DP4=0,0,10,6;MQ=41  GT:PL   1/1:255,48,0
    ...
    
    # indel:
    vip39@VM-0-15-ubuntu:~/test/bowtie2-2.3.4.3-linux-x86_64/example/reads$ cat tmp.vcf | grep -v '^#'|less -S|awk '{if (length($4)!=1 || length($5)!=1) print}'
    gi|9626243|ref|NC_001416.1| 2   .   GGCG    GGCGCGGGGGCG    9.81282 .   INDEL;IDV=1;IMF=0.5;DP=2;VDB=0.02;SGB=-0.379885;MQ0F=0;AC=2;AN=2;DP4=1,0,1,0;MQ=33  GT:PL   1/1:36,1,0
    gi|9626243|ref|NC_001416.1| 245 .   ATT AT  157 .   INDEL;IDV=33;IMF=0.942857;DP=35;VDB=0.518706;SGB=-0.692562;MQSB=0.121547;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=6,7,13,9;MQ=36  GT:PL   0/1:192,0,23
    gi|9626243|ref|NC_001416.1| 351 .   ATGCTGAAATT A   108 .   INDEL;IDV=1;IMF=0.0285714;DP=35;VDB=0.511365;SGB=-0.688148;MQSB=0.406871;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=8,12,8,7;MQ=28  GT:PL   0/1:141,0,110
    gi|9626243|ref|NC_001416.1| 353 .   GCTGAAATTGA G   215 .   INDEL;IDV=24;IMF=0.685714;DP=35;VDB=0.58815;SGB=-0.692976;MQSB=0.711476;MQ0F=0;AC=2;AN=2;DP4=4,5,13,13;MQ=28    GT:PL   1/1:244,0,3
    gi|9626243|ref|NC_001416.1| 2817    .   GA  G   175 .   INDEL;IDV=41;IMF=0.911111;DP=45;VDB=0.367939;SGB=-0.693136;MQSB=0.45851;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=3,7,20,15;MQ=41  GT:PL   0/1:210,0,28
    gi|9626243|ref|NC_001416.1| 2951    .   ACCC    A   159 .   INDEL;IDV=1;IMF=0.0322581;DP=31;VDB=0.195299;SGB=-0.691153;MQSB=0.340099;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=5,8,9,9;MQ=38   GT:PL   0/1:193,0,16
    gi|9626243|ref|NC_001416.1| 2952    .   CCCACC  CCC 228 .   INDEL;IDV=28;IMF=0.903226;DP=31;VDB=0.210377;SGB=-0.692831;MQSB=0.340099;MQ0F=0;AC=2;AN=2;DP4=2,5,12,12;MQ=38 GT:PL 1/1:255,27,0
    gi|9626243|ref|NC_001416.1| 3262    .   GCC GC  152 .   INDEL;IDV=1;IMF=0.0217391;DP=46;VDB=0.483903;SGB=-0.69311;MQSB=0.887966;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=7,8,13,18;MQ=41  GT:PL   0/1:187,0,17
    gi|9626243|ref|NC_001416.1| 3264    .   CA  C   150 .   INDEL;IDV=41;IMF=0.803922;DP=51;VDB=0.874867;SGB=-0.69312;MQSB=0.94394;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=10,9,14,18;MQ=41  GT:PL   0/1:184,0,26
    gi|9626243|ref|NC_001416.1| 3634    .   ACGC    AC  228 .   INDEL;IDV=25;IMF=0.892857;DP=28;VDB=0.621512;SGB=-0.692067;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=3,5,15,5;MQ=41   GT:PL 1/1:255,19,0
    gi|9626243|ref|NC_001416.1| 6290    .   GT  G   167 .   INDEL;IDV=38;IMF=0.926829;DP=41;VDB=0.910269;SGB=-0.693097;MQSB=0.903761;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=7,4
    ... 
    
    2.统计INDEL和SNP条目的各自的平均测序深度
    3.把INDEL条目再区分成insertion和deletion情况
    # insertion
    vip39@VM-0-15-ubuntu:~/test/bowtie2-2.3.4.3-linux-x86_64/example/reads$ cat tmp.vcf | grep -v '^#'|less -S|awk '{if (length($4)<length($5)) print}'
    gi|9626243|ref|NC_001416.1| 2   .   GGCG    GGCGCGGGGGCG    9.81282 .   INDEL;IDV=1;IMF=0.5;DP=2;VDB=0.02;SGB=-0.379885;MQ0F=0;AC=2;AN=2;DP4=1,0,1,0;MQ=33  GT:PL1/1:36,1,0
    
    # deletion
    vip39@VM-0-15-ubuntu:~/test/bowtie2-2.3.4.3-linux-x86_64/example/reads$ cat tmp.vcf | grep -v '^#'|less -S|awk '{if (length($4)>length($5)) print}'
    gi|9626243|ref|NC_001416.1| 245 .   ATT AT  157 .   INDEL;IDV=33;IMF=0.942857;DP=35;VDB=0.518706;SGB=-0.692562;MQSB=0.121547;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=6,7,13,9;MQ=36  GT:PL   0/1:192,0,23
    gi|9626243|ref|NC_001416.1| 351 .   ATGCTGAAATT A   108 .   INDEL;IDV=1;IMF=0.0285714;DP=35;VDB=0.511365;SGB=-0.688148;MQSB=0.406871;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=8,12,8,7;MQ=28  GT:PL   0/1:141,0,110
    gi|9626243|ref|NC_001416.1| 353 .   GCTGAAATTGA G   215 .   INDEL;IDV=24;IMF=0.685714;DP=35;VDB=0.58815;SGB=-0.692976;MQSB=0.711476;MQ0F=0;AC=2;AN=2;DP4=4,5,13,13;MQ=28    GT:PL   1/1:244,0,3
    gi|9626243|ref|NC_001416.1| 2817    .   GA  G   175 .   INDEL;IDV=41;IMF=0.911111;DP=45;VDB=0.367939;SGB=-0.693136;MQSB=0.45851;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=3,7,20,15;MQ=41  GT:PL   0/1:210,0,28
    gi|9626243|ref|NC_001416.1| 2951    .   ACCC    A   159 .   INDEL;IDV=1;IMF=0.0322581;DP=31;VDB=0.195299;SGB=-0.691153;MQSB=0.340099;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=5,8,9,9;MQ=38   GT:PL   0/1:193,0,16
    gi|9626243|ref|NC_001416.1| 2952    .   CCCACC  CCC 228 .   INDEL;IDV=28;IMF=0.903226;DP=31;VDB=0.210377;SGB=-0.692831;MQSB=0.340099;MQ0F=0;AC=2;AN=2;DP4=2,5,12,12;MQ=38   GT:PL   1/1:255,27,0
    gi|9626243|ref|NC_001416.1| 3262    .   GCC GC  152 .   INDEL;IDV=1;IMF=0.0217391;DP=46;VDB=0.483903;SGB=-0.69311;MQSB=0.887966;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=7,8,13,18;MQ=41  GT:PL   0/1:187,0,17
    gi|9626243|ref|NC_001416.1| 3264    .   CA  C   150 .   INDEL;IDV=41;IMF=0.803922;DP=51;VDB=0.874867;SGB=-0.69312;MQSB=0.94394;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2
    ...
    
    4.统计SNP条目的突变组合分布频率
    vip39@VM-0-15-ubuntu:~/test/bowtie2-2.3.4.3-linux-x86_64/example/reads$ cat tmp.vcf |grep -v '^#'|awk '{if (length($4)==1 && length($5)==1) print}'|cut -f4,5|sort|uniq -c
          7 A   C
          1 A   G
          4 A   T
          2 C   A
          4 C   G
          3 G   A
          2 G   C
          4 G   T
          6 T   A
          1 T   C
          2 T   G
    
    5.找到基因型不是 1/1 的条目,个数
    vip39@VM-0-15-ubuntu:~/test/bowtie2-2.3.4.3-linux-x86_64/example/reads$ cat tmp.vcf |grep -v '^#'|awk '{ print $10}'|grep -v '^1/1'
    0/1:192,0,23
    0/1:141,0,110
    0/1:210,0,28
    0/1:193,0,16
    0/1:187,0,17
    0/1:184,0,26
    0/1:202,0,22
    0/1:116,0,41
    0/1:254,0,30
    0/1:180,0,38
    0/1:184,0,41
    0/1:167,0,21
    0/1:197,0,115
    0/1:192,0,37
    0/1:45,0,87
    0/1:47,0,128
    0/1:44,0,157
    0/1:255,0,13
    0/1:239,0,13
    0/1:198,0,20
    0/1:236,0,93
    0/1:194,0,94
    0/1:195,0,56
    0/1:67,0,168
    0/1:255,0,20
    vip39@VM-0-15-ubuntu:~/test/bowtie2-2.3.4.3-linux-x86_64/example/reads$ cat tmp.vcf |grep -v '^#'|awk '{ print $10}'|grep -v '^1/1'|wc
         25      25     326
    
    6.筛选测序深度大于20的条目
    7.筛选变异位点质量值大于30的条目
    vip39@VM-0-15-ubuntu:~/test/bowtie2-2.3.4.3-linux-x86_64/example/reads$ cat tmp.vcf |awk '{if ($6>30) print}'|grep -v '^#'|less -S|head
    gi|9626243|ref|NC_001416.1| 245 .   ATT AT  157 .   INDEL;IDV=33;IMF=0.942857;DP=35;VDB=0.518706;SGB=-0.692562;MQSB=0.121547;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=6,7,13,9;MQ=36  GT:PL   0/1:192,0,23
    gi|9626243|ref|NC_001416.1| 351 .   ATGCTGAAATT A   108 .   INDEL;IDV=1;IMF=0.0285714;DP=35;VDB=0.511365;SGB=-0.688148;MQSB=0.406871;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=8,12,8,7;MQ=28  GT:PL   0/1:141,0,110
    gi|9626243|ref|NC_001416.1| 353 .   GCTGAAATTGA G   215 .   INDEL;IDV=24;IMF=0.685714;DP=35;VDB=0.58815;SGB=-0.692976;MQSB=0.711476;MQ0F=0;AC=2;AN=2;DP4=4,5,13,13;MQ=28    GT:PL   1/1:244,0,3
    gi|9626243|ref|NC_001416.1| 1104    .   C   A   225 .   DP=43;VDB=0.162843;SGB=-0.693079;MQSB=0.981133;MQ0F=0;AC=2;AN=2;DP4=0,0,8,21;MQ=41  GT:PL   1/1:255,84,0
    gi|9626243|ref|NC_001416.1| 1344    .   G   T   225 .   DP=37;VDB=0.273288;SGB=-0.690438;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,9,8;MQ=42  GT:PL   1/1:255,51,0
    gi|9626243|ref|NC_001416.1| 2143    .   C   G   225 .   DP=46;VDB=0.902087;SGB=-0.692831;MQSB=1;MQ0F=0;AC=2;AN=2;DP4=0,0,10,14;MQ=42    GT:PL   1/1:255,72,0
    gi|9626243|ref|NC_001416.1| 2817    .   GA  G   175 .   INDEL;IDV=41;IMF=0.911111;DP=45;VDB=0.367939;SGB=-0.693136;MQSB=0.45851;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=3,7,20,15;MQ=41  GT:PL   0/1:210,0,28
    gi|9626243|ref|NC_001416.1| 2951    .   ACCC    A   159 .   INDEL;IDV=1;IMF=0.0322581;DP=31;VDB=0.195299;SGB=-0.691153;MQSB=0.340099;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=5,8,9,9;MQ=38   GT:PL   0/1:193,0,16
    gi|9626243|ref|NC_001416.1| 2952    .   CCCACC  CCC 228 .   INDEL;IDV=28;IMF=0.903226;DP=31;VDB=0.210377;SGB=-0.692831;MQSB=0.340099;MQ0F=0;AC=2;AN=2;DP4=2,5,12,12;MQ=38   GT:PL   1/1:255,27,0
    gi|9626243|ref|NC_001416.1| 3262    .   GCC GC  152 .   INDEL;IDV=1;IMF=0.0217391;DP=46;VDB=0.483903;SGB=-0.69311;MQSB=0.887966;MQ0F=0;ICB=1;HOB=0.5;AC=1;AN=2;DP4=7,8,13,18;MQ=41  GT:PL   0/1:187,0,17
    
    8.组合筛选变异位点质量值大于30并且深度大于20的条目
    9. 理解DP4=4,7,11,18 这样的字段,就是 Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases 计算每个变异位点的 AF
    10.在前面步骤的bam文件里面找到这个vcf文件的某一个突变位点的测序深度表明的那些reads,并且在IGV里面可视化bam和vcf定位到该变异位点。


    相关文章

      网友评论

        本文标题:生信格式VCF

        本文链接:https://www.haomeiwen.com/subject/ucpnhqtx.html