美文网首页population遗传学基因组组装
用snpEff产出的vcf提取4DTv位点,构建进化树

用snpEff产出的vcf提取4DTv位点,构建进化树

作者: wo_monic | 来源:发表于2020-12-04 20:06 被阅读0次

    用snpeff产出的vcf提取4DTv位点,用于构建进化树 转载自https://blog.csdn.net/u012110870/article/details/105507476

    徐州更 提到 2019年NG 414个西瓜重测序
    分析方法里,可以从重测序的SNP中,选择4DTV位点来构建进化树。
    代码来源于徐州更,
    python3 calc_4dTv_in_eff_vcf.py input.vcf output.vcf ref.fa

    从snpEff注释的vcf文件中提取4DTv位点的vcf.

    calc_4dTv_in_eff_vcf.py代码如下所示:

    #!/usr/bin/env python3
     
    from sys import argv
    from pysam import VariantFile
    from pysam import FastaFile
     
    file_in = argv[1]
    file_out = argv[2]
    fafile = argv[3]
     
    codon = set(["TC", "CT", "CC", "CG", "AC", "GT", "GC", "GG"])
    rev_dict = dict(A='T',T='A', C='G', G='C')
     
    bcf_in = VariantFile(file_in)
    bcf_out = VariantFile(file_out, "w", header = bcf_in.header)
    fa_in = FastaFile(fafile)
     
    for rec in bcf_in.fetch():
        ann = rec.info['ANN']
        info = rec.info['ANN'][0].split('|')
        # only use synonymouse variants
        if info[1] != "synonymous_variant":
            continue
        # only the 3rd position can be 4dTv
        if int(info[9][2:-3]) % 3 != 0:
            continue
     
        # determine the strand by the REF column and mutation
        # if the ref is not same as the mutation site
        if rec.ref == info[9][-3]:
            pre = fa_in.fetch(rec.chrom, rec.pos-3, rec.pos-1)
        else:
            tmp = fa_in.fetch(rec.chrom, rec.pos, rec.pos+2)
            tmp.upper()
            pre = rev_dict[tmp[1]] + rev_dict[tmp[0]]
        if pre not in codon:
            continue
        bcf_out.write(rec)
    

    把4DTV的vcf转换为phylip软件需要的phy格式 vcf2phylip地址

    vcf2phylip.py代码如下:

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    
    """
    The script converts a collection of SNPs in VCF format into a PHYLIP, FASTA, 
    NEXUS, or binary NEXUS file for phylogenetic analysis. The code is optimized
    to process VCF files with sizes >1GB. For small VCF files the algorithm slows
    down as the number of taxa increases (but is still fast).
    
    Any ploidy is allowed, but binary NEXUS is produced only for diploid VCFs.
    """
    
    
    __author__      = "Edgardo M. Ortiz"
    __credits__     = "Juan D. Palacio-Mejía"
    __version__     = "2.4"
    __email__       = "e.ortiz.v@gmail.com"
    __date__        = "2020-10-04"
    
    
    import argparse
    import gzip
    import os
    import random
    import sys
    
    
    # Dictionary of IUPAC ambiguities for nucleotides
    # '*' is a deletion in GATK, deletions are ignored in consensus, lowercase consensus is udes when an
    # 'N' or '*' is part of the genotype. Capitalization is used by some software but ignored by Geneious
    # for example
    ambiguities = {"*"    :"-", "A"    :"A", "C"    :"C", "G"    :"G", "N"    :"N", "T"     :"T",
                   "*A"   :"a", "*C"   :"c", "*G"   :"g", "*N"   :"n", "*T"   :"t",
                   "AC"   :"M", "AG"   :"R", "AN"   :"a", "AT"   :"W", "CG"   :"S",
                   "CN"   :"c", "CT"   :"Y", "GN"   :"g", "GT"   :"K", "NT"   :"t",
                   "*AC"  :"m", "*AG"  :"r", "*AN"  :"a", "*AT"  :"w", "*CG"  :"s",
                   "*CN"  :"c", "*CT"  :"y", "*GN"  :"g", "*GT"  :"k", "*NT"  :"t",
                   "ACG"  :"V", "ACN"  :"m", "ACT"  :"H", "AGN"  :"r", "AGT"  :"D",
                   "ANT"  :"w", "CGN"  :"s", "CGT"  :"B", "CNT"  :"y", "GNT"  :"k",
                   "*ACG" :"v", "*ACN" :"m", "*ACT" :"h", "*AGN" :"r", "*AGT" :"d",
                   "*ANT" :"w", "*CGN" :"s", "*CGT" :"b", "*CNT" :"y", "*GNT" :"k",
                   "ACGN" :"v", "ACGT" :"N", "ACNT" :"h", "AGNT" :"d", "CGNT" :"b",
                   "*ACGN":"v", "*ACGT":"N", "*ACNT":"h", "*AGNT":"d", "*CGNT":"b", "*ACGNT":"N"}
    
    
    # Dictionary for translating biallelic SNPs into SNAPP, only for diploid VCF
    # 0 is homozygous reference
    # 1 is heterozygous
    # 2 is homozygous alternative
    gen_bin = {"./.":"?",
               ".|.":"?",
               "0/0":"0",
               "0|0":"0",
               "0/1":"1",
               "0|1":"1",
               "1/0":"1",
               "1|0":"1",
               "1/1":"2",
               "1|1":"2"}
    
    
    def extract_sample_names(vcf_file):
        """
        Extract sample names from VCF file
        """
        if vcf_file.endswith(".gz"):
            opener = gzip.open
        else:
            opener = open
        sample_names = []
        with opener(vcf_file, "rt") as vcf:
            for line in vcf:
                line = line.strip("\n")
                if line.startswith("#CHROM"):
                    record = line.split("\t")
                    sample_names = [record[i].replace("./", "") for i in range(9, len(record))]
                    break
        return sample_names
    
    
    def is_anomalous(record, num_samples):
        """
        Determine if the number of samples in current record corresponds to number of samples described
        in the line '#CHROM'
        """
        return bool(len(record) != num_samples + 9)
    
    
    def is_snp(record):
        """
        Determine if current VCF record is a SNP (single nucleotide polymorphism) as opposed to MNP 
        (multinucleotide polymorphism)
        """
        return bool(len(record[3]) == 1 
                    and len(record[4]) - record[4].count(",") == record[4].count(",") + 1)
    
    
    def num_genotypes(record, num_samples):
        """
        Get number of genotypes in VCF record, total number of samples - missing genotypes
        """
        missing = 0
        for i in range(9, num_samples + 9):
            if record[i].startswith("."):
                missing += 1
        return num_samples - missing
    
    
    def get_matrix_column(record, num_samples, resolve_IUPAC):
        """
        Transform a VCF record into a phylogenetic matrix column with nucleotides instead of numbers
        """
        nt_dict = {str(0): record[3].replace("-","*"), ".": "N"}
        alt = record[4].replace("-", "*")
        alt = alt.split(",")
        for n in range(len(alt)):
            nt_dict[str(n+1)] = alt[n]
        column = ""
        for i in range(9, num_samples + 9):
            genotype = record[i].split(":")[0].replace("/", "").replace("|", "")
            if resolve_IUPAC:
                column += nt_dict[random.choice(genotype)]
            else:
                column += ambiguities["".join(sorted(set([nt_dict[j] for j in genotype])))]
        return column
    
    
    def get_matrix_column_bin(record, num_samples):
        """
        If VCF is diploid, return an alignment column in NEXUS binary from a VCF record
        """
        column = ""
        for i in range(9, num_samples + 9):
            genotype = record[i].split(":")[0]
            if len(genotype) == 3:
                column += gen_bin[genotype]
            else:
                column += "?"
        return column
    
    
    def main():
        parser = argparse.ArgumentParser(description=__doc__, 
                                         formatter_class=argparse.RawDescriptionHelpFormatter)
        parser.add_argument("-i", "--input",
            action = "store",
            dest = "filename",
            required = True,
            help = "Name of the input VCF file, can be gzipped")
        parser.add_argument("-m", "--min-samples-locus",
            action = "store",
            dest = "min_samples_locus",
            type = int,
            default = 4,
            help = "Minimum of samples required to be present at a locus (default=4)")
        parser.add_argument("-o", "--outgroup",
            action = "store",
            dest = "outgroup",
            default = "",
            help = "Name of the outgroup in the matrix. Sequence will be written as first taxon in the "
                   "alignment.")
        parser.add_argument("-p", "--phylip-disable",
            action = "store_true",
            dest = "phylipdisable",
            help = "A PHYLIP matrix is written by default unless you enable this flag")
        parser.add_argument("-f", "--fasta",
            action = "store_true",
            dest = "fasta",
            help = "Write a FASTA matrix, disabled by default")
        parser.add_argument("-n", "--nexus",
            action = "store_true",
            dest = "nexus",
            help = "Write a NEXUS matrix, disabled by default")
        parser.add_argument("-b", "--nexus-binary",
            action = "store_true",
            dest = "nexusbin",
            help = "Write a binary NEXUS matrix for analysis of biallelic SNPs in SNAPP, only diploid "
                   "genotypes will be processed, disabled by default.")
        parser.add_argument("-r", "--resolve-IUPAC",
            action = "store_true",
            dest = "resolve_IUPAC",
            help = "Randomly resolve heterozygous genotypes to avoid IUPAC ambiguities in the matrices")
        parser.add_argument("-v", "--version",
            action = "version",
            version = "%(prog)s {version}".format(version=__version__))
        args = parser.parse_args()
    
    
        filename = args.filename
        min_samples_locus = args.min_samples_locus
        outgroup = args.outgroup.split(",")[0].split(";")[0]
        phylipdisable = args.phylipdisable
        fasta = args.fasta
        nexus = args.nexus
        nexusbin = args.nexusbin
        resolve_IUPAC = args.resolve_IUPAC
    
    
        # Get samples names and number of samples in VCF
        sample_names = extract_sample_names(filename)
        num_samples = len(sample_names)
        if len(sample_names) == 0:
            print("\nSample names not found in VCF, your file may be corrupt or missing the header.\n")
            sys.exit()
        print("\nConverting file '{}':\n".format(filename))
        print("Number of samples in VCF: {:d}".format(len(sample_names)))
    
        # If the 'min_samples_locus' is larger than the actual number of samples in VCF readjust it
        min_samples_locus = min(num_samples, min_samples_locus)
    
        # Output filename will be the same as input file, indicating the minimum of samples specified
        if filename.endswith(".gz"):
            outfile = filename.replace(".vcf.gz",".min"+str(min_samples_locus))
        else:
            outfile = filename.replace(".vcf",".min"+str(min_samples_locus))
        # We need to create an intermediate file to hold the sequence data vertically and then transpose 
        # it to create the matrices
        if fasta or nexus or not phylipdisable:
            temporal = open(outfile+".tmp", "w")
        # If binary NEXUS is selected also create a separate temporal
        if nexusbin:
            temporalbin = open(outfile+".bin.tmp", "w")
    
    
        ##########################
        # PROCESS GENOTYPES IN VCF
    
        if filename.endswith(".gz"):
            opener = gzip.open
        else:
            opener = open
    
        with opener(filename, "rt") as vcf:
            # Initialize line counter
            snp_num = 0
            snp_accepted = 0
            snp_shallow = 0
            mnp_num = 0
            snp_biallelic = 0
    
            while 1:
                # Load large chunks of file into memory
                vcf_chunk = vcf.readlines(50000)
                if not vcf_chunk:
                    break
    
                for line in vcf_chunk:
                    line = line.strip()
    
                    if line and not line.startswith("#"): # skip empty and commented lines
                        # Split line into columns
                        record = line.split("\t")
                        # Keep track of number of genotypes processed
                        snp_num += 1
                        # Print progress every 500000 lines
                        if snp_num % 500000 == 0:
                            print("{:d} genotypes processed.".format(snp_num))
                        if is_anomalous(record, num_samples):
                            print("Skipped potentially malformed line: {}".format(line))
                            continue
                        else:
                            # Check if the SNP has the minimum number of samples required
                            if num_genotypes(record, num_samples) < min_samples_locus:
                                # Keep track of loci rejected due to exceeded missing data
                                snp_shallow += 1
                                continue
                            else:
                                # Check that neither REF nor ALT contain MNPs
                                if is_snp(record):
                                    # Add to running sum of accepted SNPs
                                    snp_accepted += 1
                                    # If nucleotide matrices are requested
                                    if fasta or nexus or not phylipdisable:
                                        # Transform VCF record into an alignment column
                                        site_tmp = get_matrix_column(record, num_samples, resolve_IUPAC)
                                        # Uncomment for debugging
                                        # print(site_tmp)
                                        # Write entire row of single nucleotide genotypes to temp file
                                        temporal.write(site_tmp+"\n")
                                    # Write binary NEXUS for SNAPP if requested
                                    if nexusbin:
                                        # Check that the SNP only has two alleles
                                        if len(record[4]) == 1:
                                            # Add to running sum of biallelic SNPs
                                            snp_biallelic += 1
                                            # Translate genotype into 0 for homozygous REF, 1 for 
                                            # heterozygous, and 2 for homozygous ALT
                                            binsite_tmp = get_matrix_column_bin(record, num_samples)
                                            # Write entire row to temporary file
                                            temporalbin.write(binsite_tmp+"\n")
                                else:
                                    # Keep track of loci rejected due to multinucleotide genotypes
                                    mnp_num += 1
    
            # Print useful information about filtering of SNPs
            print("Total of genotypes processed: {:d}".format(snp_num))
            print("Genotypes excluded because they exceeded the amount "
                  "of missing data allowed: {:d}".format(snp_shallow))
            print("Genotypes that passed missing data filter but were "
                  "excluded for being MNPs: {:d}".format(mnp_num))
            print("SNPs that passed the filters: {:d}".format(snp_accepted))
            if nexusbin:
                print("Biallelic SNPs selected for binary NEXUS: {:d}".format(snp_biallelic))
            print("")
    
        if fasta or nexus or not phylipdisable:
            temporal.close()
        if nexusbin:
            temporalbin.close()
    
    
        #######################
        # WRITE OUTPUT MATRICES
    
        if not phylipdisable:
            output_phy = open(outfile+".phy", "w")
            output_phy.write("{:d} {:d}\n".format(len(sample_names), snp_accepted))
    
        if fasta:
            output_fas = open(outfile+".fasta", "w")
    
        if nexus:
            output_nex = open(outfile+".nexus", "w")
            output_nex.write("#NEXUS\n\nBEGIN DATA;\n\tDIMENSIONS NTAX={:d} NCHAR={:d};\n\tFORMAT "
                             "DATATYPE=DNA MISSING=N GAP=- ;\nMATRIX\n".format(len(sample_names),
                                                                                          snp_accepted))
    
        if nexusbin:
            output_nexbin = open(outfile+".bin.nexus", "w")
            output_nexbin.write("#NEXUS\n\nBEGIN DATA;\n\tDIMENSIONS NTAX={:d} NCHAR={:d};\n\tFORMAT "
                                "DATATYPE=SNP MISSING=? GAP=- ;\nMATRIX\n".format(len(sample_names),
                                                                                         snp_biallelic))
    
        # Get length of longest sequence name
        len_longest_name = 0
        for name in sample_names:
            if len(name) > len_longest_name:
                len_longest_name = len(name)
    
        # Write outgroup as first sequence in alignment if the name is specified
        idx_outgroup = None
        if outgroup in sample_names:
            idx_outgroup = sample_names.index(outgroup)
    
            if fasta or nexus or not phylipdisable:
                with open(outfile+".tmp") as tmp_seq:
                    seqout = ""
    
                    # This is where the transposing happens
                    for line in tmp_seq:
                        seqout += line[idx_outgroup]
    
                    # Write FASTA line
                    if fasta:
                        output_fas.write(">"+sample_names[idx_outgroup]+"\n"+seqout+"\n")
    
                    # Pad sequences names and write PHYLIP or NEXUS lines
                    padding = (len_longest_name + 3 - len(sample_names[idx_outgroup])) * " "
                    if not phylipdisable:
                        output_phy.write(sample_names[idx_outgroup]+padding+seqout+"\n")
                    if nexus:
                        output_nex.write(sample_names[idx_outgroup]+padding+seqout+"\n")
    
                    # Print current progress
                    print("Outgroup, '{}', added to the matrix(ces).".format(outgroup))
    
            if nexusbin:
                with open(outfile+".bin.tmp") as bin_tmp_seq:
                    seqout = ""
    
                    # This is where the transposing happens
                    for line in bin_tmp_seq:
                        seqout += line[idx_outgroup]
    
                    # Write line of binary SNPs to NEXUS
                    padding = (len_longest_name + 3 - len(sample_names[idx_outgroup])) * " "
                    output_nexbin.write(sample_names[idx_outgroup]+padding+seqout+"\n")
    
                    # Print current progress
                    print("Outgroup, '{}', added to the binary matrix.".format(outgroup))
    
        # Write sequences of the ingroup
        for s in range(0, len(sample_names)):
            if s != idx_outgroup:
                if fasta or nexus or not phylipdisable:
                    with open(outfile+".tmp") as tmp_seq:
                        seqout = ""
    
                        # This is where the transposing happens
                        for line in tmp_seq:
                            seqout += line[s]
    
                        # Write FASTA line
                        if fasta:
                            output_fas.write(">"+sample_names[s]+"\n"+seqout+"\n")
    
                        # Pad sequences names and write PHYLIP or NEXUS lines
                        padding = (len_longest_name + 3 - len(sample_names[s])) * " "
                        if not phylipdisable:
                            output_phy.write(sample_names[s]+padding+seqout+"\n")
                        if nexus:
                            output_nex.write(sample_names[s]+padding+seqout+"\n")
    
                        # Print current progress
                        print("Sample {:d} of {:d}, '{}', added to the nucleotide matrix(ces).".format(
                                                               s+1, len(sample_names), sample_names[s]))
    
                if nexusbin:
                    with open(outfile+".bin.tmp") as bin_tmp_seq:
                        seqout = ""
    
                        # This is where the transposing happens
                        for line in bin_tmp_seq:
                            seqout += line[s]
    
                        # Write line of binary SNPs to NEXUS
                        padding = (len_longest_name + 3 - len(sample_names[s])) * " "
                        output_nexbin.write(sample_names[s]+padding+seqout+"\n")
    
                        # Print current progress
                        print("Sample {:d} of {:d}, '{}', added to the binary matrix.".format(
                                                               s+1, len(sample_names), sample_names[s]))
    
        if not phylipdisable:
            output_phy.close()
        if fasta:
            output_fas.close()
        if nexus:
            output_nex.write(";\nEND;\n")
            output_nex.close()
        if nexusbin:
            output_nexbin.write(";\nEND;\n")
            output_nexbin.close()
    
        if fasta or nexus or not phylipdisable:
            os.remove(outfile+".tmp")
        if nexusbin:
            os.remove(outfile+".bin.tmp")
    
        print( "\nDone!\n")
    
    if __name__ == "__main__":
        main()
    
    
    注意:VCF文件中至少要有四组样本才能正常运行。

    注意:vcf里分组的材料名长度必须要小于等于10个字符,多于10个字符的在后续分析中会被自动截断为10个。
    python3 vcf2phylip.py -i sample.out.4DTv.vcf -o sample.phy

    进化树构建

    安装phylip

    http://evolution.genetics.washington.edu/phylip.html
    运行目录在exe里,添加到环境变量即可。

    运行脚本phylip_tree.sh,即可自动生成constree文件,然后使用figtree桌面版可视化树。
    运行方法:
    bash phylip_tree.sh sample.phy sample_name
    phylip_tree.sh内容如下:

    #目的:自动化生成phylip需要的par文件
    #运行方法:bash phylip_tree.sh sample.phy sample_name
    #sample_name是输出文件前缀
    if [ $# -eq 0 ] || [ $# -eq 1 ];then
        echo "Usage:
            bash phylip_tree.sh sample.py sample_name"
            exit 1
    fi
    
    #定义输入文件
    sample=$1  #phy文件
    simple=$2  #输出结果文件前缀
    
    #定义输出par函数
    function make_par(){
    #cat seqboot.par
    echo "$sample
    R
    1000
    Y
    9" >$simple.seqboot.par
    #cat dnadist.par
    echo "$simple.seqboot.out
    T
    2.3628
    M
    D
    1000
    2
    Y" >$simple.dnadist.par
    #cat neighbor.par
    echo "$simple.dnadist.out
    M
    1000
    9
    Y" >$simple.neighbor.par
    # cat consense.par
    echo "$simple.nei.tree
    Y">$simple.consense.par
    }
    
    
    ###par文件参数讲解
    <<!
    #cat seqboot.par
    $sample #设定输入.phy文件的名称,否则输入默认的名为infile的文件
    R #选择bootstrap
    1000 #设置bootstrap的值,即重复的replicate的数目,通常使用1000或者100,注意此处设定好后,后续两步的M值也为1000或者100
    Y #yes确认以上设定的参数
    9 #设定随机参数,输入奇数值。
    
    #cat dnadist.par
    $simple.seqboot.out #本程序的输入文件
    T #选择设定Transition/transversion的比值
    2.3628 #比值大小
    M #修改M值
    D #修改M值
    1000 #设定M值大小
    2 #将软件运行情况显示出来
    Y #确认以上设定的参数
    
    #cat neighbor.par
    $simple.dnadist.out #本程序的输入文件
    M
    1000  #设定M值大小
    9 #设定随机数,输入奇数值
    Y #确认以上设定的参数
    
    # cat consense.par
    $simple.nei.tree  #本程序的输入文件
    Y #确认以上设定的参数
    !
    
    #定义生成tree文件的函数
    function get_tree(){
    seqboot<$simple.seqboot.par && mv outfile $simple.seqboot.out && \
    dnadist<$simple.dnadist.par && mv outfile $simple.dnadist.out && \
    neighbor<$simple.neighbor.par && mv outfile $simple.nei.out && mv outtree $simple.nei.tree  &&  \
    consense<$simple.consense.par && mv outfile $simple.cons.out && mv outtree $simple.constree
    }
    
    #执行函数
    make_par
    get_tree
    
    

    或者使用mega7构建进化树参考1 参考2 mega的使用方法

    研究表明,GS(基因组大小)和TE(转座子)的数量有着非常大的关系,自交过程中,TE被清除,导致GS变小。NC 参考文献
    maize TE annotation 《nature》

    相关文章

      网友评论

        本文标题:用snpEff产出的vcf提取4DTv位点,构建进化树

        本文链接:https://www.haomeiwen.com/subject/yeluwktx.html