美文网首页组学学习
泛基因组分析-基本流程代码

泛基因组分析-基本流程代码

作者: 花生学生信 | 来源:发表于2024-01-24 19:18 被阅读0次

    以下是自己使用map-to-pan分析泛基因组的部分代码,仅供参考:
    统计fasta序列的N50、N90以及最短、最长序列

    ##tj_N50.py 
    import sys
    from Bio import SeqIO
    import os
    
    def calculate_Nx(sequence_lengths, x):
        total_length = sum(sequence_lengths)
        target_length = total_length * x / 100
        sorted_lengths = sorted(sequence_lengths, reverse=True)
        cumulative_length = 0
        for length in sorted_lengths:
            cumulative_length += length
            if cumulative_length >= target_length:
                return length
    
    def calculate_stats(fasta_file):
        stats = []
        for record in SeqIO.parse(fasta_file, "fasta"):
            sequence_length = len(record.seq)
            stats.append((record.id, sequence_length))
        stats.sort(key=lambda x: x[1])
    
        sequence_lengths = [length for _, length in stats]
        n50 = calculate_Nx(sequence_lengths, 50)
        n90 = calculate_Nx(sequence_lengths, 90)
        min_length = sequence_lengths[0]
        max_length = sequence_lengths[-1]
        return stats, n50, n90, min_length, max_length
    
    fasta_files = sys.argv[1:]  # 通过命令行参数传入FASTA文件路径列表
    
    print("文件名\t最短序列长度\t最长序列长度\tN50\tN90")
    for fasta_file in fasta_files:
        sample_stats, n50, n90, min_length, max_length = calculate_stats(fasta_file)
        file_name = os.path.basename(fasta_file)
        print(f"{file_name}\t{min_length}\t{max_length}\t{n50}\t{n90}")
    
    过滤前

    现在需要筛选大于500bp的序列做后续分析:

    ##tiqu_500.py
    import sys
    
    def extract_long_sequences(fasta_file):
        long_sequences = []
        current_sequence = ""
        current_sequence_name = ""
        with open(fasta_file, "r") as file:
            for line in file:
                line = line.strip()
                if line.startswith(">"):
                    if current_sequence and len(current_sequence) > 500:
                        long_sequences.append((current_sequence_name, current_sequence))
                    current_sequence_name = line[1:]
                    current_sequence = ""
                else:
                    current_sequence += line
            if current_sequence and len(current_sequence) > 500:
                long_sequences.append((current_sequence_name, current_sequence))
        return long_sequences
    
    fasta_file = sys.argv[1]  # 通过命令行参数传入FASTA文件路径
    long_sequences = extract_long_sequences(fasta_file)
    
    # 打印提取出的长序列
    for sequence_name, sequence in long_sequences:
        print(f">{sequence_name}")
        print(sequence)
    
    过滤后

    再写个简单的批处理shell:

    cat ../fqid|while read id
    do
    python tiqu_500.py ../contigs/$id.fa >01_500bp/$id.500.fa
    python tj_N50.py 01_500bp/$id_500.fa >>N50_inf.txt
    done
    
    quast评估,嫌慢可以写成parallel并行
    cat ../fqid|while read samples
    do
    #mkdir 02_quast/${samples}
    ref=/public/home/fengting/work/zyyy/DK/jelly/ref_IR64/IR64.genome
    th=4
    refgff=/public/home/fengting/work/cax/NIP/new_add/gff/IR64.IGDBv1.Allset.gff
    /public/home/fengting/demo/pan111/EUPAN/tools/quast-5.1.0rc1/quast.py \
     -t ${th} \
        --large 01_500bp/${samples}.500.fa \
        -r ${ref} \
        -g ${refgff} \
       -o ./02_quast/${samples} \
        --no-icarus \
        --no-snps \
        --min-identity 90.0
    done
    

    使用blastx将未比对上的核苷酸数据与unirice蛋白数据库比对

    cat id|while read samples
    do 
    ###为了并行处理
    echo "blastx -query 01_500bp/${samples}.500.fa -out bl/${samples}.bl -db os_nr/unirice -outfmt 6 -evalue 1e-6 -num_threads 8" >>blxwo.sh
    done
    

    题外话
    昨晚挂了一个blastn的任务,跑了一天,导致其它任务进行的极其缓慢,cpu占用也极其低,关掉这个任务就好了。

    ###去掉文件名的后缀
    import os
    
    # 获取当前文件夹路径
    folder_path = os.getcwd()
    
    # 遍历当前文件夹中的所有文件
    for filename in os.listdir(folder_path):
        # 检查文件是否以.bl为后缀
        if filename.endswith(".bl"):
            # 去掉.bl后缀
            new_filename = filename[:-3]
            
            # 重命名文件
            os.rename(os.path.join(folder_path, filename), os.path.join(folder_path, new_filename))
    

    提取blast结果

    cat sxid|while read id
    do
    cut -f  1 ../bl/$id.bl |uniq|sort -V > blid/$id
    perl ~/scripts/perl/per.pl blid/$id ../01_500bp/$id.500.fa >blfa/$id.fa
    done
    

    去冗余

    ###提取核苷酸序列
    perl ~/scripts/perl/per.pl DL001 DL001-1.500.fa >DL001.fa
    ####cd-hit 去冗余,24-1-25更新去冗余命令
    
    cdhit -i in.fasta -o out.fasta -d 30 -G 1 -M 16000 -T 20  -uL 0.05 -S 2 -U 2 -g 1 -aL 0.95 -aS 1
    
    
    ###euapn去冗余
    ##blastCluster
    eupan rmRedundant blastCluster DL001.fa 1 ~/miniconda3/bin/
    
    ##cdhit
    eupan rmRedundant cdhitCluster DL001.fa 2 ~/miniconda3/bin/
    
    

    为了防止合并后有序列的名字是重复的,我们将每个文件的sample名字添加到序列后

    import os
    
    # 获取当前文件夹路径
    folder_path = os.getcwd()
    
    # 获取当前文件夹中以DL开头的文件列表
    file_list = [file for file in os.listdir(folder_path) if file.startswith('DL')]
    
    # 遍历文件列表
    for file_name in file_list:
        # 构建原始文件路径和新文件路径
        file_path = os.path.join(folder_path, file_name)
        new_file_path = os.path.join(folder_path, f"new_{file_name}")
    
        # 打开原始文件和新文件
        with open(file_path, 'r') as file, open(new_file_path, 'w') as new_file:
            # 逐行读取原始文件内容
            for line in file:
                # 如果当前行以>开头,则在该行后面添加文件名
                if line.startswith('>'):
                    line = line.strip() + f" {file_name}\n"
                
                # 将修改后的行写入新文件
                new_file.write(line)
        
        print(f"文件 {file_name} 处理完成,生成新文件 {new_file_path}")
    
    每条序列都含有样本名

    整合所有文件

    import os
    
    # 获取当前文件夹路径
    folder_path = os.getcwd()
    
    # 获取当前文件夹中以new开头的文件列表
    file_list = [file for file in os.listdir(folder_path) if file.startswith('new')]
    
    # 创建一个新的文件用于整合所有内容
    new_file_path = os.path.join(folder_path, "combined_file.txt")
    
    # 遍历文件列表
    with open(new_file_path, 'w') as new_file:
        for file_name in file_list:
            # 构建原始文件路径
            file_path = os.path.join(folder_path, file_name)
            
            # 打开原始文件
            with open(file_path, 'r') as file:
                # 逐行读取原始文件内容,并写入新文件
                for line in file:
                    new_file.write(line)
    
    print(f"所有以new开头的文件已经整合到新文件 {new_file_path} 中")
    

    未完待续

    相关文章

      网友评论

        本文标题:泛基因组分析-基本流程代码

        本文链接:https://www.haomeiwen.com/subject/giiyodtx.html