美文网首页Python
python脚本:将gtf文件以染色体为单位进行拆分

python脚本:将gtf文件以染色体为单位进行拆分

作者: 小明的数据分析笔记本 | 来源:发表于2022-07-19 22:05 被阅读0次

    只能用于linux系统,因为用到了grep命令

    import os
    import argparse
    import subprocess
    from itertools import product
    from multiprocessing import Pool
    
    def get_chromosome_num(in_file):
        chromo = []
        with open(in_file,'r') as fr:
            for line in fr:
                if not line.strip().startswith("#"):
                    if line.strip().split("\t")[0] not in chromo and len(line.strip().split("\t")[0]) != 0:
                        chromo.append(line.strip().split("\t")[0])
    
        return chromo
    
    def split_gtf(chromo_list,gtf_file_path,output_folder):
        os.makedirs(output_folder,exist_ok=True)
        for chr in chromo_list:
            cmd = ['grep',chr,gtf_file_path,">",output_folder+"/"+chr+".gtf"]
            print(' '.join(cmd))
            subprocess.check_output(' '.join(cmd),shell=True)
    
    def final_run():
        parser = argparse.ArgumentParser(
            formatter_class = argparse.RawDescriptionHelpFormatter,
            description = "split gtf",
            epilog = '''
            @author: MingYan
            @contact: mingyan24@126.com
            '''
        )
    
        parser.add_argument("-g","--gtf",required=True,help="specify the input gtf")
        parser.add_argument("-o","--output-folder",required=True,help="specify the output folder")
        #parser.add_argument("-nt","--num-threads",required=True,type=int,default=1,help="specify the number of data threads")
        args = parser.parse_args()
    
        in_file = args.gtf
        output_folder = args.output_folder
        #num_threads = args.num_threads
    
        chr_list = get_chromosome_num(in_file)
        print(chr_list)
    
        split_gtf(chr_list,in_file,output_folder)
    
        # with Pool(num_threads) as p:
        #     p.starmap(split_gtf,product(chr_list,in_file,output_folder))
        #     p.close()
        #     p.join()
    
    if __name__ == "__main__":
        final_run()
    
        print("Congratulations!")
    

    没啥实际应用,就是为了学python

    使用

    python split_gtf_according_to_chromosome_num.py -g ../20220712/gtf/GCF_000146045.2 _R64_genomic.gtf -o output_mingyan_1
    

    相关文章

      网友评论

        本文标题:python脚本:将gtf文件以染色体为单位进行拆分

        本文链接:https://www.haomeiwen.com/subject/qdqjirtx.html