只能用于linux系统,因为用到了grep命令
import os
import argparse
import subprocess
from itertools import product
from multiprocessing import Pool
def get_chromosome_num(in_file):
chromo = []
with open(in_file,'r') as fr:
for line in fr:
if not line.strip().startswith("#"):
if line.strip().split("\t")[0] not in chromo and len(line.strip().split("\t")[0]) != 0:
chromo.append(line.strip().split("\t")[0])
return chromo
def split_gtf(chromo_list,gtf_file_path,output_folder):
os.makedirs(output_folder,exist_ok=True)
for chr in chromo_list:
cmd = ['grep',chr,gtf_file_path,">",output_folder+"/"+chr+".gtf"]
print(' '.join(cmd))
subprocess.check_output(' '.join(cmd),shell=True)
def final_run():
parser = argparse.ArgumentParser(
formatter_class = argparse.RawDescriptionHelpFormatter,
description = "split gtf",
epilog = '''
@author: MingYan
@contact: mingyan24@126.com
'''
)
parser.add_argument("-g","--gtf",required=True,help="specify the input gtf")
parser.add_argument("-o","--output-folder",required=True,help="specify the output folder")
#parser.add_argument("-nt","--num-threads",required=True,type=int,default=1,help="specify the number of data threads")
args = parser.parse_args()
in_file = args.gtf
output_folder = args.output_folder
#num_threads = args.num_threads
chr_list = get_chromosome_num(in_file)
print(chr_list)
split_gtf(chr_list,in_file,output_folder)
# with Pool(num_threads) as p:
# p.starmap(split_gtf,product(chr_list,in_file,output_folder))
# p.close()
# p.join()
if __name__ == "__main__":
final_run()
print("Congratulations!")
没啥实际应用,就是为了学python
使用
python split_gtf_according_to_chromosome_num.py -g ../20220712/gtf/GCF_000146045.2 _R64_genomic.gtf -o output_mingyan_1
网友评论