美文网首页
利用python对GATK多线程加速

利用python对GATK多线程加速

作者: 陈光辉_山东花生 | 来源:发表于2021-01-15 13:04 被阅读0次

    GATK 变异分析对于大数据样本可能会比较慢,因此可以按照染色体拆分后进行多线程并行计算。下面是我写的一个python多线程脚本,仅供参考,拙劣之处敬请指正。

    #!/usr/bin/python3
    
    import _thread
    import os
    
    import threading
    import time
    
    muthreads=[]
    bam_file="a.mkdup.bam"
    out_file_prefix="flower" 
    chr_list=["CHR01","CHR02","CHR03","CHR04","CHR05","CHR06","CHR07","CHR08","CHR09","CHR10","CHR11","CHR12","CHR13"]
    for chr in chr_list:
    
        threads_comonder_name= "gatk   HaplotypeCaller --intervals " + chr +"  -R /mnt/j/BSA/02-read-align/Tifrunner2.fasta -I " + bam_file + "   -ERC GVCF -O "+ out_file_prefix +"-"+chr+".erc.g.vcf"
        muthreads.append(threads_comonder_name)
    
    exitFlag = 0
    
    class myThread (threading.Thread):
        def __init__(self, threadID, name, counter, comander):
            threading.Thread.__init__(self)
            self.threadID = threadID
            self.name = name
            self.counter = counter
            self.comander = comander
        def run(self):
            print ("开始线程:" + self.name)
            print_time(self.name, self.counter, 5, self.comander)
            print ("退出线程:" + self.name)
    
    def print_time(threadName, delay, counter,comander):
              # while counter:
       if exitFlag:
          threadName.exit()
       time.sleep(delay)
    
       print(comander)
       os.system(comander)#调用操作系统命令行处理数据
            # counter -= 1
    # 创建新线程
    threadlist=[]
    for i, threadsnu in enumerate(muthreads[0:11]):
        print(i)
        print(threadsnu)
        threadsnew=myThread(1, "Thread-" + str(i), 2, threadsnu)
        threadlist.append(threadsnew)
    # 开启新线程
    for threads in threadlist:
        threads.start()
    for threads in threadlist:
        threads.join()
    
    print ("运行结束退出主线程")
    
    

    下面的来自网络未验证

    多条染色体的同样本的vcf文件合并

    # for i in {1..22} X Y ;do echo "-I final_chr$i.vcf" '\';done
    # for i in {10..19} {1..9} M X Y ;do echo "-I final_chr$i.vcf" '\';done
    module load java/1.8.0_91
    GATK=/home/jianmingzeng/biosoft/GATK/gatk-4.0.3.0/gatk
    $GATK GatherVcfs  \
    -I final_chr1.vcf \
    -I final_chr2.vcf \
    -I final_chr3.vcf \
    -I final_chr4.vcf \
    -I final_chr5.vcf \
    -I final_chr6.vcf \
    -I final_chr7.vcf \
    -I final_chr8.vcf \
    -I final_chr9.vcf \
    -I final_chr10.vcf \
    -I final_chr11.vcf \
    -I final_chr12.vcf \
    -I final_chr13.vcf \
    -I final_chr14.vcf \
    -I final_chr15.vcf \
    -I final_chr16.vcf \
    -I final_chr17.vcf \
    -I final_chr18.vcf \
    -I final_chr19.vcf \
    -I final_chr20.vcf \
    -I final_chr21.vcf \
    -I final_chr22.vcf \
    -I final_chrX.vcf \
    -I final_chrY.vcf \
    -O merge.vcf 
    合并的时候需要注意,vcf文件的顺序跟每个vcf文件里面头文件顺序是相同的。
    

    相关文章

      网友评论

          本文标题:利用python对GATK多线程加速

          本文链接:https://www.haomeiwen.com/subject/mnziaktx.html