tf_tc

作者: keaidelele | 来源:发表于2018-07-28 21:26 被阅读14次
from subprocess import Popen, call, check_output
import argparse
import csv
import sys
import os

parser = argparse.ArgumentParser(description='dbCAN2 Driver Script')
parser.add_argument('--cgc_dis', default=2, help='CGCFinder Distance value')
parser.add_argument('--cgc_sig_genes', default='all', choices=['tp', 'tf','all'], help='CGCFinder Signature Genes value')
args = parser.parse_args()


gff_file = "Abobi1.gff"
fasta_file = "/home/huangle/fungi629/6.29filter/fasta_all/Abobi1.fasta"
cazymeFile = "/home/huangle/fungi629/annotations/extraction1/Abobi1.cazyme.fasta"
hmmerOut = "/home/huangle/fungi629/annotations/1parser/Abobi1.hmm.out"
diamondOut="/home/huangle/fungi629/annotations/2diamond_total/Abobi1.fasta.diamond_dbcan2"
hotpepOut = "/home/huangle/fungi629/annotations/3hotpep_total/Abobi1.hotpep"
tf_out  = '/home/huangle/fungi629/scripts/cgc/tf_final_out/Abobi1.tfu'
tp_out = '/home/huangle/fungi629/scripts/cgc/tp_final_out/Abobi1.tp'
cazyme = open(cazymeFile).readlines()
for i in range(len(cazyme)):
    cazyme[i] = cazyme[i][:-1]
########################
# Begin TF and TP prediction
# call(['diamond', 'blastp', '-d', 'tf', '-e', '1e-10', '-q', 'Abobi1.fasta', '-k', '1', '-p', '4', '-o', 'tf.out', '-f', '6'])
# hmmscan --domtblout output   --cpu  8  -o /dev/null   tf-1.txt test.fa
call(['diamond', 'blastp', '-d', 'db/tc', '-e', '1e-10', '-q', 'Abobi1.fasta', '-k', '1', '-p', '6', '-o', tp_out, '-f', '6'])
tp = set()
tf = set()
tp_genes = {}
tf_genes = {}
with open(tp_out) as f:
    for line in f:
        row = line.rstrip().split('\t')
        tf.add(row[0])
        if not row[0] in tf_genes:
            tf_genes[row[0]] = row[1]
        else:
            tf_genes[row[0]] += ','+row[1]

with open(tp_out) as f:
    for line in f:
        row = line.rstrip().split('\t')
        tp.add(row[0])
        if not row[0] in tp_genes:
            tp_genes[row[0]] = row[1]
        else:
            tp_genes[row[0]] += ','+row[1]
##########################
# Begine CAZyme Extraction
cazyme_genes = {}
dia = set()
hot = set()
hmm = set()

with open(diamondOut) as f:
    next(f)
    for line in f:
        row = line.rstrip().split('\t')
        dia.add(row[0])
        if row[0] not in cazyme_genes:
            cazyme_genes[row[0]] = set()
        cazyme_genes[row[0]].add(row[1].split('|')[1])

with open(hmmerOut) as f:
    next(f)
    for line in f:
        row = line.rstrip().split('\t')
        hmm.add(row[2])
        if row[2] not in cazyme_genes:
            cazyme_genes[row[2]] = set()
        cazyme_genes[row[2]].add(row[0].split('.')[0])

with open(hotpepOut) as f:
    next(f)
    for line in f:
        row = line.rstrip().split('\t')
        hot.add(row[2])
        if row[2] not in cazyme_genes:
            cazyme_genes[row[2]] = set()
        cazyme_genes[row[2]].add(row[0])

temp1 = hmm.intersection(hot)
temp2 = hmm.intersection(dia)
temp3 = dia.intersection(hot)
cazyme = temp1.union(temp2, temp3)



# End CAZyme Extraction

######################tc
# Begin GFF preperation


# gff = False
# print(tf_genes)
prID = ""
end = -100000
lastEnd = -1000000
ok_row = []
with open(gff_file) as f:
    for line in f:
        if not line.startswith('#'):
            if len(line.split('\t')) == 9:
                gff = True
                break
isFungiGff = True
with open(gff_file) as f:
    with open('cgc2.gff', 'w') as out:
        for line in f:

            if "CDS" in line:
                row = line.rstrip().split('\t')
                end = row[4]
                ###fungi gffFile
                if isFungiGff:
                    ppts = row[8].split("; ")
                    for ppt in ppts:
                        key= ppt.split(" ")[0]
                        value = ppt.replace(key,"").replace(" ","").replace("\"","")
                        if key == "name":
                            ID = value
                        if key == "proteinId":
                            no = value
                    jgi_name = ['jgi',os.path.basename(gff_file).split(".gff")[0], no, ID]
                    gene = '|'.join(jgi_name)
                    row[8] = "ID="+gene+"; " + row[8]
                    
                if ID == prID:
                    lastEnd = end
                    continue
                if prID != "":
                    ok_row[4] = lastEnd
                    out.write('\t'.join(ok_row)+'\n')
                prID = ID
                tag = ""
                if gene in tf:
                    row[2] = "TF"
                    tag = tf_genes[gene]

                if gene in tp:
                    row[2] = "TC"
                    tag = tp_genes[gene]

                if gene in cazyme:
                    row[2] = "CAZyme"
                    tag = '|'.join(cazyme_genes[gene])

                if tag != "":
                    row[8] = "DB="+tag+"; " + row[8]
                    # row.insert(8, "DB="+tag+"; ")
                ok_row = row
                lastEnd = end
        # ok_row[4] = lastEnd
        # out.write('\t'.join(ok_row)+'\n')
####################
# Begin CGCFinder call

call(['python', 'CGCFinder.py', '-g', 'cgc2.gff', '-o', 'cgc2.out', '-s', args.cgc_sig_genes, '-d', str(args.cgc_dis)])

# End CGCFinder call
# End CGCFinder
####################

相关文章

网友评论

      本文标题:tf_tc

      本文链接:https://www.haomeiwen.com/subject/bmjamftx.html