from subprocess import Popen, call, check_output
import argparse
import csv
import sys
import os
parser = argparse.ArgumentParser(description='dbCAN2 Driver Script')
parser.add_argument('--cgc_dis', default=2, help='CGCFinder Distance value')
parser.add_argument('--cgc_sig_genes', default='all', choices=['tp', 'tf','all'], help='CGCFinder Signature Genes value')
args = parser.parse_args()
gff_file = "Abobi1.gff"
fasta_file = "/home/huangle/fungi629/6.29filter/fasta_all/Abobi1.fasta"
cazymeFile = "/home/huangle/fungi629/annotations/extraction1/Abobi1.cazyme.fasta"
hmmerOut = "/home/huangle/fungi629/annotations/1parser/Abobi1.hmm.out"
diamondOut="/home/huangle/fungi629/annotations/2diamond_total/Abobi1.fasta.diamond_dbcan2"
hotpepOut = "/home/huangle/fungi629/annotations/3hotpep_total/Abobi1.hotpep"
tf_out = '/home/huangle/fungi629/scripts/cgc/tf_final_out/Abobi1.tfu'
tp_out = '/home/huangle/fungi629/scripts/cgc/tp_final_out/Abobi1.tp'
cazyme = open(cazymeFile).readlines()
for i in range(len(cazyme)):
cazyme[i] = cazyme[i][:-1]
########################
# Begin TF and TP prediction
# call(['diamond', 'blastp', '-d', 'tf', '-e', '1e-10', '-q', 'Abobi1.fasta', '-k', '1', '-p', '4', '-o', 'tf.out', '-f', '6'])
# hmmscan --domtblout output --cpu 8 -o /dev/null tf-1.txt test.fa
call(['diamond', 'blastp', '-d', 'db/tc', '-e', '1e-10', '-q', 'Abobi1.fasta', '-k', '1', '-p', '6', '-o', tp_out, '-f', '6'])
tp = set()
tf = set()
tp_genes = {}
tf_genes = {}
with open(tp_out) as f:
for line in f:
row = line.rstrip().split('\t')
tf.add(row[0])
if not row[0] in tf_genes:
tf_genes[row[0]] = row[1]
else:
tf_genes[row[0]] += ','+row[1]
with open(tp_out) as f:
for line in f:
row = line.rstrip().split('\t')
tp.add(row[0])
if not row[0] in tp_genes:
tp_genes[row[0]] = row[1]
else:
tp_genes[row[0]] += ','+row[1]
##########################
# Begine CAZyme Extraction
cazyme_genes = {}
dia = set()
hot = set()
hmm = set()
with open(diamondOut) as f:
next(f)
for line in f:
row = line.rstrip().split('\t')
dia.add(row[0])
if row[0] not in cazyme_genes:
cazyme_genes[row[0]] = set()
cazyme_genes[row[0]].add(row[1].split('|')[1])
with open(hmmerOut) as f:
next(f)
for line in f:
row = line.rstrip().split('\t')
hmm.add(row[2])
if row[2] not in cazyme_genes:
cazyme_genes[row[2]] = set()
cazyme_genes[row[2]].add(row[0].split('.')[0])
with open(hotpepOut) as f:
next(f)
for line in f:
row = line.rstrip().split('\t')
hot.add(row[2])
if row[2] not in cazyme_genes:
cazyme_genes[row[2]] = set()
cazyme_genes[row[2]].add(row[0])
temp1 = hmm.intersection(hot)
temp2 = hmm.intersection(dia)
temp3 = dia.intersection(hot)
cazyme = temp1.union(temp2, temp3)
# End CAZyme Extraction
######################tc
# Begin GFF preperation
# gff = False
# print(tf_genes)
prID = ""
end = -100000
lastEnd = -1000000
ok_row = []
with open(gff_file) as f:
for line in f:
if not line.startswith('#'):
if len(line.split('\t')) == 9:
gff = True
break
isFungiGff = True
with open(gff_file) as f:
with open('cgc2.gff', 'w') as out:
for line in f:
if "CDS" in line:
row = line.rstrip().split('\t')
end = row[4]
###fungi gffFile
if isFungiGff:
ppts = row[8].split("; ")
for ppt in ppts:
key= ppt.split(" ")[0]
value = ppt.replace(key,"").replace(" ","").replace("\"","")
if key == "name":
ID = value
if key == "proteinId":
no = value
jgi_name = ['jgi',os.path.basename(gff_file).split(".gff")[0], no, ID]
gene = '|'.join(jgi_name)
row[8] = "ID="+gene+"; " + row[8]
if ID == prID:
lastEnd = end
continue
if prID != "":
ok_row[4] = lastEnd
out.write('\t'.join(ok_row)+'\n')
prID = ID
tag = ""
if gene in tf:
row[2] = "TF"
tag = tf_genes[gene]
if gene in tp:
row[2] = "TC"
tag = tp_genes[gene]
if gene in cazyme:
row[2] = "CAZyme"
tag = '|'.join(cazyme_genes[gene])
if tag != "":
row[8] = "DB="+tag+"; " + row[8]
# row.insert(8, "DB="+tag+"; ")
ok_row = row
lastEnd = end
# ok_row[4] = lastEnd
# out.write('\t'.join(ok_row)+'\n')
####################
# Begin CGCFinder call
call(['python', 'CGCFinder.py', '-g', 'cgc2.gff', '-o', 'cgc2.out', '-s', args.cgc_sig_genes, '-d', str(args.cgc_dis)])
# End CGCFinder call
# End CGCFinder
####################
网友评论