最近在做circos图,由于我的重复序列的gff文件不是标准格式,无法用软件生成,只好自己写一个python脚本,本脚本可处理各种类似gff格式的文件,转化为适用于circos所用的基因密度文件。有需要可以拿去使用,改一下我用#注释的那些部分就行。
import pandas as pd
filename = "F:/circos/data/newEGL/XTJ.genome.fasta.out.gff"#文件名
df = pd.read_csv(filename,sep='\t',header=None)
fileresult = filename + '.re'
f = open(fileresult , "w")
#区间
bin = 100000
for i in range(1,10):#染色体数
cluster_name = 'cluster0' + str(i) #染色体名称
chr_name = df.loc[df.iloc[:,0]== cluster_name]
chr_end_name = chr_name.iloc[:, 4]#基因终止位置所在的列数减1,记得要减1
right = 0
while True:
right += bin
if right < chr_end_name.iloc[-1]:
left = right - bin
else:
left = right - bin
right = int(chr_end_name.iloc[-1])
len(chr_end_name[chr_end_name<right])
bool = chr_end_name < right
less = chr_end_name[bool]
f.write('chr0'+str(i) + '\t' + str(left) +'\t' +str(right)+'\t' + str(len(less[less >= left])) + '\n')
break
len(chr_end_name[chr_end_name<right])
bool = chr_end_name < right
less = chr_end_name[bool]
f.write('chr0'+str(i) + '\t' + str(left) +'\t' +str(right)+'\t' + str(len(less[less >= left])) + '\n')
print('finished')
f.close()
网友评论