来自rMVP的关联后的数据;峰拆分输出QTNs
getposQTNs.py
import os
import sys
from numpy import median
goin = sys.argv[1]
ranges = int(sys.argv[2])
file = open(goin,"r")
lines = list(file.readlines())
dic = {}
dicout = {}
for i in lines:
i = i.strip()
if i.find("SNP") == -1:
i = i.replace('"','')
i = i.split(",")
chrn = i[1]
#print(i)
dicout[i[1]+"-"+str(i[2])] = [i[5],i[7]]
if chrn not in dic.keys():
dic[chrn] = []
dic[chrn].append(int(i[2]))
else:
dic[chrn].append(int(i[2]))
#print(dic)
dicoutmid = {}
for i in dic.keys():
dicoutmid[i] =[]
listpos = sorted(dic[i])#,reverse=True)
#print(listpos)
for j in range(len(listpos)):
if j == 0:
gl = []
gl.append(listpos[j])
if j + 1 < len(listpos):
if listpos[j+1] - listpos[j] < ranges:
gl.append(listpos[j+1])
else:
dicoutmid[i].append(gl)
gl = []
gl.append(listpos[j+1])
else:
dicoutmid[i].append(gl)
#print(dicoutmid)
for i in dicoutmid.keys():
for j in dicoutmid[i]:
if len(j) %2 == 0:
j = j[:-1]
pos = median(j)
effect = dicout[i+"-"+str(int(round(pos)))][0]
pvalue = dicout[i+"-"+str(int(round(pos)))][1]
print(i,int(round(pos)),effect,pvalue)
基于GFF关联基因,以水稻RAP注释为例
(locus.gff文件;https://rapdb.dna.affrc.go.jp/download/archive/irgsp1/IRGSP-1.0_representative_2022-09-01.tar.gz)
需要处理一下chromosome编号保证相同
sed -i 's/chr0//g' locus.gff
sed -i 's/chr//g' locus.gff
mergegeneandpoiresult.py
import sys
goin = sys.argv[2] #上面的QTNs输出
goingff = sys.argv[1] #locus.gff
ranges = int(sys.argv[3]) #关联区间
filegff = open(goingff,"r")
filel = filegff.readlines()
filegff.close()
dicg ={}
for i in filel:
i = i.strip()
i = i.split()
chrg = i[0]
gst = int(i[3])
ged = int(i[4])
chr_locus= chrg +"_"+str(gst) +"_"+ str(ged)
nameID = i[8].split("ID=")[1].split(";")[0]
dicg[nameID] = chr_locus
#print(dicg)
file = open(goin,"r")
fileline = file.readlines()
file.close()
print("Chr\tPos\tEffect\tPvalue\tGenes")
for i in fileline:
i = i.split()
chrt = i[0]
chrtst = int(i[1])-ranges
chrted = int(i[1])+ranges
headstr = ""
#print(chrt,chrtst,chrted)
for k in i:
headstr += k+"\t"
headstr += "gene"
print(headstr,end = ":")
for j in dicg.keys():
#print(dicg[j])
gst = int(dicg[j].split("_")[1])
ged = int(dicg[j].split("_")[2])
chrg = dicg[j].split("_")[0]
#print(chrg,gst,ged)
#break
if chrg == chrt and gst > chrtst and ged<chrted:
print(j,end = "\t")
print()
基于3V结果的一个峰拆分输出独特的QTNs
xlsx转换为csv
xlsx2csv.py
import sys
import pandas as pd
goin = sys.argv[1]
goout = sys.argv[2]
tblnum = int(sys.argv[3])
def xlsx_to_csv_pd():
data_xls = pd.read_excel(goin, sheet_name=tblnum)
data_xls.to_csv(goout, encoding='utf-8')
if __name__ == '__main__':
xlsx_to_csv_pd()
独特QTNs输出
getposQTNs3V.py
import os
import sys
from numpy import median
goin = sys.argv[1]
ranges = int(sys.argv[2])
file = open(goin,"r")
lines = list(file.readlines())
dic = {}
dicout = {}
for i in lines:
i = i.strip()
if i.find("ID") == -1:
i = i.replace('"','')
i = i.split(",")
chrn = i[4]
#print(i)
dicout[i[4]+"-"+str(i[5])] = [i[6],i[11]]
if chrn not in dic.keys():
dic[chrn] = []
dic[chrn].append(int(i[5]))
else:
dic[chrn].append(int(i[5]))
#print(dic)
dicoutmid = {}
for i in dic.keys():
dicoutmid[i] =[]
listpos = sorted(dic[i])#,reverse=True)
#print(listpos)
for j in range(len(listpos)):
if j == 0:
gl = []
gl.append(listpos[j])
if j + 1 < len(listpos):
if listpos[j+1] - listpos[j] < ranges:
gl.append(listpos[j+1])
else:
dicoutmid[i].append(gl)
gl = []
gl.append(listpos[j+1])
else:
dicoutmid[i].append(gl)
#print(dicoutmid)
for i in dicoutmid.keys():
for j in dicoutmid[i]:
if len(j) %2 == 0:
j = j[:-1]
pos = median(j)
effect = dicout[i+"-"+str(int(round(pos)))][0]
pvalue = dicout[i+"-"+str(int(round(pos)))][1]
print(i,int(round(pos)),effect,pvalue)
网友评论