数据分析中常常会有这样的转换需求
图片.png
脚本如下:
import sys
infile = open(sys.argv[1],'r')
res = open(sys.argv[2],'w')
result = {}
for line in infile:
line = line.strip().split('\t')
if len(line) < 2: continue
accession = line[0]
go = line[1]
result.setdefault(accession,set()).add(go) #这样好处是不会有重复值
#这里可以换成列表的形式acc2go.setdefault(accession,[]).append(go)
for acc,goi in acc2go.items():
#增添第二列为重复的数量
res.write("%s\t%d\t%s\n"%(acc,len(goi),",".join(goi)))
res.close()
图片.png
另一种
import sys
num = {}
old = open(sys.argv[1], 'r')
for line in old:
line = line.strip().split(' ')
if line[0] not in num:
num[line[0]] = line[1]
else:
num[line[0]] += f',{line[1]}'
old.close()
new = open(sys.argv[2], 'w')
for key,value in num.items():
print(f'{key}\t{value}', file = new)
new.close()
OK,换个思路,此时我们想把数据从右边的格式转换为左边的格式怎么做呢?
图片.png
第一种
import sys
def Trans_file(f1,f2):
for i in f1.readlines():
j = i.split(' ')
for k in j[1].split(','):
m = j[0] + '\t' + k
if(m[-1] != '\n'):
m = m + '\n'
print(m)
f2.write(m)
f1 = open(sys.argv[1],'r')
f2 = open(sys.argv[2],'w')
Trans_file(f1,f2)
f1.close()
f2.close()
第二种
import sys
dict = {}
final = open(sys.argv[2],'w')
with open(sys.argv[1],'r') as p:
for line in p:
line = line.strip().split(' ')
gene = line[0]
dict[gene] = line[1]
for key,value in dict.items():
for i in value.split(','):
print(f'{key}\t{i}',file=final)
final.close()
嗯,大功告成~~
另外,第一种情况有时候我们想要进行重复值求和时用python实现一下,稍微改动下语句即可(虽然这种处理在R中有一堆的函数可以处理):
num = {}
old = open(sys.argv[1], 'r')
for line in old:
line = line.strip().split(' ')
if line[0] not in num:
num[line[0]] = float(line[1])
else:
num[line[0]] += float(line[1])
old.close()
new = open(sys.argv[2], 'w')
for key,value in num.items():
print(f'{key}\t{value}', file = new)
new.close()
网友评论