文件处理
考虑到平常经常涉及到大量文件批量处理,现在将用过的脚本进行整理。
主要包括以下几点:
1.文件查找;文件内容拆分,提取字段;多个文件中提取的字段合并;字段统计
python的collections.Counter包用于统计特别方便,推荐使用。不过没有测试过如果数据量特别大的情况下计算速度怎么样,下次可以做个测试。
(ps.几万条数据统计是妥妥没啥问题的,这个用过哈)
因为平时工作都是在linux上进行的,所以一般脚本都是为了服务器工作写的,如果在window下进行,可能某些小细节需要调整。Python读取文件时容易出现编码异常的情况,可以查阅其他帖子。
import os
def search_file(base, tag,outpath):
'''
参数: base, 待查询的根目录;tag, 待查询文件中含有的标签;outpath, 查找结果输出文件
'''
outfile = open(outpath, 'w')
for dirbase, dirfile, filenames in os.walk(base):
for filename in filenames:
if filename.endwith(tag): #(filename.startswith(tag))
path = os.path.join(dirbase, filename)
outfile.write(path+'\n')
return
def search_file(object):
p_list = paths(object)
sh = open(r'./merge.sh', 'w') #sh 执行脚本
sh.write('echo start'+'\n')
for base in p_list:
cur_path = os.path.join(base, '01.readfilter')
sampleid = os.path.split(base)[-1]
bam_list = list()
for dirbase, dirfile, filenames in os.walk(cur_path):
for filename in filenames:
if filename.endswith('bwa.srt.bam'):
path = os.path.join(dirbase, filename)
bam_list.append(path)
if len(bam_list) > 1:
name = os.path.join(r'/OLD_LIB/home/bam', sampleid + '.merge.sh')
sh_merge = open(name, 'w')
outbam = '/OLD_LIB/home/works/bam/'+sampleid+'.merge.bam'
cmd = 'samtools merge -@ 10 '+outbam+' '+' '.join(bam_list)
index = 'samtools index -@ 5 ' + outbam + ' ' +outbam+'.bai'
sh_merge.write(cmd+'\n')
sh_merge.write(index+'\n')
sh.write('sh '+name+'\n')
import sys
def file_split(path):
'''参数说明:path,待处理文件路径'''
if not os.path.exists(path):
print('file not exist!! \n Please recheck')
sys.exit(1)
with open(path) as file:
for line in file:
if not line.startswith('#'):
aa = line.strip().split('\t')
chr, pos, ref, alt = aa[0:5]
key = chr+'\t'+pos+'\t'+ref+'\t'+alt
names = aa[8].split(':')
vals = aa[9].split(':')
infs= dict(zip(names, vals))
ad = infs['AD']
rd = infs['RD']
dp = int(ad)+int(rd)
freq = round(int(ad)/dp, 4)
if key not in diction.keys():
diction[key] = str(dp) + '\t' + str(ad) + '\t' + str(freq)
if freq > 0.2:
diction1[key] = str(dp) + '\t' + str(ad) + '\t' + str(freq)
return diction, diction1
from collections import Counter
def count_words(path):
'''模板文件用的是一行一个单词,如果一行多列,需要先拆分成单个单词 '''
words = list()
with open(path) as file:
for line in file:
aa = line.strip()
words.append(aa)
Counter.most_common(words)
网友评论