群体遗传分析中SNP过滤中通常有一步:保留1/3个体平均深度<位点深度<3倍个体平均深度的位点,所以写了个小脚本占个位,有需要方便复制粘贴。
需要所有样本深度,文件获得,参考:二代测序通过bam文件统计深度和覆盖度 - 简书 (jianshu.com)。
依赖安装:pip install pysam
python3 filter_depth.py invcf.gz outvcf.gz depth_file
"""
author: bioshimmer
data: 2023/07/14
info: This script is to filter VCF file,the depth out of 1/3 and 3 multiply
the actually depth of resequencing are setted to ./. and the line which are all
./. are deleted.
"""
import pysam
import sys
def get_depth_dict(indepth_file:str)->dict:
depth_dict = {}
with open(indepth_file,"r") as f:
for line in f.readlines():
line = line.rstrip().split()
depth_dict[line[0]] = float(line[1])
return depth_dict
def main():
if(len(sys.argv) < 3):
print("Usage: InVCF OutVCF Indepth_file" )
exit(1)
Infile = sys.argv[1]
Outfile = sys.argv[2]
indepth = sys.argv[3]
depth_dict = get_depth_dict(indepth_file=indepth)
depth_dict_min = {}
depth_dict_max = {}
for i,j in depth_dict.items():
depth_dict_min[i] = j/3
depth_dict_max[i] = j*3
f1 = pysam.VariantFile(Infile,threads = 2)
f2 = pysam.VariantFile(Outfile,"wb",header = f1.header,threads = 2)
for rec in f1.fetch():
for asample,sample_info in rec.samples.items():
if sample_info['DP'] == None:
pass
elif sample_info['DP'] >= depth_dict_min[asample] and sample_info['DP'] <= depth_dict_max[asample]:
pass
else:
sample_info['GT'] = (None,None)
f2.write(rec)
f1.close()
f2.close()
if __name__ == "__main__":
main()
网友评论