mosdepth是一个用于计算测序深度的软件
使用方法可参考下面这篇文章
https://www.jianshu.com/p/6bd56cd8d59e
mosdepth在计算一个区间的测序深度时,会用这个区间所有碱基比对到的reads数目的和除以这个区间的碱基数目(包括深度为0的位点),在对一些多倍体进行计算深度时可能因此导致总体测序深度偏低
比如测序深度50X的数据可能会因此算出来的平均深度只有5到20X左右
因此为了真实反映测序胜读,我们需要对mosdepth的输出结果进行改善,即把深度为0的碱基位点去掉,再去计算区间的平均深度或中间值深度
首先使用mosdepth输出每个碱基的测序深度
mosdepth -Q 30 -t 8 output input
然后再使用我写的脚本将深度为0的碱基位点去掉,计算区间的深度(平均深度或中间值深度)
脚本如下
import argparse
import sys
import gzip
import numpy as np
pars=argparse.ArgumentParser()
pars.add_argument("-perbase_file",required=True,help="The output file of mosdepth which contains per base depth")
pars.add_argument("-win_size",required=True,help="The window size for calcuating depth")
pars.add_argument("-out",required=True,help="The output dir")
pars.add_argument("-type",required=True,help="The way of calculating depth,it should be mean or median ")
args=pars.parse_args()
if args.perbase_file:
file_index=sys.argv.index("-perbase_file")
perbase_file=sys.argv[file_index+1]
else:
raise Exception("Please input the per base depth file")
if args.win_size:
size_index=sys.argv.index("-win_size")
win_size=int(sys.argv[size_index+1])
else:
raise Exception("Please input the window size")
if args.out:
out_index=sys.argv.index("-out")
out_dir=sys.argv[out_index+1]
else:
raise Exception("Please input the output file")
if args.type:
type_index=sys.argv.index("-type")
cal_type=sys.argv[type_index+1]
else:
raise Exception("Please choose the way of calculating depth,it should be mean or median ")
###读取物种名
out_str=perbase_file.replace("\n","").split("/")[-1]
spe_name=out_str.split(".")[0]
def list_append(add_list,add_num,add_value):
if(add_num==0):
return add_list
else:
for i in range(0,add_num):
add_list.append(add_value)
return add_list
###构建输出函数
def mean_out_put(chr,region_list,depth_list):
start_pos=0
depth_sum=0
depth_num=0
size_num=0
for i in range(0,len(region_list)):
region=region_list[i]
depth=depth_list[i]
if(region[-1]-start_pos<win_size):
if(i==len(region_list)-1):
size_num+=1
if (depth == 0):
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
if (depth_num == 0):
print("%s\t%d\t%d\t%f" % (chr, win_size * (size_num - 1), win_size * size_num, 0.0), file=f)
else:
print("%s\t%d\t%d\t%f" % (chr, win_size * (size_num - 1), win_size * size_num, (depth_sum * 1.0 / depth_num)), file=f)
else:
depth_num += (region[-1] - region[0])
depth_sum += ((region[-1] - region[0]) * depth)
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
print("%s\t%d\t%d\t%f" % (chr, win_size * (size_num - 1), win_size * size_num, (depth_sum * 1.0 / depth_num)), file=f)
else:
if(depth==0):
continue
else:
depth_num+=(region[-1]-region[0])
depth_sum+=((region[-1]-region[0])*depth)
if(region[-1]-start_pos>win_size):
if (i == len(region_list)-1):
if (depth == 0):
size_num += 1
start_pos = win_size * size_num
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
if (depth_num == 0):
print("%s\t%d\t%d\t%f" % (chr, win_size * (size_num - 1), win_size * size_num, 0.0), file=f)
else:
print("%s\t%d\t%d\t%f" % (
chr, win_size * (size_num - 1), win_size * size_num, (depth_sum * 1.0 / depth_num)), file=f)
depth_sum = 0
depth_num = 0
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
print("%s\t%d\t%d\t%f" % (chr, win_size * size_num , win_size * (size_num+1), 0.0), file=f)
else:
size_num += 1
start_pos = win_size * size_num
depth_num += (start_pos - region[0])
depth_sum += ((start_pos - region[0]) * depth)
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
print("%s\t%d\t%d\t%f" % (
chr, win_size * (size_num - 1), win_size * size_num, (depth_sum * 1.0 / depth_num)), file=f)
depth_sum = 0
depth_num = 0
depth_num += region[-1] - start_pos
depth_sum += ((region[-1] - start_pos) * depth)
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
print("%s\t%d\t%d\t%f" % (
chr, win_size * (size_num ), win_size * (size_num+1), (depth_sum * 1.0 / depth_num)), file=f)
else:
if (depth == 0):
size_num += 1
start_pos = win_size * size_num
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
if (depth_num == 0):
print("%s\t%d\t%d\t%f" % (chr, win_size * (size_num - 1), win_size * size_num, 0.0), file=f)
else:
print("%s\t%d\t%d\t%f" % (
chr, win_size * (size_num - 1), win_size * size_num, (depth_sum * 1.0 / depth_num)), file=f)
depth_sum = 0
depth_num = 0
else:
size_num += 1
start_pos = win_size * size_num
depth_num += (start_pos - region[0])
depth_sum += ((start_pos - region[0]) * depth)
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
print("%s\t%d\t%d\t%f" % (
chr, win_size * (size_num - 1), win_size * size_num, (depth_sum * 1.0 / depth_num)), file=f)
depth_sum = 0
depth_num = 0
depth_num += region[-1] - start_pos
depth_sum += ((region[-1] - start_pos) * depth)
elif(region[-1]-start_pos==win_size):
if(depth==0):
size_num+=1
start_pos=win_size*size_num
with open(r"%s/%s.region.txt" % (out_dir,spe_name),'a+') as f:
if(depth_num==0):
print("%s\t%d\t%d\t%f" % (chr,win_size*(size_num-1),win_size*size_num,0.0),file=f)
else:
print("%s\t%d\t%d\t%f" % (chr, win_size * (size_num - 1), win_size * size_num, (depth_sum*1.0/depth_num)), file=f)
depth_sum=0
depth_num=0
else:
size_num += 1
start_pos = win_size * size_num
depth_num+=(start_pos-region[0])
depth_sum+=((start_pos-region[0])*depth)
with open(r"%s/%s.region.txt" % (out_dir,spe_name),'a+') as f:
print("%s\t%d\t%d\t%f" % (chr, win_size * (size_num - 1), win_size * size_num, (depth_sum * 1.0 / depth_num)), file=f)
depth_sum=0
depth_num=0
depth_num+=region[-1]-start_pos
depth_sum+=((region[-1]-start_pos)*depth)
def mid_out_put(chr,region_list,depth_list):
start_pos = 0
size_num = 0
depth_num=0
depth_value=[]
for i in range(0,len(region_list)):
region=region_list[i]
depth=depth_list[i]
if (region[-1] - start_pos < win_size):
if (i == len(region_list) - 1):
size_num += 1
if (depth == 0):
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
if (depth_num == 0):
print("%s\t%d\t%d\t%f" % (chr, win_size * (size_num - 1), win_size * size_num, 0.0), file=f)
else:
print("%s\t%d\t%d\t%f" % (chr, win_size * (size_num - 1), win_size * size_num, np.median(depth_value)), file=f)
else:
depth_num = (region[-1] - region[0])
depth_value=list_append(add_list=depth_value,add_num=depth_num,add_value=depth)
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
print("%s\t%d\t%d\t%f" % (
chr, win_size * (size_num - 1), win_size * size_num, np.median(depth_value)), file=f)
else:
if (depth == 0):
continue
else:
depth_num=(region[-1] - region[0])
depth_value=list_append(add_list=depth_value,add_num=depth_num,add_value=depth)
if (region[-1] - start_pos > win_size):
if (i == len(region_list) - 1):
if (depth == 0):
size_num += 1
start_pos = win_size * size_num
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
if (depth_num == 0):
print("%s\t%d\t%d\t%f" % (chr, win_size * (size_num - 1), win_size * size_num, 0.0), file=f)
else:
print("%s\t%d\t%d\t%f" % (
chr, win_size * (size_num - 1), win_size * size_num, np.median(depth_value)),
file=f)
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
print("%s\t%d\t%d\t%f" % (chr, win_size * size_num, win_size * (size_num + 1), 0.0), file=f)
else:
size_num += 1
start_pos = win_size * size_num
depth_num =(start_pos - region[0])
depth_value=list_append(add_list=depth_value,add_num=depth_num,add_value=depth)
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
print("%s\t%d\t%d\t%f" % (
chr, win_size * (size_num - 1), win_size * size_num, np.median(depth_value)), file=f)
depth_value=[]
depth_num = 0
depth_num = region[-1] - start_pos
depth_value=list_append(add_list=depth_value,add_num=depth_num,add_value=depth)
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
print("%s\t%d\t%d\t%f" % (
chr, win_size * (size_num), win_size * (size_num + 1), np.median(depth_value)),
file=f)
else:
if (depth == 0):
size_num += 1
start_pos = win_size * size_num
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
if (depth_num == 0):
print("%s\t%d\t%d\t%f" % (chr, win_size * (size_num - 1), win_size * size_num, 0.0), file=f)
else:
print("%s\t%d\t%d\t%f" % (
chr, win_size * (size_num - 1), win_size * size_num, np.median(depth_value)),
file=f)
depth_value=[]
depth_num = 0
else:
size_num += 1
start_pos = win_size * size_num
depth_num = (start_pos - region[0])
depth_value=list_append(add_list=depth_value,add_num=depth_num,add_value=depth)
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
print("%s\t%d\t%d\t%f" % (
chr, win_size * (size_num - 1), win_size * size_num, np.median(depth_value)), file=f)
depth_value=[]
depth_num = 0
depth_num = region[-1] - start_pos
depth_value=list_append(add_list=depth_value,add_num=depth_num,add_value=depth)
elif (region[-1] - start_pos == win_size):
if (depth == 0):
size_num += 1
start_pos = win_size * size_num
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
if (depth_num == 0):
print("%s\t%d\t%d\t%f" % (chr, win_size * (size_num - 1), win_size * size_num, 0.0), file=f)
else:
print("%s\t%d\t%d\t%f" % (
chr, win_size * (size_num - 1), win_size * size_num, np.median(depth_value)), file=f)
depth_value=[]
depth_num = 0
else:
size_num += 1
start_pos = win_size * size_num
depth_num = (start_pos - region[0])
depth_value=list_append(add_list=depth_value,add_num=depth_num,add_value=depth)
with open(r"%s/%s.region.txt" % (out_dir, spe_name), 'a+') as f:
print("%s\t%d\t%d\t%f" % (
chr, win_size * (size_num - 1), win_size * size_num, np.median(depth_value)), file=f)
depth_value=[]
depth_num = 0
###读取文件,将每个depth值的区段和depth值按照染色体来存储
chr_list=[]
###先将第一条染色体来添加
chr_list.append("LG01")
region_list=[]
depth_list=[]
chr_index=0
with gzip.open(perbase_file,'rt') as f:
for line in f:
out_str=line.replace("\n","").split("\t")
if(out_str[0] not in chr_list):
chr_list.append(out_str[0])
chr=chr_list[chr_index]
chr_index+=1
if(cal_type=="mean"):
mean_out_put(chr,region_list=region_list,depth_list=depth_list)
if(cal_type=="median"):
mid_out_put(chr,region_list=region_list,depth_list=depth_list)
region_list=[]
depth_list=[]
else:
region=[]
region.append(int(out_str[1]))
region.append(int(out_str[2]))
depth_list.append(int(out_str[3]))
region_list.append(region)
脚本中第一条染色体名称或者物种名的获取可能有些差异,但大体思路是这样的
最后运行脚本
python mosdepth_change.py -perbase_file $i -win_size 1000000 -out /projects01/DS20070100001/05.Subgenomes/01.split/reseq_depth/02.depths/mosdepth_new -type median
网友评论