输入文件:
#reads ID quality length
CXX:3:000000000-KKRC3:1:1101:18934:1129 36.83 73
CXX:3:000000000-KKRC3:1:1101:17446:1165 32.30 301
如下脚本想实现质量值的频次分布,同时每种质量值的reads对应的长度,来判断质量值和长度的关系。如下图展示了质量值为38的序列长度偏短。
image.png
#!/bin/python
import numpy as np
import matplotlib.pyplot as plt
import sys
key=sys.argv[3]
# 读取文件并提取质量和长度数据
Quales = []
lengths = []
with open(sys.argv[1], 'r') as file:
lines = file.readlines()
for line in lines:
data = line.split()
Qual = float(data[1])
length = float(data[2])
Quales.append(Qual)
lengths.append(length)
# 将质量数据分配到对应的bin中
Qual_bins = np.arange(20, 40, 2) # 设置bin的边界,从20-40,没个两个数值为一个bar
Qual_bin_indices = np.digitize(Quales, Qual_bins)#这个很关键,np.digitize找到了每个bar的数据的index
#print(Qual_bin_indices)
# 计算每个bin的长度总和和数量
bin_sums = np.bincount(Qual_bin_indices, weights=lengths)
bin_counts = np.bincount(Qual_bin_indices)
# 计算每个bin的长度平均值
bin_averages = bin_sums / bin_counts
# 创建一个包含两个子图的图形
# 绘制柱状图
fig, ax1 = plt.subplots(figsize=(10, 5))
bar_width = 1
ax1.bar(Qual_bins, bin_averages, width=bar_width, color='blue', alpha=0.15)
ax1.set_ylabel('Average-length-blue')
ax2 = ax1.twinx() #这个设置是让两种Y值,共用一个X轴
ax2.bar(Qual_bins + bar_width, bin_counts, width=bar_width, color='red', alpha=0.15)
ax2.set_ylabel('Reads_num_with_each_Qual')
# 设置图形标题和轴标签
plt.title('Blue-length Red-Counts '+ key)
plt.xlabel('Reads Average Quality')
#plt.ylabel('Qual-blue or Length-red')
# 设置X轴刻度
plt.xticks((Qual_bins) + bar_width/2, Qual_bins)
# 调整图形大小和边距
fig.tight_layout()
plt.savefig(sys.argv[2])
网友评论