采用的策略:
起始位置减去终止位置为标记长度,如果距离大于100kb
,保留这一行,如果标记是1或2,下一个标记(0,来自亲本1)的长度小于100kb,则合并起始位置
(这里主要保留导入片段)
import sys
def process_file(filename):
# 存储处理后的行
processed_lines = []
# 用于临时存储上一行的信息
last_line = None
with open(filename, 'r') as file:
for line_number, line in enumerate(file, 1):
try:
# 分割每行数据
parts = line.strip().split('\t')
if len(parts) < 4:
continue
chrom, start, end, tag = parts[0], int(parts[1]), int(parts[2]), int(parts[3])
# 计算长度
length = end - start
# 如果是第一个标记,直接保存
if last_line is None:
last_line = (chrom, start, end, tag, length)
continue
# 检查是否需要合并
# if last_line[3] in [1, 2] and length < 100 * 1024: # 100kb
if last_line[0] == chrom and last_line[3] in [1, 2] and length < 100 * 1024: # 100kb
# 合并
last_line = (last_line[0], last_line[1], end, last_line[3], end - last_line[1])
else:
# 添加上一行到结果列表
processed_lines.append(f"{last_line[0]}\t{last_line[1]}\t{last_line[2]}\t{last_line[3]}")
last_line = (chrom, start, end, tag, length)
except ValueError as e:
print(f"Error processing line {line_number}: {e}")
except Exception as e:
print(f"Unexpected error on line {line_number}: {e}")
# 添加最后一行
if last_line is not None:
processed_lines.append(f"{last_line[0]}\t{last_line[1]}\t{last_line[2]}\t{last_line[3]}")
return processed_lines
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py filename")
sys.exit(1)
filename = sys.argv[1]
result = process_file(filename)
# 输出结果
for line in result:
print(line)
注意这里有相同的连续标记了
合并相同标记
import sys
prev_chr = None
prev_mark = None
min_start = None
max_end = None
with open(sys.argv[1], 'r') as file:
# next(file) # 跳过表头行
for line in file:
columns = line.strip().split()
if len(columns) < 4:
continue
current_chr = columns[0]
current_start = int(columns[1])
current_end = int(columns[2])
current_mark = columns[3]
if current_chr == prev_chr and current_mark == prev_mark:
max_end = current_end
else:
if prev_chr is not None:
print(f"{prev_chr}\t{min_start}\t{max_end}\t{prev_mark}")
prev_chr = current_chr
prev_mark = current_mark
min_start = current_start
max_end = current_end
if prev_chr is not None:
print(f"{prev_chr}\t{min_start}\t{max_end}\t{prev_mark}")
合并后的结果
网友评论