fn find_sequence_boundaries(file: &File, num_threads: usize) -> io::Result<Vec<(u64, u64)>> {
let mut reader = BufReader::new(file);
let mut sequence_boundaries = Vec::new(); // 存储序列边界的向量
let mut current_position = 0; // 当前文件指针位置
let mut sequence_start = 0; // 序列起始位置
// 读取文件并记录每个序列的起始和结束位置
for line in reader.lines() {
let line = line?;
if line.starts_with('>') {
if !sequence_boundaries.is_empty() {
// 设置上一个序列的结束位置
sequence_boundaries.last_mut().unwrap().1 = current_position;
}
// 开始一个新序列,并记录起始位置
sequence_start = current_position;
sequence_boundaries.push((sequence_start, 0));
}
current_position += line.len() as u64 + 1; // 更新位置(包括换行符)
}
// 确保最后一个序列的结束位置也被记录
if let Some(last) = sequence_boundaries.last_mut() {
last.1 = current_position;
}
// 根据线程数将序列边界分配到不同的块中
let mut distributed_boundaries = Vec::new();
let sequences_per_thread = (sequence_boundaries.len() + num_threads - 1) / num_threads;
for i in 0..num_threads {
let start_idx = i * sequences_per_thread;
let end_idx = ((i + 1) * sequences_per_thread).min(sequence_boundaries.len());
// 如果计算结果超出范围,则可能不会为最后一个线程分配任务
if start_idx < sequence_boundaries.len() {
distributed_boundaries.push((
sequence_boundaries[start_idx].0,
sequence_boundaries[end_idx - 1].1,
));
}
}
Ok(distributed_boundaries)
}
网友评论