高通量数据比对讲究的就是一个快和准,因此大部分软件都是用C语言实现。BWA是目前基因组序列比对最常用的工具,由于自我感觉已经入门C语言,为了提高自己的水平,因此开始从源码角度学习李恒大神开发的BWA工具。
本次介绍的是bntseq.c
开头这部分看起来强行凑代码量的表格,但本质上是一种用空间换时间哈希函数。
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
};
李恒利用该表格将ACGT/acgt转成0-3的数组,例如A,C,G,T,N的位置分别是65,67, 71,84,78, a,c,g,t对应97,99,103,116. 除了这个四个碱基外,其余都是大于4,对应N。
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0(A), 4, 1(C), 4, 4, 4, 2(G), 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3(T), 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0(a), 4, 1(c), 4, 4, 4, 2(g), 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3(t), 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
};
当然你可以写多个条件语句,来实现这种转换,例如下面的代码
char nuc;
int c;
switch (nuc){
case tolower(nuc) == 'a'; c = 0;
break;
case tolower(nuc) == 'c'; c = 1;
break;
case tolower(nuc) == 'g'; c = 2;
break;
case tolower(nuc) == 't'; c = 3;
break;
default: c = 4;
break;
}
但是tolower涉及到加减运算,又会额外增加计算量。如果不用tolower,我们再加4个逻辑判断,那么每次将碱基转成数字平均为4次逻辑判断,在数据量很大的情况下,也是不小的负担。
而如果直接编码成长度为256数组,每次查询只要做一次计算,速度相当于提高了四倍。用额外的空间换取了速度的优势。
网友评论