需求
需求是这样来的,对三通道图像的像素值进行线性运算的时候,直接调用opencv提供的函数很慢,尤其是把通道拆分,而avx这种都是直接从连续内存中load
数据比较快,set
特定数据比较慢,官方也推荐使用load
后再使用shuffle
之类的进行顺序变换。
内存变换图
从3通道图像中加载数据.png当内存成功分成bgr后就方便做后续运算了。
代码
__m128i one = _mm_loadu_si128((__m128i *)(Src + i));
__m128i two = _mm_loadu_si128((__m128i *)(Src + i + 16));
__m128i three = _mm_loadu_si128((__m128i *)(Src + i + 32));
//⽤_mm_shuffle_epi8来获取BGR数据
__m128i maskone_b = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0);
__m128i maskone_g = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1);
__m128i maskone_r = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11, 8, 5, 2);
__m128i dataone_b = _mm_shuffle_epi8(one, maskone_b);
__m128i dataone_g = _mm_shuffle_epi8(one, maskone_g);
__m128i dataone_r = _mm_shuffle_epi8(one, maskone_r);
__m128i masktwo_b = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1);
__m128i masktwo_g = _mm_set_epi8(-1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1);
__m128i masktwo_r = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1, -1, -1, -1, -1, -1);
__m128i datatwo_b = _mm_shuffle_epi8(two, masktwo_b);
__m128i datatwo_g = _mm_shuffle_epi8(two, masktwo_g);
__m128i datatwo_r = _mm_shuffle_epi8(two, masktwo_r);
__m128i maskthree_b = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m128i maskthree_g = _mm_set_epi8(14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m128i maskthree_r = _mm_set_epi8(15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m128i datathree_b = _mm_shuffle_epi8(three, maskthree_b);
__m128i datathree_g = _mm_shuffle_epi8(three, maskthree_g);
__m128i datathree_r = _mm_shuffle_epi8(three, maskthree_r);
__m128i dataB = _mm_or_si128(dataone_b, _mm_or_si128(datatwo_b, datathree_b));
__m128i dataG = _mm_or_si128(dataone_g, _mm_or_si128(datatwo_g, datathree_g));
__m128i dataR = _mm_or_si128(dataone_r, _mm_or_si128(datatwo_r, datathree_r));
网友评论