最近在项目开发的过程中需要使用到形态学中的腐蚀和膨胀算法,以获得图像中的边缘信息。腐蚀和膨胀原始的C代码运行一帧320x180x1的图像需要8ms左右,经过neon指令加速后的运行时间只需0.5ms,差不多是16倍的加速。
ps:由于项目的需要,我这边使用的是5x1的卷积核,用来提取图像中的横条信息,所以下面的加速方法并不适合卷积核是3x3或5x5这种的情况。同时为了方便加速,去除了算法开头的一些判断,所以结果会有几个像素的差异。
1.腐蚀和膨胀原始C代码
#include "stdafx.h"
#include "cv.h"
#include "highgui.h"
#include "stdio.h"
#include "stdlib.h"
#include "vector"
//腐蚀
void MorphErosion(unsigned char* src, unsigned char* dst, int width, int height, int strutWidth, int structHeight)
{
if (width - strutWidth < 0 && height - structHeight < 0)return;
int midY = (structHeight + 1) / 2 - 1;
unsigned char val = 255;
for (int i = midY; i < height - midY; i++)
{
for (int j = midY; j < width - midY; j++)
{
for (int n = 0; n < strutWidth; n++)
{
val &= src[i * width + j + n];
}
dst[i * width + j] = val;
val = 255;
}
}
}
//膨胀
void MorphDilition(unsigned char* src, unsigned char* dst, int width, int height, int strutWidth, int structHeight)
{
if (width - strutWidth < 0 && height - structHeight < 0)return;
int midY = (structHeight + 1) / 2 - 1;
unsigned char val = 0;
for (int i = midY; i < height - midY; i++)
{
for (int j = midY; j < width - midY; j++)
{
for (int n = 0; n < strutWidth; n++)
{
val |= src[i * width + j + n];
}
dst[i * width + j] = val;
val = 0;
}
}
}
void MorphOpen(unsigned char* src, unsigned char* tmp, int width, int height, int strutWidth, int structHeight)
{
MorphErosion(src, tmp, width, height, strutWidth, structHeight);
MorphDilition(tmp, tmp, width, height, strutWidth, structHeight);
}
int main()
{
int iRet = 0;
IplImage * src, *dst;
src = cvLoadImage("./sad_pic/6_R.jpg", 0);
if (src == NULL)
{
printf("open image failed\n");
exit(1);
}
IplImage* s = cvCreateImage(cvGetSize(src), IPL_DEPTH_8U, 1);
cvThreshold(src, src, 180, 255, CV_THRESH_BINARY);
dst = cvCloneImage(src);
MorphOpen((unsigned char*)src->imageData, (unsigned char*)dst->imageData, src->widthStep, src->height, 5, 1);
cvShowImage("src", src);
cvShowImage("dst", dst);
cvWaitKey(0);
cvReleaseImage(&src);
cvReleaseImage(&dst);
return iRet;
}
2.腐蚀和膨胀Neon指令加速代码
具体思路:去除了卷积核的概念,直接通过对每一排相邻的5个元素进行and或or操作,所以代码中设置了5个相邻的寄存器p0~p4。
//腐蚀
void MorphErosion(unsigned char* src, unsigned char* dst, int width, int height, int strutWidth, int structHeight)
{
uint8x16_t p0_8x16;
uint8x16_t p1_8x16;
uint8x16_t p2_8x16;
uint8x16_t p3_8x16;
uint8x16_t p4_8x16;
uint8x16_t val_8x16;
int erosion_loop = width / 16;
if (width - strutWidth < 0 && height - structHeight < 0)return;
for (int i = 0; i < height; i++)
{
int steph = i * width;
for (int k = 0; k<erosion_loop; k++)
{
int stepw = 16 * k;
p0_8x16 = vld1q_u8(src + steph + stepw);
p1_8x16 = vld1q_u8(src + steph + stepw + 1);
p2_8x16 = vld1q_u8(src + steph + stepw + 2);
p3_8x16 = vld1q_u8(src + steph + stepw + 3);
p4_8x16 = vld1q_u8(src + steph + stepw + 4);
val_8x16 = vandq_u8(vandq_u8(vandq_u8(vandq_u8(p0_8x16, p1_8x16), p2_8x16), p3_8x16), p4_8x16);
vst1q_u8(dst + steph + stepw, val_8x16);
}
}
}
//膨胀
void MorphDilition(unsigned char* src, unsigned char* dst, int width, int height, int strutWidth, int structHeight)
{
uint8x16_t p0_8x16;
uint8x16_t p1_8x16;
uint8x16_t p2_8x16;
uint8x16_t p3_8x16;
uint8x16_t p4_8x16;
uint8x16_t val_8x16;
int dilition_loop = width / 16;
if (width - strutWidth < 0 && height - structHeight < 0)return;
for (int i = 0; i < height; i++)
{
int steph = i * width;
for (int k = 0; k<dilition_loop; k++)
{
int stepw = 16 * k;
p0_8x16 = vld1q_u8(src + steph + stepw);
p1_8x16 = vld1q_u8(src + steph + stepw + 1);
p2_8x16 = vld1q_u8(src + steph + stepw + 2);
p3_8x16 = vld1q_u8(src + steph + stepw + 3);
p4_8x16 = vld1q_u8(src + steph + stepw + 4);
val_8x16 = vorrq_u8(vorrq_u8(vorrq_u8(vorrq_u8(p0_8x16, p1_8x16), p2_8x16), p3_8x16), p4_8x16);
vst1q_u8(dst + steph + stepw, val_8x16);
}
}
}
效果图
网友评论