Author | shaniadolphin |
---|---|
349948204@qq.com |
1、卷积原理
在CNN(Convolutional Neural Network-卷积神经网络)中,卷积是个必不可少的部件,该网络也因卷积得名。CNN的卷积层中,有许多滤波器,这些滤波器也称为卷积核,用于卷积运算,提取数据特征,这些滤波器的系数由训练得到。
在数字信号处理中,信号经过系统,相当于信号与系统的冲激响应函数 进行卷积,得到输出 ,其定义如下式:
上式中的符号“”表示卷积。
可以看出,信号经过系统,与冲击响应函数序列经过纵轴翻折过的序列对应相乘。
而在卷积神经网络中卷积操作的原理即卷积核与对应的点对应相乘,核心即是乘加。
卷积的方式比较多,假定滤波器的滑动步长stride(步长用表示),以及滤波器的填充方式(填充长度用表示),原始特征图(宽,高),滤波器的尺寸用表示,则输出特征图的尺寸为:
上式中为下取整。
上式左侧为原始数据与卷积核,右侧为卷积输出,其中卷积的步长stride为1,滤波器滑动方向先进行行处理,再进行列处理。
坐标的结果为12: ;
随后滤波器向右滑动1个单位继续计算。处理完行后,滤波器向下移动一个单位处理下一行。
以下是这个过程的动图:
如果要得到和原始数据维度一致的数据,就需要对数据进行填充后再卷积,示意图如下图所示:
以下是用python实现的卷积,使用大小为size的卷积核operator对sourceImage进行卷积(sourceImage为RGB三维数据)。
def convolution(sourceImage, operator, size = 3):#使用大小为size的operator对sourceImage进行卷积返回卷积结果
image = copy.deepcopy(sourceImage)
imageWidth = image.shape[0]
imageHeight = image.shape[1]
newImage = np.zeros([imageWidth - size,imageHeight - size, 3])
for width in range(imageWidth - size):
for height in range(imageHeight - size):
newImage[width,height,0] = np.sum(sourceImage[width:width + size,height:height + size,0] * operator)
newImage[width,height,1] = np.sum(sourceImage[width:width + size,height:height + size,1] * operator)
newImage[width,height,2] = np.sum(sourceImage[width:width + size,height:height + size,2] * operator)
return newImage.astype(np.uint8);
2、canny算子原理
Canny边缘检测算子是John F.Canny于 1986 年开发出来的一个多级边缘检测算法。更为重要的是 Canny 创立了边缘检测计算理论(Computational theory ofedge detection),解释了这项技术是如何工作的。Canny边缘检测算法以Canny的名字命名,被很多人推崇为当今最优的边缘检测的算法。
Canny 边缘检测的步骤
一般情况下,使用高斯平滑滤波器卷积降噪。 如下显示了一个 size = 5 的高斯内核:
高斯平滑滤波器卷积核可以通过以下Python函数求取:
def getGaussianMarix(size = 5, padding = 1):#获取大小为size内偏移为padding的高斯核
sigma = size / 3.0;
gaussian = np.zeros([size,size])
sum = 0
for x in range(size):
for y in range(size):
gaussian[x,y] = math.exp(-1/2 * (np.square(x-padding)/np.square(sigma) + (np.square(y-padding)/np.square(sigma))))
/ (2*math.pi*sigma*sigma)
sum = sum + gaussian[x,y];
gaussian = gaussian / sum
return gaussian
运用一对卷积核 (分别作用于 和 方向)进行卷积,如下的sobel算子加进行边缘加强:
使用下列公式计算梯度幅值和方向,将梯度方向近似到0, 45, 90, 135的三个可能角度之一。
进行非极大值抑制,因为图像梯度幅值矩阵中的元素值越大,图像中该点的梯度值越大,但这并不能说明该点就是边缘。在Canny算法中,非极大值抑制将寻找像素点局部最大值,将非极大值点所对应的灰度值置为0,以剔除掉一大部分非边缘的点。
滞后阈值运算,对于Canny函数的使用,推荐的高低阈值比在2:1到3:1之间。
1. 如果某一像素位置的幅值超过高阈值,该像素被保留为边缘像素。
2. 如果某一像素位置的幅值小于低阈值,该像素被排除。
3. 如果某一像素位置的幅值在两个阈值之间,该像素仅仅在连接到一个高于高阈值的像素时被保留。
Canny 边缘检测的Python实现
以下Python代码使用canny进行了边缘检测,使用了的高斯核滤波后将彩色图像转换为灰度图像,指定canny的卷积核,设定滞后阈值。
def canny(img):
#高斯模糊,降低噪声
blurred = cv.GaussianBlur(img,(5,5),0)
#灰度图像
gray=cv.cvtColor(blurred,cv.COLOR_RGB2GRAY)
#图像梯度
xgrad=cv.Sobel(gray,cv.CV_16SC1,1,0)
ygrad=cv.Sobel(gray,cv.CV_16SC1,0,1)
#计算边缘,参数推荐符合1:3或者1:2
edge_output=cv.Canny(xgrad,ygrad,50,150)
cv.imshow("edge",edge_output)
dst=cv.bitwise_and(img,img,mask=edge_output)
cv.imshow('cedge',dst)
return edge_output
3、OpenCL实现卷积
在OpenCL中将使用个工作项来完成卷积操作,工作项ID为,和取值分别为[0, W-1]和[0, H-1]。滤波器的坐标值为,和取值范围均为[0, 1, 2]。所以,滤波器坐标对应的存储索引值为,图像数据对应的存储索引值为,其中。
__kernel void Conv2D( __global int * image_in, //image input
__global int * filter_in, //filter input
int K, //filter kernel size
__global int * image_out) //feature map output
{
int W; //work group global size
int Wn; //padded image width
int x; //global id x
int y; //global id y
int ki, kj; //filter coordinate,(kj, ki)
int sum = 0; //multiply and sum of filter and data
W = get_global_size(0);
x = get_global_id(0);
y = get_global_id(1);
Wn = W + (K - 1);
for(ki=0; ki<K; ki++)
for(kj=0; kj<K; kj++)
sum = sum + filter_in[ki*K + kj] * image_in[Wn*(y+ki) + x + kj];
image_out[y*W + x] = sum;
}
C++主函数:
#include<stdio.h>
#include<stdlib.h>
#include<CL/cl.h>
#include<string.h>
#include <iostream>
#define MAX_SOURCE_SIZE (0x10000)
int main(void)
{
/*=================================================
define parameters
=================================================*/
cl_platform_id platform_id = NULL;
cl_uint ret_num_platforms;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem data_in = NULL;
cl_mem data_out = NULL;
cl_mem filter_in = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
size_t kernel_code_size;
char *kernel_str;
int *result;
cl_int ret;
FILE *fp;
cl_uint work_dim;
size_t global_item_size[2];
size_t local_item_size[2];
/*=================================================
define parameters for image, filter, kernels
=================================================*/
int const W = 100; //image width
int const H = 100; //image height
int const K = 3; //filter kernel size
int const Wn = (W + K - 1); //padded image width
int const Hn = (H + K - 1); //padded image height
int point_num = Wn * Hn;
int data_vecs[Wn*Hn];
int filter_coex[K*K] = { -1,0,1,-2,0,2,-1,0,1 }; //sobel filter: horizontal gradient
int filter_coey[K*K] = { -1,0,1,-2,0,2,-1,0,1 }; //sobel filter: horizontal gradient
int i, j;
for (i = 0; i < point_num; i++)
{
data_vecs[i] = rand() % 20;
}
//display input data
printf("\n");
printf("Array data_in:\n");
for (i = 0; i < Hn; i++) {
if(i<10)printf("row[%d]:\t", i);
for (j = 0; j < Wn; j++) {
if(j<10)printf("%d,\t", data_vecs[i*Wn + j]);
}
printf("\n");
}
printf("\n");
kernel_str = (char *)malloc(MAX_SOURCE_SIZE);
result = (int *)malloc(W*H * sizeof(int));
//遍历系统中所有OpenCL平台
ret = clGetPlatformIDs(0, 0, &ret_num_platforms);
printf("Num_of_platforms: %d\n", ret_num_platforms);
cl_platform_id* platforms = new cl_platform_id[ret_num_platforms];
ret = clGetPlatformIDs(ret_num_platforms, platforms, 0);
cl_uint selected_platform_index = ret_num_platforms;
for (cl_uint i = 0; i < ret_num_platforms; ++i)
{
size_t platform_name_length = 0;
size_t maxComputeUnits = 0;
ret = clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME,0,0,&platform_name_length);
// 调用两次,第一次是得到名称的长度
char* platform_name = new char[platform_name_length];
ret = clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME,platform_name_length,platform_name,0);
printf("%d:%s", i, platform_name);
ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
ret = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &maxComputeUnits, NULL);
printf(" [PE=%d]", maxComputeUnits);
if (strstr(platform_name, "Intel") && selected_platform_index == ret_num_platforms)
{
printf(" [Selected]");
selected_platform_index = i;
}
printf("\n");
delete[] platform_name;
}
if (selected_platform_index == ret_num_platforms)
{
printf("Intel platforms NOT FOUND!!\n");
return 1;
}
cl_platform_id platform = platforms[selected_platform_index];
ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
fp = fopen("kernelfun.cl", "r");
kernel_code_size = fread(kernel_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
program = clCreateProgramWithSource(context, 1, (const char **)&kernel_str, (const size_t *)&kernel_code_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
#if 0
//Shows the log
char* build_log;
size_t log_size;
//First call to know the proper size
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
build_log = new char[log_size + 1];
// Second call to get the log
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
build_log[log_size] = '\0';
printf("----------------\n");
printf("build_log:%s", build_log);
printf("----------------\n");
delete[] build_log;
#endif
kernel = clCreateKernel(program, "Conv2D", &ret);
data_in = clCreateBuffer(context, CL_MEM_READ_WRITE, Wn*Hn * sizeof(int), NULL, &ret);
data_out = clCreateBuffer(context, CL_MEM_READ_WRITE, W*H * sizeof(int), NULL, &ret);
filter_in = clCreateBuffer(context, CL_MEM_READ_WRITE, K*K * sizeof(int), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, data_in, CL_TRUE, 0, Wn*Hn * sizeof(int), data_vecs, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, filter_in, CL_TRUE, 0, K*K * sizeof(int), filter_coex, 0, NULL, NULL);
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_in);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&filter_in);
ret = clSetKernelArg(kernel, 2, sizeof(int), (void *)&K);
ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&data_out);
work_dim = 2;
global_item_size[0] = { W };
global_item_size[1] = { H };
local_item_size[0] = { 1 };
local_item_size[1] = { 1 };
cl_event ev;
ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, NULL,
global_item_size, local_item_size, 0, NULL, &ev);
clFinish(command_queue);
//计算kerenl执行时间
cl_ulong startTime, endTime;
clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &startTime, NULL);
clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &endTime, NULL);
cl_ulong kernelExecTimeNs = endTime - startTime;
printf("GPU_1 running:%8.6f ms\n", kernelExecTimeNs*1e-6);
ret = clEnqueueReadBuffer(command_queue, data_out, CL_TRUE, 0,W*H * sizeof(int), result, 0, NULL, NULL);
FILE *f_img_out = fopen("image_out.txt", "w+");
printf("Array data_out: \n");
for (i = 0; i < H; i++) {
if (i<10)printf("row[%d]:\t", i);
for (j = 0; j < W; j++) {
if(j<10)printf("%d,\t", result[i*W + j]);
fprintf(f_img_out, "%d,\t", result[i*W + j]);
}
printf("\n");
fprintf(f_img_out, "\n");
}
printf("\n");
fclose(f_img_out);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(data_in);
ret = clReleaseMemObject(data_out);
ret = clReleaseMemObject(filter_in);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(result);
free(kernel_str);
system("pause");
return 0;
}
参考文档
# | 链接地址 | 文档名称 |
---|---|---|
1 | https://blog.csdn.net/zhouxuanyuye/article/details/81143423 |
卷积神经网络中卷积的OpenCL实现 |
2 | https://www.cnblogs.com/Reyzal/p/7389993.html |
Intel核心显卡OpenCL环境搭建 |
3 | https://github.com/shaniadolphin/ |
github源码 |
网友评论