OpenCL的HelloWorld：一维向量相加

作者: Parker2019 | 来源:发表于2020-02-29 16:44 被阅读0次

介绍

一般我们一门语言，首先都是从最简单的Hello World开始。但是对于OpenCL C和CUDA C，GPU是用来完成并行计算的好帮手，所以最简单的一维向量相加便成了这两种特殊语言的Hello World

4-17更新，在更改了部分代码后成功运行

在Mac上失败的经历

众所周知，Apple已经在其最新的系统（包括Mac和iOS）中抛弃了OpenCL，继而转向Metal。

以下是苹果官方的介绍：
Metal 是 macOS、iOS 和 Apple TVOS 中内建的图形和计算技术。通过这项技术，主处理器 (CPU) 和图形处理器 (GPU) 可以更高效地协同工作，从而在游戏中提供更流畅的图形性能，并加快高性能媒体应用的运行速度。

在MacOS High Sierra中，OpenCL的版本是1.2。我最刚开始因为版本的原因，改动了/Library/Framework/OpenCL.Framework下的一些东西（关闭了rootless机制，强制使用root命令）。结果花了11个小时重装Mac。。。

后来因为不能更改，便开始做一维向量相加的测试。很遗憾的是每次到创建内核的时候都失败了。初步的想法还是和OpenCL.Framework中的东西有关（无奈不想再重装，所以只是放出结果，不再Mac上做测试了，后续如果有时间会在其他平台上面做测试。）

错误的结果

断点调试中kernel

执行了创建kernel后
这里就发现了一个问题：之前kernel的值变为0x0000000000000000，然后执行内核赋值。
也就是说内核没有创建成功，最后的结果算出来都是0。

错误的结果

源码（内核源码和c++源码）

注意改./Vadd.cl为自己的内核源码路径。
Vadd.cl

__kernel void vecadd(__global const float* A, 
                                   __global const float* B, 
                                   __global float* C){
    int id = get_global_id(0);
    C[id] = A[id] + B[id];
}

cl_test.cpp

#include <iostream>
#include <unistd.h>
#include <time.h>
#include "OpenCL/opencl.h"
#define ARRAY_SIZE 6    // 向量长度

void process_CPU();
void process_GPU();
bool GetFileData(const char* fname,std::string& str);
float array_1[ARRAY_SIZE] = {1.0f,2.0f,3.1f,4.2f,5.5f,7.9f};
float array_2[ARRAY_SIZE] = {2.3f,3.3f,6.7f,11.5f,13.5f,8.9f};
float array_result[ARRAY_SIZE]; // 已知结果的向量长度，直接声明
float array_result_gpu[ARRAY_SIZE];

int main(int argc, const char** argv) {
    process_CPU(); 
    process_GPU();
    return 0;
}

void process_CPU(){
    std::cout<<"-----------Start CPU process------------\nResults:"<<std::endl;
    clock_t start =  clock();
    
    for(int i = 0;i < ARRAY_SIZE; i++){
        array_result[i] = array_1[i] + array_2[i];
        std::cout<<array_result[i]<<" ";
    }

    double time_consume = (double)((clock_t)clock() - start) / CLOCKS_PER_SEC;
    std::cout<<"\n------------End CPU process-------------\nTime comsume(s):"
    <<time_consume<<std::endl;

}

void process_GPU(){
    std::cout<<"----------Start GPU process-------------"<<std::endl;

    // 查询平台
    cl_uint status;
    cl_platform_id platform_id;

    // 获取平台(Platform)对象
    status = clGetPlatformIDs(1,&platform_id,NULL);
    if(status != CL_SUCCESS){
        std::cout<<"ERROR:failed to find any platform."<<std::endl;
        return ;
    }

    // 获取设备(Device)信息
    cl_device_id devices;
    clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 2, &devices, NULL);
    
    // 创建上下文(Context)
    cl_context context;
    context = clCreateContext(NULL,1,&devices,NULL,NULL,NULL);

    // 创建命令队列(command queue)
    cl_command_queue queue;
    queue = clCreateCommandQueue(context,devices,CL_QUEUE_PROFILING_ENABLE,NULL);

    // 创建3个CL对象(cl memory object)，并都通过显示的方式拷贝到GPU内存：

    // 开始拷贝第一个数组
    cl_mem cl_array_1;
    cl_array_1 = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                                ARRAY_SIZE*sizeof(cl_float),
                                (void *)array_1,NULL);
    // 第二个
    cl_mem cl_array_2;
    cl_array_2 = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                                ARRAY_SIZE*sizeof(cl_float),
                                (void *)array_2,NULL);
    
    // 创建第三个
    cl_mem cl_array_result;
    cl_array_result = clCreateBuffer(context,CL_MEM_WRITE_ONLY,
                                ARRAY_SIZE*sizeof(cl_float),
                                (void *)array_result_gpu,NULL);

    // 上传到缓冲区(write buffer)
    clEnqueueWriteBuffer(queue,cl_array_1,1,0,
                        ARRAY_SIZE*sizeof(cl_float),
                        array_1,0,0,0);

    clEnqueueWriteBuffer(queue,cl_array_2,1,0,
                        ARRAY_SIZE*sizeof(cl_float),
                        array_2,0,0,0);

    // 创建程序对象(program)
    cl_program program;
    std::string code_file;
    if(GetFileData("./Vadd.cl",code_file) == false)return ;

    char* buf_code = new char[code_file.size()];
    strcpy(buf_code,code_file.c_str());
    buf_code[code_file.size()-1] = NULL;

    program = clCreateProgramWithSource(context,1,(const char**)&buf_code,NULL,NULL);

    // 构建程序(Build program)
    clBuildProgram(program,1,&devices,NULL,NULL,NULL);
    
    // 创建内核
    cl_kernel kernel;
    kernel = clCreateKernel(program,"vector_add",NULL);
    // 设置参数，开始执行内核(kernel)：
    clSetKernelArg(kernel,0,sizeof(cl_mem),&cl_array_1);
    clSetKernelArg(kernel,1,sizeof(cl_mem),&cl_array_2);
    clSetKernelArg(kernel,2,sizeof(cl_mem),&cl_array_result);

     size_t globalWorkSize[1];
     globalWorkSize[0] = ARRAY_SIZE;

    clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);

    // 取回计算结果
    clEnqueueReadBuffer(queue, cl_array_result, CL_TRUE, 0, ARRAY_SIZE*sizeof(cl_float), 
                        array_result_gpu, 0, NULL, NULL);

    // 输出计算结果：
    std::cout<<"Results:"<<std::endl;
    for(int i = 0; i < ARRAY_SIZE;i++){
        std::cout<<array_result_gpu[i]<<" ";
    }
    std::cout<<"\n-----------END-----------"<<std::endl;
}

bool GetFileData(const char* fname,std::string& str){
    FILE* fp = fopen(fname,"r");
    if(fp==NULL){
        printf("ERROR:File opened failed.\n");
        return false;
    }
    while(feof(fp) == 0){
        str += fgetc(fp);
    }
    return true;
}

参考编译命令：（在Mac上可直接链接框架）
clang++ cl_test -o cl_test -framework OpenCL

个人的一点想法

苹果宣布其未来的系统不支持OpenCL，决定了OpenCL悲惨的命运，没有自家人支持的OpenCL路又会在何方。现在市场上成熟的就只剩下CUDA了，而使用CUDA需要NVIDIA的设备。那么其它GPU呢？不说AMD（有实力研制），很多嵌入式（包括绝大部分的安卓手机）设备都是通过OpenCL实现GPU加速的。一般设备现在还真的是难以用GPU做计算，也许在未来会出现更好的专门针对GPU的加速框架出现吧。

4.17日更新

// This program implements a vector addition using OpenCL
// System includes
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
// OpenCL includes
#include <CL/cl.h>

// OpenCL kernel to perform an element-wise 
// add of two arrays            
const char* programSource =
"__kernel void vecadd(__global int *A,__global int *B,__global int *C)\n" 
"{\n"
"// Get the work-item unique ID \n"
"int idx = get_global_id(0);   \n"
"// Add the corresponding locations of \n"
"// 'A' and 'B', and store the result in 'C'.\n"
"   C[idx] = A[idx] + B[idx];        \n"                    
"}\n"
;
int main() {
  // This code executes on the OpenCL host
  
  // Host data
  int *A = NULL;  // Input array
  int *B = NULL;  // Input array
  int *C = NULL;  // Output array
  
  // Elements in each array
  const int elements = 2048;  
  
  // Compute the size of the data 
  size_t datasize = sizeof(int)*elements;

  // Allocate space for input/output data
  A = (int*)malloc(datasize);
  B = (int*)malloc(datasize);
  C = (int*)malloc(datasize);
  // Initialize the input data
  for(int i = 0; i < elements; i++) {
    A[i] = i;
    B[i] = i;
  }
    clock_t start =  clock();
  // Use this to check the output of each API call
  cl_int status;  
   
  //-----------------------------------------------------
  // STEP 1: Discover and initialize the platforms
  //-----------------------------------------------------
  
  cl_uint numPlatforms = 0;
  cl_platform_id *platforms = NULL;
  
  // Use clGetPlatformIDs() to retrieve the number of 
  // platforms
  status = clGetPlatformIDs(0, NULL, &numPlatforms);
 
  // Allocate enough space for each platform
  platforms =  
    (cl_platform_id*)malloc(
      numPlatforms*sizeof(cl_platform_id));
 
  // Fill in platforms with clGetPlatformIDs()
  status = clGetPlatformIDs(numPlatforms, platforms, 
        NULL);

  //-----------------------------------------------------
  // STEP 2: Discover and initialize the devices
  //----------------------------------------------------- 
  
  cl_uint numDevices = 0;
  cl_device_id *devices = NULL;

  // Use clGetDeviceIDs() to retrieve the number of 
  // devices present
  status = clGetDeviceIDs(
    platforms[0], 
    CL_DEVICE_TYPE_CPU, 
    0, 
    NULL, 
    &numDevices);

  // Allocate enough space for each device
  devices = 
    (cl_device_id*)malloc(
      numDevices*sizeof(cl_device_id));

  // Fill in devices with clGetDeviceIDs()
  status = clGetDeviceIDs(
    platforms[0], 
    CL_DEVICE_TYPE_CPU,    
    numDevices, 
    devices, 
    NULL);

  //-----------------------------------------------------
  // STEP 3: Create a context
  //----------------------------------------------------- 
  
  cl_context context = NULL;

  // Create a context using clCreateContext() and 
  // associate it with the devices
  context = clCreateContext(
    NULL, 
    numDevices, 
    devices, 
    NULL, 
    NULL, 
    &status);

  //-----------------------------------------------------
  // STEP 4: Create a command queue
  //----------------------------------------------------- 
  
  cl_command_queue cmdQueue;

  // Create a command queue using clCreateCommandQueue(),
  // and associate it with the device you want to execute 
  // on
  cmdQueue = clCreateCommandQueue(
    context, 
    devices[0], 
    0, 
    &status);

  //-----------------------------------------------------
  // STEP 5: Create device buffers
  //----------------------------------------------------- 
  
  cl_mem bufferA;  // Input array on the device
  cl_mem bufferB;  // Input array on the device
  cl_mem bufferC;  // Output array on the device

  // Use clCreateBuffer() to create a buffer object (d_A) 
  // that will contain the data from the host array A
  bufferA = clCreateBuffer(
    context, 
    CL_MEM_READ_ONLY,             
    datasize, 
    NULL, 
    &status);

  // Use clCreateBuffer() to create a buffer object (d_B)
  // that will contain the data from the host array B
  bufferB = clCreateBuffer(
    context, 
    CL_MEM_READ_ONLY,             
    datasize, 
    NULL, 
    &status);

  // Use clCreateBuffer() to create a buffer object (d_C) 
  // with enough space to hold the output data
  bufferC = clCreateBuffer(
    context, 
    CL_MEM_WRITE_ONLY,         
    datasize, 
    NULL, 
    &status);
  
  //-----------------------------------------------------
  // STEP 6: Write host data to device buffers
  //----------------------------------------------------- 
  
  // Use clEnqueueWriteBuffer() to write input array A to
  // the device buffer bufferA
  status = clEnqueueWriteBuffer(
    cmdQueue, 
    bufferA, 
    CL_FALSE, 
    0, 
    datasize,             
    A, 
    0, 
    NULL, 
    NULL);
  
  // Use clEnqueueWriteBuffer() to write input array B to 
  // the device buffer bufferB
  status = clEnqueueWriteBuffer(
    cmdQueue, 
    bufferB, 
    CL_FALSE, 
    0, 
    datasize,                  
    B, 
    0, 
    NULL, 
    NULL);

  //-----------------------------------------------------
  // STEP 7: Create and compile the program
  //----------------------------------------------------- 
   
  // Create a program using clCreateProgramWithSource()
  cl_program program = clCreateProgramWithSource(
    context, 
    1, 
    (const char**)&programSource,                 
    NULL, 
    &status);

  // Build (compile) the program for the devices with
  // clBuildProgram()
  status = clBuildProgram(
    program, 
    numDevices, 
    devices, 
    NULL, 
    NULL, 
    NULL);
 
  //-----------------------------------------------------
  // STEP 8: Create the kernel
  //----------------------------------------------------- 

  cl_kernel kernel = NULL;

  // Use clCreateKernel() to create a kernel from the 
  // vector addition function (named "vecadd")
  kernel = clCreateKernel(program, "vecadd", &status);

  //-----------------------------------------------------
  // STEP 9: Set the kernel arguments
  //----------------------------------------------------- 
  
  // Associate the input and output buffers with the 
  // kernel 
  // using clSetKernelArg()
  status  = clSetKernelArg(
    kernel, 
    0, 
    sizeof(cl_mem), 
    &bufferA);
  status |= clSetKernelArg(
    kernel, 
    1, 
    sizeof(cl_mem), 
    &bufferB);
  status |= clSetKernelArg(
    kernel, 
    2, 
    sizeof(cl_mem), 
    &bufferC);

  //-----------------------------------------------------
  // STEP 10: Configure the work-item structure
  //----------------------------------------------------- 
  
  // Define an index space (global work size) of work 
  // items for 
  // execution. A workgroup size (local work size) is not 
  // required, 
  // but can be used.
  size_t globalWorkSize[1];  
  // There are 'elements' work-items 
  globalWorkSize[0] = elements;

  //-----------------------------------------------------
  // STEP 11: Enqueue the kernel for execution
  //----------------------------------------------------- 
  
  // Execute the kernel by using 
  // clEnqueueNDRangeKernel().
  // 'globalWorkSize' is the 1D dimension of the 
  // work-items
  status = clEnqueueNDRangeKernel(
    cmdQueue, 
    kernel, 
    1, 
    NULL, 
    globalWorkSize, 
    NULL, 
    0, 
    NULL, 
    NULL);

  //-----------------------------------------------------
  // STEP 12: Read the output buffer back to the host
  //----------------------------------------------------- 
  
  // Use clEnqueueReadBuffer() to read the OpenCL output  
  // buffer (bufferC) 
  // to the host output array (C)
  clEnqueueReadBuffer(
    cmdQueue, 
    bufferC, 
    CL_TRUE, 
    0, 
    datasize, 
    C, 
    0, 
    NULL, 
    NULL);

  // Verify the output
  bool result = true;
  for(int i = 0; i < elements; i++) {
    if(C[i] != i+i) {
      result = false;
      break;
    }
  }
  if(result) {
    printf("Output is correct\n");
  } else {
    printf("Output is incorrect\n");
  }
        double time_consume = (double)((clock_t)clock() - start) / CLOCKS_PER_SEC;
        printf("Time consume(s): %f",time_consume);
  //-----------------------------------------------------
  // STEP 13: Release OpenCL resources
  //----------------------------------------------------- 
  
  // Free OpenCL resources
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(cmdQueue);
  clReleaseMemObject(bufferA);
  clReleaseMemObject(bufferB);
  clReleaseMemObject(bufferC);
  clReleaseContext(context);

  // Free host resources
  free(A);
  free(B);
  free(C);
  free(platforms);
  free(devices);
}

可以更改第79行和92行的CL_DEVICE_TYPE_CPU，查看不同设备的运行情况。可选设备
CL_DEVICE_TYPE_ALL // 选取全部支持OpenCL的设备
CL_DEVICE_TYPE_CPU // 仅选取CPU
CL_DEVICE_TYPE_GPU // 仅选取GPU
输出运行结果：

// CL_DEVICE_TYPE_ALL
Output is correct
Time consume(s): 9.128000

// CL_DEVICE_TYPE_GPU
Output is correct
Time consume(s): 7.775000

// CL_DEVICE_TYPE_CPU
Output is correct
Time consume(s): 5.408000

运行结果图

失败的原因分析

一般的情况下，Intel 7代以后的CPU都支持OpenCL（截止2020年最新版本是OpenCL 2.2），仔细对比了两次的代码，发现了还是在前面初始化设备的时候出现了问题。clGetPlatformIDs，clGetDeviceIDs两个函数以及最重要的两个malloc，第60和85行，分别是给平台和设备分配足够的内存，第一次在Mac上操作的时候这些细节没有注意，导致内核总是创建不成功。

OpenCL的HelloWorld：一维向量相加

介绍

4-17更新，在更改了部分代码后成功运行

在Mac上失败的经历

错误的结果

源码（内核源码和c++源码）

个人的一点想法

4.17日更新

失败的原因分析

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读

OpenCL编程指南