美文网首页OpenCL编程指南
OpenCL的HelloWorld:一维向量相加

OpenCL的HelloWorld:一维向量相加

作者: Parker2019 | 来源:发表于2020-02-29 16:44 被阅读0次

    介绍


    一般我们一门语言,首先都是从最简单的Hello World开始。但是对于OpenCL C和CUDA C,GPU是用来完成并行计算的好帮手,所以最简单的一维向量相加便成了这两种特殊语言的Hello World

    4-17更新,在更改了部分代码后成功运行

    在Mac上失败的经历


    众所周知,Apple已经在其最新的系统(包括Mac和iOS)中抛弃了OpenCL,继而转向Metal。

    以下是苹果官方的介绍:
    Metal 是 macOS、iOS 和 Apple TVOS 中内建的图形和计算技术。通过这项技术,主处理器 (CPU) 和图形处理器 (GPU) 可以更高效地协同工作,从而在游戏中提供更流畅的图形性能,并加快高性能媒体应用的运行速度。

    在MacOS High Sierra中,OpenCL的版本是1.2。我最刚开始因为版本的原因,改动了/Library/Framework/OpenCL.Framework下的一些东西(关闭了rootless机制,强制使用root命令)。结果花了11个小时重装Mac。。。

    后来因为不能更改,便开始做一维向量相加的测试。很遗憾的是每次到创建内核的时候都失败了。初步的想法还是和OpenCL.Framework中的东西有关(无奈不想再重装,所以只是放出结果,不再Mac上做测试了,后续如果有时间会在其他平台上面做测试。)

    错误的结果

    断点调试中kernel
    执行了创建kernel后
    这里就发现了一个问题:之前kernel的值变为0x0000000000000000,然后执行内核赋值。
    也就是说内核没有创建成功,最后的结果算出来都是0。
    错误的结果

    源码(内核源码和c++源码)

    注意改./Vadd.cl为自己的内核源码路径。
    Vadd.cl

    __kernel void vecadd(__global const float* A, 
                                       __global const float* B, 
                                       __global float* C){
        int id = get_global_id(0);
        C[id] = A[id] + B[id];
    }
    

    cl_test.cpp

    #include <iostream>
    #include <unistd.h>
    #include <time.h>
    #include "OpenCL/opencl.h"
    #define ARRAY_SIZE 6    // 向量长度
    
    void process_CPU();
    void process_GPU();
    bool GetFileData(const char* fname,std::string& str);
    float array_1[ARRAY_SIZE] = {1.0f,2.0f,3.1f,4.2f,5.5f,7.9f};
    float array_2[ARRAY_SIZE] = {2.3f,3.3f,6.7f,11.5f,13.5f,8.9f};
    float array_result[ARRAY_SIZE]; // 已知结果的向量长度,直接声明
    float array_result_gpu[ARRAY_SIZE];
    
    int main(int argc, const char** argv) {
        process_CPU(); 
        process_GPU();
        return 0;
    }
    
    void process_CPU(){
        std::cout<<"-----------Start CPU process------------\nResults:"<<std::endl;
        clock_t start =  clock();
        
        for(int i = 0;i < ARRAY_SIZE; i++){
            array_result[i] = array_1[i] + array_2[i];
            std::cout<<array_result[i]<<" ";
        }
    
        double time_consume = (double)((clock_t)clock() - start) / CLOCKS_PER_SEC;
        std::cout<<"\n------------End CPU process-------------\nTime comsume(s):"
        <<time_consume<<std::endl;
    
    }
    
    void process_GPU(){
        std::cout<<"----------Start GPU process-------------"<<std::endl;
    
        // 查询平台
        cl_uint status;
        cl_platform_id platform_id;
    
        // 获取平台(Platform)对象
        status = clGetPlatformIDs(1,&platform_id,NULL);
        if(status != CL_SUCCESS){
            std::cout<<"ERROR:failed to find any platform."<<std::endl;
            return ;
        }
    
        // 获取设备(Device)信息
        cl_device_id devices;
        clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 2, &devices, NULL);
        
        // 创建上下文(Context)
        cl_context context;
        context = clCreateContext(NULL,1,&devices,NULL,NULL,NULL);
    
        // 创建命令队列(command queue)
        cl_command_queue queue;
        queue = clCreateCommandQueue(context,devices,CL_QUEUE_PROFILING_ENABLE,NULL);
    
        // 创建3个CL对象(cl memory object),并都通过显示的方式拷贝到GPU内存:
    
        // 开始拷贝第一个数组
        cl_mem cl_array_1;
        cl_array_1 = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                                    ARRAY_SIZE*sizeof(cl_float),
                                    (void *)array_1,NULL);
        // 第二个
        cl_mem cl_array_2;
        cl_array_2 = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                                    ARRAY_SIZE*sizeof(cl_float),
                                    (void *)array_2,NULL);
        
        // 创建第三个
        cl_mem cl_array_result;
        cl_array_result = clCreateBuffer(context,CL_MEM_WRITE_ONLY,
                                    ARRAY_SIZE*sizeof(cl_float),
                                    (void *)array_result_gpu,NULL);
    
        // 上传到缓冲区(write buffer)
        clEnqueueWriteBuffer(queue,cl_array_1,1,0,
                            ARRAY_SIZE*sizeof(cl_float),
                            array_1,0,0,0);
    
        clEnqueueWriteBuffer(queue,cl_array_2,1,0,
                            ARRAY_SIZE*sizeof(cl_float),
                            array_2,0,0,0);
    
        // 创建程序对象(program)
        cl_program program;
        std::string code_file;
        if(GetFileData("./Vadd.cl",code_file) == false)return ;
    
        char* buf_code = new char[code_file.size()];
        strcpy(buf_code,code_file.c_str());
        buf_code[code_file.size()-1] = NULL;
    
        program = clCreateProgramWithSource(context,1,(const char**)&buf_code,NULL,NULL);
    
        // 构建程序(Build program)
        clBuildProgram(program,1,&devices,NULL,NULL,NULL);
        
        // 创建内核
        cl_kernel kernel;
        kernel = clCreateKernel(program,"vector_add",NULL);
        // 设置参数,开始执行内核(kernel):
        clSetKernelArg(kernel,0,sizeof(cl_mem),&cl_array_1);
        clSetKernelArg(kernel,1,sizeof(cl_mem),&cl_array_2);
        clSetKernelArg(kernel,2,sizeof(cl_mem),&cl_array_result);
    
         size_t globalWorkSize[1];
         globalWorkSize[0] = ARRAY_SIZE;
    
        clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
    
        // 取回计算结果
        clEnqueueReadBuffer(queue, cl_array_result, CL_TRUE, 0, ARRAY_SIZE*sizeof(cl_float), 
                            array_result_gpu, 0, NULL, NULL);
    
        // 输出计算结果:
        std::cout<<"Results:"<<std::endl;
        for(int i = 0; i < ARRAY_SIZE;i++){
            std::cout<<array_result_gpu[i]<<" ";
        }
        std::cout<<"\n-----------END-----------"<<std::endl;
    }
    
    bool GetFileData(const char* fname,std::string& str){
        FILE* fp = fopen(fname,"r");
        if(fp==NULL){
            printf("ERROR:File opened failed.\n");
            return false;
        }
        while(feof(fp) == 0){
            str += fgetc(fp);
        }
        return true;
    }
    

    参考编译命令:(在Mac上可直接链接框架)
    clang++ cl_test -o cl_test -framework OpenCL

    个人的一点想法


    苹果宣布其未来的系统不支持OpenCL,决定了OpenCL悲惨的命运,没有自家人支持的OpenCL路又会在何方。现在市场上成熟的就只剩下CUDA了,而使用CUDA需要NVIDIA的设备。那么其它GPU呢?不说AMD(有实力研制),很多嵌入式(包括绝大部分的安卓手机)设备都是通过OpenCL实现GPU加速的。一般设备现在还真的是难以用GPU做计算,也许在未来会出现更好的专门针对GPU的加速框架出现吧。

    4.17日更新

    // This program implements a vector addition using OpenCL
    // System includes
    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    // OpenCL includes
    #include <CL/cl.h>
    
    // OpenCL kernel to perform an element-wise 
    // add of two arrays            
    const char* programSource =
    "__kernel void vecadd(__global int *A,__global int *B,__global int *C)\n" 
    "{\n"
    "// Get the work-item unique ID \n"
    "int idx = get_global_id(0);   \n"
    "// Add the corresponding locations of \n"
    "// 'A' and 'B', and store the result in 'C'.\n"
    "   C[idx] = A[idx] + B[idx];        \n"                    
    "}\n"
    ;
    int main() {
      // This code executes on the OpenCL host
      
      // Host data
      int *A = NULL;  // Input array
      int *B = NULL;  // Input array
      int *C = NULL;  // Output array
      
      // Elements in each array
      const int elements = 2048;  
      
      // Compute the size of the data 
      size_t datasize = sizeof(int)*elements;
    
      // Allocate space for input/output data
      A = (int*)malloc(datasize);
      B = (int*)malloc(datasize);
      C = (int*)malloc(datasize);
      // Initialize the input data
      for(int i = 0; i < elements; i++) {
        A[i] = i;
        B[i] = i;
      }
        clock_t start =  clock();
      // Use this to check the output of each API call
      cl_int status;  
       
      //-----------------------------------------------------
      // STEP 1: Discover and initialize the platforms
      //-----------------------------------------------------
      
      cl_uint numPlatforms = 0;
      cl_platform_id *platforms = NULL;
      
      // Use clGetPlatformIDs() to retrieve the number of 
      // platforms
      status = clGetPlatformIDs(0, NULL, &numPlatforms);
     
      // Allocate enough space for each platform
      platforms =  
        (cl_platform_id*)malloc(
          numPlatforms*sizeof(cl_platform_id));
     
      // Fill in platforms with clGetPlatformIDs()
      status = clGetPlatformIDs(numPlatforms, platforms, 
            NULL);
    
      //-----------------------------------------------------
      // STEP 2: Discover and initialize the devices
      //----------------------------------------------------- 
      
      cl_uint numDevices = 0;
      cl_device_id *devices = NULL;
    
      // Use clGetDeviceIDs() to retrieve the number of 
      // devices present
      status = clGetDeviceIDs(
        platforms[0], 
        CL_DEVICE_TYPE_CPU, 
        0, 
        NULL, 
        &numDevices);
    
      // Allocate enough space for each device
      devices = 
        (cl_device_id*)malloc(
          numDevices*sizeof(cl_device_id));
    
      // Fill in devices with clGetDeviceIDs()
      status = clGetDeviceIDs(
        platforms[0], 
        CL_DEVICE_TYPE_CPU,    
        numDevices, 
        devices, 
        NULL);
    
      //-----------------------------------------------------
      // STEP 3: Create a context
      //----------------------------------------------------- 
      
      cl_context context = NULL;
    
      // Create a context using clCreateContext() and 
      // associate it with the devices
      context = clCreateContext(
        NULL, 
        numDevices, 
        devices, 
        NULL, 
        NULL, 
        &status);
    
      //-----------------------------------------------------
      // STEP 4: Create a command queue
      //----------------------------------------------------- 
      
      cl_command_queue cmdQueue;
    
      // Create a command queue using clCreateCommandQueue(),
      // and associate it with the device you want to execute 
      // on
      cmdQueue = clCreateCommandQueue(
        context, 
        devices[0], 
        0, 
        &status);
    
      //-----------------------------------------------------
      // STEP 5: Create device buffers
      //----------------------------------------------------- 
      
      cl_mem bufferA;  // Input array on the device
      cl_mem bufferB;  // Input array on the device
      cl_mem bufferC;  // Output array on the device
    
      // Use clCreateBuffer() to create a buffer object (d_A) 
      // that will contain the data from the host array A
      bufferA = clCreateBuffer(
        context, 
        CL_MEM_READ_ONLY,             
        datasize, 
        NULL, 
        &status);
    
      // Use clCreateBuffer() to create a buffer object (d_B)
      // that will contain the data from the host array B
      bufferB = clCreateBuffer(
        context, 
        CL_MEM_READ_ONLY,             
        datasize, 
        NULL, 
        &status);
    
      // Use clCreateBuffer() to create a buffer object (d_C) 
      // with enough space to hold the output data
      bufferC = clCreateBuffer(
        context, 
        CL_MEM_WRITE_ONLY,         
        datasize, 
        NULL, 
        &status);
      
      //-----------------------------------------------------
      // STEP 6: Write host data to device buffers
      //----------------------------------------------------- 
      
      // Use clEnqueueWriteBuffer() to write input array A to
      // the device buffer bufferA
      status = clEnqueueWriteBuffer(
        cmdQueue, 
        bufferA, 
        CL_FALSE, 
        0, 
        datasize,             
        A, 
        0, 
        NULL, 
        NULL);
      
      // Use clEnqueueWriteBuffer() to write input array B to 
      // the device buffer bufferB
      status = clEnqueueWriteBuffer(
        cmdQueue, 
        bufferB, 
        CL_FALSE, 
        0, 
        datasize,                  
        B, 
        0, 
        NULL, 
        NULL);
    
      //-----------------------------------------------------
      // STEP 7: Create and compile the program
      //----------------------------------------------------- 
       
      // Create a program using clCreateProgramWithSource()
      cl_program program = clCreateProgramWithSource(
        context, 
        1, 
        (const char**)&programSource,                 
        NULL, 
        &status);
    
      // Build (compile) the program for the devices with
      // clBuildProgram()
      status = clBuildProgram(
        program, 
        numDevices, 
        devices, 
        NULL, 
        NULL, 
        NULL);
     
      //-----------------------------------------------------
      // STEP 8: Create the kernel
      //----------------------------------------------------- 
    
      cl_kernel kernel = NULL;
    
      // Use clCreateKernel() to create a kernel from the 
      // vector addition function (named "vecadd")
      kernel = clCreateKernel(program, "vecadd", &status);
    
      //-----------------------------------------------------
      // STEP 9: Set the kernel arguments
      //----------------------------------------------------- 
      
      // Associate the input and output buffers with the 
      // kernel 
      // using clSetKernelArg()
      status  = clSetKernelArg(
        kernel, 
        0, 
        sizeof(cl_mem), 
        &bufferA);
      status |= clSetKernelArg(
        kernel, 
        1, 
        sizeof(cl_mem), 
        &bufferB);
      status |= clSetKernelArg(
        kernel, 
        2, 
        sizeof(cl_mem), 
        &bufferC);
    
      //-----------------------------------------------------
      // STEP 10: Configure the work-item structure
      //----------------------------------------------------- 
      
      // Define an index space (global work size) of work 
      // items for 
      // execution. A workgroup size (local work size) is not 
      // required, 
      // but can be used.
      size_t globalWorkSize[1];  
      // There are 'elements' work-items 
      globalWorkSize[0] = elements;
    
      //-----------------------------------------------------
      // STEP 11: Enqueue the kernel for execution
      //----------------------------------------------------- 
      
      // Execute the kernel by using 
      // clEnqueueNDRangeKernel().
      // 'globalWorkSize' is the 1D dimension of the 
      // work-items
      status = clEnqueueNDRangeKernel(
        cmdQueue, 
        kernel, 
        1, 
        NULL, 
        globalWorkSize, 
        NULL, 
        0, 
        NULL, 
        NULL);
    
      //-----------------------------------------------------
      // STEP 12: Read the output buffer back to the host
      //----------------------------------------------------- 
      
      // Use clEnqueueReadBuffer() to read the OpenCL output  
      // buffer (bufferC) 
      // to the host output array (C)
      clEnqueueReadBuffer(
        cmdQueue, 
        bufferC, 
        CL_TRUE, 
        0, 
        datasize, 
        C, 
        0, 
        NULL, 
        NULL);
    
      // Verify the output
      bool result = true;
      for(int i = 0; i < elements; i++) {
        if(C[i] != i+i) {
          result = false;
          break;
        }
      }
      if(result) {
        printf("Output is correct\n");
      } else {
        printf("Output is incorrect\n");
      }
            double time_consume = (double)((clock_t)clock() - start) / CLOCKS_PER_SEC;
            printf("Time consume(s): %f",time_consume);
      //-----------------------------------------------------
      // STEP 13: Release OpenCL resources
      //----------------------------------------------------- 
      
      // Free OpenCL resources
      clReleaseKernel(kernel);
      clReleaseProgram(program);
      clReleaseCommandQueue(cmdQueue);
      clReleaseMemObject(bufferA);
      clReleaseMemObject(bufferB);
      clReleaseMemObject(bufferC);
      clReleaseContext(context);
    
      // Free host resources
      free(A);
      free(B);
      free(C);
      free(platforms);
      free(devices);
    }
    

    可以更改第79行和92行的CL_DEVICE_TYPE_CPU,查看不同设备的运行情况。可选设备
    CL_DEVICE_TYPE_ALL // 选取全部支持OpenCL的设备
    CL_DEVICE_TYPE_CPU // 仅选取CPU
    CL_DEVICE_TYPE_GPU // 仅选取GPU
    输出运行结果:

    // CL_DEVICE_TYPE_ALL
    Output is correct
    Time consume(s): 9.128000
    
    // CL_DEVICE_TYPE_GPU
    Output is correct
    Time consume(s): 7.775000
    
    // CL_DEVICE_TYPE_CPU
    Output is correct
    Time consume(s): 5.408000
    
    运行结果图

    失败的原因分析

    一般的情况下,Intel 7代以后的CPU都支持OpenCL(截止2020年最新版本是OpenCL 2.2),仔细对比了两次的代码,发现了还是在前面初始化设备的时候出现了问题。clGetPlatformIDsclGetDeviceIDs两个函数以及最重要的两个malloc,第60和85行,分别是给平台和设备分配足够的内存,第一次在Mac上操作的时候这些细节没有注意,导致内核总是创建不成功。

    相关文章

      网友评论

        本文标题:OpenCL的HelloWorld:一维向量相加

        本文链接:https://www.haomeiwen.com/subject/wlzphhtx.html