美文网首页
在RK3399上使用openCL和numpy对比

在RK3399上使用openCL和numpy对比

作者: shaniadolphin | 来源:发表于2019-02-15 21:49 被阅读0次

    点乘的python实现

    python里的点乘非常简单,通过numpy自带的运算符号numpy.dot即可实现,以下是实现的代码,通过给数组pInMatA和pInMatB赋特定值(为了比较不同算法下的值是否一致,没有使用随机值),即可np.dot(A, B)对矩阵A和B进行点乘。

    #encoding: utf-8
    import numpy as np
    import os
    import gc
    import datetime
    import time
    
    M = 256
    P = 256
    N = 256
    
    class testopencl(object):
        def matrixmul():
            listA = [37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17,37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50, \
            25,15,17,37,50,54,50,56,0,43,43,74,71,32,6,16,43,56,100,50,25,15,17,37,50,54,50,56, 0,43,43,74,71,32,36,16,43,56,100,50,25,15,17]
    
            listB = [35,51,54,58,55,32,36,69,27,39,35, 40,16,44,55,14,58,75,18,15,35,51,54,58,55,32, 36,69,27,39,35,40,16,44,55,14,58,75,18,15,35, \
            51,54,58,55,32,36,69,27,39,35,40,16,44,55,14, 58,75,18,15,35,51,54,58,55,32,36,69,27,39,35, 40,16,44,55,14,58,75,18,15]
            
            pInMatA = np.arange(M * P, dtype=np.float32)
            pInMatB = np.arange(P * N, dtype=np.float32)
            pOutMat = np.empty(M * N, dtype=np.float32)
         
            for c in range (M * N):
                pInMatA[c] = listA[c % 20] / 3.3;
                pInMatB[c] = listB[c % 20] / 3.3;
    
            A = np.reshape(pInMatA, (M,P))
            B = np.reshape(pInMatB, (P,N))
            start = time.time()
            data2 = np.dot(A, B)
            t2 = time.time() - start
            #pInMatA += pInMatA      
            print(data2[0][0:9])
            print(t2)
            gc.collect()
            
    if __name__ == "__main__":
        try:
            testopencl.matrixmul()
        finally:
            print('test end!!')
    

    运行结果如下,可以看到前10个数据的结果:

    python3 testpy.py
    [40661.8   49127.918 39687.023 42113.31  37694.105 48052.055 41653.062 40994.29  38949.03 ]
    0.009578227996826172
    test end!!
    

    openCL程序

    新建一个OpenCLMulMatrix.cl的文件,将以下内容复制到文件中:

    __kernel void RunAsGpu_2(
        __global  float *A,
        __global  float *B,
        int M,
        int N,
        int P,
        __global float *C)
    {
        int x = get_global_id(0);
        int y = get_global_id(1);
        float sum = 0;
        for(int i = 0;i<P;i++)
        {
            sum += A[y*P + i]*B[i*N + x];
        }
        C[y*N + x] = sum;
    }
    __kernel void VectorAdd(
        __global int* c, 
        __global int* a,
        __global int* b)
    {
        // Index of the elements to add
        unsigned int n = get_global_id(0);
        // Sum the nth element of vectors a and b and store in c \n
        c[n] = a[n] + b[n];
    }
    

    这个程序中包括两个内核功能,RunAsGpu_2VectorAdd

    主机程序

    通过ssh登陆到RK3399的主机上,挂载共享文件夹:

    sudo mount -o username=guest,password="" //192.168.199.1/vr /samba
    cd /samba/testocl
    

    新建cl_mulmatrix.c文件,复制以下代码到文件中:

    //************************************************************
    // Demo OpenCL application to compute a simple vector addition
    // computation between 2 arrays on the GPU
    // ************************************************************
    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    #include <CL/cl.h>
    
    #define M 256
    #define P 256
    #define N 256
    
    // OpenCL source code
    const char* OpenCLSource[] = {
    "__kernel void VectorAdd(__global float* c, __global float* a,__global float* b)",
    "{",
    " // Index of the elements to add \n",
    " unsigned int n = get_global_id(0);",
    " // Sum the nth element of vectors a and b and store in c \n",
    " c[n] = a[n] * b[n];",
    "}"
    };
    
    char *openclfile(void)
    {
        char* source;
        //char** cp;
        //读取OpenCLSum.cl文件内容
        FILE* fp = fopen("OpenCLMulMatrix.cl", "rb");
        fseek(fp, 0, SEEK_END);
        size_t src_size = ftell(fp);
        source = (char *)malloc(src_size + 1);
        printf("src_size = %ld\n", src_size);
        fseek(fp, 0, SEEK_SET);
        fread(source, sizeof(char), src_size, fp);
        //for(int i=0;i<1000;i++)
        //  printf("%c",source[i]);
        //printf("\n");
        source[src_size] = '\0';
        fclose(fp);
        free(source);
        return source;
    }
    
    void RunAsCpu(
        const float *a,
        const float *b,
        float* c)
    {
        for (int i = 0; i < M; i++)
        {
            for (int j = 0; j < N; j++)
            {
                c[i*N + j] = 0.0;
                for (int k = 0; k < P; k++)
                {
                    c[i*N + j] += a[i*P + k] * a[k*N + j];
                }
            }
        }
    }
    
    //计时函数
    cl_ulong time_stamp()
    {
        clock_t ticks;
        cl_ulong timet;
        ticks = clock();
        
        timet = ticks * 1000 / CLOCKS_PER_SEC;
        //printf("ticks = %ld;timet= %ld;clocks= %ld\n", ticks, timet, CLOCKS_PER_SEC);
        return timet;
    }
    
    // Some interesting data for the vectors
    int InitialData1[80] = {37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17,37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17,37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17,37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17};
    int InitialData2[80] = {35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15,35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15,35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15,35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15};
    // Number of elements in the vectors to be added
    #define SIZE (M*N)
    // Main function
    // ************************************************************
    int main(int argc, char **argv)
    {   
        // Two integer source vectors in Host memory
        float HostVector1[SIZE], HostVector2[SIZE];
        //Output Vector
        float HostOutputVector[SIZE];
        double total_time;
        char cBuffer[1024];
        cl_ulong start = 0, end = 0;
        const char* sourcefile[1];
        //读取OpenCLSum.cl文件内容
        FILE* fp = fopen("OpenCLMulMatrix.cl", "rb");   
        // Initialize with some interesting repeating data
        for(int c = 0; c < SIZE; c++)
        {
            HostVector1[c] = (float)InitialData1[c%20] / 3.3f;
            HostVector2[c] = (float)InitialData2[c%20] / 3.3f;
            HostOutputVector[c] = 0.0f;
        }
        //Get an OpenCL platform
        cl_platform_id cpPlatform;
        clGetPlatformIDs(1, &cpPlatform, NULL);
        // Get a GPU device
        cl_device_id cdDevice;
        clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
    
        clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
        printf("CL_DEVICE_NAME: %s\n", cBuffer);
        clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL);
        printf("CL_DRIVER_VERSION: %s\n\n", cBuffer);
        // Create a context to run OpenCL enabled GPU
        cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);     
        // Create a command-queue on the GPU device
        cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, NULL);
        // Allocate GPU memory for source vectors AND initialize from CPU memory
    
        // Create OpenCL program with source code
        //
    #if 0   
        fseek(fp, 0, SEEK_END);
        size_t src_size = ftell(fp);
        //source = (char *)malloc(src_size);
        printf("src_size = %ld\n", src_size);
        fseek(fp, 0, SEEK_SET);
        fread((void*)sourcefile, sizeof(char), src_size, fp);
        for(int i=0;i<100;i++)
            printf("%c",sourcefile[i]);
        printf("\n");
        fclose(fp);
        cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, &sourcefile, NULL, NULL);
        //free(&source);
    #else
        sourcefile[0] = openclfile();
        //for(int i=0;i<1000;i++)
        //  printf("%c",sourcefile[0][i]);
        //printf("\n");
        cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 1, sourcefile, NULL, NULL);
        //cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, OpenCLSource, NULL, NULL);
    #endif
        // Build the program (OpenCL JIT compilation)
        clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);
        
    #if 1
        //Shows the log
        char* build_log;
        size_t log_size;
        //First call to know the proper size
        clGetProgramBuildInfo(OpenCLProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        build_log = (char*)malloc(log_size + 1);
        // Second call to get the log
        clGetProgramBuildInfo(OpenCLProgram, cdDevice, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
        build_log[log_size] = '\0';
        printf("build_log %ld:%s", log_size,build_log);
        printf("\n");
        free(build_log);
    #endif
        cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * SIZE, HostVector1, NULL);
        cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * SIZE, HostVector2, NULL);
        // Allocate output memory on GPU
        cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(float) * SIZE, NULL, NULL);
    #if 0
        // Create a handle to the compiled OpenCL function (Kernel)
        cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "VectorAdd", NULL);
        // In the next step we associate the GPU memory with the Kernel arguments
        clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUOutputVector);
        clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1);
        clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2);
    
        //create event
        cl_event event = clCreateUserEvent(GPUContext, NULL);
    
        // Launch the Kernel on the GPU
        // This kernel only uses global data
        size_t WorkSize[1] = {SIZE}; // one dimensional Range
        clEnqueueNDRangeKernel(cqCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, &event);
        // Copy the output in GPU memory back to CPU memory
        clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, SIZE * sizeof(float), HostOutputVector, 0, NULL, NULL);
        // Cleanup
        clReleaseKernel(OpenCLVectorAdd);
        clReleaseProgram(OpenCLProgram);
        clReleaseCommandQueue(cqCommandQueue);
        clReleaseContext(GPUContext);
        clReleaseMemObject(GPUVector1);
        clReleaseMemObject(GPUVector2);
        clReleaseMemObject(GPUOutputVector);    
    #else
        //Extracting the kernel
        cl_kernel run_as_gpu_1 = clCreateKernel(OpenCLProgram, "RunAsGpu_2", NULL);
        //设置kernel参数
        cl_int M_d = M;
        cl_int P_d = P;
        cl_int N_d = N;
        clSetKernelArg(run_as_gpu_1, 0, sizeof(cl_mem), &GPUVector1);
        clSetKernelArg(run_as_gpu_1, 1, sizeof(cl_mem), &GPUVector2);
        clSetKernelArg(run_as_gpu_1, 2, sizeof(int), &M_d);
        clSetKernelArg(run_as_gpu_1, 3, sizeof(int), &N_d);
        clSetKernelArg(run_as_gpu_1, 4, sizeof(int), &P_d);
        clSetKernelArg(run_as_gpu_1, 5, sizeof(cl_mem), &GPUOutputVector);
        //create event
        cl_event event = clCreateUserEvent(GPUContext, NULL);
        size_t WorkSize[2] = { M,N };
        clEnqueueNDRangeKernel(cqCommandQueue, run_as_gpu_1, 2, NULL, WorkSize, NULL, 0, NULL, &event);
        clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, SIZE * sizeof(float), HostOutputVector, 0, NULL, NULL);
        clReleaseKernel(run_as_gpu_1);
        clReleaseProgram(OpenCLProgram);
        clReleaseCommandQueue(cqCommandQueue);
        clReleaseContext(GPUContext);
        clReleaseMemObject(GPUVector1);
        clReleaseMemObject(GPUVector2);
        clReleaseMemObject(GPUOutputVector);   
    #endif
        clWaitForEvents(1, &event);
    
        clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
        clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
    
        total_time = (double)(end - start);     
    
        for( int i =0 ; i < 10; i++)
        {
            printf("%0.3f ", HostOutputVector[i]);
        }
        printf("\n");
        printf("\nExecution 2 time in milliseconds = %0.3f ms\n", (total_time / 1000000.0) );     
        start = time_stamp(); 
        RunAsCpu(HostVector1, HostVector2, HostOutputVector);
        end = time_stamp();
        total_time = (double)(end - start);
        printf("\nExecution 1 time in milliseconds = %0.3f ms\n", total_time);  
        return 0;
    }
    

    在终端输入以下命令编译:

    sudo gcc -o cl_mulmatrix  cl_mulmatrix.c -lOpenCL
    

    -lOpenCL用于链接openCL的库,编译后会生成cl_mulmatrix的可执行文件。
    运行应用,查看结果:

    sudo gcc -o cl_mulmatrix  cl_mulmatrix.c -lOpenCL
    ./cl_mulmatrix
    CL_DEVICE_NAME: Mali-T860
    CL_DRIVER_VERSION: 1.2
    src_size = 1089
    build_log 1:
    40661.801 49127.918 39687.023 42113.297 37694.113 48052.055 41653.059 40994.293 38949.027 49471.305
    Execution 2 time in milliseconds = 8.442 ms
    Execution 1 time in milliseconds = 309.000 ms
    

    结果比较

    通过两个程序可以看到,两个256*256的向量运算,python通过numpy的运算,时间是9.578 ms,而如果通过openCL运算,时间是8.442 ms,如果不经任何优化直接计算,时间是267 ms。
    两个平台的运算结果也是一致的:

    #python
    [40661.8   49127.918 39687.023 42113.31  37694.105 48052.055 41653.062 40994.29  38949.03 ]
    #openCL
    40661.801 49127.918 39687.023 42113.297 37694.113 48052.055 41653.059 40994.293 38949.027
    

    如果将数组的大小调整为512*512,再来看两个的运行数据:

    #python
    python3 testpy.py
    [76084.414 99685.69  80474.23  76532.68  77556.67  99632.86  82275.73
     78062.05  81567.11 ]
    0.07016563415527344
    test end!!
    #openCL
    ./cl_mulmatrix
    CL_DEVICE_NAME: Mali-T860
    CL_DRIVER_VERSION: 1.2
    src_size = 1089
    build_log 1:
    76084.375 99685.688 80474.227 76532.664 77556.680 99632.859 82275.727 78062.008 81567.109
    Execution 2 time in milliseconds = 81.042 ms
    Execution 1 time in milliseconds = 2994.000 ms
    

    pyhon通过numpy的运算是70ms,而openCL则需要81ms,在这个平台上openCL的优势并不明显。
    以下是运行算法时CPU的占用率分析:
    openCL:



    numpy:


    相关文章

      网友评论

          本文标题:在RK3399上使用openCL和numpy对比

          本文链接:https://www.haomeiwen.com/subject/yrzmeqtx.html