在RK3399上使用openCL和numpy对比

作者: shaniadolphin | 来源:发表于2019-02-15 21:49 被阅读0次

在RK3399上使用openCL和numpy对比
OpenCL 使用速度对比
快 2 倍，TFLite 推出基于 OpenCL 的 Andro
使用opencl
Numpy中的数组叠加问题
OpenCV配置
Numpy | 与Matlab科学计算对比
torch和numpy的对比
Python NumPy 实例教程 [译]
Python NumPy 实例教程 [译]

点乘的python实现

python里的点乘非常简单，通过numpy自带的运算符号numpy.dot即可实现，以下是实现的代码，通过给数组pInMatA和pInMatB赋特定值（为了比较不同算法下的值是否一致，没有使用随机值），即可np.dot(A, B)对矩阵A和B进行点乘。

#encoding: utf-8
import numpy as np
import os
import gc
import datetime
import time

M = 256
P = 256
N = 256

class testopencl(object):
    def matrixmul():
        listA = [37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17,37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50, \
        25,15,17,37,50,54,50,56,0,43,43,74,71,32,6,16,43,56,100,50,25,15,17,37,50,54,50,56, 0,43,43,74,71,32,36,16,43,56,100,50,25,15,17]

        listB = [35,51,54,58,55,32,36,69,27,39,35, 40,16,44,55,14,58,75,18,15,35,51,54,58,55,32, 36,69,27,39,35,40,16,44,55,14,58,75,18,15,35, \
        51,54,58,55,32,36,69,27,39,35,40,16,44,55,14, 58,75,18,15,35,51,54,58,55,32,36,69,27,39,35, 40,16,44,55,14,58,75,18,15]
        
        pInMatA = np.arange(M * P, dtype=np.float32)
        pInMatB = np.arange(P * N, dtype=np.float32)
        pOutMat = np.empty(M * N, dtype=np.float32)
     
        for c in range (M * N):
            pInMatA[c] = listA[c % 20] / 3.3;
            pInMatB[c] = listB[c % 20] / 3.3;

        A = np.reshape(pInMatA, (M,P))
        B = np.reshape(pInMatB, (P,N))
        start = time.time()
        data2 = np.dot(A, B)
        t2 = time.time() - start
        #pInMatA += pInMatA      
        print(data2[0][0:9])
        print(t2)
        gc.collect()
        
if __name__ == "__main__":
    try:
        testopencl.matrixmul()
    finally:
        print('test end!!')

运行结果如下，可以看到前10个数据的结果：

python3 testpy.py
[40661.8   49127.918 39687.023 42113.31  37694.105 48052.055 41653.062 40994.29  38949.03 ]
0.009578227996826172
test end!!

openCL程序

新建一个OpenCLMulMatrix.cl的文件，将以下内容复制到文件中：

__kernel void RunAsGpu_2(
    __global  float *A,
    __global  float *B,
    int M,
    int N,
    int P,
    __global float *C)
{
    int x = get_global_id(0);
    int y = get_global_id(1);
    float sum = 0;
    for(int i = 0;i<P;i++)
    {
        sum += A[y*P + i]*B[i*N + x];
    }
    C[y*N + x] = sum;
}
__kernel void VectorAdd(
    __global int* c, 
    __global int* a,
    __global int* b)
{
    // Index of the elements to add
    unsigned int n = get_global_id(0);
    // Sum the nth element of vectors a and b and store in c \n
    c[n] = a[n] + b[n];
}

这个程序中包括两个内核功能，RunAsGpu_2和VectorAdd。

主机程序

通过ssh登陆到RK3399的主机上，挂载共享文件夹：

sudo mount -o username=guest,password="" //192.168.199.1/vr /samba
cd /samba/testocl

新建cl_mulmatrix.c文件，复制以下代码到文件中：

//************************************************************
// Demo OpenCL application to compute a simple vector addition
// computation between 2 arrays on the GPU
// ************************************************************
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <CL/cl.h>

#define M 256
#define P 256
#define N 256

// OpenCL source code
const char* OpenCLSource[] = {
"__kernel void VectorAdd(__global float* c, __global float* a,__global float* b)",
"{",
" // Index of the elements to add \n",
" unsigned int n = get_global_id(0);",
" // Sum the nth element of vectors a and b and store in c \n",
" c[n] = a[n] * b[n];",
"}"
};

char *openclfile(void)
{
    char* source;
    //char** cp;
    //读取OpenCLSum.cl文件内容
    FILE* fp = fopen("OpenCLMulMatrix.cl", "rb");
    fseek(fp, 0, SEEK_END);
    size_t src_size = ftell(fp);
    source = (char *)malloc(src_size + 1);
    printf("src_size = %ld\n", src_size);
    fseek(fp, 0, SEEK_SET);
    fread(source, sizeof(char), src_size, fp);
    //for(int i=0;i<1000;i++)
    //  printf("%c",source[i]);
    //printf("\n");
    source[src_size] = '\0';
    fclose(fp);
    free(source);
    return source;
}

void RunAsCpu(
    const float *a,
    const float *b,
    float* c)
{
    for (int i = 0; i < M; i++)
    {
        for (int j = 0; j < N; j++)
        {
            c[i*N + j] = 0.0;
            for (int k = 0; k < P; k++)
            {
                c[i*N + j] += a[i*P + k] * a[k*N + j];
            }
        }
    }
}

//计时函数
cl_ulong time_stamp()
{
    clock_t ticks;
    cl_ulong timet;
    ticks = clock();
    
    timet = ticks * 1000 / CLOCKS_PER_SEC;
    //printf("ticks = %ld;timet= %ld;clocks= %ld\n", ticks, timet, CLOCKS_PER_SEC);
    return timet;
}

// Some interesting data for the vectors
int InitialData1[80] = {37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17,37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17,37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17,37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17};
int InitialData2[80] = {35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15,35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15,35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15,35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15};
// Number of elements in the vectors to be added
#define SIZE (M*N)
// Main function
// ************************************************************
int main(int argc, char **argv)
{   
    // Two integer source vectors in Host memory
    float HostVector1[SIZE], HostVector2[SIZE];
    //Output Vector
    float HostOutputVector[SIZE];
    double total_time;
    char cBuffer[1024];
    cl_ulong start = 0, end = 0;
    const char* sourcefile[1];
    //读取OpenCLSum.cl文件内容
    FILE* fp = fopen("OpenCLMulMatrix.cl", "rb");   
    // Initialize with some interesting repeating data
    for(int c = 0; c < SIZE; c++)
    {
        HostVector1[c] = (float)InitialData1[c%20] / 3.3f;
        HostVector2[c] = (float)InitialData2[c%20] / 3.3f;
        HostOutputVector[c] = 0.0f;
    }
    //Get an OpenCL platform
    cl_platform_id cpPlatform;
    clGetPlatformIDs(1, &cpPlatform, NULL);
    // Get a GPU device
    cl_device_id cdDevice;
    clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);

    clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
    printf("CL_DEVICE_NAME: %s\n", cBuffer);
    clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL);
    printf("CL_DRIVER_VERSION: %s\n\n", cBuffer);
    // Create a context to run OpenCL enabled GPU
    cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);     
    // Create a command-queue on the GPU device
    cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, NULL);
    // Allocate GPU memory for source vectors AND initialize from CPU memory

    // Create OpenCL program with source code
    //
#if 0   
    fseek(fp, 0, SEEK_END);
    size_t src_size = ftell(fp);
    //source = (char *)malloc(src_size);
    printf("src_size = %ld\n", src_size);
    fseek(fp, 0, SEEK_SET);
    fread((void*)sourcefile, sizeof(char), src_size, fp);
    for(int i=0;i<100;i++)
        printf("%c",sourcefile[i]);
    printf("\n");
    fclose(fp);
    cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, &sourcefile, NULL, NULL);
    //free(&source);
#else
    sourcefile[0] = openclfile();
    //for(int i=0;i<1000;i++)
    //  printf("%c",sourcefile[0][i]);
    //printf("\n");
    cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 1, sourcefile, NULL, NULL);
    //cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, OpenCLSource, NULL, NULL);
#endif
    // Build the program (OpenCL JIT compilation)
    clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);
    
#if 1
    //Shows the log
    char* build_log;
    size_t log_size;
    //First call to know the proper size
    clGetProgramBuildInfo(OpenCLProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
    build_log = (char*)malloc(log_size + 1);
    // Second call to get the log
    clGetProgramBuildInfo(OpenCLProgram, cdDevice, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
    build_log[log_size] = '\0';
    printf("build_log %ld:%s", log_size,build_log);
    printf("\n");
    free(build_log);
#endif
    cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * SIZE, HostVector1, NULL);
    cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * SIZE, HostVector2, NULL);
    // Allocate output memory on GPU
    cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(float) * SIZE, NULL, NULL);
#if 0
    // Create a handle to the compiled OpenCL function (Kernel)
    cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "VectorAdd", NULL);
    // In the next step we associate the GPU memory with the Kernel arguments
    clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUOutputVector);
    clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1);
    clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2);

    //create event
    cl_event event = clCreateUserEvent(GPUContext, NULL);

    // Launch the Kernel on the GPU
    // This kernel only uses global data
    size_t WorkSize[1] = {SIZE}; // one dimensional Range
    clEnqueueNDRangeKernel(cqCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, &event);
    // Copy the output in GPU memory back to CPU memory
    clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, SIZE * sizeof(float), HostOutputVector, 0, NULL, NULL);
    // Cleanup
    clReleaseKernel(OpenCLVectorAdd);
    clReleaseProgram(OpenCLProgram);
    clReleaseCommandQueue(cqCommandQueue);
    clReleaseContext(GPUContext);
    clReleaseMemObject(GPUVector1);
    clReleaseMemObject(GPUVector2);
    clReleaseMemObject(GPUOutputVector);    
#else
    //Extracting the kernel
    cl_kernel run_as_gpu_1 = clCreateKernel(OpenCLProgram, "RunAsGpu_2", NULL);
    //设置kernel参数
    cl_int M_d = M;
    cl_int P_d = P;
    cl_int N_d = N;
    clSetKernelArg(run_as_gpu_1, 0, sizeof(cl_mem), &GPUVector1);
    clSetKernelArg(run_as_gpu_1, 1, sizeof(cl_mem), &GPUVector2);
    clSetKernelArg(run_as_gpu_1, 2, sizeof(int), &M_d);
    clSetKernelArg(run_as_gpu_1, 3, sizeof(int), &N_d);
    clSetKernelArg(run_as_gpu_1, 4, sizeof(int), &P_d);
    clSetKernelArg(run_as_gpu_1, 5, sizeof(cl_mem), &GPUOutputVector);
    //create event
    cl_event event = clCreateUserEvent(GPUContext, NULL);
    size_t WorkSize[2] = { M,N };
    clEnqueueNDRangeKernel(cqCommandQueue, run_as_gpu_1, 2, NULL, WorkSize, NULL, 0, NULL, &event);
    clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, SIZE * sizeof(float), HostOutputVector, 0, NULL, NULL);
    clReleaseKernel(run_as_gpu_1);
    clReleaseProgram(OpenCLProgram);
    clReleaseCommandQueue(cqCommandQueue);
    clReleaseContext(GPUContext);
    clReleaseMemObject(GPUVector1);
    clReleaseMemObject(GPUVector2);
    clReleaseMemObject(GPUOutputVector);   
#endif
    clWaitForEvents(1, &event);

    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);

    total_time = (double)(end - start);     

    for( int i =0 ; i < 10; i++)
    {
        printf("%0.3f ", HostOutputVector[i]);
    }
    printf("\n");
    printf("\nExecution 2 time in milliseconds = %0.3f ms\n", (total_time / 1000000.0) );     
    start = time_stamp(); 
    RunAsCpu(HostVector1, HostVector2, HostOutputVector);
    end = time_stamp();
    total_time = (double)(end - start);
    printf("\nExecution 1 time in milliseconds = %0.3f ms\n", total_time);  
    return 0;
}

在终端输入以下命令编译：

sudo gcc -o cl_mulmatrix  cl_mulmatrix.c -lOpenCL

-lOpenCL用于链接openCL的库，编译后会生成cl_mulmatrix的可执行文件。
运行应用，查看结果：

sudo gcc -o cl_mulmatrix  cl_mulmatrix.c -lOpenCL
./cl_mulmatrix
CL_DEVICE_NAME: Mali-T860
CL_DRIVER_VERSION: 1.2
src_size = 1089
build_log 1:
40661.801 49127.918 39687.023 42113.297 37694.113 48052.055 41653.059 40994.293 38949.027 49471.305
Execution 2 time in milliseconds = 8.442 ms
Execution 1 time in milliseconds = 309.000 ms

结果比较

通过两个程序可以看到，两个256*256的向量运算，python通过numpy的运算，时间是9.578 ms，而如果通过openCL运算，时间是8.442 ms，如果不经任何优化直接计算，时间是267 ms。
两个平台的运算结果也是一致的：

#python
[40661.8   49127.918 39687.023 42113.31  37694.105 48052.055 41653.062 40994.29  38949.03 ]
#openCL
40661.801 49127.918 39687.023 42113.297 37694.113 48052.055 41653.059 40994.293 38949.027

如果将数组的大小调整为512*512，再来看两个的运行数据：

#python
python3 testpy.py
[76084.414 99685.69  80474.23  76532.68  77556.67  99632.86  82275.73
 78062.05  81567.11 ]
0.07016563415527344
test end!!
#openCL
./cl_mulmatrix
CL_DEVICE_NAME: Mali-T860
CL_DRIVER_VERSION: 1.2
src_size = 1089
build_log 1:
76084.375 99685.688 80474.227 76532.664 77556.680 99632.859 82275.727 78062.008 81567.109
Execution 2 time in milliseconds = 81.042 ms
Execution 1 time in milliseconds = 2994.000 ms

pyhon通过numpy的运算是70ms，而openCL则需要81ms，在这个平台上openCL的优势并不明显。
以下是运行算法时CPU的占用率分析：
openCL：

numpy：

在RK3399上使用openCL和numpy对比
点乘的python实现 python里的点乘非常简单，通过numpy自带的运算符号numpy.dot即可实现，以下...
OpenCL 使用速度对比
1.仅使用 OpenCV 2.集显使用 OpenCL 3.独显使用 OpenCL
快 2 倍，TFLite 推出基于 OpenCL 的 Andro
TensorFlow Lite（TFLite）现在支持在 Android 设备上使用 OpenCL 进行 GPU ...
使用opencl
标签： python opencl 矩阵乘法对于以下是一个常见的线性方程组，用矩阵表示就是：推导出矩...
Numpy中的数组叠加问题
Numpy中的函数r_和c_ 问题：如何使用numpy中的函数实现数组在水平方向和垂直方向的上的叠加？答案： ...
OpenCV配置
OpenCL & OpenCV环境配置本次实验需要配置OpenCL和OpenCV环境。需求硬件要求：首先需要...
Numpy | 与Matlab科学计算对比
Numpy与Matlab科学计算对比 MATLAB®和NumPy / SciPy有很多共同之处。但是有很多不同之处...
torch和numpy的对比
一、numpy和torch numpy是python中处理数据的模块，可以处理各种的矩阵(matrix)。 Tor...
Python NumPy 实例教程 [译]
# Python NumPy 实例教程 [译] *在本教程中，您将找到使用NumPy解决数值计算和科学计算问题的解...
Python NumPy 实例教程 [译]
在本教程中，您将找到使用NumPy解决数值计算和科学计算问题的解决方案。 NumPy（Numerical Pyth...