点乘的python实现
python里的点乘非常简单,通过numpy自带的运算符号numpy.dot
即可实现,以下是实现的代码,通过给数组pInMatA和pInMatB赋特定值(为了比较不同算法下的值是否一致,没有使用随机值),即可np.dot(A, B)对矩阵A和B进行点乘。
#encoding: utf-8
import numpy as np
import os
import gc
import datetime
import time
M = 256
P = 256
N = 256
class testopencl(object):
def matrixmul():
listA = [37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17,37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50, \
25,15,17,37,50,54,50,56,0,43,43,74,71,32,6,16,43,56,100,50,25,15,17,37,50,54,50,56, 0,43,43,74,71,32,36,16,43,56,100,50,25,15,17]
listB = [35,51,54,58,55,32,36,69,27,39,35, 40,16,44,55,14,58,75,18,15,35,51,54,58,55,32, 36,69,27,39,35,40,16,44,55,14,58,75,18,15,35, \
51,54,58,55,32,36,69,27,39,35,40,16,44,55,14, 58,75,18,15,35,51,54,58,55,32,36,69,27,39,35, 40,16,44,55,14,58,75,18,15]
pInMatA = np.arange(M * P, dtype=np.float32)
pInMatB = np.arange(P * N, dtype=np.float32)
pOutMat = np.empty(M * N, dtype=np.float32)
for c in range (M * N):
pInMatA[c] = listA[c % 20] / 3.3;
pInMatB[c] = listB[c % 20] / 3.3;
A = np.reshape(pInMatA, (M,P))
B = np.reshape(pInMatB, (P,N))
start = time.time()
data2 = np.dot(A, B)
t2 = time.time() - start
#pInMatA += pInMatA
print(data2[0][0:9])
print(t2)
gc.collect()
if __name__ == "__main__":
try:
testopencl.matrixmul()
finally:
print('test end!!')
运行结果如下,可以看到前10个数据的结果:
python3 testpy.py
[40661.8 49127.918 39687.023 42113.31 37694.105 48052.055 41653.062 40994.29 38949.03 ]
0.009578227996826172
test end!!
openCL程序
新建一个OpenCLMulMatrix.cl
的文件,将以下内容复制到文件中:
__kernel void RunAsGpu_2(
__global float *A,
__global float *B,
int M,
int N,
int P,
__global float *C)
{
int x = get_global_id(0);
int y = get_global_id(1);
float sum = 0;
for(int i = 0;i<P;i++)
{
sum += A[y*P + i]*B[i*N + x];
}
C[y*N + x] = sum;
}
__kernel void VectorAdd(
__global int* c,
__global int* a,
__global int* b)
{
// Index of the elements to add
unsigned int n = get_global_id(0);
// Sum the nth element of vectors a and b and store in c \n
c[n] = a[n] + b[n];
}
这个程序中包括两个内核功能,RunAsGpu_2
和VectorAdd
。
主机程序
通过ssh登陆到RK3399的主机上,挂载共享文件夹:
sudo mount -o username=guest,password="" //192.168.199.1/vr /samba
cd /samba/testocl
新建cl_mulmatrix.c
文件,复制以下代码到文件中:
//************************************************************
// Demo OpenCL application to compute a simple vector addition
// computation between 2 arrays on the GPU
// ************************************************************
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <CL/cl.h>
#define M 256
#define P 256
#define N 256
// OpenCL source code
const char* OpenCLSource[] = {
"__kernel void VectorAdd(__global float* c, __global float* a,__global float* b)",
"{",
" // Index of the elements to add \n",
" unsigned int n = get_global_id(0);",
" // Sum the nth element of vectors a and b and store in c \n",
" c[n] = a[n] * b[n];",
"}"
};
char *openclfile(void)
{
char* source;
//char** cp;
//读取OpenCLSum.cl文件内容
FILE* fp = fopen("OpenCLMulMatrix.cl", "rb");
fseek(fp, 0, SEEK_END);
size_t src_size = ftell(fp);
source = (char *)malloc(src_size + 1);
printf("src_size = %ld\n", src_size);
fseek(fp, 0, SEEK_SET);
fread(source, sizeof(char), src_size, fp);
//for(int i=0;i<1000;i++)
// printf("%c",source[i]);
//printf("\n");
source[src_size] = '\0';
fclose(fp);
free(source);
return source;
}
void RunAsCpu(
const float *a,
const float *b,
float* c)
{
for (int i = 0; i < M; i++)
{
for (int j = 0; j < N; j++)
{
c[i*N + j] = 0.0;
for (int k = 0; k < P; k++)
{
c[i*N + j] += a[i*P + k] * a[k*N + j];
}
}
}
}
//计时函数
cl_ulong time_stamp()
{
clock_t ticks;
cl_ulong timet;
ticks = clock();
timet = ticks * 1000 / CLOCKS_PER_SEC;
//printf("ticks = %ld;timet= %ld;clocks= %ld\n", ticks, timet, CLOCKS_PER_SEC);
return timet;
}
// Some interesting data for the vectors
int InitialData1[80] = {37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17,37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17,37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17,37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17};
int InitialData2[80] = {35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15,35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15,35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15,35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15};
// Number of elements in the vectors to be added
#define SIZE (M*N)
// Main function
// ************************************************************
int main(int argc, char **argv)
{
// Two integer source vectors in Host memory
float HostVector1[SIZE], HostVector2[SIZE];
//Output Vector
float HostOutputVector[SIZE];
double total_time;
char cBuffer[1024];
cl_ulong start = 0, end = 0;
const char* sourcefile[1];
//读取OpenCLSum.cl文件内容
FILE* fp = fopen("OpenCLMulMatrix.cl", "rb");
// Initialize with some interesting repeating data
for(int c = 0; c < SIZE; c++)
{
HostVector1[c] = (float)InitialData1[c%20] / 3.3f;
HostVector2[c] = (float)InitialData2[c%20] / 3.3f;
HostOutputVector[c] = 0.0f;
}
//Get an OpenCL platform
cl_platform_id cpPlatform;
clGetPlatformIDs(1, &cpPlatform, NULL);
// Get a GPU device
cl_device_id cdDevice;
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
printf("CL_DEVICE_NAME: %s\n", cBuffer);
clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL);
printf("CL_DRIVER_VERSION: %s\n\n", cBuffer);
// Create a context to run OpenCL enabled GPU
cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
// Create a command-queue on the GPU device
cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, NULL);
// Allocate GPU memory for source vectors AND initialize from CPU memory
// Create OpenCL program with source code
//
#if 0
fseek(fp, 0, SEEK_END);
size_t src_size = ftell(fp);
//source = (char *)malloc(src_size);
printf("src_size = %ld\n", src_size);
fseek(fp, 0, SEEK_SET);
fread((void*)sourcefile, sizeof(char), src_size, fp);
for(int i=0;i<100;i++)
printf("%c",sourcefile[i]);
printf("\n");
fclose(fp);
cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, &sourcefile, NULL, NULL);
//free(&source);
#else
sourcefile[0] = openclfile();
//for(int i=0;i<1000;i++)
// printf("%c",sourcefile[0][i]);
//printf("\n");
cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 1, sourcefile, NULL, NULL);
//cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, OpenCLSource, NULL, NULL);
#endif
// Build the program (OpenCL JIT compilation)
clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);
#if 1
//Shows the log
char* build_log;
size_t log_size;
//First call to know the proper size
clGetProgramBuildInfo(OpenCLProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
build_log = (char*)malloc(log_size + 1);
// Second call to get the log
clGetProgramBuildInfo(OpenCLProgram, cdDevice, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
build_log[log_size] = '\0';
printf("build_log %ld:%s", log_size,build_log);
printf("\n");
free(build_log);
#endif
cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * SIZE, HostVector1, NULL);
cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * SIZE, HostVector2, NULL);
// Allocate output memory on GPU
cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(float) * SIZE, NULL, NULL);
#if 0
// Create a handle to the compiled OpenCL function (Kernel)
cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "VectorAdd", NULL);
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUOutputVector);
clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1);
clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2);
//create event
cl_event event = clCreateUserEvent(GPUContext, NULL);
// Launch the Kernel on the GPU
// This kernel only uses global data
size_t WorkSize[1] = {SIZE}; // one dimensional Range
clEnqueueNDRangeKernel(cqCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, &event);
// Copy the output in GPU memory back to CPU memory
clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, SIZE * sizeof(float), HostOutputVector, 0, NULL, NULL);
// Cleanup
clReleaseKernel(OpenCLVectorAdd);
clReleaseProgram(OpenCLProgram);
clReleaseCommandQueue(cqCommandQueue);
clReleaseContext(GPUContext);
clReleaseMemObject(GPUVector1);
clReleaseMemObject(GPUVector2);
clReleaseMemObject(GPUOutputVector);
#else
//Extracting the kernel
cl_kernel run_as_gpu_1 = clCreateKernel(OpenCLProgram, "RunAsGpu_2", NULL);
//设置kernel参数
cl_int M_d = M;
cl_int P_d = P;
cl_int N_d = N;
clSetKernelArg(run_as_gpu_1, 0, sizeof(cl_mem), &GPUVector1);
clSetKernelArg(run_as_gpu_1, 1, sizeof(cl_mem), &GPUVector2);
clSetKernelArg(run_as_gpu_1, 2, sizeof(int), &M_d);
clSetKernelArg(run_as_gpu_1, 3, sizeof(int), &N_d);
clSetKernelArg(run_as_gpu_1, 4, sizeof(int), &P_d);
clSetKernelArg(run_as_gpu_1, 5, sizeof(cl_mem), &GPUOutputVector);
//create event
cl_event event = clCreateUserEvent(GPUContext, NULL);
size_t WorkSize[2] = { M,N };
clEnqueueNDRangeKernel(cqCommandQueue, run_as_gpu_1, 2, NULL, WorkSize, NULL, 0, NULL, &event);
clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, SIZE * sizeof(float), HostOutputVector, 0, NULL, NULL);
clReleaseKernel(run_as_gpu_1);
clReleaseProgram(OpenCLProgram);
clReleaseCommandQueue(cqCommandQueue);
clReleaseContext(GPUContext);
clReleaseMemObject(GPUVector1);
clReleaseMemObject(GPUVector2);
clReleaseMemObject(GPUOutputVector);
#endif
clWaitForEvents(1, &event);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
total_time = (double)(end - start);
for( int i =0 ; i < 10; i++)
{
printf("%0.3f ", HostOutputVector[i]);
}
printf("\n");
printf("\nExecution 2 time in milliseconds = %0.3f ms\n", (total_time / 1000000.0) );
start = time_stamp();
RunAsCpu(HostVector1, HostVector2, HostOutputVector);
end = time_stamp();
total_time = (double)(end - start);
printf("\nExecution 1 time in milliseconds = %0.3f ms\n", total_time);
return 0;
}
在终端输入以下命令编译:
sudo gcc -o cl_mulmatrix cl_mulmatrix.c -lOpenCL
-lOpenCL用于链接openCL的库,编译后会生成cl_mulmatrix
的可执行文件。
运行应用,查看结果:
sudo gcc -o cl_mulmatrix cl_mulmatrix.c -lOpenCL
./cl_mulmatrix
CL_DEVICE_NAME: Mali-T860
CL_DRIVER_VERSION: 1.2
src_size = 1089
build_log 1:
40661.801 49127.918 39687.023 42113.297 37694.113 48052.055 41653.059 40994.293 38949.027 49471.305
Execution 2 time in milliseconds = 8.442 ms
Execution 1 time in milliseconds = 309.000 ms
结果比较
通过两个程序可以看到,两个256*256的向量运算,python通过numpy的运算,时间是9.578 ms,而如果通过openCL运算,时间是8.442 ms,如果不经任何优化直接计算,时间是267 ms。
两个平台的运算结果也是一致的:
#python
[40661.8 49127.918 39687.023 42113.31 37694.105 48052.055 41653.062 40994.29 38949.03 ]
#openCL
40661.801 49127.918 39687.023 42113.297 37694.113 48052.055 41653.059 40994.293 38949.027
如果将数组的大小调整为512*512,再来看两个的运行数据:
#python
python3 testpy.py
[76084.414 99685.69 80474.23 76532.68 77556.67 99632.86 82275.73
78062.05 81567.11 ]
0.07016563415527344
test end!!
#openCL
./cl_mulmatrix
CL_DEVICE_NAME: Mali-T860
CL_DRIVER_VERSION: 1.2
src_size = 1089
build_log 1:
76084.375 99685.688 80474.227 76532.664 77556.680 99632.859 82275.727 78062.008 81567.109
Execution 2 time in milliseconds = 81.042 ms
Execution 1 time in milliseconds = 2994.000 ms
pyhon通过numpy的运算是70ms,而openCL则需要81ms,在这个平台上openCL的优势并不明显。
以下是运行算法时CPU的占用率分析:
openCL:
numpy:
网友评论