美文网首页
CUDA C初学者编程 (VS2017)

CUDA C初学者编程 (VS2017)

作者: Eden0503 | 来源:发表于2018-10-24 20:13 被阅读0次

    打开VS2017后,文件——新建——项目


    找到NVIDIA,有的人说自己的VS中没看见NVIDIA这一项啊,那是因为没有你没有安装CUDA,或者你在安装CUDA的时候参照某教程将Visual Studio Integration 取消勾选安装,其实后来再重新装上就行。

    创建一个文件夹名为 cuda_test 的项目,然后我们发现其实里面已经有 .cu 文件了,如下图所示。

    然后,我们像C语言一样生成编译文件,最终结果如下:

    接下来,我们修改代码如下,并运行以下代码。

    #include <stdio.h>
    
    const int N = 16;
    const int blocksize = 16;
    
    __global__  void hello(char *a, int *b)
    {
        a[threadIdx.x] += b[threadIdx.x];
    }
    
    int main()
    {
        char a[N] = "Hello \0\0\0\0\0\0";
        int b[N] = { 15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
    
        char *ad;
        int *bd;
        const int csize = N * sizeof(char);
        const int isize = N * sizeof(int);
    
        printf("%s", a);
    
        cudaMalloc((void**)&ad, csize);
        cudaMalloc((void**)&bd, isize);
        cudaMemcpy(ad, a, csize, cudaMemcpyHostToDevice);
        cudaMemcpy(bd, b, isize, cudaMemcpyHostToDevice);
    
        dim3 dimBlock(blocksize, 1);
        dim3 dimGrid(1, 1);
        hello << <dimGrid, dimBlock >> > (ad, bd);
        cudaMemcpy(a, ad, csize, cudaMemcpyDeviceToHost);
        cudaFree(ad);
        cudaFree(bd);
    
        printf("%s\n", a);
        return EXIT_SUCCESS;
    }
    
    

    编译运行成功,最终结果如下:


    运行以下代码,查看电脑设备。

    /*********************************************/
    
    #include <stdio.h>
    #include <cuda_runtime.h>
    
    void printDeviceProp(const cudaDeviceProp &prop)
    {
        printf("Device Name : %s.\n", prop.name);
        printf("totalGlobalMem : %d.\n", prop.totalGlobalMem);
        printf("sharedMemPerBlock : %d.\n", prop.sharedMemPerBlock);
        printf("regsPerBlock : %d.\n", prop.regsPerBlock);
        printf("warpSize : %d.\n", prop.warpSize);
        printf("memPitch : %d.\n", prop.memPitch);
        printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
        printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
        printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
        printf("totalConstMem : %d.\n", prop.totalConstMem);
        printf("major.minor : %d.%d.\n", prop.major, prop.minor);
        printf("clockRate : %d.\n", prop.clockRate);   // 输出的是GPU的时钟频率
        printf("textureAlignment : %d.\n", prop.textureAlignment);
        printf("deviceOverlap : %d.\n", prop.deviceOverlap);
        printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
    }
    
    bool InitCUDA()
    {
        //used to count the device numbers
        int count;
    
        // 获取CUDA设备数 
        cudaGetDeviceCount(&count);  
        if (count == 0) {
            fprintf(stderr, "There is no device.\n");
            return false;
        }
    
        // find the device >= 1.X
        int i;
        for (i = 0; i < count; ++i) {
            cudaDeviceProp prop;
            if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {  //cudaGetDeviceProperties获取 CUDA 设备属性
                if (prop.major >= 1) {
                    printDeviceProp(prop);
                    break;
                }
            }
        }
    
        // if can't find the device
        if (i == count) {
            fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
            return false;
        }
    
        // set cuda device
        cudaSetDevice(i);
    
        return true;
    }
    
    int main(int argc, char const *argv[])
    {
        if (InitCUDA()) {
            printf("CUDA initialized.\n");
        }
    
        return 0;
    }
    
    

    程序运行结果:


    查看当前系统上安装的GPU设备的详细参数

    打开文件夹C:\ProgramData\NVIDIA Corporation\CUDA Samples\v9.2\1_Utilities
    其中有一个名为deviceQuery的程序,可编译执行,即可打出当前系统上安装的GPU设备的详细参数。结果如下:

    CUDA 初始化

    /*
    CUDA初始化
    */
    
    #include "cuda_runtime.h"
    #include <stdio.h>
    
    //CUDA 初始化
    bool InitCUDA(){
        int count;
        cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目
        if (count == 0) {
            fprintf(stderr, "There is no device.\n");
            return false;
        }
        int i;
        for (i = 0; i < count; i++) {
            cudaDeviceProp prop;
            if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
                if (prop.major >= 1) {
                    break;
                }
            }
        }
        if (i == count) {
            fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
            return false;
        }
        cudaSetDevice(i);
        return true;
    }
    
    int main(){ 
        if (!InitCUDA()) {
            return 0;
        }//CUDA 初始化
        printf("CUDA initialized.\n");
        return 0;
    }
    
    
    运行得到结果:

    说明系统上有支持CUDA的装置。

    完整的向量点积CUDA程序

    /*
    完整的向量点积CUDA程序
    a=[a1,a2,…an], b=[b1,b2,…bn]
    a*b=a1*b1+a2*b2+…+an*bn
    */
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include "device_functions.h"
    #include <cuda.h> 
    #include <stdio.h>
    #include <stdlib.h>
    #include <malloc.h>
    #define N 10
    __global__ void Dot(int *a, int *b, int *c) //声明kernel函数
    {
        __shared__ int temp[N]; // 声明在共享存储中的变量
        temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];
        __syncthreads();  // 此处有下划线,并不是错误,而是VS智能感知有问题,
        if (0 == threadIdx.x)
        {
            int sum = 0;
            for (int i = 0; i < N; i++)
                sum += temp[i];
            *c = sum;
            printf("sum Calculated on Device:%d\n", *c);
        }
    }
    void random_ints(int *a, int n)
    {
        for (int i = 0; i < n; i++)
            *(a + i) = rand() % 10;
    }
    int main()
    {
        int *a, *b, *c;
        int *d_a, *d_b, *d_c;
        int size = N * sizeof(int);
        cudaMalloc((void **)&d_a, size);
        cudaMalloc((void **)&d_b, size);
        cudaMalloc((void **)&d_c, sizeof(int));
        a = (int *)malloc(size); random_ints(a, N);
        b = (int *)malloc(size); random_ints(b, N);
        c = (int *)malloc(sizeof(int));
        printf("Array a[N]:\n");
        for (int i = 0; i < N; i++) printf("%d ", a[i]);
        printf("\n");
        printf("Array b[N]:\n");
        for (int i = 0; i < N; i++) printf("%d ", b[i]);
        printf("\n\n");
        cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
        Dot << <1, N >> > (d_a, d_b, d_c); //单block多thread
        cudaMemcpy(c, d_c, sizeof(int), cudaMemcpyDeviceToHost);
        int sumHost = 0;
        for (int i = 0; i < N; i++)
            sumHost += a[i] * b[i];
        printf("sum Calculated on Host=%d\n", sumHost);
        printf("Device to Host: a*b=%d\n", *c);
        free(a); free(b); free(c);
        cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
        return 0;
    }
    
    运行结果如下:

    计算随机数向量立方和的CUDA程序

    /*
    计算随机数向量立方和的CUDA程序
    */
    
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include "device_functions.h"
    #include <cuda.h> 
    #include <stdio.h>
    #include <stdlib.h>  // 为了使用rand 函数
    #include <malloc.h>
    #define DATA_SIZE 1048576 //数据
    int data[DATA_SIZE];
    //产生大量0-9之间的随机数
    void GenerateNumbers(int *number, int size)
    {
        for (int i = 0; i < size; i++) {
            number[i] = rand() % 10;  // 0~32767之间的随机数 
        }
    }
    
    //CUDA 初始化
    bool InitCUDA()
    {
        int count;
        //取得支持Cuda的装置的数目
        cudaGetDeviceCount(&count);
        if (count == 0) {
            fprintf(stderr, "There is no device.\n");
            return false;
        }
        int i;
        for (i = 0; i < count; i++) {
            cudaDeviceProp prop;
            if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
                if (prop.major >= 1) {
                    break;
                }
            }
        }
        if (i == count) {
            fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
            return false;
        }
        cudaSetDevice(i);
        return true;
    }
    // __global__ 函数(GPU上执行) 计算立方和
    __global__ static void sumOfcubes(int *num, int* result)
    {
        int sum = 0;
        int i;
        for (i = 0; i < DATA_SIZE; i++) {
            sum += num[i] * num[i] * num[i];
        }
        *result = sum;
    }
    int main()
    { //CUDA 初始化
        if (!InitCUDA()) {
            return 0;
        }
        //生成随机数
        GenerateNumbers(data, DATA_SIZE);
        int* gpudata, *result;  // device中的参数
        int gpusum = 0; //CPU中的参数,但是是为了记录gpu中求到的sum.
        cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
        cudaMalloc((void**)&result, sizeof(int));
    
        
        cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); //cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中
        sumOfcubes <<<1,1,0>>> (gpudata, result); // gpudata, result 是传给设备本身的参数
        cudaMemcpy(&gpusum, result, sizeof(int), cudaMemcpyDeviceToHost);  //cudaMemcpy将GPU中的结果result 传给 CPU中的参数gpusum
        cudaFree(gpudata);
        cudaFree(result);
        printf("GPUsum: %d \n", gpusum);
        int sum = 0;
        for (int i = 0; i < DATA_SIZE; i++) {
            sum += data[i] * data[i] * data[i];
        }
        printf("CPUsum: %d \n", sum);
        return 0;
    }
    
    
    运行结果如下:

    计算随机数向量立方和的CUDA程序(加入统计时间的函数)

    /*
    计算随机数向量立方和的CUDA程序
    */
    
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include "device_functions.h"
    #include <time.h>  // 为了使用clock_t
    #include <cuda.h> 
    #include <stdio.h>
    #include <stdlib.h>  // 为了使用rand 函数
    #include <malloc.h>
    #define DATA_SIZE 1048576 //数据
    int data[DATA_SIZE];
    //产生大量0-9之间的随机数
    void GenerateNumbers(int *number, int size)
    {
        for (int i = 0; i < size; i++) {
            number[i] = rand() % 10;  // 0~32767之间的随机数 
        }
    }
    
    //CUDA 初始化
    bool InitCUDA()
    {
        int count;
        //取得支持Cuda的装置的数目
        cudaGetDeviceCount(&count);
        if (count == 0) {
            fprintf(stderr, "There is no device.\n");
            return false;
        }
        int i;
        for (i = 0; i < count; i++) {
            cudaDeviceProp prop;
            if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
                if (prop.major >= 1) {
                    break;
                }
            }
        }
        if (i == count) {
            fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
            return false;
        }
        cudaSetDevice(i);
        return true;
    }
    // __global__ 函数(GPU上执行) 计算立方和
    
    __global__ static void sumOfcubes(int *num, int *result, clock_t *time)
    {
        int sum = 0;
        int i;
        clock_t start = clock();
        for (i = 0; i < DATA_SIZE; i++) {
            sum += num[i] * num[i] * num[i];
        }
        *result = sum;
        *time = clock() - start;
    }
    
    int main()
    { //CUDA 初始化
        if (!InitCUDA()) {
            return 0;
        }
        //生成随机数
        GenerateNumbers(data, DATA_SIZE);
        int* gpudata, *result;  // device中的参数
        int gpusum = 0; //CPU中的参数,但是是为了记录gpu中求到的sum.
    
        clock_t* time;
        clock_t time_used;
    
        cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
        cudaMalloc((void**)&result, sizeof(int));
        cudaMalloc((void**)&time, sizeof(clock_t));
        
        cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); //cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中
        sumOfcubes <<<1,1,0>>> (gpudata, result,time); // gpudata, result,time 是传给设备本身的参数
        cudaMemcpy(&gpusum, result, sizeof(int), cudaMemcpyDeviceToHost);  //cudaMemcpy将GPU中的结果result 传给 CPU中的参数gpusum
        cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost); 
        
        cudaFree(gpudata);
        cudaFree(result);
        cudaFree(time);
    
        printf("GPUsum: %d——time:%d\n", gpusum, time_used);
    
        int sum = 0;
        for (int i = 0; i < DATA_SIZE; i++) {
            sum += data[i] * data[i] * data[i];
        }
        printf("CPUsum: %d \n", sum);
        return 0;
    }
    
    结果如下:

    GTX 1050上运行核函数部分的时间约为:
    1992949251/ 1493000 = 1334.8 mS 其中1493000是GPU频率。
    重要!!!
    也就是clock() 统计出来的 (end - start) / GPU频率 = 毫秒级单位的时间 其中利用***获取的GPU频率单位是 千赫兹,比如我的GTX1050 的频率是 1493000千赫兹。

    进一步优化上述求立方和的程序(利用单block多thread)

    /*
    计算随机数向量立方和的CUDA程序
    */
    
    #include "cuda_runtime.h"
    #include <time.h>  // 为了使用clock_t
    #include <stdio.h>
    #include <stdlib.h>  // 为了使用rand 函数
    
    //#include "device_launch_parameters.h"
    //#include "device_functions.h"
    //#include <cuda.h> 
    //#include <malloc.h>
    
    #define DATA_SIZE 1048576 //数据
    #define THREAD_NUM 256  //线程的数量
    int data[DATA_SIZE];
    
    //产生大量0-9之间的随机数
    void GenerateNumbers(int *number, int size){
        for (int i = 0; i < size; i++) {
            number[i] = rand() % 10;  // 0~32767之间的随机数 
        }
    }
    
    //CUDA 初始化
    bool InitCUDA(){
        int count;
        //取得支持Cuda的装置的数目
        cudaGetDeviceCount(&count);
        if (count == 0) {
            fprintf(stderr, "There is no device.\n");
            return false;
        }
        int i;
        for (i = 0; i < count; i++) {
            cudaDeviceProp prop;
            if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
                if (prop.major >= 1) {
                    break;
                }
            }
        }
        if (i == count) {
            fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
            return false;
        }
        cudaSetDevice(i);
        return true;
    }
    // __global__ 函数(GPU上执行) 计算立方和
    
    __global__ static void sumOfcubes(int *num, int *result, clock_t *time){
        const int tid = threadIdx.x;
        const int size = DATA_SIZE / THREAD_NUM;//计算每个线程需要完成的量
        int sum = 0;
        int i;
    
        clock_t start; // 记录运算开始的时间 
    
        if (tid == 0) start = clock(); //只在thread 0(即threadIdx.x = 0 的时候)进行记录
        for (i = tid * size; i < (tid + 1) * size; i++){
            sum += num[i] * num[i] * num[i];
        }
        result[tid] = sum;
        //计算时间的动作,只在thread 0(即threadIdx.x = 0 的时候)进行
        if (tid == 0) *time = clock() - start;
    }
    
    int main(){ 
        if (!InitCUDA()) {return 0;}//CUDA 初始化
        GenerateNumbers(data, DATA_SIZE); //生成随机数
        int* gpudata, *result;  // device中的参数
        int gpusum[THREAD_NUM];  // CPU中记录GPU中记录求和的参数
        clock_t* time;
        clock_t time_used;
    
        cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
        cudaMalloc((void**)&time, sizeof(clock_t));
        cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM);
    
        //cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中
        cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); 
        // 启动kernel函数
        sumOfcubes<<<1,THREAD_NUM,0>>> (gpudata, result,time); // gpudata, result,time 是传给设备本身的参数
    
        clock_t time_use;
        
        //cudaMemcpy 将结果从显存中复制回CPU内存 gpusum
        cudaMemcpy(&gpusum, result, sizeof(int) * THREAD_NUM, cudaMemcpyDeviceToHost);
        cudaMemcpy(&time_use, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
        
        cudaFree(gpudata);
        cudaFree(result);
        cudaFree(time);
    
        int final_sum = 0; /*立方和归约*/
        for (int i = 0; i < THREAD_NUM; i++){
            final_sum += gpusum[i];
        }
        printf("GPUsum: %d gputime: %d\n", final_sum, time_use);
    
        // 计算CPU中计算时的结果。
        final_sum = 0;
        for (int i = 0; i < DATA_SIZE; i++) {
            final_sum += data[i] * data[i] * data[i];
        }
        printf("CPUsum: %d \n", final_sum);
        return 0;
    }
    
    
    结果如下:

    然而,上述程序还能够优化,上述代码:

        if (tid == 0) start = clock(); //只在thread 0(即threadIdx.x = 0 的时候)进行记录
        for (i = tid * size; i < (tid + 1) * size; i++){
            sum += num[i] * num[i] * num[i];
        }
        result[tid] = sum;
        //计算时间的动作,只在thread 0(即threadIdx.x = 0 的时候)进行
        if (tid == 0) *time = clock() - start;
    }
    

    中核函数并不是连续存取的,读取数字完全是跳跃式的读取,这会非常影响内存的存取效率。因此我们下一步要将取数字的过程变成:thread 0 读取第一个数字,thread 1 读取第二个数字,以此类推......
    程序部分修改为:

        if (tid == 0) start = clock(); //只在thread 0(即threadIdx.x = 0 的时候)进行记录
        for (i = tid; i < DATA_SIZE; i+= THREAD_NUM){
            sum += num[i] * num[i] * num[i];
        }
        result[tid] = sum;
        //计算时间的动作,只在thread 0(即threadIdx.x = 0 的时候)进行
        if (tid == 0) *time = clock() - start;
    
    实验运行结果:

    ???问题来了,老师上课的时候明明说,这样的优化会使得程序运行时间缩短很多倍,但是为啥我的运行时间结果没啥大的改变呢。8746726/8365217=1.04,几乎就是没有改进嘛.....难道是程序出现了问题,于是,我就将两个程序放在服务器上运行了。结果显示 1103530/185263=5.96,有近6倍的提,说明这样的优化程序是正确的,只是在VS2017的编译运行中没有体现出来,可这又是为啥?



    经过一番查询资料,发现VS2017中我在编译运行的时候用的是debug模式,其实需要改成Release模式。
    debug.png Release.png

    修改完编译器中的模式之后,发现结果果然不一样了,1060761/166149 = 6.38,说明优化后的程序有很大的提升,因此,得出一个结果,VS中的debug模式只能用来修改程序中的错误,但是不能说明真正的程序性能的好坏,谨记,要改为Release模式。


    优化前.png 优化后.png

    更进一步的优化,多block多thread

    CUDA不仅提供了Thread,还提供了Grid和Block以及Share Memory这些非常重要的机制,每个block中Thread极限是1024,但是通过block和Grid,线程的数量还能成倍增长,甚至用几万个线程。
    /*
    计算随机数向量立方和的CUDA程序
    */
    #include "cuda_runtime.h"
    #include <time.h>  // 为了使用clock_t
    #include <stdio.h>
    #include <stdlib.h>  // 为了使用rand 函数
    
    //#include "device_launch_parameters.h"
    //#include "device_functions.h"
    //#include <cuda.h> 
    //#include <malloc.h>
    
    #define DATA_SIZE 1048576 //数据
    #define THREAD_NUM 256 //thread 数量
    #define BLOCK_NUM  32// block 数量 
    //配置32 个blocks,每个blocks 有256个threads,总共有32*256= 8192个threads。
    int data[DATA_SIZE];
    
    //产生大量0-9之间的随机数
    void GenerateNumbers(int *number, int size) {
        for (int i = 0; i < size; i++) {
            number[i] = rand() % 10;  // 0~32767之间的随机数 
        }
    }
    
    //CUDA 初始化
    bool InitCUDA() {
        int count;
        cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目
        if (count == 0) {
            fprintf(stderr, "There is no device.\n");
            return false;
        }
        int i;
        for (i = 0; i < count; i++) {
            cudaDeviceProp prop;
            if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
                if (prop.major >= 1) {
                    break;
                }
            }
        }
        if (i == count) {
            fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
            return false;
        }
        cudaSetDevice(i);
        return true;
    }
    // __global__ 函数(GPU上执行) 计算立方和
    
    __global__ static void sumOfcubes(int *num, int *result, clock_t *time) {
        const int tid = threadIdx.x;
        const int bid = blockIdx.x;
        int sum = 0;
        int i;
    
        //只在thread 0(即threadIdx.x = 0 的时候)进行记录,每个block 都会记录开始时间及结束时间
        if (tid == 0) time[bid] = clock();
        //thread需要同时通过tid和bid来确定,并保证内存连续性
        for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {
            sum += num[i] * num[i] * num[i];
        }
        //Result的数量随之增加
        result[bid * THREAD_NUM + tid] = sum;
        //计算时间的动作,只在thread 0(即threadIdx.x = 0 的时候)进行,每个block 都会记录开始时间及结束时间
        if (tid == 0) time[bid + BLOCK_NUM] = clock();
    }
    
    int main() {
        if (!InitCUDA()) { return 0; }//CUDA 初始化
        GenerateNumbers(data, DATA_SIZE); //生成随机数
        int *gpudata, *result;  // device中的参数
        clock_t *time;
    
        cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
        cudaMalloc((void**)&time, sizeof(clock_t)*BLOCK_NUM * 2);
        cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM*BLOCK_NUM);
    
        //cudaMemcpy 将数据从cpu复制到device内存中
        cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
        // 启动kernel函数
        sumOfcubes << <BLOCK_NUM, THREAD_NUM, 0 >> > (gpudata, result, time); // gpudata, result,time 是传给设备本身的参数
    
        int gpusum[THREAD_NUM*BLOCK_NUM];  // CPU中记录GPU中记录求和的参数
        clock_t time_use[BLOCK_NUM * 2];
    
        //cudaMemcpy 将结果从显存中复制回CPU内存
        cudaMemcpy(&gpusum, result, sizeof(int) * THREAD_NUM * BLOCK_NUM, cudaMemcpyDeviceToHost);
        cudaMemcpy(&time_use, time, sizeof(clock_t)*BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
    
        cudaFree(gpudata);
        cudaFree(result);
        cudaFree(time);
    
        int final_sum = 0; /*立方和归约*/
        for (int i = 0; i < THREAD_NUM * BLOCK_NUM; i++) {
            final_sum += gpusum[i];
        }
    
        //采取新的计时策略把每个block 最早的开始时间,和最晚的结束时间相减,取得总运行时间
        clock_t min_start, max_end;
        min_start = time_use[0];
        max_end = time_use[BLOCK_NUM];
        for (int i = 1; i < BLOCK_NUM; i++) {  //把每个block最早的开始时间和最晚结束的时间相减,取得最终的运行时间
            if (min_start > time_use[i])
                min_start = time_use[i];
            if (max_end < time_use[i + BLOCK_NUM])
                max_end = time_use[i + BLOCK_NUM];
        }
        printf("GPUsum: %d gputime: %ld\n", final_sum, max_end - min_start);
    
    
        // 计算CPU中计算时的结果。
        final_sum = 0;
        for (int i = 0; i < DATA_SIZE; i++) {
            final_sum += data[i] * data[i] * data[i];
        }
        printf("CPUsum: %d \n", final_sum);
        return 0;
    }
    
    会出现以下错误: error.png

    解决方案:release模式会优化很多东西,但是多重新编译几次,又会成功,真是搞笑,搞不懂这些编译器......... 算了,还是建议去服务器上运行吧,VS中实在有太多无法未知的错误了。

    程序运行结果: Release_threads256_blocks32.png

    后来修改 THREAD_NUM = 1024 , BLOCK_NUM = 128 ,总共 1024 * 128 = 131073 个threads,性能并没有很大提升了。

     #define THREAD_NUM 1024  //thread 数量
     #define BLOCK_NUM 128 // block 数量 
    
    Release_threads1024_blocks128.png

    为什么不用极限的1024个线程呢?

    • 如果1024个线程全部用上,那样就是32*1024 = 32768 个线程,难道不是更好吗?从线程运行的原理来看,线程数量达到一定大小后,再一味地增加线程也不会取得性能的提升,反而有可能会让性能下降。线程数达到隐藏各种latency的程度后,之后线程数量的提升就没有太大的作用了。
    • 程序中加和部分是在CPU上进行的,越多的线程意味着越多的结果,而这也意味着CPU上的运算压力会越来越大。

    Thread 的同步

    一个block内的所有的 thread 可以有共享的内存,也可以进行同步,因此,还可以让每个block 内所有 thread 将自己的计算结果相加起来。代码可以进一步修改:

    #include "cuda_runtime.h"
    #include <time.h>  // 为了使用clock_t
    #include <stdio.h>
    #include <stdlib.h>  // 为了使用rand 函数
    
    //#include "device_launch_parameters.h"
    //#include "device_functions.h"
    //#include <cuda.h> 
    //#include <malloc.h>
    
    #define DATA_SIZE 1048576 //数据
    #define THREAD_NUM 256 //thread 数量
    #define BLOCK_NUM 32// block 数量 
    //配置32 个blocks,每个blocks 有256个threads,总共有32*256= 8192个threads。
    int data[DATA_SIZE];
    
    //产生大量0-9之间的随机数
    void GenerateNumbers(int *number, int size) {
        for (int i = 0; i < size; i++) {
            number[i] = rand() % 10;  // 0~32767之间的随机数 
        }
    }
    
    //CUDA 初始化
    bool InitCUDA() {
        int count;
        cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目
        if (count == 0) {
            fprintf(stderr, "There is no device.\n");
            return false;
        }
        int i;
        for (i = 0; i < count; i++) {
            cudaDeviceProp prop;
            if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
                if (prop.major >= 1) {
                    break;
                }
            }
        }
        if (i == count) {
            fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
            return false;
        }
        cudaSetDevice(i);
        return true;
    }
    // __global__ 函数(GPU上执行) 计算立方和
    
    __global__ static void sumOfcubes(int *num, int *result, clock_t *time) {
        extern __shared__ int shared[];
        const int tid = threadIdx.x;
        const int bid = blockIdx.x;
        int i;
    
        //只在thread 0(即threadIdx.x = 0 的时候)进行记录,每个block 都会记录开始时间及结束时间
        if (tid == 0) time[bid] = clock(); 
        shared[tid] = 0;
        //thread需要同时通过tid和bid来确定,并保证内存连续性
        for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {
            shared[tid] += num[i] * num[i] * num[i];
        }
    
    
        __syncthreads();
        if (tid == 0) {
            for (i = 1; i < THREAD_NUM; i++) {
                shared[0] += shared[i];
            }
            result[tid] = shared[0];
        }
        if (tid == 0) time[bid + BLOCK_NUM] = clock();
    }
    
    int main() {
        if (!InitCUDA()) { return 0; }//CUDA 初始化
        GenerateNumbers(data, DATA_SIZE); //生成随机数
        int *gpudata, *result;  // device中的参数
        clock_t *time;
    
        cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
        cudaMalloc((void**)&time, sizeof(clock_t)*BLOCK_NUM * 2);
        cudaMalloc((void**)&result, sizeof(int)*BLOCK_NUM);
    
        cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
        // 启动kernel函数
        sumOfcubes << <BLOCK_NUM, THREAD_NUM, sizeof(int)* THREAD_NUM >> > (gpudata, result, time); // gpudata, result,time 是传给设备本身的参数
    
        int gpusum[BLOCK_NUM];  // CPU中记录GPU中记录求和的参数
        clock_t time_use[BLOCK_NUM * 2];
    
        cudaMemcpy(&gpusum, result, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost);
        cudaMemcpy(&time_use, time, sizeof(clock_t)*BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
    
        cudaFree(gpudata);
        cudaFree(result);
        cudaFree(time);
    
        int final_sum = 0; /*立方和归约*/
        for (int i = 0; i <BLOCK_NUM; i++) {
            final_sum += gpusum[i];
        }
    
        //采取新的计时策略把每个block 最早的开始时间,和最晚的结束时间相减,取得总运行时间
        clock_t min_start, max_end;
        min_start = time_use[0];
        max_end = time_use[BLOCK_NUM];
        for (int i = 1; i < BLOCK_NUM; i++) {  //把每个block最早的开始时间和最晚结束的时间相减,取得最终的运行时间
            if (min_start > time_use[i])
                min_start = time_use[i];
            if (max_end < time_use[i + BLOCK_NUM])
                max_end = time_use[i + BLOCK_NUM];
        }
        printf("GPUsum: %d gputime: %ld\n", final_sum, max_end - min_start);
    
    
        // 计算CPU中计算时的结果。
        final_sum = 0;
        for (int i = 0; i < DATA_SIZE; i++) {
            final_sum += data[i] * data[i] * data[i];
        }
        printf("CPUsum: %d \n", final_sum);
        return 0;
    }
    

    树状加法

    参考:
    http://hpc.pku.edu.cn/docs/20170829223804057249.pdf 深入浅出谈CUDA

    #include "device_functions.h"
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include "device_functions.h"
    #include <stdio.h>
    #include <cuda.h>
    #include <cublas.h>
    
    #define BLOCK_DIM 16
    
    
    /**
     * Computes the squared Euclidean distance matrix between the query points and the reference points.
     *
     * @param ref          refence points stored in the global memory
     * @param ref_width    number of reference points
     * @param ref_pitch    pitch of the reference points array in number of column
     * @param query        query points stored in the global memory
     * @param query_width  number of query points
     * @param query_pitch  pitch of the query points array in number of columns
     * @param height       dimension of points = height of texture `ref` and of the array `query`
     * @param dist         array containing the query_width x ref_width computed distances
     */
    __global__ void compute_distances(float * ref,
        int     ref_width,
        int     ref_pitch,
        float * query,
        int     query_width,
        int     query_pitch,
        int     height,
        float * dist) {
    
        // Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B
        __shared__ float shared_A[BLOCK_DIM][BLOCK_DIM];
        __shared__ float shared_B[BLOCK_DIM][BLOCK_DIM];
    
        // Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step)
        __shared__ int begin_A;
        __shared__ int begin_B;
        __shared__ int step_A;
        __shared__ int step_B;
        __shared__ int end_A;
    
        // Thread index
        int tx = threadIdx.x;
        int ty = threadIdx.y;
    
        // Initializarion of the SSD for the current thread
        float ssd = 0.f;
    
        // Loop parameters
        begin_A = BLOCK_DIM * blockIdx.y;
        begin_B = BLOCK_DIM * blockIdx.x;
        step_A = BLOCK_DIM * ref_pitch;
        step_B = BLOCK_DIM * query_pitch;
        end_A = begin_A + (height - 1) * ref_pitch;
    
        // Conditions
        int cond0 = (begin_A + tx < ref_width); // used to write in shared memory
        int cond1 = (begin_B + tx < query_width); // used to write in shared memory & to computations and to write in output array 
        int cond2 = (begin_A + ty < ref_width); // used to computations and to write in output matrix
    
        // Loop over all the sub-matrices of A and B required to compute the block sub-matrix
        for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) {
    
            // Load the matrices from device memory to shared memory; each thread loads one element of each matrix
            if (a / ref_pitch + ty < height) {
                shared_A[ty][tx] = (cond0) ? ref[a + ref_pitch * ty + tx] : 0;
                shared_B[ty][tx] = (cond1) ? query[b + query_pitch * ty + tx] : 0;
            }
            else {
                shared_A[ty][tx] = 0;
                shared_B[ty][tx] = 0;
            }
    
            // Synchronize to make sure the matrices are loaded
            __syncthreads();
    
            // Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix
            if (cond2 && cond1) {
                for (int k = 0; k < BLOCK_DIM; ++k) {
                    float tmp = shared_A[k][ty] - shared_B[k][tx];
                    ssd += tmp * tmp;
                }
            }
    
            // Synchronize to make sure that the preceeding computation is done before loading two new sub-matrices of A and B in the next iteration
            __syncthreads();
        }
    
        // Write the block sub-matrix to device memory; each thread writes one element
        if (cond2 && cond1) {
            dist[(begin_A + ty) * query_pitch + begin_B + tx] = ssd;
        }
    }
    
    
    /**
     * Computes the squared Euclidean distance matrix between the query points and the reference points.
     *
     * @param ref          refence points stored in the texture memory
     * @param ref_width    number of reference points
     * @param query        query points stored in the global memory
     * @param query_width  number of query points
     * @param query_pitch  pitch of the query points array in number of columns
     * @param height       dimension of points = height of texture `ref` and of the array `query`
     * @param dist         array containing the query_width x ref_width computed distances
     */
    __global__ void compute_distance_texture(cudaTextureObject_t ref,
        int                 ref_width,
        float *             query,
        int                 query_width,
        int                 query_pitch,
        int                 height,
        float*              dist) {
        unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
        unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
        if (xIndex < query_width && yIndex < ref_width) {
            float ssd = 0.f;
            for (int i = 0; i < height; i++) {
                float tmp = tex2D<float>(ref, (float)yIndex, (float)i) - query[i * query_pitch + xIndex];
                ssd += tmp * tmp;
            }
            dist[yIndex * query_pitch + xIndex] = ssd;
        }
    }
    
    
    /**
     * For each reference point (i.e. each column) finds the k-th smallest distances
     * of the distance matrix and their respective indexes and gathers them at the top
     * of the 2 arrays.
     *
     * Since we only need to locate the k smallest distances, sorting the entire array
     * would not be very efficient if k is relatively small. Instead, we perform a
     * simple insertion sort by eventually inserting a given distance in the first
     * k values.
     *
     * @param dist         distance matrix
     * @param dist_pitch   pitch of the distance matrix given in number of columns
     * @param index        index matrix
     * @param index_pitch  pitch of the index matrix given in number of columns
     * @param width        width of the distance matrix and of the index matrix
     * @param height       height of the distance matrix
     * @param k            number of values to find
     */
    __global__ void modified_insertion_sort(float * dist,
        int     dist_pitch,
        int *   index,
        int     index_pitch,
        int     width,
        int     height,
        int     k) {
    
        // Column position
        unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    
        // Do nothing if we are out of bounds
        if (xIndex < width) {
    
            // Pointer shift
            float * p_dist = dist + xIndex;
            int *   p_index = index + xIndex;
    
            // Initialise the first index
            p_index[0] = 0;
    
            // Go through all points
            for (int i = 1; i < height; ++i) {
    
                // Store current distance and associated index
                float curr_dist = p_dist[i*dist_pitch];
                int   curr_index = i;
    
                // Skip the current value if its index is >= k and if it's higher the k-th slready sorted mallest value
                if (i >= k && curr_dist >= p_dist[(k - 1)*dist_pitch]) {
                    continue;
                }
    
                // Shift values (and indexes) higher that the current distance to the right
                int j = min(i, k - 1);
                while (j > 0 && p_dist[(j - 1)*dist_pitch] > curr_dist) {
                    p_dist[j*dist_pitch] = p_dist[(j - 1)*dist_pitch];
                    p_index[j*index_pitch] = p_index[(j - 1)*index_pitch];
                    --j;
                }
    
                // Write the current distance and index at their position
                p_dist[j*dist_pitch] = curr_dist;
                p_index[j*index_pitch] = curr_index;
            }
        }
    }
    
    
    /**
     * Computes the square root of the first k lines of the distance matrix.
     *
     * @param dist   distance matrix
     * @param width  width of the distance matrix
     * @param pitch  pitch of the distance matrix given in number of columns
     * @param k      number of values to consider
     */
    __global__ void compute_sqrt(float * dist, int width, int pitch, int k) {
        unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
        unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
        if (xIndex < width && yIndex < k)
            dist[yIndex*pitch + xIndex] = sqrt(dist[yIndex*pitch + xIndex]);
    }
    
    
    /**
     * Computes the squared norm of each column of the input array.
     *
     * @param array   input array
     * @param width   number of columns of `array` = number of points
     * @param pitch   pitch of `array` in number of columns
     * @param height  number of rows of `array` = dimension of the points
     * @param norm    output array containing the squared norm values
     */
    __global__ void compute_squared_norm(float * array, int width, int pitch, int height, float * norm) {
        unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
        if (xIndex < width) {
            float sum = 0.f;
            for (int i = 0; i < height; i++) {
                float val = array[i*pitch + xIndex];
                sum += val * val;
            }
            norm[xIndex] = sum;
        }
    }
    
    
    /**
     * Add the reference points norm (column vector) to each colum of the input array.
     *
     * @param array   input array
     * @param width   number of columns of `array` = number of points
     * @param pitch   pitch of `array` in number of columns
     * @param height  number of rows of `array` = dimension of the points
     * @param norm    reference points norm stored as a column vector
     */
    __global__ void add_reference_points_norm(float * array, int width, int pitch, int height, float * norm) {
        unsigned int tx = threadIdx.x;
        unsigned int ty = threadIdx.y;
        unsigned int xIndex = blockIdx.x * blockDim.x + tx;
        unsigned int yIndex = blockIdx.y * blockDim.y + ty;
        __shared__ float shared_vec[16];
        if (tx == 0 && yIndex < height)
            shared_vec[ty] = norm[yIndex];
        __syncthreads();
        if (xIndex < width && yIndex < height)
            array[yIndex*pitch + xIndex] += shared_vec[ty];
    }
    
    
    /**
     * Adds the query points norm (row vector) to the k first lines of the input
     * array and computes the square root of the resulting values.
     *
     * @param array   input array
     * @param width   number of columns of `array` = number of points
     * @param pitch   pitch of `array` in number of columns
     * @param k       number of neighbors to consider
     * @param norm     query points norm stored as a row vector
     */
    __global__ void add_query_points_norm_and_sqrt(float * array, int width, int pitch, int k, float * norm) {
        unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
        unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
        if (xIndex < width && yIndex < k)
            array[yIndex*pitch + xIndex] = sqrt(array[yIndex*pitch + xIndex] + norm[xIndex]);
    }
    
    
    bool knn_cuda_global(const float * ref,
        int           ref_nb,
        const float * query,
        int           query_nb,
        int           dim,
        int           k,
        float *       knn_dist,
        int *         knn_index) {
    
        // Constants
        const unsigned int size_of_float = sizeof(float);
        const unsigned int size_of_int = sizeof(int);
    
        // Return variables
        cudaError_t err0, err1, err2, err3;
    
        // Check that we have at least one CUDA device 
        int nb_devices;
        err0 = cudaGetDeviceCount(&nb_devices);
        if (err0 != cudaSuccess || nb_devices == 0) {
            printf("ERROR: No CUDA device found\n");
            return false;
        }
    
        // Select the first CUDA device as default
        err0 = cudaSetDevice(0);
        if (err0 != cudaSuccess) {
            printf("ERROR: Cannot set the chosen CUDA device\n");
            return false;
        }
    
        // Allocate global memory
        float * ref_dev = NULL;
        float * query_dev = NULL;
        float * dist_dev = NULL;
        int   * index_dev = NULL;
        size_t  ref_pitch_in_bytes;
        size_t  query_pitch_in_bytes;
        size_t  dist_pitch_in_bytes;
        size_t  index_pitch_in_bytes;
        err0 = cudaMallocPitch((void**)&ref_dev, &ref_pitch_in_bytes, ref_nb   * size_of_float, dim);
        err1 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);
        err2 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);
        err3 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);
        if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess || err3 != cudaSuccess) {
            printf("ERROR: Memory allocation error\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            return false;
        }
    
        // Deduce pitch values
        size_t ref_pitch = ref_pitch_in_bytes / size_of_float;
        size_t query_pitch = query_pitch_in_bytes / size_of_float;
        size_t dist_pitch = dist_pitch_in_bytes / size_of_float;
        size_t index_pitch = index_pitch_in_bytes / size_of_int;
    
        // Check pitch values
        if (query_pitch != dist_pitch || query_pitch != index_pitch) {
            printf("ERROR: Invalid pitch value\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            return false;
        }
    
        // Copy reference and query data from the host to the device
        err0 = cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref, ref_nb * size_of_float, ref_nb * size_of_float, dim, cudaMemcpyHostToDevice);
        err1 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);
        if (err0 != cudaSuccess || err1 != cudaSuccess) {
            printf("ERROR: Unable to copy data from host to device\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            return false;
        }
    
        // Compute the squared Euclidean distances
        dim3 block0(BLOCK_DIM, BLOCK_DIM, 1);
        dim3 grid0(query_nb / BLOCK_DIM, ref_nb / BLOCK_DIM, 1);
        if (query_nb % BLOCK_DIM != 0) grid0.x += 1;
        if (ref_nb   % BLOCK_DIM != 0) grid0.y += 1;
        compute_distances << <grid0, block0 >> > (ref_dev, ref_nb, ref_pitch, query_dev, query_nb, query_pitch, dim, dist_dev);
        if (cudaGetLastError() != cudaSuccess) {
            printf("ERROR: Unable to execute kernel\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            return false;
        }
    
        // Sort the distances with their respective indexes
        dim3 block1(256, 1, 1);
        dim3 grid1(query_nb / 256, 1, 1);
        if (query_nb % 256 != 0) grid1.x += 1;
        modified_insertion_sort << <grid1, block1 >> > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);
        if (cudaGetLastError() != cudaSuccess) {
            printf("ERROR: Unable to execute kernel\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            return false;
        }
    
        // Compute the square root of the k smallest distances
        dim3 block2(16, 16, 1);
        dim3 grid2(query_nb / 16, k / 16, 1);
        if (query_nb % 16 != 0) grid2.x += 1;
        if (k % 16 != 0)        grid2.y += 1;
        compute_sqrt << <grid2, block2 >> > (dist_dev, query_nb, query_pitch, k);
        if (cudaGetLastError() != cudaSuccess) {
            printf("ERROR: Unable to execute kernel\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            return false;
        }
    
        // Copy k smallest distances / indexes from the device to the host
        err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);
        err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);
        if (err0 != cudaSuccess || err1 != cudaSuccess) {
            printf("ERROR: Unable to copy data from device to host\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            return false;
        }
    
        // Memory clean-up
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
    
        return true;
    }
    
    
    bool knn_cuda_texture(const float * ref,
        int           ref_nb,
        const float * query,
        int           query_nb,
        int           dim,
        int           k,
        float *       knn_dist,
        int *         knn_index) {
    
        // Constants
        unsigned int size_of_float = sizeof(float);
        unsigned int size_of_int = sizeof(int);
    
        // Return variables
        cudaError_t err0, err1, err2;
    
        // Check that we have at least one CUDA device 
        int nb_devices;
        err0 = cudaGetDeviceCount(&nb_devices);
        if (err0 != cudaSuccess || nb_devices == 0) {
            printf("ERROR: No CUDA device found\n");
            return false;
        }
    
        // Select the first CUDA device as default
        err0 = cudaSetDevice(0);
        if (err0 != cudaSuccess) {
            printf("ERROR: Cannot set the chosen CUDA device\n");
            return false;
        }
    
        // Allocate global memory
        float * query_dev = NULL;
        float * dist_dev = NULL;
        int *   index_dev = NULL;
        size_t  query_pitch_in_bytes;
        size_t  dist_pitch_in_bytes;
        size_t  index_pitch_in_bytes;
        err0 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);
        err1 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);
        err2 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);
        if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess) {
            printf("ERROR: Memory allocation error (cudaMallocPitch)\n");
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            return false;
        }
    
        // Deduce pitch values
        size_t query_pitch = query_pitch_in_bytes / size_of_float;
        size_t dist_pitch = dist_pitch_in_bytes / size_of_float;
        size_t index_pitch = index_pitch_in_bytes / size_of_int;
    
        // Check pitch values
        if (query_pitch != dist_pitch || query_pitch != index_pitch) {
            printf("ERROR: Invalid pitch value\n");
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            return false;
        }
    
        // Copy query data from the host to the device
        err0 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);
        if (err0 != cudaSuccess) {
            printf("ERROR: Unable to copy data from host to device\n");
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            return false;
        }
    
        // Allocate CUDA array for reference points
        cudaArray* ref_array_dev = NULL;
        cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
        err0 = cudaMallocArray(&ref_array_dev, &channel_desc, ref_nb, dim);
        if (err0 != cudaSuccess) {
            printf("ERROR: Memory allocation error (cudaMallocArray)\n");
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            return false;
        }
    
        // Copy reference points from host to device
        err0 = cudaMemcpyToArray(ref_array_dev, 0, 0, ref, ref_nb * size_of_float * dim, cudaMemcpyHostToDevice);
        if (err0 != cudaSuccess) {
            printf("ERROR: Unable to copy data from host to device\n");
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFreeArray(ref_array_dev);
            return false;
        }
    
        // Resource descriptor
        struct cudaResourceDesc res_desc;
        memset(&res_desc, 0, sizeof(res_desc));
        res_desc.resType = cudaResourceTypeArray;
        res_desc.res.array.array = ref_array_dev;
    
        // Texture descriptor
        struct cudaTextureDesc tex_desc;
        memset(&tex_desc, 0, sizeof(tex_desc));
        tex_desc.addressMode[0] = cudaAddressModeClamp;
        tex_desc.addressMode[1] = cudaAddressModeClamp;
        tex_desc.filterMode = cudaFilterModePoint;
        tex_desc.readMode = cudaReadModeElementType;
        tex_desc.normalizedCoords = 0;
    
        // Create the texture
        cudaTextureObject_t ref_tex_dev = 0;
        err0 = cudaCreateTextureObject(&ref_tex_dev, &res_desc, &tex_desc, NULL);
        if (err0 != cudaSuccess) {
            printf("ERROR: Unable to create the texture\n");
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFreeArray(ref_array_dev);
            return false;
        }
    
        // Compute the squared Euclidean distances
        dim3 block0(16, 16, 1);
        dim3 grid0(query_nb / 16, ref_nb / 16, 1);
        if (query_nb % 16 != 0) grid0.x += 1;
        if (ref_nb % 16 != 0) grid0.y += 1;
        compute_distance_texture << <grid0, block0 >> > (ref_tex_dev, ref_nb, query_dev, query_nb, query_pitch, dim, dist_dev);
        if (cudaGetLastError() != cudaSuccess) {
            printf("ERROR: Unable to execute kernel\n");
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFreeArray(ref_array_dev);
            cudaDestroyTextureObject(ref_tex_dev);
            return false;
        }
    
        // Sort the distances with their respective indexes
        dim3 block1(256, 1, 1);
        dim3 grid1(query_nb / 256, 1, 1);
        if (query_nb % 256 != 0) grid1.x += 1;
        modified_insertion_sort << <grid1, block1 >> > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);
        if (cudaGetLastError() != cudaSuccess) {
            printf("ERROR: Unable to execute kernel\n");
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFreeArray(ref_array_dev);
            cudaDestroyTextureObject(ref_tex_dev);
            return false;
        }
    
        // Compute the square root of the k smallest distances
        dim3 block2(16, 16, 1);
        dim3 grid2(query_nb / 16, k / 16, 1);
        if (query_nb % 16 != 0) grid2.x += 1;
        if (k % 16 != 0)        grid2.y += 1;
        compute_sqrt << <grid2, block2 >> > (dist_dev, query_nb, query_pitch, k);
        if (cudaGetLastError() != cudaSuccess) {
            printf("ERROR: Unable to execute kernel\n");
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFreeArray(ref_array_dev);
            cudaDestroyTextureObject(ref_tex_dev);
            return false;
        }
    
        // Copy k smallest distances / indexes from the device to the host
        err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);
        err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);
        if (err0 != cudaSuccess || err1 != cudaSuccess) {
            printf("ERROR: Unable to copy data from device to host\n");
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFreeArray(ref_array_dev);
            cudaDestroyTextureObject(ref_tex_dev);
            return false;
        }
    
        // Memory clean-up
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFreeArray(ref_array_dev);
        cudaDestroyTextureObject(ref_tex_dev);
    
        return true;
    }
    
    
    bool knn_cublas(const float * ref,
        int           ref_nb,
        const float * query,
        int           query_nb,
        int           dim,
        int           k,
        float *       knn_dist,
        int *         knn_index) {
    
        // Constants
        const unsigned int size_of_float = sizeof(float);
        const unsigned int size_of_int = sizeof(int);
    
        // Return variables
        cudaError_t  err0, err1, err2, err3, err4, err5;
    
        // Check that we have at least one CUDA device 
        int nb_devices;
        err0 = cudaGetDeviceCount(&nb_devices);
        if (err0 != cudaSuccess || nb_devices == 0) {
            printf("ERROR: No CUDA device found\n");
            return false;
        }
    
        // Select the first CUDA device as default
        err0 = cudaSetDevice(0);
        if (err0 != cudaSuccess) {
            printf("ERROR: Cannot set the chosen CUDA device\n");
            return false;
        }
    
        // Initialize CUBLAS
        cublasInit();
    
        // Allocate global memory
        float * ref_dev = NULL;
        float * query_dev = NULL;
        float * dist_dev = NULL;
        int   * index_dev = NULL;
        float * ref_norm_dev = NULL;
        float * query_norm_dev = NULL;
        size_t  ref_pitch_in_bytes;
        size_t  query_pitch_in_bytes;
        size_t  dist_pitch_in_bytes;
        size_t  index_pitch_in_bytes;
        err0 = cudaMallocPitch((void**)&ref_dev, &ref_pitch_in_bytes, ref_nb   * size_of_float, dim);
        err1 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);
        err2 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);
        err3 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);
        err4 = cudaMalloc((void**)&ref_norm_dev, ref_nb   * size_of_float);
        err5 = cudaMalloc((void**)&query_norm_dev, query_nb * size_of_float);
        if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess || err3 != cudaSuccess || err4 != cudaSuccess || err5 != cudaSuccess) {
            printf("ERROR: Memory allocation error\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFree(ref_norm_dev);
            cudaFree(query_norm_dev);
            cublasShutdown();
            return false;
        }
    
        // Deduce pitch values
        size_t ref_pitch = ref_pitch_in_bytes / size_of_float;
        size_t query_pitch = query_pitch_in_bytes / size_of_float;
        size_t dist_pitch = dist_pitch_in_bytes / size_of_float;
        size_t index_pitch = index_pitch_in_bytes / size_of_int;
    
        // Check pitch values
        if (query_pitch != dist_pitch || query_pitch != index_pitch) {
            printf("ERROR: Invalid pitch value\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFree(ref_norm_dev);
            cudaFree(query_norm_dev);
            cublasShutdown();
            return false;
        }
    
        // Copy reference and query data from the host to the device
        err0 = cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref, ref_nb * size_of_float, ref_nb * size_of_float, dim, cudaMemcpyHostToDevice);
        err1 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);
        if (err0 != cudaSuccess || err1 != cudaSuccess) {
            printf("ERROR: Unable to copy data from host to device\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFree(ref_norm_dev);
            cudaFree(query_norm_dev);
            cublasShutdown();
            return false;
        }
    
        // Compute the squared norm of the reference points
        dim3 block0(256, 1, 1);
        dim3 grid0(ref_nb / 256, 1, 1);
        if (ref_nb % 256 != 0) grid0.x += 1;
        compute_squared_norm << <grid0, block0 >> > (ref_dev, ref_nb, ref_pitch, dim, ref_norm_dev);
        if (cudaGetLastError() != cudaSuccess) {
            printf("ERROR: Unable to execute kernel\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFree(ref_norm_dev);
            cudaFree(query_norm_dev);
            cublasShutdown();
            return false;
        }
    
        // Compute the squared norm of the query points
        dim3 block1(256, 1, 1);
        dim3 grid1(query_nb / 256, 1, 1);
        if (query_nb % 256 != 0) grid1.x += 1;
        compute_squared_norm << <grid1, block1 >> > (query_dev, query_nb, query_pitch, dim, query_norm_dev);
        if (cudaGetLastError() != cudaSuccess) {
            printf("ERROR: Unable to execute kernel\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFree(ref_norm_dev);
            cudaFree(query_norm_dev);
            cublasShutdown();
            return false;
        }
    
        // Computation of query*transpose(reference)
        cublasSgemm('n', 't', (int)query_pitch, (int)ref_pitch, dim, (float)-2.0, query_dev, query_pitch, ref_dev, ref_pitch, (float)0.0, dist_dev, query_pitch);
        if (cublasGetError() != CUBLAS_STATUS_SUCCESS) {
            printf("ERROR: Unable to execute cublasSgemm\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFree(ref_norm_dev);
            cudaFree(query_norm_dev);
            cublasShutdown();
            return false;
        }
    
        // Add reference points norm
        dim3 block2(16, 16, 1);
        dim3 grid2(query_nb / 16, ref_nb / 16, 1);
        if (query_nb % 16 != 0) grid2.x += 1;
        if (ref_nb % 16 != 0) grid2.y += 1;
        add_reference_points_norm << <grid2, block2 >> > (dist_dev, query_nb, dist_pitch, ref_nb, ref_norm_dev);
        if (cudaGetLastError() != cudaSuccess) {
            printf("ERROR: Unable to execute kernel\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFree(ref_norm_dev);
            cudaFree(query_norm_dev);
            cublasShutdown();
            return false;
        }
    
        // Sort each column
        modified_insertion_sort << <grid1, block1 >> > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);
        if (cudaGetLastError() != cudaSuccess) {
            printf("ERROR: Unable to execute kernel\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFree(ref_norm_dev);
            cudaFree(query_norm_dev);
            cublasShutdown();
            return false;
        }
    
        // Add query norm and compute the square root of the of the k first elements
        dim3 block3(16, 16, 1);
        dim3 grid3(query_nb / 16, k / 16, 1);
        if (query_nb % 16 != 0) grid3.x += 1;
        if (k % 16 != 0) grid3.y += 1;
        add_query_points_norm_and_sqrt << <grid3, block3 >> > (dist_dev, query_nb, dist_pitch, k, query_norm_dev);
        if (cudaGetLastError() != cudaSuccess) {
            printf("ERROR: Unable to execute kernel\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFree(ref_norm_dev);
            cudaFree(query_norm_dev);
            cublasShutdown();
            return false;
        }
    
        // Copy k smallest distances / indexes from the device to the host
        err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);
        err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);
        if (err0 != cudaSuccess || err1 != cudaSuccess) {
            printf("ERROR: Unable to copy data from device to host\n");
            cudaFree(ref_dev);
            cudaFree(query_dev);
            cudaFree(dist_dev);
            cudaFree(index_dev);
            cudaFree(ref_norm_dev);
            cudaFree(query_norm_dev);
            cublasShutdown();
            return false;
        }
    
        // Memory clean-up and CUBLAS shutdown
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
    
        return true;
    }
    

    相关文章

      网友评论

          本文标题:CUDA C初学者编程 (VS2017)

          本文链接:https://www.haomeiwen.com/subject/hhozzftx.html