CUDA C初学者编程（VS2017）

作者: Eden0503 | 来源:发表于2018-10-24 20:13 被阅读0次

CUDA C初学者编程（VS2017）
VS2017 running a CUDA sample rep
VS2017 C++ 多文件项目踩坑记录（一）
《CUDA C 编程》
2020-06-02
RTFSC | Caffe源码阅读（其二）
CUDA编程概述
【CUDA】学习记录（2）-编程模型
CUDA编程基础——内存分配
Jetson cuda build opencv

打开VS2017后，文件——新建——项目；

找到NVIDIA，有的人说自己的VS中没看见NVIDIA这一项啊，那是因为没有你没有安装CUDA，或者你在安装CUDA的时候参照某教程将Visual Studio Integration 取消勾选安装，其实后来再重新装上就行。

创建一个文件夹名为 cuda_test 的项目，然后我们发现其实里面已经有 .cu 文件了，如下图所示。

然后，我们像C语言一样生成编译文件，最终结果如下：

接下来，我们修改代码如下，并运行以下代码。

#include <stdio.h>

const int N = 16;
const int blocksize = 16;

__global__  void hello(char *a, int *b)
{
    a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
    char a[N] = "Hello \0\0\0\0\0\0";
    int b[N] = { 15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };

    char *ad;
    int *bd;
    const int csize = N * sizeof(char);
    const int isize = N * sizeof(int);

    printf("%s", a);

    cudaMalloc((void**)&ad, csize);
    cudaMalloc((void**)&bd, isize);
    cudaMemcpy(ad, a, csize, cudaMemcpyHostToDevice);
    cudaMemcpy(bd, b, isize, cudaMemcpyHostToDevice);

    dim3 dimBlock(blocksize, 1);
    dim3 dimGrid(1, 1);
    hello << <dimGrid, dimBlock >> > (ad, bd);
    cudaMemcpy(a, ad, csize, cudaMemcpyDeviceToHost);
    cudaFree(ad);
    cudaFree(bd);

    printf("%s\n", a);
    return EXIT_SUCCESS;
}

编译运行成功，最终结果如下：

运行以下代码，查看电脑设备。

/*********************************************/

#include <stdio.h>
#include <cuda_runtime.h>

void printDeviceProp(const cudaDeviceProp &prop)
{
    printf("Device Name : %s.\n", prop.name);
    printf("totalGlobalMem : %d.\n", prop.totalGlobalMem);
    printf("sharedMemPerBlock : %d.\n", prop.sharedMemPerBlock);
    printf("regsPerBlock : %d.\n", prop.regsPerBlock);
    printf("warpSize : %d.\n", prop.warpSize);
    printf("memPitch : %d.\n", prop.memPitch);
    printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
    printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
    printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
    printf("totalConstMem : %d.\n", prop.totalConstMem);
    printf("major.minor : %d.%d.\n", prop.major, prop.minor);
    printf("clockRate : %d.\n", prop.clockRate);   // 输出的是GPU的时钟频率
    printf("textureAlignment : %d.\n", prop.textureAlignment);
    printf("deviceOverlap : %d.\n", prop.deviceOverlap);
    printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}

bool InitCUDA()
{
    //used to count the device numbers
    int count;

    // 获取CUDA设备数 
    cudaGetDeviceCount(&count);  
    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }

    // find the device >= 1.X
    int i;
    for (i = 0; i < count; ++i) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {  //cudaGetDeviceProperties获取 CUDA 设备属性
            if (prop.major >= 1) {
                printDeviceProp(prop);
                break;
            }
        }
    }

    // if can't find the device
    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }

    // set cuda device
    cudaSetDevice(i);

    return true;
}

int main(int argc, char const *argv[])
{
    if (InitCUDA()) {
        printf("CUDA initialized.\n");
    }

    return 0;
}

程序运行结果：

查看当前系统上安装的GPU设备的详细参数

打开文件夹C:\ProgramData\NVIDIA Corporation\CUDA Samples\v9.2\1_Utilities
其中有一个名为deviceQuery的程序，可编译执行，即可打出当前系统上安装的GPU设备的详细参数。结果如下：

CUDA 初始化

/*
CUDA初始化
*/

#include "cuda_runtime.h"
#include <stdio.h>

//CUDA 初始化
bool InitCUDA(){
    int count;
    cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目
    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    int i;
    for (i = 0; i < count; i++) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if (prop.major >= 1) {
                break;
            }
        }
    }
    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }
    cudaSetDevice(i);
    return true;
}

int main(){ 
    if (!InitCUDA()) {
        return 0;
    }//CUDA 初始化
    printf("CUDA initialized.\n");
    return 0;
}

运行得到结果：

说明系统上有支持CUDA的装置。

完整的向量点积CUDA程序

/*
完整的向量点积CUDA程序
a=[a1,a2,…an], b=[b1,b2,…bn]
a*b=a1*b1+a2*b2+…+an*bn
*/
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <cuda.h> 
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#define N 10
__global__ void Dot(int *a, int *b, int *c) //声明kernel函数
{
    __shared__ int temp[N]; // 声明在共享存储中的变量
    temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];
    __syncthreads();  // 此处有下划线，并不是错误，而是VS智能感知有问题，
    if (0 == threadIdx.x)
    {
        int sum = 0;
        for (int i = 0; i < N; i++)
            sum += temp[i];
        *c = sum;
        printf("sum Calculated on Device:%d\n", *c);
    }
}
void random_ints(int *a, int n)
{
    for (int i = 0; i < n; i++)
        *(a + i) = rand() % 10;
}
int main()
{
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    int size = N * sizeof(int);
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, sizeof(int));
    a = (int *)malloc(size); random_ints(a, N);
    b = (int *)malloc(size); random_ints(b, N);
    c = (int *)malloc(sizeof(int));
    printf("Array a[N]:\n");
    for (int i = 0; i < N; i++) printf("%d ", a[i]);
    printf("\n");
    printf("Array b[N]:\n");
    for (int i = 0; i < N; i++) printf("%d ", b[i]);
    printf("\n\n");
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
    Dot << <1, N >> > (d_a, d_b, d_c); //单block多thread
    cudaMemcpy(c, d_c, sizeof(int), cudaMemcpyDeviceToHost);
    int sumHost = 0;
    for (int i = 0; i < N; i++)
        sumHost += a[i] * b[i];
    printf("sum Calculated on Host=%d\n", sumHost);
    printf("Device to Host: a*b=%d\n", *c);
    free(a); free(b); free(c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    return 0;
}

运行结果如下：

计算随机数向量立方和的CUDA程序

/*
计算随机数向量立方和的CUDA程序
*/

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <cuda.h> 
#include <stdio.h>
#include <stdlib.h>  // 为了使用rand 函数
#include <malloc.h>
#define DATA_SIZE 1048576 //数据
int data[DATA_SIZE];
//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size)
{
    for (int i = 0; i < size; i++) {
        number[i] = rand() % 10;  // 0~32767之间的随机数 
    }
}

//CUDA 初始化
bool InitCUDA()
{
    int count;
    //取得支持Cuda的装置的数目
    cudaGetDeviceCount(&count);
    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    int i;
    for (i = 0; i < count; i++) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if (prop.major >= 1) {
                break;
            }
        }
    }
    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }
    cudaSetDevice(i);
    return true;
}
// __global__ 函数(GPU上执行) 计算立方和
__global__ static void sumOfcubes(int *num, int* result)
{
    int sum = 0;
    int i;
    for (i = 0; i < DATA_SIZE; i++) {
        sum += num[i] * num[i] * num[i];
    }
    *result = sum;
}
int main()
{ //CUDA 初始化
    if (!InitCUDA()) {
        return 0;
    }
    //生成随机数
    GenerateNumbers(data, DATA_SIZE);
    int* gpudata, *result;  // device中的参数
    int gpusum = 0; //CPU中的参数，但是是为了记录gpu中求到的sum.
    cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
    cudaMalloc((void**)&result, sizeof(int));

    
    cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); //cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中
    sumOfcubes <<<1,1,0>>> (gpudata, result); // gpudata, result 是传给设备本身的参数
    cudaMemcpy(&gpusum, result, sizeof(int), cudaMemcpyDeviceToHost);  //cudaMemcpy将GPU中的结果result 传给 CPU中的参数gpusum
    cudaFree(gpudata);
    cudaFree(result);
    printf("GPUsum: %d \n", gpusum);
    int sum = 0;
    for (int i = 0; i < DATA_SIZE; i++) {
        sum += data[i] * data[i] * data[i];
    }
    printf("CPUsum: %d \n", sum);
    return 0;
}

运行结果如下：

计算随机数向量立方和的CUDA程序（加入统计时间的函数）

/*
计算随机数向量立方和的CUDA程序
*/

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <time.h>  // 为了使用clock_t
#include <cuda.h> 
#include <stdio.h>
#include <stdlib.h>  // 为了使用rand 函数
#include <malloc.h>
#define DATA_SIZE 1048576 //数据
int data[DATA_SIZE];
//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size)
{
    for (int i = 0; i < size; i++) {
        number[i] = rand() % 10;  // 0~32767之间的随机数 
    }
}

//CUDA 初始化
bool InitCUDA()
{
    int count;
    //取得支持Cuda的装置的数目
    cudaGetDeviceCount(&count);
    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    int i;
    for (i = 0; i < count; i++) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if (prop.major >= 1) {
                break;
            }
        }
    }
    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }
    cudaSetDevice(i);
    return true;
}
// __global__ 函数(GPU上执行) 计算立方和

__global__ static void sumOfcubes(int *num, int *result, clock_t *time)
{
    int sum = 0;
    int i;
    clock_t start = clock();
    for (i = 0; i < DATA_SIZE; i++) {
        sum += num[i] * num[i] * num[i];
    }
    *result = sum;
    *time = clock() - start;
}

int main()
{ //CUDA 初始化
    if (!InitCUDA()) {
        return 0;
    }
    //生成随机数
    GenerateNumbers(data, DATA_SIZE);
    int* gpudata, *result;  // device中的参数
    int gpusum = 0; //CPU中的参数，但是是为了记录gpu中求到的sum.

    clock_t* time;
    clock_t time_used;

    cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
    cudaMalloc((void**)&result, sizeof(int));
    cudaMalloc((void**)&time, sizeof(clock_t));
    
    cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); //cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中
    sumOfcubes <<<1,1,0>>> (gpudata, result,time); // gpudata, result,time 是传给设备本身的参数
    cudaMemcpy(&gpusum, result, sizeof(int), cudaMemcpyDeviceToHost);  //cudaMemcpy将GPU中的结果result 传给 CPU中的参数gpusum
    cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost); 
    
    cudaFree(gpudata);
    cudaFree(result);
    cudaFree(time);

    printf("GPUsum: %d——time:%d\n", gpusum, time_used);

    int sum = 0;
    for (int i = 0; i < DATA_SIZE; i++) {
        sum += data[i] * data[i] * data[i];
    }
    printf("CPUsum: %d \n", sum);
    return 0;
}

结果如下：

GTX 1050上运行核函数部分的时间约为：
1992949251/ 1493000 = 1334.8 mS 其中1493000是GPU频率。
重要！！！
也就是clock() 统计出来的 (end - start) / GPU频率 = 毫秒级单位的时间其中利用***获取的GPU频率单位是千赫兹，比如我的GTX1050 的频率是 1493000千赫兹。

进一步优化上述求立方和的程序(利用单block多thread)

/*
计算随机数向量立方和的CUDA程序
*/

#include "cuda_runtime.h"
#include <time.h>  // 为了使用clock_t
#include <stdio.h>
#include <stdlib.h>  // 为了使用rand 函数

//#include "device_launch_parameters.h"
//#include "device_functions.h"
//#include <cuda.h> 
//#include <malloc.h>

#define DATA_SIZE 1048576 //数据
#define THREAD_NUM 256  //线程的数量
int data[DATA_SIZE];

//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size){
    for (int i = 0; i < size; i++) {
        number[i] = rand() % 10;  // 0~32767之间的随机数 
    }
}

//CUDA 初始化
bool InitCUDA(){
    int count;
    //取得支持Cuda的装置的数目
    cudaGetDeviceCount(&count);
    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    int i;
    for (i = 0; i < count; i++) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if (prop.major >= 1) {
                break;
            }
        }
    }
    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }
    cudaSetDevice(i);
    return true;
}
// __global__ 函数(GPU上执行) 计算立方和

__global__ static void sumOfcubes(int *num, int *result, clock_t *time){
    const int tid = threadIdx.x;
    const int size = DATA_SIZE / THREAD_NUM;//计算每个线程需要完成的量
    int sum = 0;
    int i;

    clock_t start; // 记录运算开始的时间 

    if (tid == 0) start = clock(); //只在thread 0（即threadIdx.x = 0 的时候）进行记录
    for (i = tid * size; i < (tid + 1) * size; i++){
        sum += num[i] * num[i] * num[i];
    }
    result[tid] = sum;
    //计算时间的动作，只在thread 0（即threadIdx.x = 0 的时候）进行
    if (tid == 0) *time = clock() - start;
}

int main(){ 
    if (!InitCUDA()) {return 0;}//CUDA 初始化
    GenerateNumbers(data, DATA_SIZE); //生成随机数
    int* gpudata, *result;  // device中的参数
    int gpusum[THREAD_NUM];  // CPU中记录GPU中记录求和的参数
    clock_t* time;
    clock_t time_used;

    cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
    cudaMalloc((void**)&time, sizeof(clock_t));
    cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM);

    //cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中
    cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); 
    // 启动kernel函数
    sumOfcubes<<<1,THREAD_NUM,0>>> (gpudata, result,time); // gpudata, result,time 是传给设备本身的参数

    clock_t time_use;
    
    //cudaMemcpy 将结果从显存中复制回CPU内存 gpusum
    cudaMemcpy(&gpusum, result, sizeof(int) * THREAD_NUM, cudaMemcpyDeviceToHost);
    cudaMemcpy(&time_use, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
    
    cudaFree(gpudata);
    cudaFree(result);
    cudaFree(time);

    int final_sum = 0; /*立方和归约*/
    for (int i = 0; i < THREAD_NUM; i++){
        final_sum += gpusum[i];
    }
    printf("GPUsum: %d gputime: %d\n", final_sum, time_use);

    // 计算CPU中计算时的结果。
    final_sum = 0;
    for (int i = 0; i < DATA_SIZE; i++) {
        final_sum += data[i] * data[i] * data[i];
    }
    printf("CPUsum: %d \n", final_sum);
    return 0;
}

结果如下：

然而，上述程序还能够优化，上述代码：

    if (tid == 0) start = clock(); //只在thread 0（即threadIdx.x = 0 的时候）进行记录
    for (i = tid * size; i < (tid + 1) * size; i++){
        sum += num[i] * num[i] * num[i];
    }
    result[tid] = sum;
    //计算时间的动作，只在thread 0（即threadIdx.x = 0 的时候）进行
    if (tid == 0) *time = clock() - start;
}

中核函数并不是连续存取的，读取数字完全是跳跃式的读取，这会非常影响内存的存取效率。因此我们下一步要将取数字的过程变成：thread 0 读取第一个数字，thread 1 读取第二个数字，以此类推......
程序部分修改为：

    if (tid == 0) start = clock(); //只在thread 0（即threadIdx.x = 0 的时候）进行记录
    for (i = tid; i < DATA_SIZE; i+= THREAD_NUM){
        sum += num[i] * num[i] * num[i];
    }
    result[tid] = sum;
    //计算时间的动作，只在thread 0（即threadIdx.x = 0 的时候）进行
    if (tid == 0) *time = clock() - start;

实验运行结果：

？？？问题来了，老师上课的时候明明说，这样的优化会使得程序运行时间缩短很多倍，但是为啥我的运行时间结果没啥大的改变呢。8746726/8365217=1.04，几乎就是没有改进嘛.....难道是程序出现了问题，于是，我就将两个程序放在服务器上运行了。结果显示 1103530/185263=5.96，有近6倍的提，说明这样的优化程序是正确的，只是在VS2017的编译运行中没有体现出来，可这又是为啥？

经过一番查询资料，发现VS2017中我在编译运行的时候用的是debug模式，其实需要改成Release模式。

debug.png

Release.png

修改完编译器中的模式之后，发现结果果然不一样了，1060761/166149 = 6.38，说明优化后的程序有很大的提升，因此，得出一个结果，VS中的debug模式只能用来修改程序中的错误，但是不能说明真正的程序性能的好坏，谨记，要改为Release模式。

优化前.png

优化后.png

更进一步的优化，多block多thread

CUDA不仅提供了Thread，还提供了Grid和Block以及Share Memory这些非常重要的机制，每个block中Thread极限是1024，但是通过block和Grid，线程的数量还能成倍增长，甚至用几万个线程。

/*
计算随机数向量立方和的CUDA程序
*/
#include "cuda_runtime.h"
#include <time.h>  // 为了使用clock_t
#include <stdio.h>
#include <stdlib.h>  // 为了使用rand 函数

//#include "device_launch_parameters.h"
//#include "device_functions.h"
//#include <cuda.h> 
//#include <malloc.h>

#define DATA_SIZE 1048576 //数据
#define THREAD_NUM 256 //thread 数量
#define BLOCK_NUM  32// block 数量 
//配置32 个blocks，每个blocks 有256个threads，总共有32*256= 8192个threads。
int data[DATA_SIZE];

//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size) {
    for (int i = 0; i < size; i++) {
        number[i] = rand() % 10;  // 0~32767之间的随机数 
    }
}

//CUDA 初始化
bool InitCUDA() {
    int count;
    cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目
    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    int i;
    for (i = 0; i < count; i++) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if (prop.major >= 1) {
                break;
            }
        }
    }
    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }
    cudaSetDevice(i);
    return true;
}
// __global__ 函数(GPU上执行) 计算立方和

__global__ static void sumOfcubes(int *num, int *result, clock_t *time) {
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    int sum = 0;
    int i;

    //只在thread 0（即threadIdx.x = 0 的时候）进行记录，每个block 都会记录开始时间及结束时间
    if (tid == 0) time[bid] = clock();
    //thread需要同时通过tid和bid来确定，并保证内存连续性
    for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {
        sum += num[i] * num[i] * num[i];
    }
    //Result的数量随之增加
    result[bid * THREAD_NUM + tid] = sum;
    //计算时间的动作，只在thread 0（即threadIdx.x = 0 的时候）进行，每个block 都会记录开始时间及结束时间
    if (tid == 0) time[bid + BLOCK_NUM] = clock();
}

int main() {
    if (!InitCUDA()) { return 0; }//CUDA 初始化
    GenerateNumbers(data, DATA_SIZE); //生成随机数
    int *gpudata, *result;  // device中的参数
    clock_t *time;

    cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
    cudaMalloc((void**)&time, sizeof(clock_t)*BLOCK_NUM * 2);
    cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM*BLOCK_NUM);

    //cudaMemcpy 将数据从cpu复制到device内存中
    cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
    // 启动kernel函数
    sumOfcubes << <BLOCK_NUM, THREAD_NUM, 0 >> > (gpudata, result, time); // gpudata, result,time 是传给设备本身的参数

    int gpusum[THREAD_NUM*BLOCK_NUM];  // CPU中记录GPU中记录求和的参数
    clock_t time_use[BLOCK_NUM * 2];

    //cudaMemcpy 将结果从显存中复制回CPU内存
    cudaMemcpy(&gpusum, result, sizeof(int) * THREAD_NUM * BLOCK_NUM, cudaMemcpyDeviceToHost);
    cudaMemcpy(&time_use, time, sizeof(clock_t)*BLOCK_NUM * 2, cudaMemcpyDeviceToHost);

    cudaFree(gpudata);
    cudaFree(result);
    cudaFree(time);

    int final_sum = 0; /*立方和归约*/
    for (int i = 0; i < THREAD_NUM * BLOCK_NUM; i++) {
        final_sum += gpusum[i];
    }

    //采取新的计时策略把每个block 最早的开始时间，和最晚的结束时间相减，取得总运行时间
    clock_t min_start, max_end;
    min_start = time_use[0];
    max_end = time_use[BLOCK_NUM];
    for (int i = 1; i < BLOCK_NUM; i++) {  //把每个block最早的开始时间和最晚结束的时间相减，取得最终的运行时间
        if (min_start > time_use[i])
            min_start = time_use[i];
        if (max_end < time_use[i + BLOCK_NUM])
            max_end = time_use[i + BLOCK_NUM];
    }
    printf("GPUsum: %d gputime: %ld\n", final_sum, max_end - min_start);


    // 计算CPU中计算时的结果。
    final_sum = 0;
    for (int i = 0; i < DATA_SIZE; i++) {
        final_sum += data[i] * data[i] * data[i];
    }
    printf("CPUsum: %d \n", final_sum);
    return 0;
}

会出现以下错误：

error.png

解决方案：release模式会优化很多东西，但是多重新编译几次，又会成功，真是搞笑，搞不懂这些编译器......... 算了，还是建议去服务器上运行吧，VS中实在有太多无法未知的错误了。

程序运行结果：

Release_threads256_blocks32.png

后来修改 THREAD_NUM = 1024 ， BLOCK_NUM = 128 ，总共 1024 * 128 = 131073 个threads，性能并没有很大提升了。

 #define THREAD_NUM 1024  //thread 数量
 #define BLOCK_NUM 128 // block 数量

Release_threads1024_blocks128.png

为什么不用极限的1024个线程呢？

如果1024个线程全部用上，那样就是32*1024 = 32768 个线程，难道不是更好吗？从线程运行的原理来看，线程数量达到一定大小后，再一味地增加线程也不会取得性能的提升，反而有可能会让性能下降。线程数达到隐藏各种latency的程度后，之后线程数量的提升就没有太大的作用了。

程序中加和部分是在CPU上进行的，越多的线程意味着越多的结果，而这也意味着CPU上的运算压力会越来越大。

Thread 的同步

一个block内的所有的 thread 可以有共享的内存，也可以进行同步，因此，还可以让每个block 内所有 thread 将自己的计算结果相加起来。代码可以进一步修改：

#include "cuda_runtime.h"
#include <time.h>  // 为了使用clock_t
#include <stdio.h>
#include <stdlib.h>  // 为了使用rand 函数

//#include "device_launch_parameters.h"
//#include "device_functions.h"
//#include <cuda.h> 
//#include <malloc.h>

#define DATA_SIZE 1048576 //数据
#define THREAD_NUM 256 //thread 数量
#define BLOCK_NUM 32// block 数量 
//配置32 个blocks，每个blocks 有256个threads，总共有32*256= 8192个threads。
int data[DATA_SIZE];

//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size) {
    for (int i = 0; i < size; i++) {
        number[i] = rand() % 10;  // 0~32767之间的随机数 
    }
}

//CUDA 初始化
bool InitCUDA() {
    int count;
    cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目
    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    int i;
    for (i = 0; i < count; i++) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if (prop.major >= 1) {
                break;
            }
        }
    }
    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }
    cudaSetDevice(i);
    return true;
}
// __global__ 函数(GPU上执行) 计算立方和

__global__ static void sumOfcubes(int *num, int *result, clock_t *time) {
    extern __shared__ int shared[];
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    int i;

    //只在thread 0（即threadIdx.x = 0 的时候）进行记录，每个block 都会记录开始时间及结束时间
    if (tid == 0) time[bid] = clock(); 
    shared[tid] = 0;
    //thread需要同时通过tid和bid来确定，并保证内存连续性
    for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {
        shared[tid] += num[i] * num[i] * num[i];
    }


    __syncthreads();
    if (tid == 0) {
        for (i = 1; i < THREAD_NUM; i++) {
            shared[0] += shared[i];
        }
        result[tid] = shared[0];
    }
    if (tid == 0) time[bid + BLOCK_NUM] = clock();
}

int main() {
    if (!InitCUDA()) { return 0; }//CUDA 初始化
    GenerateNumbers(data, DATA_SIZE); //生成随机数
    int *gpudata, *result;  // device中的参数
    clock_t *time;

    cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
    cudaMalloc((void**)&time, sizeof(clock_t)*BLOCK_NUM * 2);
    cudaMalloc((void**)&result, sizeof(int)*BLOCK_NUM);

    cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
    // 启动kernel函数
    sumOfcubes << <BLOCK_NUM, THREAD_NUM, sizeof(int)* THREAD_NUM >> > (gpudata, result, time); // gpudata, result,time 是传给设备本身的参数

    int gpusum[BLOCK_NUM];  // CPU中记录GPU中记录求和的参数
    clock_t time_use[BLOCK_NUM * 2];

    cudaMemcpy(&gpusum, result, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost);
    cudaMemcpy(&time_use, time, sizeof(clock_t)*BLOCK_NUM * 2, cudaMemcpyDeviceToHost);

    cudaFree(gpudata);
    cudaFree(result);
    cudaFree(time);

    int final_sum = 0; /*立方和归约*/
    for (int i = 0; i <BLOCK_NUM; i++) {
        final_sum += gpusum[i];
    }

    //采取新的计时策略把每个block 最早的开始时间，和最晚的结束时间相减，取得总运行时间
    clock_t min_start, max_end;
    min_start = time_use[0];
    max_end = time_use[BLOCK_NUM];
    for (int i = 1; i < BLOCK_NUM; i++) {  //把每个block最早的开始时间和最晚结束的时间相减，取得最终的运行时间
        if (min_start > time_use[i])
            min_start = time_use[i];
        if (max_end < time_use[i + BLOCK_NUM])
            max_end = time_use[i + BLOCK_NUM];
    }
    printf("GPUsum: %d gputime: %ld\n", final_sum, max_end - min_start);


    // 计算CPU中计算时的结果。
    final_sum = 0;
    for (int i = 0; i < DATA_SIZE; i++) {
        final_sum += data[i] * data[i] * data[i];
    }
    printf("CPUsum: %d \n", final_sum);
    return 0;
}

树状加法

参考：
http://hpc.pku.edu.cn/docs/20170829223804057249.pdf 深入浅出谈CUDA

#include "device_functions.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>

#define BLOCK_DIM 16


/**
 * Computes the squared Euclidean distance matrix between the query points and the reference points.
 *
 * @param ref          refence points stored in the global memory
 * @param ref_width    number of reference points
 * @param ref_pitch    pitch of the reference points array in number of column
 * @param query        query points stored in the global memory
 * @param query_width  number of query points
 * @param query_pitch  pitch of the query points array in number of columns
 * @param height       dimension of points = height of texture `ref` and of the array `query`
 * @param dist         array containing the query_width x ref_width computed distances
 */
__global__ void compute_distances(float * ref,
    int     ref_width,
    int     ref_pitch,
    float * query,
    int     query_width,
    int     query_pitch,
    int     height,
    float * dist) {

    // Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B
    __shared__ float shared_A[BLOCK_DIM][BLOCK_DIM];
    __shared__ float shared_B[BLOCK_DIM][BLOCK_DIM];

    // Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step)
    __shared__ int begin_A;
    __shared__ int begin_B;
    __shared__ int step_A;
    __shared__ int step_B;
    __shared__ int end_A;

    // Thread index
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // Initializarion of the SSD for the current thread
    float ssd = 0.f;

    // Loop parameters
    begin_A = BLOCK_DIM * blockIdx.y;
    begin_B = BLOCK_DIM * blockIdx.x;
    step_A = BLOCK_DIM * ref_pitch;
    step_B = BLOCK_DIM * query_pitch;
    end_A = begin_A + (height - 1) * ref_pitch;

    // Conditions
    int cond0 = (begin_A + tx < ref_width); // used to write in shared memory
    int cond1 = (begin_B + tx < query_width); // used to write in shared memory & to computations and to write in output array 
    int cond2 = (begin_A + ty < ref_width); // used to computations and to write in output matrix

    // Loop over all the sub-matrices of A and B required to compute the block sub-matrix
    for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) {

        // Load the matrices from device memory to shared memory; each thread loads one element of each matrix
        if (a / ref_pitch + ty < height) {
            shared_A[ty][tx] = (cond0) ? ref[a + ref_pitch * ty + tx] : 0;
            shared_B[ty][tx] = (cond1) ? query[b + query_pitch * ty + tx] : 0;
        }
        else {
            shared_A[ty][tx] = 0;
            shared_B[ty][tx] = 0;
        }

        // Synchronize to make sure the matrices are loaded
        __syncthreads();

        // Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix
        if (cond2 && cond1) {
            for (int k = 0; k < BLOCK_DIM; ++k) {
                float tmp = shared_A[k][ty] - shared_B[k][tx];
                ssd += tmp * tmp;
            }
        }

        // Synchronize to make sure that the preceeding computation is done before loading two new sub-matrices of A and B in the next iteration
        __syncthreads();
    }

    // Write the block sub-matrix to device memory; each thread writes one element
    if (cond2 && cond1) {
        dist[(begin_A + ty) * query_pitch + begin_B + tx] = ssd;
    }
}


/**
 * Computes the squared Euclidean distance matrix between the query points and the reference points.
 *
 * @param ref          refence points stored in the texture memory
 * @param ref_width    number of reference points
 * @param query        query points stored in the global memory
 * @param query_width  number of query points
 * @param query_pitch  pitch of the query points array in number of columns
 * @param height       dimension of points = height of texture `ref` and of the array `query`
 * @param dist         array containing the query_width x ref_width computed distances
 */
__global__ void compute_distance_texture(cudaTextureObject_t ref,
    int                 ref_width,
    float *             query,
    int                 query_width,
    int                 query_pitch,
    int                 height,
    float*              dist) {
    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
    if (xIndex < query_width && yIndex < ref_width) {
        float ssd = 0.f;
        for (int i = 0; i < height; i++) {
            float tmp = tex2D<float>(ref, (float)yIndex, (float)i) - query[i * query_pitch + xIndex];
            ssd += tmp * tmp;
        }
        dist[yIndex * query_pitch + xIndex] = ssd;
    }
}


/**
 * For each reference point (i.e. each column) finds the k-th smallest distances
 * of the distance matrix and their respective indexes and gathers them at the top
 * of the 2 arrays.
 *
 * Since we only need to locate the k smallest distances, sorting the entire array
 * would not be very efficient if k is relatively small. Instead, we perform a
 * simple insertion sort by eventually inserting a given distance in the first
 * k values.
 *
 * @param dist         distance matrix
 * @param dist_pitch   pitch of the distance matrix given in number of columns
 * @param index        index matrix
 * @param index_pitch  pitch of the index matrix given in number of columns
 * @param width        width of the distance matrix and of the index matrix
 * @param height       height of the distance matrix
 * @param k            number of values to find
 */
__global__ void modified_insertion_sort(float * dist,
    int     dist_pitch,
    int *   index,
    int     index_pitch,
    int     width,
    int     height,
    int     k) {

    // Column position
    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;

    // Do nothing if we are out of bounds
    if (xIndex < width) {

        // Pointer shift
        float * p_dist = dist + xIndex;
        int *   p_index = index + xIndex;

        // Initialise the first index
        p_index[0] = 0;

        // Go through all points
        for (int i = 1; i < height; ++i) {

            // Store current distance and associated index
            float curr_dist = p_dist[i*dist_pitch];
            int   curr_index = i;

            // Skip the current value if its index is >= k and if it's higher the k-th slready sorted mallest value
            if (i >= k && curr_dist >= p_dist[(k - 1)*dist_pitch]) {
                continue;
            }

            // Shift values (and indexes) higher that the current distance to the right
            int j = min(i, k - 1);
            while (j > 0 && p_dist[(j - 1)*dist_pitch] > curr_dist) {
                p_dist[j*dist_pitch] = p_dist[(j - 1)*dist_pitch];
                p_index[j*index_pitch] = p_index[(j - 1)*index_pitch];
                --j;
            }

            // Write the current distance and index at their position
            p_dist[j*dist_pitch] = curr_dist;
            p_index[j*index_pitch] = curr_index;
        }
    }
}


/**
 * Computes the square root of the first k lines of the distance matrix.
 *
 * @param dist   distance matrix
 * @param width  width of the distance matrix
 * @param pitch  pitch of the distance matrix given in number of columns
 * @param k      number of values to consider
 */
__global__ void compute_sqrt(float * dist, int width, int pitch, int k) {
    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
    if (xIndex < width && yIndex < k)
        dist[yIndex*pitch + xIndex] = sqrt(dist[yIndex*pitch + xIndex]);
}


/**
 * Computes the squared norm of each column of the input array.
 *
 * @param array   input array
 * @param width   number of columns of `array` = number of points
 * @param pitch   pitch of `array` in number of columns
 * @param height  number of rows of `array` = dimension of the points
 * @param norm    output array containing the squared norm values
 */
__global__ void compute_squared_norm(float * array, int width, int pitch, int height, float * norm) {
    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    if (xIndex < width) {
        float sum = 0.f;
        for (int i = 0; i < height; i++) {
            float val = array[i*pitch + xIndex];
            sum += val * val;
        }
        norm[xIndex] = sum;
    }
}


/**
 * Add the reference points norm (column vector) to each colum of the input array.
 *
 * @param array   input array
 * @param width   number of columns of `array` = number of points
 * @param pitch   pitch of `array` in number of columns
 * @param height  number of rows of `array` = dimension of the points
 * @param norm    reference points norm stored as a column vector
 */
__global__ void add_reference_points_norm(float * array, int width, int pitch, int height, float * norm) {
    unsigned int tx = threadIdx.x;
    unsigned int ty = threadIdx.y;
    unsigned int xIndex = blockIdx.x * blockDim.x + tx;
    unsigned int yIndex = blockIdx.y * blockDim.y + ty;
    __shared__ float shared_vec[16];
    if (tx == 0 && yIndex < height)
        shared_vec[ty] = norm[yIndex];
    __syncthreads();
    if (xIndex < width && yIndex < height)
        array[yIndex*pitch + xIndex] += shared_vec[ty];
}


/**
 * Adds the query points norm (row vector) to the k first lines of the input
 * array and computes the square root of the resulting values.
 *
 * @param array   input array
 * @param width   number of columns of `array` = number of points
 * @param pitch   pitch of `array` in number of columns
 * @param k       number of neighbors to consider
 * @param norm     query points norm stored as a row vector
 */
__global__ void add_query_points_norm_and_sqrt(float * array, int width, int pitch, int k, float * norm) {
    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
    if (xIndex < width && yIndex < k)
        array[yIndex*pitch + xIndex] = sqrt(array[yIndex*pitch + xIndex] + norm[xIndex]);
}


bool knn_cuda_global(const float * ref,
    int           ref_nb,
    const float * query,
    int           query_nb,
    int           dim,
    int           k,
    float *       knn_dist,
    int *         knn_index) {

    // Constants
    const unsigned int size_of_float = sizeof(float);
    const unsigned int size_of_int = sizeof(int);

    // Return variables
    cudaError_t err0, err1, err2, err3;

    // Check that we have at least one CUDA device 
    int nb_devices;
    err0 = cudaGetDeviceCount(&nb_devices);
    if (err0 != cudaSuccess || nb_devices == 0) {
        printf("ERROR: No CUDA device found\n");
        return false;
    }

    // Select the first CUDA device as default
    err0 = cudaSetDevice(0);
    if (err0 != cudaSuccess) {
        printf("ERROR: Cannot set the chosen CUDA device\n");
        return false;
    }

    // Allocate global memory
    float * ref_dev = NULL;
    float * query_dev = NULL;
    float * dist_dev = NULL;
    int   * index_dev = NULL;
    size_t  ref_pitch_in_bytes;
    size_t  query_pitch_in_bytes;
    size_t  dist_pitch_in_bytes;
    size_t  index_pitch_in_bytes;
    err0 = cudaMallocPitch((void**)&ref_dev, &ref_pitch_in_bytes, ref_nb   * size_of_float, dim);
    err1 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);
    err2 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);
    err3 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);
    if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess || err3 != cudaSuccess) {
        printf("ERROR: Memory allocation error\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Deduce pitch values
    size_t ref_pitch = ref_pitch_in_bytes / size_of_float;
    size_t query_pitch = query_pitch_in_bytes / size_of_float;
    size_t dist_pitch = dist_pitch_in_bytes / size_of_float;
    size_t index_pitch = index_pitch_in_bytes / size_of_int;

    // Check pitch values
    if (query_pitch != dist_pitch || query_pitch != index_pitch) {
        printf("ERROR: Invalid pitch value\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Copy reference and query data from the host to the device
    err0 = cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref, ref_nb * size_of_float, ref_nb * size_of_float, dim, cudaMemcpyHostToDevice);
    err1 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);
    if (err0 != cudaSuccess || err1 != cudaSuccess) {
        printf("ERROR: Unable to copy data from host to device\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Compute the squared Euclidean distances
    dim3 block0(BLOCK_DIM, BLOCK_DIM, 1);
    dim3 grid0(query_nb / BLOCK_DIM, ref_nb / BLOCK_DIM, 1);
    if (query_nb % BLOCK_DIM != 0) grid0.x += 1;
    if (ref_nb   % BLOCK_DIM != 0) grid0.y += 1;
    compute_distances << <grid0, block0 >> > (ref_dev, ref_nb, ref_pitch, query_dev, query_nb, query_pitch, dim, dist_dev);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Sort the distances with their respective indexes
    dim3 block1(256, 1, 1);
    dim3 grid1(query_nb / 256, 1, 1);
    if (query_nb % 256 != 0) grid1.x += 1;
    modified_insertion_sort << <grid1, block1 >> > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Compute the square root of the k smallest distances
    dim3 block2(16, 16, 1);
    dim3 grid2(query_nb / 16, k / 16, 1);
    if (query_nb % 16 != 0) grid2.x += 1;
    if (k % 16 != 0)        grid2.y += 1;
    compute_sqrt << <grid2, block2 >> > (dist_dev, query_nb, query_pitch, k);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Copy k smallest distances / indexes from the device to the host
    err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);
    err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);
    if (err0 != cudaSuccess || err1 != cudaSuccess) {
        printf("ERROR: Unable to copy data from device to host\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Memory clean-up
    cudaFree(ref_dev);
    cudaFree(query_dev);
    cudaFree(dist_dev);
    cudaFree(index_dev);

    return true;
}


bool knn_cuda_texture(const float * ref,
    int           ref_nb,
    const float * query,
    int           query_nb,
    int           dim,
    int           k,
    float *       knn_dist,
    int *         knn_index) {

    // Constants
    unsigned int size_of_float = sizeof(float);
    unsigned int size_of_int = sizeof(int);

    // Return variables
    cudaError_t err0, err1, err2;

    // Check that we have at least one CUDA device 
    int nb_devices;
    err0 = cudaGetDeviceCount(&nb_devices);
    if (err0 != cudaSuccess || nb_devices == 0) {
        printf("ERROR: No CUDA device found\n");
        return false;
    }

    // Select the first CUDA device as default
    err0 = cudaSetDevice(0);
    if (err0 != cudaSuccess) {
        printf("ERROR: Cannot set the chosen CUDA device\n");
        return false;
    }

    // Allocate global memory
    float * query_dev = NULL;
    float * dist_dev = NULL;
    int *   index_dev = NULL;
    size_t  query_pitch_in_bytes;
    size_t  dist_pitch_in_bytes;
    size_t  index_pitch_in_bytes;
    err0 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);
    err1 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);
    err2 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);
    if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess) {
        printf("ERROR: Memory allocation error (cudaMallocPitch)\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Deduce pitch values
    size_t query_pitch = query_pitch_in_bytes / size_of_float;
    size_t dist_pitch = dist_pitch_in_bytes / size_of_float;
    size_t index_pitch = index_pitch_in_bytes / size_of_int;

    // Check pitch values
    if (query_pitch != dist_pitch || query_pitch != index_pitch) {
        printf("ERROR: Invalid pitch value\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Copy query data from the host to the device
    err0 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);
    if (err0 != cudaSuccess) {
        printf("ERROR: Unable to copy data from host to device\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Allocate CUDA array for reference points
    cudaArray* ref_array_dev = NULL;
    cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    err0 = cudaMallocArray(&ref_array_dev, &channel_desc, ref_nb, dim);
    if (err0 != cudaSuccess) {
        printf("ERROR: Memory allocation error (cudaMallocArray)\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Copy reference points from host to device
    err0 = cudaMemcpyToArray(ref_array_dev, 0, 0, ref, ref_nb * size_of_float * dim, cudaMemcpyHostToDevice);
    if (err0 != cudaSuccess) {
        printf("ERROR: Unable to copy data from host to device\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFreeArray(ref_array_dev);
        return false;
    }

    // Resource descriptor
    struct cudaResourceDesc res_desc;
    memset(&res_desc, 0, sizeof(res_desc));
    res_desc.resType = cudaResourceTypeArray;
    res_desc.res.array.array = ref_array_dev;

    // Texture descriptor
    struct cudaTextureDesc tex_desc;
    memset(&tex_desc, 0, sizeof(tex_desc));
    tex_desc.addressMode[0] = cudaAddressModeClamp;
    tex_desc.addressMode[1] = cudaAddressModeClamp;
    tex_desc.filterMode = cudaFilterModePoint;
    tex_desc.readMode = cudaReadModeElementType;
    tex_desc.normalizedCoords = 0;

    // Create the texture
    cudaTextureObject_t ref_tex_dev = 0;
    err0 = cudaCreateTextureObject(&ref_tex_dev, &res_desc, &tex_desc, NULL);
    if (err0 != cudaSuccess) {
        printf("ERROR: Unable to create the texture\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFreeArray(ref_array_dev);
        return false;
    }

    // Compute the squared Euclidean distances
    dim3 block0(16, 16, 1);
    dim3 grid0(query_nb / 16, ref_nb / 16, 1);
    if (query_nb % 16 != 0) grid0.x += 1;
    if (ref_nb % 16 != 0) grid0.y += 1;
    compute_distance_texture << <grid0, block0 >> > (ref_tex_dev, ref_nb, query_dev, query_nb, query_pitch, dim, dist_dev);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFreeArray(ref_array_dev);
        cudaDestroyTextureObject(ref_tex_dev);
        return false;
    }

    // Sort the distances with their respective indexes
    dim3 block1(256, 1, 1);
    dim3 grid1(query_nb / 256, 1, 1);
    if (query_nb % 256 != 0) grid1.x += 1;
    modified_insertion_sort << <grid1, block1 >> > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFreeArray(ref_array_dev);
        cudaDestroyTextureObject(ref_tex_dev);
        return false;
    }

    // Compute the square root of the k smallest distances
    dim3 block2(16, 16, 1);
    dim3 grid2(query_nb / 16, k / 16, 1);
    if (query_nb % 16 != 0) grid2.x += 1;
    if (k % 16 != 0)        grid2.y += 1;
    compute_sqrt << <grid2, block2 >> > (dist_dev, query_nb, query_pitch, k);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFreeArray(ref_array_dev);
        cudaDestroyTextureObject(ref_tex_dev);
        return false;
    }

    // Copy k smallest distances / indexes from the device to the host
    err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);
    err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);
    if (err0 != cudaSuccess || err1 != cudaSuccess) {
        printf("ERROR: Unable to copy data from device to host\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFreeArray(ref_array_dev);
        cudaDestroyTextureObject(ref_tex_dev);
        return false;
    }

    // Memory clean-up
    cudaFree(query_dev);
    cudaFree(dist_dev);
    cudaFree(index_dev);
    cudaFreeArray(ref_array_dev);
    cudaDestroyTextureObject(ref_tex_dev);

    return true;
}


bool knn_cublas(const float * ref,
    int           ref_nb,
    const float * query,
    int           query_nb,
    int           dim,
    int           k,
    float *       knn_dist,
    int *         knn_index) {

    // Constants
    const unsigned int size_of_float = sizeof(float);
    const unsigned int size_of_int = sizeof(int);

    // Return variables
    cudaError_t  err0, err1, err2, err3, err4, err5;

    // Check that we have at least one CUDA device 
    int nb_devices;
    err0 = cudaGetDeviceCount(&nb_devices);
    if (err0 != cudaSuccess || nb_devices == 0) {
        printf("ERROR: No CUDA device found\n");
        return false;
    }

    // Select the first CUDA device as default
    err0 = cudaSetDevice(0);
    if (err0 != cudaSuccess) {
        printf("ERROR: Cannot set the chosen CUDA device\n");
        return false;
    }

    // Initialize CUBLAS
    cublasInit();

    // Allocate global memory
    float * ref_dev = NULL;
    float * query_dev = NULL;
    float * dist_dev = NULL;
    int   * index_dev = NULL;
    float * ref_norm_dev = NULL;
    float * query_norm_dev = NULL;
    size_t  ref_pitch_in_bytes;
    size_t  query_pitch_in_bytes;
    size_t  dist_pitch_in_bytes;
    size_t  index_pitch_in_bytes;
    err0 = cudaMallocPitch((void**)&ref_dev, &ref_pitch_in_bytes, ref_nb   * size_of_float, dim);
    err1 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);
    err2 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);
    err3 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);
    err4 = cudaMalloc((void**)&ref_norm_dev, ref_nb   * size_of_float);
    err5 = cudaMalloc((void**)&query_norm_dev, query_nb * size_of_float);
    if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess || err3 != cudaSuccess || err4 != cudaSuccess || err5 != cudaSuccess) {
        printf("ERROR: Memory allocation error\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Deduce pitch values
    size_t ref_pitch = ref_pitch_in_bytes / size_of_float;
    size_t query_pitch = query_pitch_in_bytes / size_of_float;
    size_t dist_pitch = dist_pitch_in_bytes / size_of_float;
    size_t index_pitch = index_pitch_in_bytes / size_of_int;

    // Check pitch values
    if (query_pitch != dist_pitch || query_pitch != index_pitch) {
        printf("ERROR: Invalid pitch value\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Copy reference and query data from the host to the device
    err0 = cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref, ref_nb * size_of_float, ref_nb * size_of_float, dim, cudaMemcpyHostToDevice);
    err1 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);
    if (err0 != cudaSuccess || err1 != cudaSuccess) {
        printf("ERROR: Unable to copy data from host to device\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Compute the squared norm of the reference points
    dim3 block0(256, 1, 1);
    dim3 grid0(ref_nb / 256, 1, 1);
    if (ref_nb % 256 != 0) grid0.x += 1;
    compute_squared_norm << <grid0, block0 >> > (ref_dev, ref_nb, ref_pitch, dim, ref_norm_dev);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Compute the squared norm of the query points
    dim3 block1(256, 1, 1);
    dim3 grid1(query_nb / 256, 1, 1);
    if (query_nb % 256 != 0) grid1.x += 1;
    compute_squared_norm << <grid1, block1 >> > (query_dev, query_nb, query_pitch, dim, query_norm_dev);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Computation of query*transpose(reference)
    cublasSgemm('n', 't', (int)query_pitch, (int)ref_pitch, dim, (float)-2.0, query_dev, query_pitch, ref_dev, ref_pitch, (float)0.0, dist_dev, query_pitch);
    if (cublasGetError() != CUBLAS_STATUS_SUCCESS) {
        printf("ERROR: Unable to execute cublasSgemm\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Add reference points norm
    dim3 block2(16, 16, 1);
    dim3 grid2(query_nb / 16, ref_nb / 16, 1);
    if (query_nb % 16 != 0) grid2.x += 1;
    if (ref_nb % 16 != 0) grid2.y += 1;
    add_reference_points_norm << <grid2, block2 >> > (dist_dev, query_nb, dist_pitch, ref_nb, ref_norm_dev);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Sort each column
    modified_insertion_sort << <grid1, block1 >> > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Add query norm and compute the square root of the of the k first elements
    dim3 block3(16, 16, 1);
    dim3 grid3(query_nb / 16, k / 16, 1);
    if (query_nb % 16 != 0) grid3.x += 1;
    if (k % 16 != 0) grid3.y += 1;
    add_query_points_norm_and_sqrt << <grid3, block3 >> > (dist_dev, query_nb, dist_pitch, k, query_norm_dev);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Copy k smallest distances / indexes from the device to the host
    err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);
    err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);
    if (err0 != cudaSuccess || err1 != cudaSuccess) {
        printf("ERROR: Unable to copy data from device to host\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Memory clean-up and CUBLAS shutdown
    cudaFree(ref_dev);
    cudaFree(query_dev);
    cudaFree(dist_dev);
    cudaFree(index_dev);
    cudaFree(ref_norm_dev);
    cudaFree(query_norm_dev);
    cublasShutdown();

    return true;
}

CUDA C初学者编程（VS2017）
打开VS2017后，文件——新建——项目；接下来，我们修改代码如下，并运行以下代码。编译运行成功，最终结果如下...
VS2017 running a CUDA sample rep
安装VS2017和CUDA9.0时，出现C1189和MSB372的错误。使用VS2017 测试CUDA9.0,报...
VS2017 C++ 多文件项目踩坑记录（一）
文章背景作为使用c++的大四核工程小白，因为学习接触了VS2017进行C++的编程。之前也有用过VS2017做过...
《CUDA C 编程》
第一章 GPU 版的 HelloWorld 问题1：原来GPU中也可以输出，我之前一直以为不能的，也许直接走的总线...
2020-06-02
cuda编程 c++程序设计 zjs项目预研项目简历毕业 cpc
RTFSC | Caffe源码阅读（其二）
这一回我们来深入im2col。我们会顺便介绍一下Cuda的c接口以及一些基本的cuda编程的概念。 Cuda In...
CUDA编程概述
CUDA C 的编程接口主要包括一个对C语言的小扩展集以及一个runtime库。核心的语言扩展在于编程模型，就是...
【CUDA】学习记录（2）-编程模型
CUDA编程结构 CUDA显存管理分配显存传输数据 Example：返回类型 CUDA内存模型线程核函数...
CUDA编程基础——内存分配
本文介绍cuda编程中cudaMalloc和cudaMemcpy。首先声明了一个dec_c变量，是存储在CPU内...
Jetson cuda build opencv
1- compiled CUDA: YES 目标：compiled CUDA: NO --> compiled C...