美文网首页
CUDA C/C++简介

CUDA C/C++简介

作者: 松鼠伴石 | 来源:发表于2019-05-30 23:07 被阅读0次

一、Hello Wold

CPU和CPU内存被称为host memory,GPU和显存被称为device memory,下面我们来看英伟达官网给出的一段代码:

#include <iostream>
#include <algorithm>

using namespace std;

#define N          1024
#define RADIUS     3
#define BLOCK_SIZE 16

// parallel fn(并行函数)
__global__ void stencil_1d(int *in, int *out) {
    __shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
    int gindex = threadIdx.x + blockIdx.x * blockDim.x;
    int lindex = threadIdx.x + RADIUS;

    // Read input elements into shared memory
    temp[lindex] = in[gindex];
    if (threadIdx.x < RADIUS) {
        temp[lindex - RADIUS] = in[gindex - RADIUS];
        temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];
    }

    // Synchronize (ensure all the data is available)
    __syncthreads();

    // Apply the stencil
    int result = 0;
    for (int offset = -RADIUS ; offset <= RADIUS ; offset++)
        result += temp[lindex + offset];

    // Store the result
    out[gindex] = result;
}

void fill_ints(int *x, int n) {
    fill_n(x, n, 1);
}
// CPU代码(包括CPU数据拷贝到GPU,并行计算,GPU数据拷贝到CPU)
int main(void) {
    int *in, *out;              // host copies of a, b, c
    int *d_in, *d_out;          // device copies of a, b, c
    int size = (N + 2*RADIUS) * sizeof(int);

    // Alloc space for host copies and setup values
    in  = (int *)malloc(size); fill_ints(in,  N + 2*RADIUS);
    out = (int *)malloc(size); fill_ints(out, N + 2*RADIUS);
    
    // Alloc space for device copies
    cudaMalloc((void **)&d_in,  size);
    cudaMalloc((void **)&d_out, size);

    // Copy to device
    cudaMemcpy(d_in,  in,  size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, size, cudaMemcpyHostToDevice);

    // Launch stencil_1d() kernel on GPU
    stencil_1d<<<N/BLOCK_SIZE,BLOCK_SIZE>>>(d_in + RADIUS, d_out + RADIUS);

    // Copy result back to host
    cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);

    // Cleanup
    free(in); free(out);
    cudaFree(d_in); cudaFree(d_out);
    return 0;
}

下面三张是表示上面代码的整体过程:

Simple Processing Flow1.png Simple Processing Flow2.png Simple Procesing Flow3.png

比较下C的hello world和cuda的hello world

// C
#include <stdio.h>

int main(void) {
    printf("Hello World!\n");
    return 0;
}

//Cuda
#include <cstdlib>
#include <cstdio>
#include <cuda.h>

__global__ void mykernel(void) {

}

int main(void) {
    mykernel<<<1,1>>> ();
    printf("Hello World!\n");
    return 0;
}

CUDA, device代码中参数的使用方式是和C类似的,例如:

__global__ void add(int *a, int *b, int *c) {
        *c = *a + *b;
}

内存管理

cuda中和c类似的malloc(),free(),memcpy()有cudaMalloc(),cudaFree(),cudaMemory()

int main(void) {
        int a, b, c;                // host copies of a, b, c
        int *d_a, *d_b, *d_c;        // device copies of a, b, c
        int size = sizeof(int);
        
        // Allocate space for device copies of a, b, c
        cudaMalloc((void **)&d_a, size);
        cudaMalloc((void **)&d_b, size);
        cudaMalloc((void **)&d_c, size);

        // Setup input values
        a = 2;
        b = 7;
        
        // Copy inputs to device
        cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);

        // Launch add() kernel on GPU
        add<<<1,1>>>(d_a, d_b, d_c);

        // Copy result back to host
        cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);

        // Cleanup
        cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
        return 0;
    }

二、CUDA Block

GPU可以将add<<<1,1>>>() 修改为add<<<N, 1>>>()来并行计算代码,对向量的加法的简单例子:

__global__ void add(int *a, int *b, int *c) {
        c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
    }
//使用blockIdx.x可以使得不同block处理不同的index

block运算如下图.


Block运算.png

向量加法的并行运算示例代码:

#define N 512
int main(void) {
    int *a, *b, *c;     // host copies of a, b, c
    int *d_a, *d_b, *d_c;   // device copies of a, b, c
    int size = N * sizeof(int);
        
    // Alloc space for device copies of a, b, c
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    // Alloc space for host copies of a, b, c and setup input values
    a = (int *)malloc(size); random_ints(a, N);
    b = (int *)malloc(size); random_ints(b, N);
    c = (int *)malloc(size);
    // Copy inputs to device
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

    // Launch add() kernel on GPU with N blocks
    add<<<N,1>>>(d_a, d_b, d_c);

    // Copy result back to host
    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

    // Cleanup
    free(a); free(b); free(c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    return 0;
}

三、CUDA Thread

类似的和上面的blockIdx.x类似的,有

__global__ void add(int *a, int *b, int *c) {
    c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}

类似的也有使用线程向量加法的代码:

#define N 512
    int main(void) {
        int *a, *b, *c;         // host copies of a, b, c
        int *d_a, *d_b, *d_c;       // device copies of a, b, c
        int size = N * sizeof(int);
        
        // Alloc space for device copies of a, b, c
        cudaMalloc((void **)&d_a, size);
        cudaMalloc((void **)&d_b, size);
        cudaMalloc((void **)&d_c, size);
        
        // Alloc space for host copies of a, b, c and setup input values
        a = (int *)malloc(size); random_ints(a, N);
        b = (int *)malloc(size); random_ints(b, N);
        c = (int *)malloc(size);
         // Copy inputs to device
        cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

        // Launch add() kernel on GPU with N threads
        add<<<1,N>>>(d_a, d_b, d_c);

        // Copy result back to host
        cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

        // Cleanup
        free(a); free(b); free(c);
        cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
        return 0;
    }

可以很明显的看出,唯一的不同其实就是add<<<N,1>>>和add<<<1,N>>>的区别.

四、Blocks和Threads的结合

先来看一张图

Blocks和Threads.png
对于第M个blockIdx的threadInx可以到新的index:
int index = threadIdx.x + blockIdx.x * M
分别可以写出它们的add函数和main函数
__global__ void add(int *a, int *b, int *c) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    c[index] = a[index] + b[index];
}


#define N (2048*2048)
#define THREADS_PER_BLOCK 512
int main(void) {
    int *a, *b, *c;         // host copies of a, b, c
    int *d_a, *d_b, *d_c;       // device copies of a, b, c
    int size = N * sizeof(int);

    // Alloc space for device copies of a, b, c
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    // Alloc space for host copies of a, b, c and setup input values
    a = (int *)malloc(size); random_ints(a, N);
    b = (int *)malloc(size); random_ints(b, N);
    c = (int *)malloc(size);
    // Copy inputs to device
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

    // Launch add() kernel on GPU
    add<<<N/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>(d_a, d_b, d_c);

    // Copy result back to host
    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

    // Cleanup
    free(a); free(b); free(c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    return 0;
}

在通常情况下,数组长度并不一定等于blocks* threads,上面的add函数和kernel启动代码可以修改如下

__global__ void add(int *a, int *b, int *c, int n) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if (index < n)
        c[index] = a[index] + b[index];
}

add <<<(N + M -1) / M , M>>>(d_a, d_b, d_c, N)

五、Cooperation Threads(线程间的协作)

考虑这么个问题,指定半径radius,将一个1维的stencil(数组环)中的值变成另外一个一维数组;
如果半径是3,一维数组的每个数都是stencil中数和相邻距离小于或等于3的总共7个数的和,在转变过程中,stencil中每个数都会被访问7次.
CUDA中线程数据:

  • 在同一个block,threads通过shared memory来共享数据;
  • Extremely fast on-chip memory, user-managed (?)
  • 使用_shared_来声明shared memory
  • 对于同一个thread的其他block数据是不可见的.这部分内存都是需要单独管理的.
__global__ void stencil_1d(int *in, int *out) {
  __shared__ int temp[BLOCK_SIZE + 2 * RADIUS]; //对应下图中的第一个长条
  int gindex = threadIdx.x + blockIdx.x * blockDim.x;
  int lindex = threadIdx.x + RADIUS;


  // Read input elements into shared memory
  temp[lindex] = in[gindex];  //对应下图中的第二个长条
  if (threadIdx.x < RADIUS) {
    temp[lindex - RADIUS] = in[gindex - RADIUS];  //对应下图中的第三个长条
    temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE]; //对应下图中的第四个长条
  }
  // Synchronize (ensure all the data is available)
  __syncthreads();

  // Apply the stencil
  int result = 0;
  for (int offset = -RADIUS ; offset <= RADIUS ; offset++)
    result += temp[lindex + offset];

  // Store the result
  out[gindex] = result;
}

stencil kernel.png

上述操作过程中__syncthreds()适用于同步一个block的所有线程.

六、同步host和device

CUDA所有错误都会返回cuda错误码(cudaError_t)

// 获取cuda error string
char *cudaGetErrorString(cudaError_t)
printf("%s\n", cudaGetErrorString(cudaGetLastError()));

另外还有一些常用的方法:

cudaGetDeviceCount(int *count)
cudaSetDevice(int device)
cudaGetDevice(int *device)
cudaGetDeviceProperties(cudaDeviceProp *prop, int device)

七、冒泡排序的CUDA版本

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
 
#include <stdio.h>
#include <stdlib.h>
 
#define N 400
void random_ints(int *);
 
__global__ void myKernel(int *d_a)
{
    __shared__ int s_a[N]; //定义共享变量
    int tid = threadIdx.x;
    s_a[tid] = d_a[tid]; //每个线程搬运对应的数据到共享内存中
    __syncthreads(); //线程同步
 
    for (int i = 1; i <= N; i++) { //最多N次排序完成
 
        if (i % 2 == 1 && (2 * tid + 1) < N) { //奇数步
            if (s_a[2 * tid] > s_a[2 * tid + 1]) {
                int temp = s_a[2 * tid];
                s_a[2 * tid] = s_a[2 * tid+1];
                s_a[2 * tid + 1] = temp;
            }
        } 
        __syncthreads(); //线程同步
        if (i % 2 == 0 && (2 * tid + 2) < N ) { //偶数步
            if (s_a[2 * tid+1] > s_a[2 * tid + 2]) {
                int temp = s_a[2 * tid+1];
                s_a[2 * tid+1] = s_a[2 * tid + 2];
                s_a[2 * tid + 2] = temp;
            }
        }
        __syncthreads(); //线程同步     
    }
    d_a[tid] = s_a[tid]; //将排序结果搬回到Global Memory
}
 
int main()
{
    //定义变量
    int *a,*d_a;
    int size = N * sizeof(int);
 
    //Host端变量分配内存
    a = (int *)malloc(size); 
    //初始化待排序数组
    random_ints(a);
    
    //Device端变量分配内存
    cudaMalloc((void **)&d_a, size);
 
    //将数据从Host端拷贝到Device端
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
 
    //调用核函数
    myKernel<<<1, N >>> (d_a);
 
    //将数据从Device端拷贝到Host端
    cudaMemcpy(a, d_a, size, cudaMemcpyDeviceToHost);
 
    //打印排序结果
    for (int i = 0; i < N; i++) {
        printf("%d ", a[i]);
    }
 
    //释放内存
    free(a); 
    cudaFree(d_a); 
    return 0;
}
void random_ints(int *a)
{
    if (!a) { //异常判断
        return;
    }
    for (int i = 0; i < N; i++) {
        a[i] = rand() % N; //产生0-N之间的随机整数
    }

相关文章

网友评论

      本文标题:CUDA C/C++简介

      本文链接:https://www.haomeiwen.com/subject/daubtctx.html