一、Hello Wold
CPU和CPU内存被称为host memory,GPU和显存被称为device memory,下面我们来看英伟达官网给出的一段代码:
#include <iostream>
#include <algorithm>
using namespace std;
#define N 1024
#define RADIUS 3
#define BLOCK_SIZE 16
// parallel fn(并行函数)
__global__ void stencil_1d(int *in, int *out) {
__shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
int gindex = threadIdx.x + blockIdx.x * blockDim.x;
int lindex = threadIdx.x + RADIUS;
// Read input elements into shared memory
temp[lindex] = in[gindex];
if (threadIdx.x < RADIUS) {
temp[lindex - RADIUS] = in[gindex - RADIUS];
temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];
}
// Synchronize (ensure all the data is available)
__syncthreads();
// Apply the stencil
int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++)
result += temp[lindex + offset];
// Store the result
out[gindex] = result;
}
void fill_ints(int *x, int n) {
fill_n(x, n, 1);
}
// CPU代码(包括CPU数据拷贝到GPU,并行计算,GPU数据拷贝到CPU)
int main(void) {
int *in, *out; // host copies of a, b, c
int *d_in, *d_out; // device copies of a, b, c
int size = (N + 2*RADIUS) * sizeof(int);
// Alloc space for host copies and setup values
in = (int *)malloc(size); fill_ints(in, N + 2*RADIUS);
out = (int *)malloc(size); fill_ints(out, N + 2*RADIUS);
// Alloc space for device copies
cudaMalloc((void **)&d_in, size);
cudaMalloc((void **)&d_out, size);
// Copy to device
cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_out, out, size, cudaMemcpyHostToDevice);
// Launch stencil_1d() kernel on GPU
stencil_1d<<<N/BLOCK_SIZE,BLOCK_SIZE>>>(d_in + RADIUS, d_out + RADIUS);
// Copy result back to host
cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);
// Cleanup
free(in); free(out);
cudaFree(d_in); cudaFree(d_out);
return 0;
}
下面三张是表示上面代码的整体过程:
Simple Processing Flow1.png Simple Processing Flow2.png Simple Procesing Flow3.png比较下C的hello world和cuda的hello world
// C
#include <stdio.h>
int main(void) {
printf("Hello World!\n");
return 0;
}
//Cuda
#include <cstdlib>
#include <cstdio>
#include <cuda.h>
__global__ void mykernel(void) {
}
int main(void) {
mykernel<<<1,1>>> ();
printf("Hello World!\n");
return 0;
}
CUDA, device代码中参数的使用方式是和C类似的,例如:
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
内存管理
cuda中和c类似的malloc(),free(),memcpy()有cudaMalloc(),cudaFree(),cudaMemory()
int main(void) {
int a, b, c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = sizeof(int);
// Allocate space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Setup input values
a = 2;
b = 7;
// Copy inputs to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<1,1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
// Cleanup
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
二、CUDA Block
GPU可以将add<<<1,1>>>() 修改为add<<<N, 1>>>()来并行计算代码,对向量的加法的简单例子:
__global__ void add(int *a, int *b, int *c) {
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
//使用blockIdx.x可以使得不同block处理不同的index
block运算如下图.
Block运算.png
向量加法的并行运算示例代码:
#define N 512
int main(void) {
int *a, *b, *c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = N * sizeof(int);
// Alloc space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Alloc space for host copies of a, b, c and setup input values
a = (int *)malloc(size); random_ints(a, N);
b = (int *)malloc(size); random_ints(b, N);
c = (int *)malloc(size);
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU with N blocks
add<<<N,1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
三、CUDA Thread
类似的和上面的blockIdx.x类似的,有
__global__ void add(int *a, int *b, int *c) {
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}
类似的也有使用线程向量加法的代码:
#define N 512
int main(void) {
int *a, *b, *c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = N * sizeof(int);
// Alloc space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Alloc space for host copies of a, b, c and setup input values
a = (int *)malloc(size); random_ints(a, N);
b = (int *)malloc(size); random_ints(b, N);
c = (int *)malloc(size);
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU with N threads
add<<<1,N>>>(d_a, d_b, d_c);
// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
可以很明显的看出,唯一的不同其实就是add<<<N,1>>>和add<<<1,N>>>的区别.
四、Blocks和Threads的结合
先来看一张图
对于第M个blockIdx的threadInx可以到新的index:
int index = threadIdx.x + blockIdx.x * M
分别可以写出它们的add函数和main函数
__global__ void add(int *a, int *b, int *c) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
c[index] = a[index] + b[index];
}
#define N (2048*2048)
#define THREADS_PER_BLOCK 512
int main(void) {
int *a, *b, *c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = N * sizeof(int);
// Alloc space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Alloc space for host copies of a, b, c and setup input values
a = (int *)malloc(size); random_ints(a, N);
b = (int *)malloc(size); random_ints(b, N);
c = (int *)malloc(size);
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<N/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>(d_a, d_b, d_c);
// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
在通常情况下,数组长度并不一定等于blocks* threads,上面的add函数和kernel启动代码可以修改如下
__global__ void add(int *a, int *b, int *c, int n) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < n)
c[index] = a[index] + b[index];
}
add <<<(N + M -1) / M , M>>>(d_a, d_b, d_c, N)
五、Cooperation Threads(线程间的协作)
考虑这么个问题,指定半径radius,将一个1维的stencil(数组环)中的值变成另外一个一维数组;
如果半径是3,一维数组的每个数都是stencil中数和相邻距离小于或等于3的总共7个数的和,在转变过程中,stencil中每个数都会被访问7次.
CUDA中线程数据:
- 在同一个block,threads通过shared memory来共享数据;
- Extremely fast on-chip memory, user-managed (?)
- 使用_shared_来声明shared memory
- 对于同一个thread的其他block数据是不可见的.这部分内存都是需要单独管理的.
__global__ void stencil_1d(int *in, int *out) {
__shared__ int temp[BLOCK_SIZE + 2 * RADIUS]; //对应下图中的第一个长条
int gindex = threadIdx.x + blockIdx.x * blockDim.x;
int lindex = threadIdx.x + RADIUS;
// Read input elements into shared memory
temp[lindex] = in[gindex]; //对应下图中的第二个长条
if (threadIdx.x < RADIUS) {
temp[lindex - RADIUS] = in[gindex - RADIUS]; //对应下图中的第三个长条
temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE]; //对应下图中的第四个长条
}
// Synchronize (ensure all the data is available)
__syncthreads();
// Apply the stencil
int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++)
result += temp[lindex + offset];
// Store the result
out[gindex] = result;
}
stencil kernel.png
上述操作过程中__syncthreds()适用于同步一个block的所有线程.
六、同步host和device
CUDA所有错误都会返回cuda错误码(cudaError_t)
// 获取cuda error string
char *cudaGetErrorString(cudaError_t)
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
另外还有一些常用的方法:
cudaGetDeviceCount(int *count)
cudaSetDevice(int device)
cudaGetDevice(int *device)
cudaGetDeviceProperties(cudaDeviceProp *prop, int device)
七、冒泡排序的CUDA版本
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define N 400
void random_ints(int *);
__global__ void myKernel(int *d_a)
{
__shared__ int s_a[N]; //定义共享变量
int tid = threadIdx.x;
s_a[tid] = d_a[tid]; //每个线程搬运对应的数据到共享内存中
__syncthreads(); //线程同步
for (int i = 1; i <= N; i++) { //最多N次排序完成
if (i % 2 == 1 && (2 * tid + 1) < N) { //奇数步
if (s_a[2 * tid] > s_a[2 * tid + 1]) {
int temp = s_a[2 * tid];
s_a[2 * tid] = s_a[2 * tid+1];
s_a[2 * tid + 1] = temp;
}
}
__syncthreads(); //线程同步
if (i % 2 == 0 && (2 * tid + 2) < N ) { //偶数步
if (s_a[2 * tid+1] > s_a[2 * tid + 2]) {
int temp = s_a[2 * tid+1];
s_a[2 * tid+1] = s_a[2 * tid + 2];
s_a[2 * tid + 2] = temp;
}
}
__syncthreads(); //线程同步
}
d_a[tid] = s_a[tid]; //将排序结果搬回到Global Memory
}
int main()
{
//定义变量
int *a,*d_a;
int size = N * sizeof(int);
//Host端变量分配内存
a = (int *)malloc(size);
//初始化待排序数组
random_ints(a);
//Device端变量分配内存
cudaMalloc((void **)&d_a, size);
//将数据从Host端拷贝到Device端
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
//调用核函数
myKernel<<<1, N >>> (d_a);
//将数据从Device端拷贝到Host端
cudaMemcpy(a, d_a, size, cudaMemcpyDeviceToHost);
//打印排序结果
for (int i = 0; i < N; i++) {
printf("%d ", a[i]);
}
//释放内存
free(a);
cudaFree(d_a);
return 0;
}
void random_ints(int *a)
{
if (!a) { //异常判断
return;
}
for (int i = 0; i < N; i++) {
a[i] = rand() % N; //产生0-N之间的随机整数
}
网友评论