2020-10-23
运行环境:VS2015 + CUDA 8.0
一.创建工程
1. 打开VS2015,选择菜单栏中的 “文件” --> "新建" --> "项目"

2. 选择 “NVIDIA” --> "CUDA 8.0" ,点击 ”确定“

3.工程创建成功。
4.在 kernel.cu 中, main() 函数中 添加 getchar()
(使程序停住)

5.保存并编译运行程序,结果图如下:
substitute 代替品,替补,代替,取代
二.线程的定义与描述
1.一维线程的定义与描述

代码如下:
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}

代码如下:
// Launch a kernel on the GPU with one thread for each element.
addKernel <<<1, size >>>(dev_c, dev_a, dev_b);
2.二维线程的定义与描述

代码如下:
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = threadIdx + blockIdx.x*blockDim.x;
c[i] = a[i] + b[i];
}

代码如下:(#define dataSize 100000)
//二维
int blockSize = 512;
int blockNum = ceil(dataSize / double(blockSize));//ceil函数作用是把一个小数向上取整
dim3 dimBlock(blockSize,1,1);
dim3 dimGrid(blockNum, 1, 1);
addKernel <<<dimGrid, dimBlock >>>(dev_c, dev_a, dev_b);
三.程序测试:
1.a和b两数组对加10万次

代码如下:(添加头文件 #include "time.h")
//add
clock_t startTime = clock();
for (int i = 0; i < 100000; i++) {
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
}
clock_t endTime = clock();
printf("Add Time = %0.1f s\n", (endTime - startTime) / 1000.0f);
2.a数组中的数据拷贝10万次

代码如下:
//copy
// Copy input vectors from host memory to GPU buffers.
clock_t startTime = clock();
for (int i = 0; i < 100000; i++) {
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
}
clock_t endTime = clock();
printf("Copy Time = %0.1f s\n", (endTime - startTime) / 1000.0f);
3.SM的资源
在main ()函数中添加如下代码,查看本机SM的资源。

测试结果如下:

4.a和b数组内10万个随机数对加,分别在CPU和GPU中计算
代码如下:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include "time.h"
#include <math.h>
__global__ void add_in_parallel(int *array_a, int *array_b, int *array_c)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
array_c[tid] = array_a[tid] + array_b[tid];
}
int main()
{
// --------------------------------------------
printf("Begin...\n");
int arraysize = 100000;
int *a_host;
int *b_host;
int *c_host;
int *devresult_host;
a_host = (int *)malloc(arraysize * sizeof(int));
b_host = (int *)malloc(arraysize * sizeof(int));
c_host = (int *)malloc(arraysize * sizeof(int));
devresult_host = (int *)malloc(arraysize * sizeof(int));
for (int i = 0; i < arraysize; i++)
{
a_host[i] = i;
b_host[i] = i;
}
// ---------------------------------------------
printf("Allocating device memory...\n");
int *a_dev;
int *b_dev;
int *c_dev;
cudaMalloc((void**)&a_dev, arraysize * sizeof(int));
cudaMalloc((void**)&b_dev, arraysize * sizeof(int));
cudaMalloc((void**)&c_dev, arraysize * sizeof(int));
// ----------------------------------------------
cudaEvent_t start, stop;
float time_from_host_to_dev;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(a_dev, a_host, arraysize * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b_dev, b_host, arraysize * sizeof(int), cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_from_host_to_dev, start, stop);
printf("Copy host data to device, time used: %0.5g seconds\n", time_from_host_to_dev / 1000);
// ----------------------------------------------
float time_of_kernel;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
int blocksize = 512; //block 可改为 128,256,512,1024
int blocknum = ceil(arraysize / double(blocksize));
dim3 dimBlock(blocksize, 1, 1);
dim3 dimGrid(blocknum, 1, 1);
add_in_parallel << <dimGrid, dimBlock >> >(a_dev, b_dev, c_dev);
cudaEventRecord(stop, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_of_kernel, start, stop);
printf("Add in parallel, time used: %0.5g seconds\n", time_of_kernel / 1000);
// ----------------------------------------------
float time_from_dev_to_host;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(devresult_host, c_dev, arraysize * sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_from_dev_to_host, start, stop);
printf("Copy dev data to host, time used: %0.5g seconds\n", time_from_dev_to_host / 1000);
// -------------------------------------------------
printf("Verify result...\n");
int status = 0;
clock_t start_cpu, end_cpu;
float time_cpu;
start_cpu = clock();
for (int i = 0; i < arraysize; i++)
{
c_host[i] = a_host[i] + b_host[i];
}
end_cpu = clock();
time_cpu = (double)(end_cpu - start_cpu) / CLOCKS_PER_SEC;
for (int i = 0; i < arraysize; i++)
{
if (c_host[i] != devresult_host[i])
{
status = 1;
}
}
if (status)
{
printf("Failed vervified.\n");
}
else
{
printf("Sucessdully verified.\n");
}
// ----------------------------------------------
printf("Free dev memory\n");
cudaFree(a_dev);
cudaFree(b_dev);
cudaFree(c_dev);
// ----------------------------------------
printf("Free host memory\n");
free(a_host);
free(b_host);
free(c_host);
// ----------------------------------------
printf("\nPerformance: CPU vs. GPU\n");
printf("time cpu:%f\n", time_cpu);
printf("time gpu(kernel):%f\n", time_of_kernel / 1000);
getchar();
return 1;
}
测试结果如下(每次运行误差是真的大,害):

网友评论