美文网首页
Cuda编程---工程创建&程序测试(ab两数组)

Cuda编程---工程创建&程序测试(ab两数组)

作者: 小白兔555 | 来源:发表于2020-10-24 11:33 被阅读0次

2020-10-23
运行环境:VS2015 + CUDA 8.0

一.创建工程

1. 打开VS2015,选择菜单栏中的 “文件” --> "新建" --> "项目"
concern 关心,担忧,涉及
2. 选择 “NVIDIA” --> "CUDA 8.0" ,点击 ”确定“
evaluate 评估,评价
3.工程创建成功。
4.在 kernel.cu 中, main() 函数中 添加 getchar()

(使程序停住)


compromise 妥协,危害,破坏
5.保存并编译运行程序,结果图如下:

substitute 代替品,替补,代替,取代

二.线程的定义与描述

1.一维线程的定义与描述
sample 样品,取样
代码如下:
__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}
brand 铭记,加污名于...上,品牌
代码如下:
    // Launch a kernel on the GPU with one thread for each element.
    addKernel <<<1, size >>>(dev_c, dev_a, dev_b);
2.二维线程的定义与描述
confirm 确定,证实
代码如下:
    __global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx + blockIdx.x*blockDim.x;
    c[i] = a[i] + b[i];
}
enhance 提高,增强
代码如下:(#define dataSize 100000)
    //二维
    int blockSize = 512;
    int blockNum = ceil(dataSize / double(blockSize));//ceil函数作用是把一个小数向上取整

    dim3 dimBlock(blockSize,1,1);
    dim3 dimGrid(blockNum, 1, 1);
    addKernel <<<dimGrid, dimBlock >>>(dev_c, dev_a, dev_b);

三.程序测试:

1.a和b两数组对加10万次
symbol 象征,符号

代码如下:(添加头文件 #include "time.h")

//add
    clock_t startTime = clock();
    for (int i = 0; i < 100000; i++) {
        cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaMalloc failed!");
            goto Error;
        }
    }
    clock_t endTime = clock();
    printf("Add Time = %0.1f s\n", (endTime - startTime) / 1000.0f);
2.a数组中的数据拷贝10万次
symptom 症状
代码如下:
       //copy 
    // Copy input vectors from host memory to GPU buffers.

    clock_t startTime = clock();
    for (int i = 0; i < 100000; i++) {
        cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }
    }
    clock_t endTime = clock();
    printf("Copy Time = %0.1f s\n", (endTime - startTime) / 1000.0f);
3.SM的资源

在main ()函数中添加如下代码,查看本机SM的资源。


pressure 压力,强迫,迫使

测试结果如下:


consolidate 巩固,加强,联合
4.a和b数组内10万个随机数对加,分别在CPU和GPU中计算

代码如下:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include "time.h"
#include <math.h>

__global__ void add_in_parallel(int *array_a, int *array_b, int *array_c)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    array_c[tid] = array_a[tid] + array_b[tid];
}


int main()
{
    // --------------------------------------------
    printf("Begin...\n");
    int arraysize = 100000;
    int *a_host;
    int *b_host;
    int *c_host;
    int *devresult_host;

    a_host = (int *)malloc(arraysize * sizeof(int));
    b_host = (int *)malloc(arraysize * sizeof(int));
    c_host = (int *)malloc(arraysize * sizeof(int));
    devresult_host = (int *)malloc(arraysize * sizeof(int));

    for (int i = 0; i < arraysize; i++)
    {
        a_host[i] = i;
        b_host[i] = i;
    }

    // ---------------------------------------------
    printf("Allocating device memory...\n");
    int *a_dev;
    int *b_dev;
    int *c_dev;

    cudaMalloc((void**)&a_dev, arraysize * sizeof(int));
    cudaMalloc((void**)&b_dev, arraysize * sizeof(int));
    cudaMalloc((void**)&c_dev, arraysize * sizeof(int));

    // ----------------------------------------------
    cudaEvent_t start, stop;
    float time_from_host_to_dev;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    cudaMemcpy(a_dev, a_host, arraysize * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b_host, arraysize * sizeof(int), cudaMemcpyHostToDevice);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(start);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time_from_host_to_dev, start, stop);
    printf("Copy host data to device, time used: %0.5g seconds\n", time_from_host_to_dev / 1000);

    // ----------------------------------------------
    float time_of_kernel;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    int blocksize = 512;   //block 可改为 128,256,512,1024
    int blocknum = ceil(arraysize / double(blocksize));

    dim3 dimBlock(blocksize, 1, 1);
    dim3 dimGrid(blocknum, 1, 1);

    add_in_parallel << <dimGrid, dimBlock >> >(a_dev, b_dev, c_dev);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(start);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time_of_kernel, start, stop);
    printf("Add in parallel, time used: %0.5g seconds\n", time_of_kernel / 1000);


    // ----------------------------------------------
    float time_from_dev_to_host;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    cudaMemcpy(devresult_host, c_dev, arraysize * sizeof(int), cudaMemcpyDeviceToHost);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(start);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time_from_dev_to_host, start, stop);
    printf("Copy dev data to host, time used: %0.5g seconds\n", time_from_dev_to_host / 1000);

    // -------------------------------------------------
    printf("Verify result...\n");
    int status = 0;
    clock_t start_cpu, end_cpu;
    float time_cpu;
    start_cpu = clock();
    for (int i = 0; i < arraysize; i++)
    {
        c_host[i] = a_host[i] + b_host[i];
    }
    end_cpu = clock();
    time_cpu = (double)(end_cpu - start_cpu) / CLOCKS_PER_SEC;

    for (int i = 0; i < arraysize; i++)
    {
        if (c_host[i] != devresult_host[i])
        {
            status = 1;
        }
    }

    if (status)
    {
        printf("Failed vervified.\n");
    }
    else
    {
        printf("Sucessdully verified.\n");
    }

    // ----------------------------------------------
    printf("Free dev memory\n");
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);

    // ----------------------------------------
    printf("Free host memory\n");
    free(a_host);
    free(b_host);
    free(c_host);

    // ----------------------------------------
    printf("\nPerformance: CPU vs. GPU\n");
    printf("time cpu:%f\n", time_cpu);
    printf("time gpu(kernel):%f\n", time_of_kernel / 1000);
    getchar();

    return 1;
}

测试结果如下(每次运行误差是真的大,害):


conspicuous 显而易见的,显著的

相关文章

网友评论

      本文标题:Cuda编程---工程创建&程序测试(ab两数组)

      本文链接:https://www.haomeiwen.com/subject/lygmmktx.html