Cuda编程---工程创建&程序测试（ab两数组）

作者: 小白兔555 | 来源:发表于2020-10-24 11:33 被阅读0次

Cuda编程---工程创建&程序测试（ab两数组）
Maven常用命令
2020-06-02
小程序初学-小程序框架及其需要注意的点
A/B测试知识点总结
web进阶之二十一：数组及其操作方法
3、Groovy对文件读与写操作
单元测试(二)
apache压力测试
“CUDA driver version is insuffic

2020-10-23
运行环境：VS2015 + CUDA 8.0

一.创建工程

1. 打开VS2015，选择菜单栏中的 “文件” --> "新建" --> "项目"

concern 关心，担忧，涉及

2. 选择 “NVIDIA” --> "CUDA 8.0" ，点击 ”确定“

evaluate 评估，评价

3.工程创建成功。

4.在 kernel.cu 中， main() 函数中添加 getchar()

（使程序停住）

compromise 妥协，危害，破坏

5.保存并编译运行程序，结果图如下：

substitute 代替品，替补，代替，取代

二.线程的定义与描述

1.一维线程的定义与描述

sample 样品，取样
代码如下：

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

brand 铭记，加污名于...上，品牌
代码如下：

    // Launch a kernel on the GPU with one thread for each element.
    addKernel <<<1, size >>>(dev_c, dev_a, dev_b);

2.二维线程的定义与描述

confirm 确定，证实
代码如下：

    __global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx + blockIdx.x*blockDim.x;
    c[i] = a[i] + b[i];
}

enhance 提高，增强
代码如下：（#define dataSize 100000）

    //二维
    int blockSize = 512;
    int blockNum = ceil(dataSize / double(blockSize));//ceil函数作用是把一个小数向上取整

    dim3 dimBlock(blockSize,1,1);
    dim3 dimGrid(blockNum, 1, 1);
    addKernel <<<dimGrid, dimBlock >>>(dev_c, dev_a, dev_b);

三.程序测试：

1.a和b两数组对加10万次

symbol 象征，符号

代码如下：（添加头文件 #include "time.h"）

//add
    clock_t startTime = clock();
    for (int i = 0; i < 100000; i++) {
        cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaMalloc failed!");
            goto Error;
        }
    }
    clock_t endTime = clock();
    printf("Add Time = %0.1f s\n", (endTime - startTime) / 1000.0f);

2.a数组中的数据拷贝10万次

symptom 症状
代码如下：

       //copy 
    // Copy input vectors from host memory to GPU buffers.

    clock_t startTime = clock();
    for (int i = 0; i < 100000; i++) {
        cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }
    }
    clock_t endTime = clock();
    printf("Copy Time = %0.1f s\n", (endTime - startTime) / 1000.0f);

3.SM的资源

在main ()函数中添加如下代码，查看本机SM的资源。

pressure 压力，强迫，迫使

测试结果如下：

consolidate 巩固，加强，联合

4.a和b数组内10万个随机数对加，分别在CPU和GPU中计算

代码如下：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include "time.h"
#include <math.h>

__global__ void add_in_parallel(int *array_a, int *array_b, int *array_c)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    array_c[tid] = array_a[tid] + array_b[tid];
}


int main()
{
    // --------------------------------------------
    printf("Begin...\n");
    int arraysize = 100000;
    int *a_host;
    int *b_host;
    int *c_host;
    int *devresult_host;

    a_host = (int *)malloc(arraysize * sizeof(int));
    b_host = (int *)malloc(arraysize * sizeof(int));
    c_host = (int *)malloc(arraysize * sizeof(int));
    devresult_host = (int *)malloc(arraysize * sizeof(int));

    for (int i = 0; i < arraysize; i++)
    {
        a_host[i] = i;
        b_host[i] = i;
    }

    // ---------------------------------------------
    printf("Allocating device memory...\n");
    int *a_dev;
    int *b_dev;
    int *c_dev;

    cudaMalloc((void**)&a_dev, arraysize * sizeof(int));
    cudaMalloc((void**)&b_dev, arraysize * sizeof(int));
    cudaMalloc((void**)&c_dev, arraysize * sizeof(int));

    // ----------------------------------------------
    cudaEvent_t start, stop;
    float time_from_host_to_dev;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    cudaMemcpy(a_dev, a_host, arraysize * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b_host, arraysize * sizeof(int), cudaMemcpyHostToDevice);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(start);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time_from_host_to_dev, start, stop);
    printf("Copy host data to device, time used: %0.5g seconds\n", time_from_host_to_dev / 1000);

    // ----------------------------------------------
    float time_of_kernel;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    int blocksize = 512;   //block 可改为 128，256,512,1024
    int blocknum = ceil(arraysize / double(blocksize));

    dim3 dimBlock(blocksize, 1, 1);
    dim3 dimGrid(blocknum, 1, 1);

    add_in_parallel << <dimGrid, dimBlock >> >(a_dev, b_dev, c_dev);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(start);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time_of_kernel, start, stop);
    printf("Add in parallel, time used: %0.5g seconds\n", time_of_kernel / 1000);


    // ----------------------------------------------
    float time_from_dev_to_host;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    cudaMemcpy(devresult_host, c_dev, arraysize * sizeof(int), cudaMemcpyDeviceToHost);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(start);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time_from_dev_to_host, start, stop);
    printf("Copy dev data to host, time used: %0.5g seconds\n", time_from_dev_to_host / 1000);

    // -------------------------------------------------
    printf("Verify result...\n");
    int status = 0;
    clock_t start_cpu, end_cpu;
    float time_cpu;
    start_cpu = clock();
    for (int i = 0; i < arraysize; i++)
    {
        c_host[i] = a_host[i] + b_host[i];
    }
    end_cpu = clock();
    time_cpu = (double)(end_cpu - start_cpu) / CLOCKS_PER_SEC;

    for (int i = 0; i < arraysize; i++)
    {
        if (c_host[i] != devresult_host[i])
        {
            status = 1;
        }
    }

    if (status)
    {
        printf("Failed vervified.\n");
    }
    else
    {
        printf("Sucessdully verified.\n");
    }

    // ----------------------------------------------
    printf("Free dev memory\n");
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);

    // ----------------------------------------
    printf("Free host memory\n");
    free(a_host);
    free(b_host);
    free(c_host);

    // ----------------------------------------
    printf("\nPerformance: CPU vs. GPU\n");
    printf("time cpu:%f\n", time_cpu);
    printf("time gpu(kernel):%f\n", time_of_kernel / 1000);
    getchar();

    return 1;
}

测试结果如下（每次运行误差是真的大，害）：

conspicuous 显而易见的，显著的

网友评论

本文标题：Cuda编程---工程创建&程序测试（ab两数组）

本文链接：https://www.haomeiwen.com/subject/lygmmktx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！