CUDA02_05GPU计时与性能评估

作者: 杨强AT南京 | 来源:发表于2020-01-20 08:09 被阅读0次

CUDA02_05GPU计时与性能评估
Jmeter组件
Jmeter安装及使用
[JPT_02]性能测试-性能指标的分析 & 定义
评估方法与性能度量
无标题文章
JVM（十一）内存与垃圾回收|垃圾回收器
我是如何为Serverless配置内存和超时时间的
2019-05-17资产评估概述01
模型性能评估

该是分析GPU性能的时候了，GPU性能与如下因素有关：
1. GPU硬件架构；
2. PCI吞吐量；
3. 流处理器；
4. GPU内存；
本主题说明GPU的事件计时方式，并解释了Grid，Block，Thread对性能的影响。

时间与事件记录

函数说明

创建/释放一个事件cudaEventCreate/cudaEventDestroy
- __host__ cudaError_t cudaEventCreate ( cudaEvent_t* event )
- __host__ __device__ cudaError_t cudaEventDestroy ( cudaEvent_t event )
开始一个事件cudaEventRecord
- __host__ __device__ cudaError_t cudaEventRecord ( cudaEvent_t event, cudaStream_t stream = 0 )
  - 其中 cudaStream_t stream 默认使用缺省stream = 0。
  - 可以多次调用，但会覆盖上次的调用记录的状态。
等待事件结束cudaEventSynchronize
- __host__ cudaError_t cudaEventSynchronize ( cudaEvent_t event )
计算事件消耗的事件cudaEventElapsedTime
- __host__ cudaError_t cudaEventElapsedTime ( float* ms, cudaEvent_t start, cudaEvent_t end )
事件的其他使用
- cudaEventCreateWithFlags()与cudaEventQuery() + cudaStreamWaitEvent().
  - 使用cudaEventDisableTiming标记，实现最好的事件性能处理。
- cudaEventCreateWithFlags()与cudaIpcGetEventHandle()
  - 使用cudaEventInterprocess + cudaEventDisableTiming结合，实现进程间事件

使用模式

调用模式：
1. cudaEventCreate : 创建；
2. cudaEventRecord : 记录开始状态start；
3. cudaEventRecord : 记录结束状态stop；
4. cudaEventSynchronize：等待两个记录完成；
5. cudaEventElapsedTime：计算两个状态之间的耗时；
6. cudaEventDestroy：释放事件；

使用例子

使用图像的像素颜色分量通道交换作为例子，来记录几个核心步骤的计算耗时。

核心代码

   // 1. 定义事件对象
    cudaEvent_t t1,t2,t3,t4,t5;

    // 2. 定义事件
    cudaEventCreate (&t1);
    cudaEventCreate (&t2);
    cudaEventCreate (&t3);
    cudaEventCreate (&t4);
    cudaEventCreate (&t5);

    // 3. 开始事件记录
    cudaEventRecord (t1, 0);   // C++才有默认参数，C需要设置为0。 
    read_bmp();
    cudaEventRecord (t2, 0); 
    move_to_device();
    cudaEventRecord (t3, 0); 
    ////////////////////////////////GPU处理调用
    //为了简单，按照图像的行列分配线程，行用来定义块，列用来定义线程, 每个线程处理4个char，并交换像素通道
    dim3 grid(header.height * 2);    // C/C++的除法对整数是整除。
    dim3 block(header.width / 2);    // 注意：每个block做多1024个线程。因为块的所有线程都应该位于同一个处理器核心上
    shift_color_channels<<<grid, block>>>(img_gpu);
    cudaEventRecord (t4, 0); 
    ////////////////////////////////
    move_to_host();
    cudaEventRecord (t5, 0); 
    // 4. 等待事件结束
    cudaEventSynchronize(t1);
    cudaEventSynchronize(t2);
    cudaEventSynchronize(t3);
    cudaEventSynchronize(t4);
    cudaEventSynchronize(t5);

    // 5. 计算消耗的时间
    float t_total, t_readbmp, t_move2gpu, t_kernel, t_move2host;
    cudaEventElapsedTime(&t_total, t1, t5);

    cudaEventElapsedTime(&t_readbmp, t1, t2);
    cudaEventElapsedTime(&t_move2gpu, t2, t3);
    cudaEventElapsedTime(&t_kernel, t3, t4);
    cudaEventElapsedTime(&t_move2host, t4, t5);
    printf("总耗时:%f\n", t_total);
    printf("\t|-读取:%f\n", t_readbmp);
    printf("\t|-拷贝到Device:%f\n", t_move2gpu);
    printf("\t|-计算:%f\n", t_kernel);
    printf("\t|-拷贝到Host:%f\n", t_move2host);

    // 保存图像，可以考虑等待所有设备计算完成。
    cudaDeviceSynchronize();

完成代码


#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
// 结构体定义
#pragma pack(1)
struct img_header{
    // 文件头
    char                  magic[2];                  // 魔法字
    unsigned int          file_size;                 // 文件大小
    unsigned char         reserve1[4];               // 跳4字节
    unsigned int          data_off;                  // 数据区开始位置
    // 信息头
    unsigned char         reserve2[4];               // 跳4字节
    int                   width;                     // 图像宽度
    int                   height;                    // 图像高度
    unsigned char         reserve3[2];               // 跳2字节
    unsigned short int    bit_count;                 // 图像位数1，4，8，16，24，32
    unsigned char         reserve4[24];              // 跳24字节
};


// 偷懒写一个匿名全局类

// 全局数据
struct img_header  header;
uchar4             *img;                             // 使用gpu的扩展类型
uchar4             *img_gpu; 

// 输入/输出文件名
const char *in_filename  = "gpu.bmp";
const char *out_filename = "gpu_out.bmp";

// 打开图像
void read_bmp();                                     // 无参数，采用全局成员
// 保存图像
void save_bmp();
// GPU运算的数据
void move_to_device();  
// 取得数据
void move_to_host();                            
// GPU处理
__global__ void shift_color_channels(uchar4 *data);
// 内存释放
void free_mem();

int main(int argc, const char **argv){
    // 1. 定义事件对象
    cudaEvent_t t1,t2,t3,t4,t5;

    // 2. 定义事件
    cudaEventCreate (&t1);
    cudaEventCreate (&t2);
    cudaEventCreate (&t3);
    cudaEventCreate (&t4);
    cudaEventCreate (&t5);

    // 3. 开始事件记录
    cudaEventRecord (t1, 0);   // C++才有默认参数，C需要设置为0。 
    read_bmp();
    cudaEventRecord (t2, 0); 
    move_to_device();
    cudaEventRecord (t3, 0); 
    ////////////////////////////////GPU处理调用
    //为了简单，按照图像的行列分配线程，行用来定义块，列用来定义线程, 每个线程处理4个char，并交换像素通道
    dim3 grid(header.height * 2);    // C/C++的除法对整数是整除。
    dim3 block(header.width / 2);    // 注意：每个block做多1024个线程。因为块的所有线程都应该位于同一个处理器核心上
    shift_color_channels<<<grid, block>>>(img_gpu);
    cudaEventRecord (t4, 0); 
    ////////////////////////////////
    move_to_host();
    cudaEventRecord (t5, 0); 
    // 4. 等待事件结束
    cudaEventSynchronize(t1);
    cudaEventSynchronize(t2);
    cudaEventSynchronize(t3);
    cudaEventSynchronize(t4);
    cudaEventSynchronize(t5);

    // 5. 计算消耗的时间
    float t_total, t_readbmp, t_move2gpu, t_kernel, t_move2host;
    cudaEventElapsedTime(&t_total, t1, t5);

    cudaEventElapsedTime(&t_readbmp, t1, t2);
    cudaEventElapsedTime(&t_move2gpu, t2, t3);
    cudaEventElapsedTime(&t_kernel, t3, t4);
    cudaEventElapsedTime(&t_move2host, t4, t5);
    printf("总耗时:%f\n", t_total);
    printf("\t|-读取:%f\n", t_readbmp);
    printf("\t|-拷贝到Device:%f\n", t_move2gpu);
    printf("\t|-计算:%f\n", t_kernel);
    printf("\t|-拷贝到Host:%f\n", t_move2host);

    // 保存图像，可以考虑等待所有设备计算完成。
    cudaDeviceSynchronize();
    save_bmp();
    free_mem();
    cudaEventDestroy(t1);
    cudaEventDestroy(t2);
    cudaEventDestroy(t3);
    cudaEventDestroy(t4);
    cudaEventDestroy(t5);

    return 0;
}

__global__ void shift_color_channels(uchar4 *data){
    // 计算索引
    int  idx = blockIdx.x * blockDim.x + threadIdx.x;   // 因为是1维，获取x就是索引。
    // 处理像素
    unsigned char red   = data[idx].x;
    unsigned char green = data[idx].y;
    unsigned char blue  = data[idx].z;
    // unsigned char alpha = pixel.w;

    data[idx].x = blue;
    data[idx].y = red;
    data[idx].z = green;
    // alpha不动
}
void move_to_host(){
    cudaMemcpy((void*)img, (void*)img_gpu, header.height * header.width * sizeof(uchar4), cudaMemcpyDeviceToHost);
}
void move_to_device(){
    // 分配GPU内存
    cudaMalloc((void**)&img_gpu, header.height * header.width * sizeof(uchar4));   // 返回指针，则参数就需要二重指针。
    // 拷贝数据
    cudaMemcpy((void*)img_gpu, (void*)img,  header.height * header.width * sizeof(uchar4), cudaMemcpyHostToDevice);

}
void read_bmp(){ 
    /* 读取头，分配内存，读取数据，这里数据采用了一维数组，使用的时候，需要转换处理下。*/
    FILE *file = fopen(in_filename, "rb");
    // 读取头
    size_t n_bytes = fread(&header, 1, 54, file); 
    
    // 计算读取的大大小，并分配空间，并读取。
    header.height = header.height >= 0? header.height : -header.height;
    img = (uchar4 *)malloc(header.height * header.width * sizeof(uchar4));
    n_bytes = fread(img, sizeof(uchar4), header.height * header.width, file);  // 因为是4倍数对齐的，所以可以直接读取

    fclose(file); // 关闭文件
    
}
void save_bmp(){
    /* 使用与读取一样的头信息保存图像 */
    FILE *file = fopen(out_filename, "wb");
    // 写头
    header.height = -header.height;
    size_t n_bytes = fwrite(&header, 1, 54, file);
    header.height = -header.height;
    // 写图像数据
    n_bytes = fwrite(img, sizeof(uchar4), header.height * header.width, file);
    // 关闭文件
    fclose(file);
}
void free_mem(){
    /* 释放Host与Device内存 */
    free(img); // 直接释放（不需要指定大小，malloc系列函数有内部变量管理分配的内存）
    cudaFree(img_gpu);
}

// nvcc -o main.exe  -Xcompiler /source-charset:utf-8 c04_gpu_app.cu

运行结果
- GPU不参与图像的读取，耗时基本上忽略不计。
- 如果还记得我们前面CPU并发的计算结果，这个结果是非常令人欣喜的。
- 其中耗时的就在于GPU与CPU之间的内存捣腾耗时。

C:\01works\02cuda\c05_event_timer>main
总耗时:4.196352
        |-读取:0.002048
        |-拷贝到Device:1.976320
        |-计算:0.116032
        |-拷贝到Host:2.101952

线程分配方案的性能比较

为了在时间上体现一定的复杂度，下面的逻辑采用图像旋转的处理。
而且依然与前面一样，一个线程处理一个像素。

GPU上的数学运算

CUDA提供了GPU上数学运算，传统来自标准C/C++的数学函数不能在GPU上运算，大致提供的运算按照数据类型分成几个模块：
- 1. Half Precision Intrinsics
- 1. Mathematical Functions
- 1. Single Precision Mathematical Functions
- 1. Double Precision Mathematical Functions
- 1. Single Precision Intrinsics
- 1. Double Precision Intrinsics
- 1. Integer Intrinsics
- 1. Type Casting Intrinsics
- 1. SIMD Intrinsics

由于数学运算已经非常成熟，并形成使用习惯，所以基本上与C/C++中函数差不多的使用，可以在需要使用的时候查阅文档，这里不再介绍。

用代码比较性能

核心代码的实现
- 下面代码是减少重复计算的代码来优化。

__global__ void rotate(uchar4 *src,  uchar4 *dst, double degree){
    // 计算线程的所索引- 因为我们的设计是一个线程处理一个像素的旋转。
    int  idx = blockIdx.x * blockDim.x + threadIdx.x;
    // 根据idx，计算出像素在图像的物理位置- 这里偷懒，不传递图像的大小了，因为根据图像大小，可以折行。
    // 图像高度与宽度
    int h = 1080; 
    int w = 1920;

    // 像素位置
    int y = idx / w;           // 这是整除
    int x = idx - y * w;       // 余数

    // 计算旋转导致的缩放因子
    double d = sqrt((double)(w * w + h * h));
    double scale = w < h ? w / d : h / d;   // 其中有个浮点运算不会发生

    // 计算相关的三角计算
    double arc = 2 * 3.141592 / 360.0 * degree;
    double f_sin = sin(arc);
    double f_cos = cos(arc);

    // 计算中心位置
    int ox = w / 2; 
    int oy = h / 2; 
    // 计算新坐标系下的坐标
    double fo_x = x - ox;
    double fo_y = y - oy;
    // 计算按照图像中心旋转的浮点坐标
    double fr_x = ( fo_x * f_cos + fo_y * f_sin) * scale;
    double fr_y = (-fo_x * f_sin + fo_y * f_cos) * scale;

    // 坐标取整，并恢复到原来的图像坐标（不在以中心为原点的坐标系）
    int r_x = (int)fr_x + ox;
    int r_y = (int)fr_y + oy;
    // 迁移像素到旋转后的位置(需要转换为一维索引)
    dst[r_y * w + r_x] = src[idx];
}

调用代码

dim3 grid(header.height * 2); 
dim3 block(header.width / 2);    

rotate<<<grid, block>>>(img_gpu, rotate_gpu, 45.0);   // 旋转还是要考虑图像大小,这里不从host拷贝了，再GPU直接使用

45度的旋转效果

GPU旋转处理效果

不同的Grid与Block分配对性能的影响
- 计时代码这里不介绍了，参考附录。

计时项	128	64	32
总耗时:	6.309856	6.560128	6.444032
拷贝到Device:	2.715040	2.953216	2.711584
计算:	1.555168	1.557856	1.556448
拷贝到Host:	2.039648	2.049056	2.176000

其中128，64，32表示每个块的线程数

性能的分析

上面性能的结果差别非常微小，但是对运行几次，会发现一个规律就是：
- 128的效果总是最小的。
- Host2Device的时间总是占用最多的（这个当运算更多的时候，就不是问题了，旋转一幅8M大小的图像的时间不到2毫秒，这是很激动的事情）。
原因：
- GPU结构设计中线程基本组织单位是warps（线程组/也称线程束）=32，这是代码执行动用线程的最小单位
- GPU实际组织线程是使用的块，块的线程数是warps的倍数。一般程序员是不用关注warps的，一般关注的是block。
  - warps线程执行的最小单位；
  - block是线程启动的单位；
  - block的线程数一般是32，64，128，256，512，1024。
分析：
- 可以根据块中线程数，计算块启动时核函数拷贝的数据字节，然后根据GPU带宽，观察带宽的合理利用。一般128据说是最合理的利用。大于128，带宽也会得到比较好的利用。这是内存块连续、合适的大小都会导致较好的性能，这一点前面CPU计算中是有常识的。
- 还有一个问题是，当设计的块包含的线程多于需要调用和函数的次数，我们需要判定不必要的计算。
  - 这样也会浪费多余的线程，所以block的设计是GPU程序员的很重要的能力。
Grid与Block的维数因为是硬件产生，所以对程序的性能不产生任何影响。
- 程序员可以巧妙的把某些参数设计为gridDim与blockDim，而不是通过核函数参数传递。

附录

使用GPU旋转图像的完整代码

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

// 结构体定义
#pragma pack(1)
struct img_header{
    // 文件头
    char                  magic[2];                  // 魔法字
    unsigned int          file_size;                 // 文件大小
    unsigned char         reserve1[4];               // 跳4字节
    unsigned int          data_off;                  // 数据区开始位置
    // 信息头
    unsigned char         reserve2[4];               // 跳4字节
    int                   width;                     // 图像宽度
    int                   height;                    // 图像高度
    unsigned char         reserve3[2];               // 跳2字节
    unsigned short int    bit_count;                 // 图像位数1，4，8，16，24，32
    unsigned char         reserve4[24];              // 跳24字节
};


// 偷懒写一个匿名全局类

// 全局数据
struct img_header  header;
uchar4             *img;                             // 使用gpu的扩展类型
uchar4             *img_gpu; 
uchar4             *rotate_gpu;                      // 旋转后的图像

// 输入/输出文件名
const char *in_filename  = "gpu.bmp";
const char *out_filename = "gpu_out.bmp";

// 打开图像
void read_bmp();                                     // 无参数，采用全局成员
// 保存图像
void save_bmp();
// GPU运算的数据
void move_to_device();  
// 取得数据
void move_to_host();                            
// GPU处理
__global__ void rotate(uchar4 *src,  uchar4 *dst, double degree);
// 内存释放
void free_mem();

int main(int argc, const char **argv){
    read_bmp();
    move_to_device();
    ////////////////////////////////GPU处理调用
    dim3 grid(header.height * 2); 
    dim3 block(header.width / 2);    

    rotate<<<grid, block>>>(img_gpu, rotate_gpu, 45.0);   // 旋转还是要考虑图像大小,这里不从host拷贝了，再GPU直接使用

    ////////////////////////////////
    move_to_host();
    save_bmp();
    free_mem();
    return 0;
}

__global__ void rotate(uchar4 *src,  uchar4 *dst, double degree){
    // 计算线程的所索引- 因为我们的设计是一个线程处理一个像素的旋转。
    int  idx = blockIdx.x * blockDim.x + threadIdx.x;
    // 根据idx，计算出像素在图像的物理位置- 这里偷懒，不传递图像的大小了，因为根据图像大小，可以折行。
    // 图像高度与宽度
    int h = 1080; 
    int w = 1920;

    // 像素位置
    int y = idx / w;           // 这是整除
    int x = idx - y * w;       // 余数

    // 计算旋转导致的缩放因子
    double d = sqrt((double)(w * w + h * h));
    double scale = w < h ? w / d : h / d;   // 其中有个浮点运算不会发生

    // 计算相关的三角计算
    double arc = 2 * 3.141592 / 360.0 * degree;
    double f_sin = sin(arc);
    double f_cos = cos(arc);

    // 计算中心位置
    int ox = w / 2; 
    int oy = h / 2; 
    // 计算新坐标系下的坐标
    double fo_x = x - ox;
    double fo_y = y - oy;
    // 计算按照图像中心旋转的浮点坐标
    double fr_x = ( fo_x * f_cos + fo_y * f_sin) * scale;
    double fr_y = (-fo_x * f_sin + fo_y * f_cos) * scale;

    // 坐标取整，并恢复到原来的图像坐标（不在以中心为原点的坐标系）
    int r_x = (int)fr_x + ox;
    int r_y = (int)fr_y + oy;
    // 迁移像素到旋转后的位置(需要转换为一维索引)
    dst[r_y * w + r_x] = src[idx];
}

void move_to_host(){
    // 把选装后的图像拷贝到Host内存，用来保存到磁盘
    cudaMemcpy((void*)img, (void*)rotate_gpu, header.height * header.width * sizeof(uchar4), cudaMemcpyDeviceToHost);
}
void move_to_device(){
    // 分配GPU内存
    cudaMalloc((void**)&img_gpu, header.height * header.width * sizeof(uchar4));   // 返回指针，则参数就需要二重指针。
    // 拷贝数据
    cudaMemcpy((void*)img_gpu, (void*)img,  header.height * header.width * sizeof(uchar4), cudaMemcpyHostToDevice);

    // 旋转后的图像
    cudaMalloc((void**)&rotate_gpu, header.height * header.width * sizeof(uchar4));
    cudaMemset((void*)rotate_gpu, 0/*初始值,可以考虑255等其他值*/, header.height * header.width * sizeof(uchar4));

}
void read_bmp(){ 
    /* 读取头，分配内存，读取数据，这里数据采用了一维数组，使用的时候，需要转换处理下。*/
    FILE *file = fopen(in_filename, "rb");
    // 读取头
    size_t n_bytes = fread(&header, 1, 54, file); 
    
    // 计算读取的大大小，并分配空间，并读取。
    header.height = header.height >= 0? header.height : -header.height;
    img = (uchar4 *)malloc(header.height * header.width * sizeof(uchar4));
    n_bytes = fread(img, sizeof(uchar4), header.height * header.width, file);  // 因为是4倍数对齐的，所以可以直接读取

    fclose(file); // 关闭文件
    
}
void save_bmp(){
    /* 使用与读取一样的头信息保存图像 */
    FILE *file = fopen(out_filename, "wb");
    // 写头
    header.height = -header.height;
    size_t n_bytes = fwrite(&header, 1, 54, file);
    header.height = -header.height;
    // 写图像数据
    n_bytes = fwrite(img, sizeof(uchar4), header.height * header.width, file);
    // 关闭文件
    fclose(file);
}
void free_mem(){
    /* 释放Host与Device内存 */
    free(img); // 直接释放（不需要指定大小，malloc系列函数有内部变量管理分配的内存）
    cudaFree(img_gpu);
    cudaFree(rotate_gpu);
}

// @nvcc -o main.exe  -Xcompiler /source-charset:utf-8 c02_rotate_gpu.cu

性能比较的完整代码

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

// 结构体定义
#pragma pack(1)
struct img_header{
    // 文件头
    char                  magic[2];                  // 魔法字
    unsigned int          file_size;                 // 文件大小
    unsigned char         reserve1[4];               // 跳4字节
    unsigned int          data_off;                  // 数据区开始位置
    // 信息头
    unsigned char         reserve2[4];               // 跳4字节
    int                   width;                     // 图像宽度
    int                   height;                    // 图像高度
    unsigned char         reserve3[2];               // 跳2字节
    unsigned short int    bit_count;                 // 图像位数1，4，8，16，24，32
    unsigned char         reserve4[24];              // 跳24字节
};


// 偷懒写一个匿名全局类

// 全局数据
struct img_header  header;
uchar4             *img;                             // 使用gpu的扩展类型
uchar4             *img_gpu; 
uchar4             *rotate_gpu;                      // 旋转后的图像

// 输入/输出文件名
const char *in_filename  = "gpu.bmp";
const char *out_filename = "gpu_out.bmp";

// 打开图像
void read_bmp();                                     // 无参数，采用全局成员
// 保存图像
void save_bmp();
// GPU运算的数据
void move_to_device();  
// 取得数据
void move_to_host();                            
// GPU处理
__global__ void rotate(uchar4 *src,  uchar4 *dst, double degree);
// 内存释放
void free_mem();

int main(int argc, const char **argv){
    cudaEvent_t t1,t2,t3,t4;

    // 2. 定义事件
    cudaEventCreate (&t1);
    cudaEventCreate (&t2);
    cudaEventCreate (&t3);
    cudaEventCreate (&t4);

    read_bmp();
    cudaEventRecord (t1, 0); 
    move_to_device();
    cudaEventRecord (t2, 0); 
    ////////////////////////////////GPU处理调用
    /* 
        1920 = 2 * 960
        1920 = 2 * 2 * 480
        1920 = 2 * 2 * 2 * 240
        1920 = 2 * 2 * 2 * 2 * 120
        1920 = 2 * 2 * 2 * 2 * 2 * 60
        1920 = 2 * 2 * 2 * 2 * 2 * 2 * 30
        1920 = 2 * 2 * 2 * 2 * 2 * 2 * 2 * 15
        1920 = 2 * 2 * 2 * 2 * 2 * 2 * 2 * 3 * 5
        15 * 128
        30 * 64
        60 * 32
    */
    int times = 2;
    if(argc > 1){
        times = atoi(argv[1]);
    }
    dim3 grid(header.height * times);   // 可以使用3维结构，得到更好的性能比对效果 
    dim3 block(header.width / times);    

    rotate<<<grid, block>>>(img_gpu, rotate_gpu, 45.0);   // 旋转还是要考虑图像大小,这里不从host拷贝了，再GPU直接使用
    cudaEventRecord (t3, 0); 
    ////////////////////////////////
    move_to_host();
    cudaEventRecord (t4, 0); 
    cudaEventSynchronize(t1);
    cudaEventSynchronize(t2);
    cudaEventSynchronize(t3);
    cudaEventSynchronize(t4);
    // 计时
    float t_total, t_move2gpu, t_kernel, t_move2host;
    cudaEventElapsedTime(&t_total, t1, t4);

    cudaEventElapsedTime(&t_move2gpu, t1, t2);
    cudaEventElapsedTime(&t_kernel, t2, t3);
    cudaEventElapsedTime(&t_move2host, t3, t4);
    printf("总耗时:%f\n", t_total);
    printf("\t|-拷贝到Device:%f\n", t_move2gpu);
    printf("\t|-计算:%f\n", t_kernel);
    printf("\t|-拷贝到Host:%f\n", t_move2host);

    save_bmp();
    free_mem();
    cudaEventDestroy(t1);
    cudaEventDestroy(t2);
    cudaEventDestroy(t3);
    cudaEventDestroy(t4);
    return 0;
}

__global__ void rotate(uchar4 *src,  uchar4 *dst, double degree){
    // 计算线程的所索引- 因为我们的设计是一个线程处理一个像素的旋转。
    int  idx = blockIdx.x * blockDim.x + threadIdx.x;
    // 根据idx，计算出像素在图像的物理位置- 这里偷懒，不传递图像的大小了，因为根据图像大小，可以折行。
    // 图像高度与宽度
    int h = 1080; 
    int w = 1920;

    // 像素位置
    int y = idx / w;           // 这是整除
    int x = idx - y * w;       // 余数

    // 计算旋转导致的缩放因子
    double d = sqrt((double)(w * w + h * h));
    double scale = w < h ? w / d : h / d;   // 其中有个浮点运算不会发生

    // 计算相关的三角计算
    double arc = 2 * 3.141592 / 360.0 * degree;
    double f_sin = sin(arc);
    double f_cos = cos(arc);

    // 计算中心位置
    int ox = w / 2; 
    int oy = h / 2; 
    // 计算新坐标系下的坐标
    double fo_x = x - ox;
    double fo_y = y - oy;
    // 计算按照图像中心旋转的浮点坐标
    double fr_x = ( fo_x * f_cos + fo_y * f_sin) * scale;
    double fr_y = (-fo_x * f_sin + fo_y * f_cos) * scale;

    // 坐标取整，并恢复到原来的图像坐标（不在以中心为原点的坐标系）
    int r_x = (int)fr_x + ox;
    int r_y = (int)fr_y + oy;
    // 迁移像素到旋转后的位置(需要转换为一维索引)
    dst[r_y * w + r_x] = src[idx];
}

void move_to_host(){
    // 把选装后的图像拷贝到Host内存，用来保存到磁盘
    cudaMemcpy((void*)img, (void*)rotate_gpu, header.height * header.width * sizeof(uchar4), cudaMemcpyDeviceToHost);
}
void move_to_device(){
    // 分配GPU内存
    cudaMalloc((void**)&img_gpu, header.height * header.width * sizeof(uchar4));   // 返回指针，则参数就需要二重指针。
    // 拷贝数据
    cudaMemcpy((void*)img_gpu, (void*)img,  header.height * header.width * sizeof(uchar4), cudaMemcpyHostToDevice);

    // 旋转后的图像
    cudaMalloc((void**)&rotate_gpu, header.height * header.width * sizeof(uchar4));
    cudaMemset((void*)rotate_gpu, 0/*初始值,可以考虑255等其他值*/, header.height * header.width * sizeof(uchar4));

}
void read_bmp(){ 
    /* 读取头，分配内存，读取数据，这里数据采用了一维数组，使用的时候，需要转换处理下。*/
    FILE *file = fopen(in_filename, "rb");
    // 读取头
    size_t n_bytes = fread(&header, 1, 54, file); 
    
    // 计算读取的大大小，并分配空间，并读取。
    header.height = header.height >= 0? header.height : -header.height;
    img = (uchar4 *)malloc(header.height * header.width * sizeof(uchar4));
    n_bytes = fread(img, sizeof(uchar4), header.height * header.width, file);  // 因为是4倍数对齐的，所以可以直接读取

    fclose(file); // 关闭文件
    
}
void save_bmp(){
    /* 使用与读取一样的头信息保存图像 */
    FILE *file = fopen(out_filename, "wb");
    // 写头
    header.height = -header.height;
    size_t n_bytes = fwrite(&header, 1, 54, file);
    header.height = -header.height;
    // 写图像数据
    n_bytes = fwrite(img, sizeof(uchar4), header.height * header.width, file);
    // 关闭文件
    fclose(file);
}
void free_mem(){
    /* 释放Host与Device内存 */
    free(img); // 直接释放（不需要指定大小，malloc系列函数有内部变量管理分配的内存）
    cudaFree(img_gpu);
    cudaFree(rotate_gpu);
}

// @nvcc -o main.exe  -Xcompiler /source-charset:utf-8 c03_rotate_timer.cu

执行方式
- main 15 # 128
- main 30 # 64
- main 60 # 32

CUDA02_05GPU计时与性能评估
该是分析GPU性能的时候了，GPU性能与如下因素有关： 1. GPU硬件架构； 2. PCI吞吐...
Jmeter组件
一. 性能测试 1.为什么要进行性能测试？ ①评估当前系统的能力 ②寻找性能瓶颈，优化系统性能 ③评估软件是否满...
Jmeter安装及使用
一. 性能测试 1.0 为什么要进行性能测试？评估当前系统的能力寻找性能瓶颈，优化系统性能评估软件是否满足未...
[JPT_02]性能测试-性能指标的分析 & 定义
目录结构一、性能测试需求分析与定义通过前文[JPT_01]性能测试需求分析对性能测试的必要性评估之后，敏捷开发...
评估方法与性能度量
模型评估方法留出法（训练集2/3~4/5）交叉验证（数据量充足）自助法（数据集较小、难以有效划分训练/测试集） ...
无标题文章
Linux服务器性能评估与优化(一) http://blog.csdn.net/hguisu/article/d...
JVM（十一）内存与垃圾回收|垃圾回收器
本文介绍垃圾回收器。目录 1 GC的分类与性能指标 2 评估GC的性能指标 2.1 吞吐量 2.2 暂停时...
我是如何为Serverless配置内存和超时时间的
之前写过一篇文章：Serverless架构与资源评估：性能与成本探索是关于性能和成本的探索，探索之后，就不得不...
2019-05-17资产评估概述01
资产评估概述（倒计时：127） [if !supportLists]一、[endif]资产评估的概念及特点 [i...
模型性能评估
不同的机器学习任务对应有不同的评估指标：评估分类模型混淆矩阵（Confusion matrix）： True ...

CUDA02_05GPU计时与性能评估

时间与事件记录

函数说明

使用模式

使用例子

线程分配方案的性能比较

GPU上的数学运算

用代码比较性能

性能的分析

附录

相关文章

CUDA02_05GPU计时与性能评估

Jmeter组件

Jmeter安装及使用

[JPT_02]性能测试-性能指标的分析 & 定义

评估方法与性能度量

无标题文章

JVM（十一）内存与垃圾回收|垃圾回收器

我是如何为Serverless配置内存和超时时间的

2019-05-17资产评估概述01

模型性能评估

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读