前言
开发GPU端的程序主要是为了性能优化, 衡量程序/代码性能主要的指标: 执行时间, 为此, CUDA专门提供了测量时间的API函数: cudaEvent_t, cudaEventRecord(), cudaEventElapsedTime() 等函数
, 具体通过实践了解以下用法.
测试环境
- OS: Ubuntu 20.04
- CUDA: v11
- GCC-10
- VSCode
工程测试
CUDA测量时间的API函数介绍:
CUDA首先提供了一个数据结构: cudaEvent_t
, 表示一个CUDA时间, 实际上是一个指针类型的变量.
API:
-
cudaEventCreate(cudaEvent_t *event) --- 创建event变量
-
cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) ---记录event事件, 对同一个event, 可以调用多次cudaEventRecord(), 最新调用的值覆盖之前的旧值.
::cudaEventRecord() can be called multiple times on the same event and will overwrite the previously captured state.
-
cudaEventSynchronize(cudaEvent_t event) --- 用于event事件的同步,在调用此函数之后保证event已经结束
Waits for an event to complete, Waits until the completion of all work currently captured in
-
cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end) --- 统计start和end 2个event之间的时间间隔, 单位 毫秒(ms), 此函数的时间分辨率为0.5us
Computes the elapsed time between two events (in milliseconds with a resolution of around 0.5 microseconds).
Demo代码实现
#include<iostream>
#include<cuda_runtime.h>
// ref: https://blog.csdn.net/qq_17239003/article/details/78991567
using namespace std;
#define CHECK(call) \
do \
{ \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) \
{ \
printf("CUDA Error\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
// GPU Kernel func, perform element-wise add
__global__ void kernel_sum(int *arr1, int *arr2, int *out, int N)
{
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_id < N)
{
out[thread_id] = arr1[thread_id] + arr2[thread_id];
}
}
int main() {
const int N = 2048 * 2048;
int *arr1 = new int[N];
int *arr2 = new int[N];
int *out = new int[N];
srand(123456);
for (int i = 0; i < N; i++)
{
arr1[i] = rand() * 5 % 255;
arr2[i] = rand() % 128 + 5;
}
// 1. GPU端申请显存
int *d_arr1 = nullptr;
int *d_arr2 = nullptr;
int *d_out = nullptr;
CHECK(cudaMalloc((void **)&d_arr1, sizeof(int) * N));
CHECK(cudaMalloc((void **)&d_arr2, sizeof(int) * N));
CHECK(cudaMalloc((void **)&d_out, sizeof(int) * N));
// 2. CPU Memory数据复制到GPU显存
cudaMemcpy(d_arr1, arr1, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_arr2, arr2, sizeof(int) * N, cudaMemcpyHostToDevice);
// 3. 设置GPU端线程执行配置, launch the GPU kernel
cudaEvent_t start, end;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&end));
// 开始计时
cudaEventRecord(start);
int blk_size = 128;
int grid_size = (N + blk_size - 1) / blk_size;
kernel_sum<<<grid_size, blk_size>>>(d_arr1, d_arr2, d_out, N);
// 结束计时
cudaEventRecord(end);
cudaEventSynchronize(end);
// 统计时间
float time_ms = 0.f;
cudaEventElapsedTime(&time_ms, start, end);
std::cout << "CUDA Kernel time: " << time_ms << " ms" << std::endl;
// 4. Cpoy GPU result to CPU
cudaMemcpy(out, d_out, sizeof(int) * N, cudaMemcpyDeviceToHost);
// 5. Free GPU Memory
cudaFree(d_arr1);
cudaFree(d_arr2);
cudaFree(d_out);
}
运行结果:
![](https://img.haomeiwen.com/i11478104/7567d9df3d79be4c.png)
将CUDA计时代码封装为工具类: GPUTimer
为了代码的重用, 将上述CUDA计时的工具类:
class GPUTimer
{
public:
GPUTimer()
{
cudaEventCreate(&m_start);
cudaEventCreate(&m_end);
}
~GPUTimer()
{
cudaEventDestroy(m_start);
cudaEventDestroy(m_end);
}
float elapsed_ms()
{
float ms = 0;
cudaEventElapsedTime(&ms, m_start, m_end);
return ms;
}
void start()
{
cudaEventRecord(m_start);
}
void stop()
{
cudaEventRecord(m_end);
cudaEventSynchronize(m_end);
}
private:
cudaEvent_t m_start;
cudaEvent_t m_end;
};
补充: CPU端测量时间的工具代码
注意: 编译器需要支持>=C++11, 包含头文件: <chrono>
class CPUTimer
{
public:
CPUTimer()
{
m_start = std::chrono::high_resolution_clock::now();
}
~CPUTimer() {}
void start()
{
m_start = std::chrono::high_resolution_clock::now();
}
void stop()
{
m_end = std::chrono::high_resolution_clock::now();
}
float elapsed_ms()
{
auto dur = std::chrono::duration_cast<std::chrono::microseconds>(m_end - m_start).count(); // us
return (float)(dur) / 1000;
}
private:
std::chrono::time_point<std::chrono::high_resolution_clock> m_start;
std::chrono::time_point<std::chrono::high_resolution_clock> m_end;
};
网友评论