#include<iostream>
#include<cuda_runtime.h>
#include"utils.cuh"
#define USE_DP 1
#if USE_DP
using real = double;
#else
using real = float;
#endif // USE_DP
constexpr int N = 1024 * 1024 * 200;
constexpr int M = sizeof(real) * N;
constexpr int BLOCK_SIZE = 128;
real reduce_cpu(real* arr, int N) {
real sum = 0.0;
for(int i=0;i<N;i++) {
sum += arr[i];
}
return sum;
}
__global__ void reduce_global(real* d_x, real* d_y) {
const int tid = threadIdx.x;
real* x = d_x + blockDim.x * blockIdx.x;
for(int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
if(tid < offset) {
x[tid] += x[tid + offset];
}
__syncthreads();
}
if(tid==0) {
// 将每个block计算的结果写入d_y
d_y[blockIdx.x] = x[0];
}
}
__global__ void reduce_shared(real* d_x, real* d_y) {
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
__shared__ real s_y[128];
s_y[tid] = (n < N) ? d_x[n]:0.0;
__syncthreads();
for(int offset = blockDim.x >> 1; offset > 0;offset >>= 1) {
if(tid < offset) {
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
if(tid == 0) {
d_y[bid] = s_y[0];
}
}
__global__ void reduce_dynamic(real* d_x, real* d_y) {
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int n = bid * blockDim.x + tid;
extern __shared__ real s_y[];
s_y[tid] = (n < N) ? d_x[n]:0.0;
__syncthreads();
for(int offset = blockDim.x >> 1; offset > 0; offset >>=1) {
if(tid < offset) {
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
if(tid == 0) {
d_y[bid] = s_y[0];
}
}
real reduce(real* d_x) {
int grid_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int ymem = sizeof(real) * grid_size;
const int smem = sizeof(real) * BLOCK_SIZE;
real* d_y;
CHECK(cudaMalloc((void**)&d_y, ymem));
real* h_y = (real*)malloc(ymem);
// reduce_global<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
// reduce_shared<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
GPUTimer timer;
timer.start();
reduce_global<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
// reduce_dynamic<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y);
// reduce_shared<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
timer.stop();
std::cout << "time:" << timer.elapsed_ms() << "ms" << std::endl;
CHECK(cudaMemcpy(h_y, d_y, ymem, cudaMemcpyDeviceToHost));
real result = 0.0;
for(int i=0;i<grid_size;i++) {
result += h_y[0];
}
free(h_y);
CHECK(cudaFree(d_y));
return result;
}
int main() {
WARN_UP;
WARN_UP;
real* h_x = new real[N];
for(int i=0;i<N;i++) {
h_x[i] = 1.23;
}
real* d_x;
CHECK(cudaMalloc((void**)&d_x, M));
CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
real result_gpu = 0.0;
result_gpu = reduce(d_x);
std::cout << "result_gpu:" << result_gpu << std::endl;
std::cout << "result_cpu:" << reduce_cpu(h_x, N) << std::endl;
}
网友评论