美文网首页
CUDA(五) - Reduce Sum并行归约

CUDA(五) - Reduce Sum并行归约

作者: 侠之大者_7d3f | 来源:发表于2022-01-06 17:21 被阅读0次
#include<iostream>
#include<cuda_runtime.h>
#include"utils.cuh"

#define USE_DP 1

#if USE_DP
    using real = double;
#else
    using real = float;
#endif // USE_DP

constexpr int N = 1024 * 1024 * 200;
constexpr int M = sizeof(real) * N;
constexpr int BLOCK_SIZE = 128;

real reduce_cpu(real* arr, int N) {
    real sum = 0.0;
    for(int i=0;i<N;i++) {
        sum += arr[i];
    }
    return sum;
}


__global__ void reduce_global(real* d_x, real* d_y) {
    const int tid = threadIdx.x;
    real* x = d_x + blockDim.x * blockIdx.x;

    for(int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
        if(tid < offset) {
            x[tid] += x[tid + offset];
        }
        __syncthreads();
    }

    if(tid==0) {
        // 将每个block计算的结果写入d_y
        d_y[blockIdx.x] = x[0];
    }
}

__global__ void reduce_shared(real* d_x, real* d_y) {
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    __shared__ real s_y[128];
    s_y[tid] = (n < N) ? d_x[n]:0.0;
    __syncthreads();

    for(int offset = blockDim.x >> 1; offset > 0;offset >>= 1) {
        if(tid < offset) {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    if(tid == 0) {
        d_y[bid] = s_y[0];
    }
}

__global__ void reduce_dynamic(real* d_x, real* d_y) {
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < N) ? d_x[n]:0.0;
    __syncthreads();

    for(int offset = blockDim.x >> 1; offset > 0; offset >>=1) {
        if(tid < offset) {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    if(tid == 0) {
        d_y[bid] = s_y[0];
    }
}

real reduce(real* d_x) {
    int grid_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
    const int ymem = sizeof(real) * grid_size;
    const int smem = sizeof(real) * BLOCK_SIZE;
    real* d_y;
    CHECK(cudaMalloc((void**)&d_y, ymem));
    real* h_y = (real*)malloc(ymem);

    // reduce_global<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
    // reduce_shared<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
    GPUTimer timer;
    timer.start();
    reduce_global<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
    // reduce_dynamic<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y);
    // reduce_shared<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
    timer.stop();
    std::cout << "time:" << timer.elapsed_ms() << "ms" << std::endl;
    CHECK(cudaMemcpy(h_y, d_y, ymem, cudaMemcpyDeviceToHost));

    real result = 0.0;
    for(int i=0;i<grid_size;i++) {
        result += h_y[0];
    }

    free(h_y);
    CHECK(cudaFree(d_y));
    return result;
}

int main() {

    WARN_UP;
    WARN_UP;

    real* h_x = new real[N];
    for(int i=0;i<N;i++) {
        h_x[i] = 1.23;
    }

    real* d_x;
    CHECK(cudaMalloc((void**)&d_x, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
    real result_gpu = 0.0;
    result_gpu = reduce(d_x);

    std::cout << "result_gpu:" << result_gpu << std::endl;
    std::cout << "result_cpu:" << reduce_cpu(h_x, N) << std::endl;
}

相关文章

网友评论

      本文标题:CUDA(五) - Reduce Sum并行归约

      本文链接:https://www.haomeiwen.com/subject/svdrcrtx.html