美文网首页
CUDA(五) - Reduce Sum并行归约

CUDA(五) - Reduce Sum并行归约

作者: 侠之大者_7d3f | 来源:发表于2022-01-06 17:21 被阅读0次
    #include<iostream>
    #include<cuda_runtime.h>
    #include"utils.cuh"
    
    #define USE_DP 1
    
    #if USE_DP
        using real = double;
    #else
        using real = float;
    #endif // USE_DP
    
    constexpr int N = 1024 * 1024 * 200;
    constexpr int M = sizeof(real) * N;
    constexpr int BLOCK_SIZE = 128;
    
    real reduce_cpu(real* arr, int N) {
        real sum = 0.0;
        for(int i=0;i<N;i++) {
            sum += arr[i];
        }
        return sum;
    }
    
    
    __global__ void reduce_global(real* d_x, real* d_y) {
        const int tid = threadIdx.x;
        real* x = d_x + blockDim.x * blockIdx.x;
    
        for(int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
            if(tid < offset) {
                x[tid] += x[tid + offset];
            }
            __syncthreads();
        }
    
        if(tid==0) {
            // 将每个block计算的结果写入d_y
            d_y[blockIdx.x] = x[0];
        }
    }
    
    __global__ void reduce_shared(real* d_x, real* d_y) {
        const int tid = threadIdx.x;
        const int bid = blockIdx.x;
        const int n = bid * blockDim.x + tid;
        __shared__ real s_y[128];
        s_y[tid] = (n < N) ? d_x[n]:0.0;
        __syncthreads();
    
        for(int offset = blockDim.x >> 1; offset > 0;offset >>= 1) {
            if(tid < offset) {
                s_y[tid] += s_y[tid + offset];
            }
            __syncthreads();
        }
    
        if(tid == 0) {
            d_y[bid] = s_y[0];
        }
    }
    
    __global__ void reduce_dynamic(real* d_x, real* d_y) {
        const int tid = threadIdx.x;
        const int bid = blockIdx.x;
        const int n = bid * blockDim.x + tid;
        extern __shared__ real s_y[];
        s_y[tid] = (n < N) ? d_x[n]:0.0;
        __syncthreads();
    
        for(int offset = blockDim.x >> 1; offset > 0; offset >>=1) {
            if(tid < offset) {
                s_y[tid] += s_y[tid + offset];
            }
            __syncthreads();
        }
    
        if(tid == 0) {
            d_y[bid] = s_y[0];
        }
    }
    
    real reduce(real* d_x) {
        int grid_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
        const int ymem = sizeof(real) * grid_size;
        const int smem = sizeof(real) * BLOCK_SIZE;
        real* d_y;
        CHECK(cudaMalloc((void**)&d_y, ymem));
        real* h_y = (real*)malloc(ymem);
    
        // reduce_global<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
        // reduce_shared<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
        GPUTimer timer;
        timer.start();
        reduce_global<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
        // reduce_dynamic<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y);
        // reduce_shared<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
        timer.stop();
        std::cout << "time:" << timer.elapsed_ms() << "ms" << std::endl;
        CHECK(cudaMemcpy(h_y, d_y, ymem, cudaMemcpyDeviceToHost));
    
        real result = 0.0;
        for(int i=0;i<grid_size;i++) {
            result += h_y[0];
        }
    
        free(h_y);
        CHECK(cudaFree(d_y));
        return result;
    }
    
    int main() {
    
        WARN_UP;
        WARN_UP;
    
        real* h_x = new real[N];
        for(int i=0;i<N;i++) {
            h_x[i] = 1.23;
        }
    
        real* d_x;
        CHECK(cudaMalloc((void**)&d_x, M));
        CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
        real result_gpu = 0.0;
        result_gpu = reduce(d_x);
    
        std::cout << "result_gpu:" << result_gpu << std::endl;
        std::cout << "result_cpu:" << reduce_cpu(h_x, N) << std::endl;
    }
    

    相关文章

      网友评论

          本文标题:CUDA(五) - Reduce Sum并行归约

          本文链接:https://www.haomeiwen.com/subject/svdrcrtx.html