美文网首页
CUDA Samples-matMul

CUDA Samples-matMul

作者: 凉凉zz | 来源:发表于2018-11-26 22:24 被阅读0次
    /**
     * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
     *
     * Please refer to the NVIDIA end user license agreement (EULA) associated
     * with this source code for terms and conditions that govern your use of
     * this software. Any use, reproduction, disclosure, or distribution of
     * this software and related documentation outside the terms of the EULA
     * is strictly prohibited.
     *
     */
    
    /**
     * Matrix multiplication: C = A * B.
     * Host code.
     *
     * This sample implements matrix multiplication as described in Chapter 3
     * of the programming guide.
     * It has been written for clarity of exposition to illustrate various CUDA
     * programming principles, not with the goal of providing the most
     * performant generic kernel for matrix multiplication.
     *
     * See also:
     * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
     * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
     * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
     */
    
    // System includes
    #include <stdio.h>
    #include <assert.h>
    
    // CUDA runtime
    #include <cuda_runtime.h>
    
    // Helper functions and utilities to work with CUDA
    #include <helper_functions.h>
    #include <helper_cuda.h>
    
    /**
     * Matrix multiplication (CUDA Kernel) on the device: C = A * B
     * wA is A's width and wB is B's width
     */
    template <int BLOCK_SIZE> __global__ void
    matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
    {
        // Block index
        int bx = blockIdx.x;
        int by = blockIdx.y;
    
        // Thread index
        int tx = threadIdx.x;
        int ty = threadIdx.y;
    
        // Index of the first sub-matrix of A processed by the block
        int aBegin = wA * BLOCK_SIZE * by;
    
        // Index of the last sub-matrix of A processed by the block
        int aEnd   = aBegin + wA - 1;
    
        // Step size used to iterate through the sub-matrices of A
        int aStep  = BLOCK_SIZE;
    
        // Index of the first sub-matrix of B processed by the block
        int bBegin = BLOCK_SIZE * bx;
    
        // Step size used to iterate through the sub-matrices of B
        int bStep  = BLOCK_SIZE * wB;
    
        // Csub is used to store the element of the block sub-matrix
        // that is computed by the thread
        float Csub = 0;
    
        // Loop over all the sub-matrices of A and B
        // required to compute the block sub-matrix
        for (int a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep)
        {
    
            // Declaration of the shared memory array As used to
            // store the sub-matrix of A
            __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
    
            // Declaration of the shared memory array Bs used to
            // store the sub-matrix of B
            __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
    
            // Load the matrices from device memory
            // to shared memory; each thread loads
            // one element of each matrix
            As[ty][tx] = A[a + wA * ty + tx];
            Bs[ty][tx] = B[b + wB * ty + tx];
    
            // Synchronize to make sure the matrices are loaded
            __syncthreads();
    
            // Multiply the two matrices together;
            // each thread computes one element
            // of the block sub-matrix
    #pragma unroll
    
            for (int k = 0; k < BLOCK_SIZE; ++k)
            {
                Csub += As[ty][k] * Bs[k][tx];
            }
    
            // Synchronize to make sure that the preceding
            // computation is done before loading two new
            // sub-matrices of A and B in the next iteration
            __syncthreads();
        }
    
        // Write the block sub-matrix to device memory;
        // each thread writes one element
        int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
        C[c + wB * ty + tx] = Csub;
    }
    
    void constantInit(float *data, int size, float val)
    {
        for (int i = 0; i < size; ++i)
        {
            data[i] = val;
        }
    }
    
    /**
     * Run a simple test of matrix multiplication using CUDA
     */
    int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
    {
        // Allocate host memory for matrices A and B
        unsigned int size_A = dimsA.x * dimsA.y;
        unsigned int mem_size_A = sizeof(float) * size_A;
        float *h_A = (float *)malloc(mem_size_A);
        unsigned int size_B = dimsB.x * dimsB.y;
        unsigned int mem_size_B = sizeof(float) * size_B;
        float *h_B = (float *)malloc(mem_size_B);
    
        // Initialize host memory
        const float valB = 0.01f;
        constantInit(h_A, size_A, 1.0f);
        constantInit(h_B, size_B, valB);
    
        // Allocate device memory
        float *d_A, *d_B, *d_C;
    
        // Allocate host matrix C
        dim3 dimsC(dimsB.x, dimsA.y, 1);
        unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
        float *h_C = (float *) malloc(mem_size_C);
    
        if (h_C == NULL)
        {
            fprintf(stderr, "Failed to allocate host matrix C!\n");
            exit(EXIT_FAILURE);
        }
    
        cudaError_t error;
    
        error = cudaMalloc((void **) &d_A, mem_size_A);
    
        if (error != cudaSuccess)
        {
            printf("cudaMalloc d_A returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
            exit(EXIT_FAILURE);
        }
    
        error = cudaMalloc((void **) &d_B, mem_size_B);
    
        if (error != cudaSuccess)
        {
            printf("cudaMalloc d_B returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
            exit(EXIT_FAILURE);
        }
    
        error = cudaMalloc((void **) &d_C, mem_size_C);
    
        if (error != cudaSuccess)
        {
            printf("cudaMalloc d_C returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
            exit(EXIT_FAILURE);
        }
    
        // copy host memory to device
        error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
    
        if (error != cudaSuccess)
        {
            printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
            exit(EXIT_FAILURE);
        }
    
        error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
    
        if (error != cudaSuccess)
        {
            printf("cudaMemcpy (d_B,h_B) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
            exit(EXIT_FAILURE);
        }
    
        // Setup execution parameters
        dim3 threads(block_size, block_size);
        dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
    
        // Create and start timer
        printf("Computing result using CUDA Kernel...\n");
    
        // Performs warmup operation using matrixMul CUDA kernel
        if (block_size == 16)
        {
            matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
        }
        else
        {
            matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
        }
    
        printf("done\n");
    
        cudaDeviceSynchronize();
    
        // Allocate CUDA events that we'll use for timing
        cudaEvent_t start;
        error = cudaEventCreate(&start);
    
        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }
    
        cudaEvent_t stop;
        error = cudaEventCreate(&stop);
    
        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }
    
        // Record the start event
        error = cudaEventRecord(start, NULL);
    
        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }
    
        // Execute the kernel
        int nIter = 300;
    
        for (int j = 0; j < nIter; j++)
        {
            if (block_size == 16)
            {
                matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
            }
            else
            {
                matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
            }
        }
    
        // Record the stop event
        error = cudaEventRecord(stop, NULL);
    
        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }
    
        // Wait for the stop event to complete
        error = cudaEventSynchronize(stop);
    
        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }
    
        float msecTotal = 0.0f;
        error = cudaEventElapsedTime(&msecTotal, start, stop);
    
        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }
    
        // Compute and print the performance
        float msecPerMatrixMul = msecTotal / nIter;
        double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
        double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
        printf(
            "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
            gigaFlops,
            msecPerMatrixMul,
            flopsPerMatrixMul,
            threads.x * threads.y);
    
        // Copy result from device to host
        error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
    
        if (error != cudaSuccess)
        {
            printf("cudaMemcpy (h_C,d_C) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
            exit(EXIT_FAILURE);
        }
    
        printf("Checking computed result for correctness: ");
        bool correct = true;
    
        // test relative error by the formula
        //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
        double eps = 1.e-6 ; // machine zero
    
        for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
        {
            double abs_err = fabs(h_C[i] - (dimsA.x * valB));
            double dot_length = dimsA.x;
            double abs_val = fabs(h_C[i]);
            double rel_err = abs_err/abs_val/dot_length ;
    
            if (rel_err > eps)
            {
                printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps);
                correct = false;
            }
        }
    
        printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
    
        // Clean up memory
        free(h_A);
        free(h_B);
        free(h_C);
        cudaFree(d_A);
        cudaFree(d_B);
        cudaFree(d_C);
    
        printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
    
        if (correct)
        {
            return EXIT_SUCCESS;
        }
        else
        {
            return EXIT_FAILURE;
        }
    }
    
    
    /**
     * Program main
     */
    int main(int argc, char **argv)
    {
        printf("[Matrix Multiply Using CUDA] - Starting...\n");
    
        if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
            checkCmdLineFlag(argc, (const char **)argv, "?"))
        {
            printf("Usage -device=n (n >= 0 for deviceID)\n");
            printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
            printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
            printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
    
            exit(EXIT_SUCCESS);
        }
    
        // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
        int devID = 0;
    
        if (checkCmdLineFlag(argc, (const char **)argv, "device"))
        {
            devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
            cudaSetDevice(devID);
        }
    
        cudaError_t error;
        cudaDeviceProp deviceProp;
        error = cudaGetDevice(&devID);
    
        if (error != cudaSuccess)
        {
            printf("cudaGetDevice returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
        }
    
        error = cudaGetDeviceProperties(&deviceProp, devID);
    
        if (deviceProp.computeMode == cudaComputeModeProhibited)
        {
            fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
            exit(EXIT_SUCCESS);
        }
    
        if (error != cudaSuccess)
        {
            printf("cudaGetDeviceProperties returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
        }
        else
        {
            printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
        }
    
        int block_size = 32;
    
        dim3 dimsA(5*2*block_size, 5*2*block_size, 1);
        dim3 dimsB(5*4*block_size, 5*2*block_size, 1);
    
        // width of Matrix A
        if (checkCmdLineFlag(argc, (const char **)argv, "wA"))
        {
            dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
        }
    
        // height of Matrix A
        if (checkCmdLineFlag(argc, (const char **)argv, "hA"))
        {
            dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
        }
    
        // width of Matrix B
        if (checkCmdLineFlag(argc, (const char **)argv, "wB"))
        {
            dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
        }
    
        // height of Matrix B
        if (checkCmdLineFlag(argc, (const char **)argv, "hB"))
        {
            dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
        }
    
        if (dimsA.x != dimsB.y)
        {
            printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
                   dimsA.x, dimsB.y);
            exit(EXIT_FAILURE);
        }
    
        printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
    
        int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
    
        getchar();
        exit(matrix_result);
    }
    
    

    matrixMul-nvrtv

    /**
     * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
     *
     * Please refer to the NVIDIA end user license agreement (EULA) associated
     * with this source code for terms and conditions that govern your use of
     * this software. Any use, reproduction, disclosure, or distribution of
     * this software and related documentation outside the terms of the EULA
     * is strictly prohibited.
     *
     */
    
    /**
     * Matrix multiplication: C = A * B.
     * Host code.
     *
     * This sample implements matrix multiplication as described in Chapter 3
     * of the programming guide.
     * It has been written for clarity of exposition to illustrate various CUDA
     * programming principles, not with the goal of providing the most
     * performant generic kernel for matrix multiplication.
     *
     * See also:
     * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
     * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
     * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
     */
    
    /**
     * Matrix multiplication (CUDA Kernel) on the device: C = A * B
     * wA is A's width and wB is B's width
     */
     
     #include <cooperative_groups.h>
    
    
    template <int BLOCK_SIZE> __device__ void
    
    matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
    {
        // Handle to thread block group
        cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
        // Block index
        int bx = blockIdx.x;
        int by = blockIdx.y;
    
        // Thread index
        int tx = threadIdx.x;
        int ty = threadIdx.y;
        
        // Index of the first sub-matrix of A processed by the block
        int aBegin = wA * BLOCK_SIZE * by;
    
        // Index of the last sub-matrix of A processed by the block
        int aEnd   = aBegin + wA - 1;
    
        // Step size used to iterate through the sub-matrices of A
        int aStep  = BLOCK_SIZE;
    
        // Index of the first sub-matrix of B processed by the block
        int bBegin = BLOCK_SIZE * bx;
    
        // Step size used to iterate through the sub-matrices of B
        int bStep  = BLOCK_SIZE * wB;
    
        // Csub is used to store the element of the block sub-matrix
        // that is computed by the thread
        float Csub = 0;
    
        // Loop over all the sub-matrices of A and B
        // required to compute the block sub-matrix
        for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
        {
            // Declaration of the shared memory array As used to
            // store the sub-matrix of A
            __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
    
            // Declaration of the shared memory array Bs used to
            // store the sub-matrix of B
            __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
    
            // Load the matrices from device memory
            // to shared memory; each thread loads
            // one element of each matrix
            As[ty][tx] = A[a + wA * ty + tx];
            Bs[ty][tx] = B[b + wB * ty + tx];
    
            // Synchronize to make sure the matrices are loaded
           cooperative_groups::sync(cta);
    
    
            // Multiply the two matrices together;
            // each thread computes one element
            // of the block sub-matrix
    #pragma unroll
            for (int k = 0; k < BLOCK_SIZE; ++k)
            {
                Csub += As[ty][k] * Bs[k][tx];
            }
    
            // Synchronize to make sure that the preceding
            // computation is done before loading two new
            // sub-matrices of A and B in the next iteration
           cooperative_groups::sync(cta);
    
        }
    
        // Write the block sub-matrix to device memory;
        // each thread writes one element
        int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
        C[c + wB * ty + tx] = Csub;
    }
    
    extern "C" __global__ void  matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
    {
        matrixMulCUDA<16>(C,A,B,wA,wB);
    }
    
    extern "C" __global__ void  matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
    {
        matrixMulCUDA<32>(C,A,B,wA,wB);
    }
    
    
    
    /**
     * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
     *
     * Please refer to the NVIDIA end user license agreement (EULA) associated
     * with this source code for terms and conditions that govern your use of
     * this software. Any use, reproduction, disclosure, or distribution of
     * this software and related documentation outside the terms of the EULA
     * is strictly prohibited.
     *
     */
    
    
    /**
     * Matrix multiplication: C = A * B.
     * Host code.
     *
     * This sample implements matrix multiplication as described in Chapter 3
     * of the programming guide.
     * It has been written for clarity of exposition to illustrate various CUDA
     * programming principles, not with the goal of providing the most
     * performant generic kernel for matrix multiplication.
     *
     * See also:
     * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
     * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
     * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
     */
    
    
    // System includes
    #include <stdio.h>
    #include <assert.h>
    
    // CUDA runtime
    #include <cuda_runtime.h>
    #include "nvrtc_helper.h"
    
    // Helper functions and utilities to work with CUDA
    #include <helper_functions.h>
    
    void constantInit(float *data, int size, float val)
    {
        for (int i = 0; i < size; ++i)
        {
            data[i] = val;
        }
    }
    
    
    /**
     * Run a simple test of matrix multiplication using CUDA
     */
    int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
    {
        // Allocate host memory for matrices A and B
        unsigned int size_A = dimsA.x * dimsA.y;
        unsigned int mem_size_A = sizeof(float) * size_A;
        float *h_A = (float *)malloc(mem_size_A);
        unsigned int size_B = dimsB.x * dimsB.y;
        unsigned int mem_size_B = sizeof(float) * size_B;
        float *h_B = (float *)malloc(mem_size_B);
    
        // Initialize host memory
        const float valB = 0.01f;
        constantInit(h_A, size_A, 1.0f);
        constantInit(h_B, size_B, valB);
    
        // Allocate device memory
        CUdeviceptr d_A, d_B, d_C;
    
        char *ptx, *kernel_file;
        size_t ptxSize;
    
        kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
        compileFileToPTX(kernel_file, argc, argv, &ptx, &ptxSize, 1);
    
        CUmodule module = loadPTX(ptx, argc, argv);
    
        // Allocate host matrix C
        dim3 dimsC(dimsB.x, dimsA.y, 1);
        unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
        float *h_C = (float *) malloc(mem_size_C);
    
        if (h_C == NULL)
        {
            fprintf(stderr, "Failed to allocate host matrix C!\n");
            exit(EXIT_FAILURE);
        }
    
        checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
        checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
        checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
    
        // copy host memory to device
        checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
        checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
    
        // Setup execution parameters
        dim3 threads(block_size, block_size);
        dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
    
        // Create and start timer
        printf("Computing result using CUDA Kernel...\n");
    
        CUfunction kernel_addr;
        if (block_size == 16)
        {
          checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
        }
        else
        {
          checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
        }
    
        void *arr[] = { (void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x };
    
        // Execute the kernel
        int nIter = 300;
    
        for (int j = 0; j < nIter; j++)
        {
            checkCudaErrors(cuLaunchKernel(kernel_addr,
                                                grid.x, grid.y, grid.z, /* grid dim */
                                                threads.x, threads.y, threads.z, /* block dim */
                                                0,0, /* shared mem, stream */
                                                &arr[0], /* arguments */
                                                0));
    
            checkCudaErrors(cuCtxSynchronize());
        }
    
        // Copy result from device to host
        checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
    
        printf("Checking computed result for correctness: ");
    
        bool correct = true;
    
        // test relative error by the formula
        //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
    
        double eps = 1.e-6 ; // machine zero
    
        for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
        {
            double abs_err = fabs(h_C[i] - (dimsA.x * valB));
            double dot_length = dimsA.x;
            double abs_val = fabs(h_C[i]);
            double rel_err = abs_err/abs_val/dot_length ;
    
            if (rel_err > eps)
            {
                printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps);
                correct = false;
            }
        }
    
        printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
    
        printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
    
        // Clean up memory
        free(h_A);
        free(h_B);
        free(h_C);
    
        checkCudaErrors(cuMemFree(d_A));
        checkCudaErrors(cuMemFree(d_B));
        checkCudaErrors(cuMemFree(d_C));
    
        if (correct)
        {
            return EXIT_SUCCESS;
        }
        else
        {
            return EXIT_FAILURE;
        }
    }
    
    /**
     * Program main
     */
    
    int main(int argc, char **argv)
    {
    
        printf("[Matrix Multiply Using CUDA] - Starting...\n");
    
        if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
            checkCmdLineFlag(argc, (const char **)argv, "?"))
        {
    
            printf("Usage -device=n (n >= 0 for deviceID)\n");
            printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
            printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
            printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
    
            exit(EXIT_SUCCESS);
        }
    
        int block_size = 32;
    
        //original:
        dim3 dimsA(5*2*block_size, 5*2*block_size, 1);
        dim3 dimsB(5*4*block_size, 5*2*block_size, 1);
    
        // reduce sizes to avoid running out of memory
        //dim3 dimsA(32,32, 1);
        //dim3 dimsB(32,32,1);
    
        // width of Matrix A
        if (checkCmdLineFlag(argc, (const char **)argv, "wA"))
        {
            dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
        }
    
        // height of Matrix A
        if (checkCmdLineFlag(argc, (const char **)argv, "hA"))
        {
            dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
        }
    
        // width of Matrix B
        if (checkCmdLineFlag(argc, (const char **)argv, "wB"))
        {
            dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
        }
    
        // height of Matrix B
        if (checkCmdLineFlag(argc, (const char **)argv, "hB"))
        {
            dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
        }
    
        if (dimsA.x != dimsB.y)
        {
            printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
            exit(EXIT_FAILURE);
        }
    
        printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
    
        int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
    
        exit(matrix_result);
    }
    
    
    

    matrixMul-CUBLAS

    ////////////////////////////////////////////////////////////////////////////
    //
    // Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
    //
    // Please refer to the NVIDIA end user license agreement (EULA) associated
    // with this source code for terms and conditions that govern your use of
    // this software. Any use, reproduction, disclosure, or distribution of
    // this software and related documentation outside the terms of the EULA
    // is strictly prohibited.
    //
    ////////////////////////////////////////////////////////////////////////////
    
    //
    // Matrix multiplication: C = A * B.
    // Host code.
    //
    // This sample implements matrix multiplication as described in Chapter 3
    // of the programming guide and uses the CUBLAS library to demonstrate
    // the best performance.
    
    // SOME PRECAUTIONS:
    // IF WE WANT TO CALCULATE ROW-MAJOR MATRIX MULTIPLY C = A * B,
    // WE JUST NEED CALL CUBLAS API IN A REVERSE ORDER: cublasSegemm(B, A)!
    // The reason is explained as follows:
    
    // CUBLAS library uses column-major storage, but C/C++ use row-major storage.
    // When passing the matrix pointer to CUBLAS, the memory layout alters from
    // row-major to column-major, which is equivalent to an implicit transpose.
    
    // In the case of row-major C/C++ matrix A, B, and a simple matrix multiplication
    // C = A * B, we can't use the input order like cublasSgemm(A, B)  because of
    // implicit transpose. The actual result of cublasSegemm(A, B) is A(T) * B(T).
    // If col(A(T)) != row(B(T)), equal to row(A) != col(B), A(T) and B(T) are not
    // multipliable. Moreover, even if A(T) and B(T) are multipliable, the result C
    // is a column-based cublas matrix, which means C(T) in C/C++, we need extra
    // transpose code to convert it to a row-based C/C++ matrix.
    
    // To solve the problem, let's consider our desired result C, a row-major matrix.
    // In cublas format, it is C(T) actually (because of the implicit transpose).
    // C = A * B, so C(T) = (A * B) (T) = B(T) * A(T). Cublas matrice B(T) and A(T)
    // happen to be C/C++ matrice B and A (still because of the implicit transpose)!
    // We don't need extra transpose code, we only need alter the input order!
    //
    // CUBLAS provides high-performance matrix multiplication.
    // See also:
    // V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
    // in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
    // Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
    //
    
    // Utilities and system includes
    #include <assert.h>
    #include <helper_string.h>  // helper for shared functions common to CUDA Samples
    
    // CUDA runtime
    #include <cuda_runtime.h>
    #include <cublas_v2.h>
    
    // CUDA and CUBLAS functions
    #include <helper_functions.h>
    #include <helper_cuda.h>
    
    #ifndef min
    #define min(a,b) ((a < b) ? a : b)
    #endif
    #ifndef max
    #define max(a,b) ((a > b) ? a : b)
    #endif
    
    typedef struct _matrixSize      // Optional Command-line multiplier for matrix sizes
    {
        unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
    } sMatrixSize;
    
    ////////////////////////////////////////////////////////////////////////////////
    //! Compute reference data set matrix multiply on CPU
    //! C = A * B
    //! @param C          reference data, computed but preallocated
    //! @param A          matrix A as provided to device
    //! @param B          matrix B as provided to device
    //! @param hA         height of matrix A
    //! @param wB         width of matrix B
    ////////////////////////////////////////////////////////////////////////////////
    void
    matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
    {
        for (unsigned int i = 0; i < hA; ++i)
            for (unsigned int j = 0; j < wB; ++j)
            {
                double sum = 0;
    
                for (unsigned int k = 0; k < wA; ++k)
                {
                    double a = A[i * wA + k];
                    double b = B[k * wB + j];
                    sum += a * b;
                }
    
                C[i * wB + j] = (float)sum;
            }
    }
    
    // Allocates a matrix with random float entries.
    void randomInit(float *data, int size)
    {
        for (int i = 0; i < size; ++i)
            data[i] = rand() / (float)RAND_MAX;
    }
    
    void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
    {
        printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
        int i,j,k;
        int error_count=0;
    
        for (j = 0; j < height; j++)
        {
            if (error_count < iListLength)
            {
                printf("\n  Row %d:\n", j);
            }
    
            for (i = 0; i < width; i++)
            {
                k = j * width + i;
                float fDiff = fabs(data1[k] - data2[k]);
    
                if (fDiff > fListTol)
                {
                    if (error_count < iListLength)
                    {
                        printf("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
                    }
    
                    error_count++;
                }
            }
        }
    
        printf(" \n  Total Errors = %d\n", error_count);
    }
    
    void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
    {
        // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
        cudaError_t error;
        devID = 0;
    
        if (checkCmdLineFlag(argc, (const char **)argv, "device"))
        {
            devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
            error = cudaSetDevice(devID);
    
            if (error != cudaSuccess)
            {
                printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
                exit(EXIT_FAILURE);
            }
        }
    
        // get number of SMs on this GPU
        error = cudaGetDevice(&devID);
    
        if (error != cudaSuccess)
        {
            printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
            exit(EXIT_FAILURE);
        }
    
    
        if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
        {
            iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
        }
    
        iSizeMultiple = min(iSizeMultiple, 10);
        iSizeMultiple = max(iSizeMultiple, 1);
    
        cudaDeviceProp deviceProp;
    
        error = cudaGetDeviceProperties(&deviceProp, devID);
    
        if (error != cudaSuccess)
        {
            printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
            exit(EXIT_FAILURE);
        }
    
        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
    
        int block_size = 32;
    
        matrix_size.uiWA = 3 * block_size * iSizeMultiple;
        matrix_size.uiHA = 4 * block_size * iSizeMultiple;
        matrix_size.uiWB = 2 * block_size * iSizeMultiple;
        matrix_size.uiHB = 3 * block_size * iSizeMultiple;
        matrix_size.uiWC = 2 * block_size * iSizeMultiple;
        matrix_size.uiHC = 4 * block_size * iSizeMultiple;
    
        printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
               matrix_size.uiHA, matrix_size.uiWA,
               matrix_size.uiHB, matrix_size.uiWB,
               matrix_size.uiHC, matrix_size.uiWC);
    
        if( matrix_size.uiWA != matrix_size.uiHB ||
            matrix_size.uiHA != matrix_size.uiHC ||
            matrix_size.uiWB != matrix_size.uiWC)
        {
           printf("ERROR: Matrix sizes do not match!\n");
           exit(-1);
        }
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    //! Run a simple test matrix multiply using CUBLAS
    ////////////////////////////////////////////////////////////////////////////////
    int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
    {
        cudaDeviceProp deviceProp;
    
        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
    
        int block_size = 32;
    
        // set seed for rand()
        srand(2006);
    
        // allocate host memory for matrices A and B
        unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
        unsigned int mem_size_A = sizeof(float) * size_A;
        float *h_A = (float *)malloc(mem_size_A);
        unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
        unsigned int mem_size_B = sizeof(float) * size_B;
        float *h_B = (float *)malloc(mem_size_B);
    
        // set seed for rand()
        srand(2006);
    
        // initialize host memory
        randomInit(h_A, size_A);
        randomInit(h_B, size_B);
    
        // allocate device memory
        float *d_A, *d_B, *d_C;
        unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
        unsigned int mem_size_C = sizeof(float) * size_C;
    
        // allocate host memory for the result
        float *h_C      = (float *) malloc(mem_size_C);
        float *h_CUBLAS = (float *) malloc(mem_size_C);
    
        checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A));
        checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B));
        checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C));
    
        // setup execution parameters
        dim3 threads(block_size, block_size);
        dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);
    
        // create and start timer
        printf("Computing result using CUBLAS...");
    
        // execute the kernel
        int nIter = 30;
    
        // CUBLAS version 2.0
        {
            const float alpha = 1.0f;
            const float beta  = 0.0f;
            cublasHandle_t handle;
            cudaEvent_t start, stop;
    
            checkCudaErrors(cublasCreate(&handle));
    
            //Perform warmup operation with cublas
            checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
    
            // Allocate CUDA events that we'll use for timing
            checkCudaErrors(cudaEventCreate(&start));
            checkCudaErrors(cudaEventCreate(&stop));
    
            // Record the start event
            checkCudaErrors(cudaEventRecord(start, NULL));
    
            for (int j = 0; j < nIter; j++)
            {
                //note cublas is column primary!
                //need to transpose the order
                checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
    
            }
    
            printf("done.\n");
    
            // Record the stop event
            checkCudaErrors(cudaEventRecord(stop, NULL));
    
            // Wait for the stop event to complete
            checkCudaErrors(cudaEventSynchronize(stop));
    
            float msecTotal = 0.0f;
            checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
    
            // Compute and print the performance
            float msecPerMatrixMul = msecTotal / nIter;
            double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB;
            double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
            printf(
                "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
                gigaFlops,
                msecPerMatrixMul,
                flopsPerMatrixMul);
    
            // copy result from device to host
            checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost));
    
            // Destroy the handle
            checkCudaErrors(cublasDestroy(handle));
        }
    
        // compute reference solution
        printf("Computing result using host CPU...");
        float *reference = (float *)malloc(mem_size_C);
        matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
        printf("done.\n");
    
        // check result (CUBLAS)
        bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
    
        if (resCUBLAS != true)
        {
            printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
        }
    
        printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");
    
        printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
    
        // clean up memory
        free(h_A);
        free(h_B);
        free(h_C);
        free(reference);
        checkCudaErrors(cudaFree(d_A));
        checkCudaErrors(cudaFree(d_B));
        checkCudaErrors(cudaFree(d_C));
    
        if (resCUBLAS == true)
        {
            return EXIT_SUCCESS;    // return value = 1
        }
        else
        {
            return EXIT_FAILURE;     // return value = 0
        }
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    // Program main
    ////////////////////////////////////////////////////////////////////////////////
    int main(int argc, char **argv)
    {
        printf("[Matrix Multiply CUBLAS] - Starting...\n");
    
        int devID = 0, sizeMult = 5;
        sMatrixSize matrix_size;
    
        initializeCUDA(argc, argv, devID, sizeMult, matrix_size);
    
        int matrix_result = matrixMultiply(argc, argv, devID, matrix_size);
    
        return matrix_result;
    }
    
    

    相关文章

      网友评论

          本文标题:CUDA Samples-matMul

          本文链接:https://www.haomeiwen.com/subject/csgiqqtx.html