    为了快速了解GPU以及入门CUDA编程,很有必要对GPU硬件有一个基本的认识, 下面通过cuda_runtime提供的API进行GPU设备查询以及相关硬件参数的了解.


    • OS: Ubuntu 20.04
    • CUDA: v11
    • GCC-10
    • VSCode



    • cudaGetDeviceCount()
    • cudaGetDeviceProperties()
    #include <iostream>
    #include <cuda_runtime.h>
    #include <stdio.h>
    #define CHECK(call)                                                     \
        do                                                                  \
        {                                                                   \
            cudaError_t ret = call;                                         \
            if (ret != cudaSuccess)                                         \
            {                                                               \
                printf("        CUDA Error\n");                             \
                printf("        File:%s\n", __FILE__);                      \
                printf("        Line:%d\n", __LINE__);                      \
                printf("        Error code:%d\n", ret);                     \
                printf("        Error text:%s\n", cudaGetErrorString(ret)); \
            }                                                               \
        } while (0)
    int main()
        int device_count = 0;
        if(device_count > 0) {
            std::cout << "Found " << device_count << " GPUs!" << std::endl;
            for(int id=0;id<device_count;id++) {
                std::cout << "Device: " << id << std::endl;
                cudaDeviceProp prop;
                CHECK(cudaGetDeviceProperties(&prop, id));
                printf("GPU Name: %s\n", prop.name);
                printf("GPU Global Memory(显存容量): %f GB\n", (float)prop.totalGlobalMem/(1024*1024*1024));
                printf("GPU Memory 位宽:%d bit\n", prop.memoryBusWidth);
                printf("GPU SM个数:%d\n", prop.multiProcessorCount);
                printf("GPU 每个SM上最大线程数量:%d\n", prop.maxThreadsPerMultiProcessor);
        }else {
            std::cout << "No NVIDIA GPU Exist !" << std::endl;



    CUDA提供的GPU 属性介绍:

    struct __device_builtin__ cudaDeviceProp
        char         name[256];                  /**< ASCII string identifying device */
        cudaUUID_t   uuid;                       /**< 16-byte unique identifier */
        char         luid[8];                    /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
        unsigned int luidDeviceNodeMask;         /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
        size_t       totalGlobalMem;             /**< Global memory available on device in bytes */
        size_t       sharedMemPerBlock;          /**< Shared memory available per block in bytes */
        int          regsPerBlock;               /**< 32-bit registers available per block */
        int          warpSize;                   /**< Warp size in threads */
        size_t       memPitch;                   /**< Maximum pitch in bytes allowed by memory copies */
        int          maxThreadsPerBlock;         /**< Maximum number of threads per block */
        int          maxThreadsDim[3];           /**< Maximum size of each dimension of a block */
        int          maxGridSize[3];             /**< Maximum size of each dimension of a grid */
        int          clockRate;                  /**< Clock frequency in kilohertz */
        size_t       totalConstMem;              /**< Constant memory available on device in bytes */
        int          major;                      /**< Major compute capability */
        int          minor;                      /**< Minor compute capability */
        size_t       textureAlignment;           /**< Alignment requirement for textures */
        size_t       texturePitchAlignment;      /**< Pitch alignment requirement for texture references bound to pitched memory */
        int          deviceOverlap;              /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
        int          multiProcessorCount;        /**< Number of multiprocessors on device */
        int          kernelExecTimeoutEnabled;   /**< Specified whether there is a run time limit on kernels */
        int          integrated;                 /**< Device is integrated as opposed to discrete */
        int          canMapHostMemory;           /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
        int          computeMode;                /**< Compute mode (See ::cudaComputeMode) */
        int          maxTexture1D;               /**< Maximum 1D texture size */
        int          maxTexture1DMipmap;         /**< Maximum 1D mipmapped texture size */
        int          maxTexture1DLinear;         /**< Maximum size for 1D textures bound to linear memory */
        int          maxTexture2D[2];            /**< Maximum 2D texture dimensions */
        int          maxTexture2DMipmap[2];      /**< Maximum 2D mipmapped texture dimensions */
        int          maxTexture2DLinear[3];      /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
        int          maxTexture2DGather[2];      /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
        int          maxTexture3D[3];            /**< Maximum 3D texture dimensions */
        int          maxTexture3DAlt[3];         /**< Maximum alternate 3D texture dimensions */
        int          maxTextureCubemap;          /**< Maximum Cubemap texture dimensions */
        int          maxTexture1DLayered[2];     /**< Maximum 1D layered texture dimensions */
        int          maxTexture2DLayered[3];     /**< Maximum 2D layered texture dimensions */
        int          maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
        int          maxSurface1D;               /**< Maximum 1D surface size */
        int          maxSurface2D[2];            /**< Maximum 2D surface dimensions */
        int          maxSurface3D[3];            /**< Maximum 3D surface dimensions */
        int          maxSurface1DLayered[2];     /**< Maximum 1D layered surface dimensions */
        int          maxSurface2DLayered[3];     /**< Maximum 2D layered surface dimensions */
        int          maxSurfaceCubemap;          /**< Maximum Cubemap surface dimensions */
        int          maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
        size_t       surfaceAlignment;           /**< Alignment requirements for surfaces */
        int          concurrentKernels;          /**< Device can possibly execute multiple kernels concurrently */
        int          ECCEnabled;                 /**< Device has ECC support enabled */
        int          pciBusID;                   /**< PCI bus ID of the device */
        int          pciDeviceID;                /**< PCI device ID of the device */
        int          pciDomainID;                /**< PCI domain ID of the device */
        int          tccDriver;                  /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
        int          asyncEngineCount;           /**< Number of asynchronous engines */
        int          unifiedAddressing;          /**< Device shares a unified address space with the host */
        int          memoryClockRate;            /**< Peak memory clock frequency in kilohertz */
        int          memoryBusWidth;             /**< Global memory bus width in bits */
        int          l2CacheSize;                /**< Size of L2 cache in bytes */
        int          persistingL2CacheMaxSize;   /**< Device's maximum l2 persisting lines capacity setting in bytes */
        int          maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
        int          streamPrioritiesSupported;  /**< Device supports stream priorities */
        int          globalL1CacheSupported;     /**< Device supports caching globals in L1 */
        int          localL1CacheSupported;      /**< Device supports caching locals in L1 */
        size_t       sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */
        int          regsPerMultiprocessor;      /**< 32-bit registers available per multiprocessor */
        int          managedMemory;              /**< Device supports allocating managed memory on this system */
        int          isMultiGpuBoard;            /**< Device is on a multi-GPU board */
        int          multiGpuBoardGroupID;       /**< Unique identifier for a group of devices on the same multi-GPU board */
        int          hostNativeAtomicSupported;  /**< Link between the device and the host supports native atomic operations */
        int          singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
        int          pageableMemoryAccess;       /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
        int          concurrentManagedAccess;    /**< Device can coherently access managed memory concurrently with the CPU */
        int          computePreemptionSupported; /**< Device supports Compute Preemption */
        int          canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */
        int          cooperativeLaunch;          /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
        int          cooperativeMultiDeviceLaunch; /**< Device can participate in cooperative kernels launched via ::cudaLaunchCooperativeKernelMultiDevice */
        size_t       sharedMemPerBlockOptin;     /**< Per device maximum shared memory per block usable by special opt in */
        int          pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */
        int          directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */
        int          maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */
        int          accessPolicyMaxWindowSize;  /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */
        size_t       reservedSharedMemPerBlock;  /**< Shared memory reserved by CUDA driver per block in bytes */

    deviceQuery App

    在安装好CUDA之后, CUDA提供了一些sample, 其他包含deviceQuery 可执行的app, 用法查看本机上的GPU硬件, sample的位置: /usr/local/cuda/samples

    ├── 0_Simple
    ├── 1_Utilities
    ├── 2_Graphics
    ├── 3_Imaging
    ├── 4_Finance
    ├── 5_Simulations
    ├── 6_Advanced
    ├── 7_CUDALibraries
    ├── bin
    ├── common
    ├── EULA.txt
    └── Makefile

    samples目录包含了若干例子, samples的根目录提供的makefile, 因此只需进行make即可生成app:
    make -j8

    查找 deviceQuery 可执行程序:

    cd bin/x84_x64/linux/release
    ls | grep deviceQuery
    # Run


    /deviceQuery Starting...
     CUDA Device Query (Runtime API) version (CUDART static linking)
    Detected 1 CUDA Capable device(s)
    Device 0: "NVIDIA GeForce MX250"
      CUDA Driver Version / Runtime Version          11.5 / 11.1
      CUDA Capability Major/Minor version number:    6.1
      Total amount of global memory:                 2003 MBytes (2099904512 bytes)
      ( 3) Multiprocessors, (128) CUDA Cores/MP:     384 CUDA Cores
      GPU Max Clock rate:                            1582 MHz (1.58 GHz)
      Memory Clock rate:                             3004 Mhz
      Memory Bus Width:                              64-bit
      L2 Cache Size:                                 524288 bytes
      Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
      Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
      Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
      Total amount of constant memory:               65536 bytes
      Total amount of shared memory per block:       49152 bytes
      Total shared memory per multiprocessor:        98304 bytes
      Total number of registers available per block: 65536
      Warp size:                                     32
      Maximum number of threads per multiprocessor:  2048
      Maximum number of threads per block:           1024
      Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
      Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
      Maximum memory pitch:                          2147483647 bytes
      Texture alignment:                             512 bytes
      Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
      Run time limit on kernels:                     Yes
      Integrated GPU sharing Host Memory:            No
      Support host page-locked memory mapping:       Yes
      Alignment requirement for Surfaces:            Yes
      Device has ECC support:                        Disabled
      Device supports Unified Addressing (UVA):      Yes
      Device supports Managed Memory:                Yes
      Device supports Compute Preemption:            Yes
      Supports Cooperative Kernel Launch:            Yes
      Supports MultiDevice Co-op Kernel Launch:      Yes
      Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
      Compute Mode:
         < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
    deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 11.5, CUDA Runtime Version = 11.1, NumDevs = 1
    Result = PASS

    BandWidthTest App

    用途: 测试GPU显存的传输带宽

    [CUDA Bandwidth Test] - Starting...
    Running on...
     Device 0: NVIDIA GeForce MX250
     Quick Mode
     Host to Device Bandwidth, 1 Device(s)
     PINNED Memory Transfers
       Transfer Size (Bytes)    Bandwidth(GB/s)
       32000000         3.1
     Device to Host Bandwidth, 1 Device(s)
     PINNED Memory Transfers
       Transfer Size (Bytes)    Bandwidth(GB/s)
       32000000         3.3
     Device to Device Bandwidth, 1 Device(s)
     PINNED Memory Transfers
       Transfer Size (Bytes)    Bandwidth(GB/s)
       32000000         41.2
    Result = PASS
    NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
    • Device To Host bandwidth 和 Host to Device bandwidth基本上都是 3GB/s, GPU和CPU的通信是通过PCIe总线, PCIe的带宽是CPU-GPU的通信瓶颈.
    • Device To Device bandwidth: 41GB/s, GPU显存自身的带宽比PCIe通信的带宽大的多, 因此大多数GPU 程序都是提前把输入数据准备好一次性复制到GPU显存, 这样可以减少CPU-GPU通信带来的访存overhead.



