美文网首页
device query

device query

作者: jianpengma | 来源:发表于2018-11-18 19:48 被阅读0次

    ~/samples/NVIDIA_CUDA-9.1_Samples/1_Utilities/deviceQuery$ make

    /usr/local/cuda-9.1/bin/nvcc -ccbin g++ -I../../common/inc  -m64    -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o deviceQuery.o -c deviceQuery.cpp

    /usr/local/cuda-9.1/bin/nvcc -ccbin g++  -m64      -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o deviceQuery deviceQuery.o

    mkdir -p ../../bin/x86_64/linux/release

    cp deviceQuery ../../bin/x86_64/linux/release

    ~/samples/NVIDIA_CUDA-9.1_Samples/1_Utilities/deviceQuery$ ls

    deviceQuery  deviceQuery.cpp  deviceQuery.o  Makefile  NsightEclipse.xml  readme.txt

    ~/samples/NVIDIA_CUDA-9.1_Samples/1_Utilities/deviceQuery$ ./deviceQuery

    ./deviceQuery Starting...

    CUDA Device Query (Runtime API) version (CUDART static linking)

    Detected 1 CUDA Capable device(s)

    Device 0: "GeForce GTX 1050"

      CUDA Driver Version / Runtime Version          10.1 / 9.1

      CUDA Capability Major/Minor version number:    6.1

      Total amount of global memory:                1998 MBytes (2095382528 bytes)

      ( 5) Multiprocessors, (128) CUDA Cores/MP:    640 CUDA Cores

      GPU Max Clock rate:                            1455 MHz (1.46 GHz)

      Memory Clock rate:                            3504 Mhz

      Memory Bus Width:                              128-bit

      L2 Cache Size:                                1048576 bytes

      Maximum Texture Dimension Size (x,y,z)        1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)

      Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers

      Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers

      Total amount of constant memory:              65536 bytes

      Total amount of shared memory per block:      49152 bytes

      Total number of registers available per block: 65536

      Warp size:                                    32

      Maximum number of threads per multiprocessor:  2048

      Maximum number of threads per block:          1024

      Max dimension size of a thread block (x,y,z): (1024, 1024, 64)

      Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)

      Maximum memory pitch:                          2147483647 bytes

      Texture alignment:                            512 bytes

      Concurrent copy and kernel execution:          Yes with 2 copy engine(s)

      Run time limit on kernels:                    No

      Integrated GPU sharing Host Memory:            No

      Support host page-locked memory mapping:      Yes

      Alignment requirement for Surfaces:            Yes

      Device has ECC support:                        Disabled

      Device supports Unified Addressing (UVA):      Yes

      Supports Cooperative Kernel Launch:            Yes

      Supports MultiDevice Co-op Kernel Launch:      Yes

      Device PCI Domain ID / Bus ID / location ID:  0 / 2 / 0

      Compute Mode:

        < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

    deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 10.1, CUDA Runtime Version = 9.1, NumDevs = 1

    Result = PASS

    ~/samples/NVIDIA_CUDA-9.1_Samples/1_Utilities/deviceQuery$ cat deviceQuery.cpp

    /*

    * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.

    *

    * Please refer to the NVIDIA end user license agreement (EULA) associated

    * with this source code for terms and conditions that govern your use of

    * this software. Any use, reproduction, disclosure, or distribution of

    * this software and related documentation outside the terms of the EULA

    * is strictly prohibited.

    *

    */

    /* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */

    // Shared Utilities (QA Testing)

    // std::system includes

    #include <memory>

    #include <iostream>

    #include <cuda_runtime.h>

    #include <helper_cuda.h>

    int *pArgc = NULL;

    char **pArgv = NULL;

    #if CUDART_VERSION < 5000

    // CUDA-C includes

    #include <cuda.h>

    // This function wraps the CUDA Driver API into a template function

    template <class T>

    inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)

    {

        CUresult error =    cuDeviceGetAttribute(attribute, device_attribute, device);

        if (CUDA_SUCCESS != error)

        {

            fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",

                    error, __FILE__, __LINE__);

            exit(EXIT_FAILURE);

        }

    }

    #endif /* CUDART_VERSION < 5000 */

    ////////////////////////////////////////////////////////////////////////////////

    // Program main

    ////////////////////////////////////////////////////////////////////////////////

    int

    main(int argc, char **argv)

    {

        pArgc = &argc;

        pArgv = argv;

        printf("%s Starting...\n\n", argv[0]);

        printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

        int deviceCount = 0;

        cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

        if (error_id != cudaSuccess)

        {

            printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));

            printf("Result = FAIL\n");

            exit(EXIT_FAILURE);

        }

        // This function call returns 0 if there are no CUDA capable devices.

        if (deviceCount == 0)

        {

            printf("There are no available device(s) that support CUDA\n");

        }

        else

        {

            printf("Detected %d CUDA Capable device(s)\n", deviceCount);

        }

        int dev, driverVersion = 0, runtimeVersion = 0;

        for (dev = 0; dev < deviceCount; ++dev)

        {

            cudaSetDevice(dev);

            cudaDeviceProp deviceProp;

            cudaGetDeviceProperties(&deviceProp, dev);

            printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

            // Console log

            cudaDriverGetVersion(&driverVersion);

            cudaRuntimeGetVersion(&runtimeVersion);

            printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);

            printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

            char msg[256];

            SPRINTF(msg, "  Total amount of global memory:                %.0f MBytes (%llu bytes)\n",

                    (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);

            printf("%s", msg);

            printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:    %d CUDA Cores\n",

                  deviceProp.multiProcessorCount,

                  _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),

                  _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);

            printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);

    #if CUDART_VERSION >= 5000

            // This is supported in CUDA 5.0 (runtime API device properties)

            printf("  Memory Clock rate:                            %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);

            printf("  Memory Bus Width:                              %d-bit\n",  deviceProp.memoryBusWidth);

            if (deviceProp.l2CacheSize)

            {

                printf("  L2 Cache Size:                                %d bytes\n", deviceProp.l2CacheSize);

            }

    #else

            // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)

            int memoryClock;

            getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);

            printf("  Memory Clock rate:                            %.0f Mhz\n", memoryClock * 1e-3f);

            int memBusWidth;

            getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);

            printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);

            int L2CacheSize;

            getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

            if (L2CacheSize)

            {

                printf("  L2 Cache Size:                                %d bytes\n", L2CacheSize);

            }

    #endif

            printf("  Maximum Texture Dimension Size (x,y,z)        1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",

                  deviceProp.maxTexture1D  , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],

                  deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);

            printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",

                  deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);

            printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",

                  deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);

            printf("  Total amount of constant memory:              %lu bytes\n", deviceProp.totalConstMem);

            printf("  Total amount of shared memory per block:      %lu bytes\n", deviceProp.sharedMemPerBlock);

            printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);

            printf("  Warp size:                                    %d\n", deviceProp.warpSize);

            printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);

            printf("  Maximum number of threads per block:          %d\n", deviceProp.maxThreadsPerBlock);

            printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",

                  deviceProp.maxThreadsDim[0],

                  deviceProp.maxThreadsDim[1],

                  deviceProp.maxThreadsDim[2]);

            printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",

                  deviceProp.maxGridSize[0],

                  deviceProp.maxGridSize[1],

                  deviceProp.maxGridSize[2]);

            printf("  Maximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);

            printf("  Texture alignment:                            %lu bytes\n", deviceProp.textureAlignment);

            printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);

            printf("  Run time limit on kernels:                    %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");

            printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");

            printf("  Support host page-locked memory mapping:      %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");

            printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");

            printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");

    #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)

            printf("  CUDA Device Driver Mode (TCC or WDDM):        %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");

    #endif

            printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");

            printf("  Supports Cooperative Kernel Launch:            %s\n", deviceProp.cooperativeLaunch ? "Yes" : "No");

            printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n", deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");

            printf("  Device PCI Domain ID / Bus ID / location ID:  %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);

            const char *sComputeMode[] =

            {

                "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",

                "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",

                "Prohibited (no host thread can use ::cudaSetDevice() with this device)",

                "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",

                "Unknown",

                NULL

            };

            printf("  Compute Mode:\n");

            printf("    < %s >\n", sComputeMode[deviceProp.computeMode]);

        }

        // If there are 2 or more GPUs, query to determine whether RDMA is supported

        if (deviceCount >= 2)

        {

            cudaDeviceProp prop[64];

            int gpuid[64]; // we want to find the first two GPUs that can support P2P

            int gpu_p2p_count = 0;

            for (int i=0; i < deviceCount; i++)

            {

                checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));

                // Only boards based on Fermi or later can support P2P

                if ((prop[i].major >= 2)

    #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)

                    // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to support this

                    && prop[i].tccDriver

    #endif

                  )

                {

                    // This is an array of P2P capable GPUs

                    gpuid[gpu_p2p_count++] = i;

                }

            }

            // Show all the combinations of support P2P GPUs

            int can_access_peer;

            if (gpu_p2p_count >= 2)

            {

                for (int i = 0; i < gpu_p2p_count; i++)

                {

                    for (int j = 0; j < gpu_p2p_count; j++)

                    {

                        if (gpuid[i] == gpuid[j])

                        {

                            continue;

                        }

                        checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));

                            printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i],

                              prop[gpuid[j]].name, gpuid[j] ,

                              can_access_peer ? "Yes" : "No");

                    }

                }

            }

        }

        // csv masterlog info

        // *****************************

        // exe and CUDA driver name

        printf("\n");

        std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";

        char cTemp[16];

        // driver version

        sProfileString += ", CUDA Driver Version = ";

    #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)

        sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);

    #else

        sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);

    #endif

        sProfileString +=  cTemp;

        // Runtime version

        sProfileString += ", CUDA Runtime Version = ";

    #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)

        sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);

    #else

        sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);

    #endif

        sProfileString +=  cTemp;

        // Device count

        sProfileString += ", NumDevs = ";

    #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)

        sprintf_s(cTemp, 10, "%d", deviceCount);

    #else

        sprintf(cTemp, "%d", deviceCount);

    #endif

        sProfileString += cTemp;

        sProfileString += "\n";

        printf("%s", sProfileString.c_str());

        printf("Result = PASS\n");

        // finish

        exit(EXIT_SUCCESS);

    }

    相关文章

      网友评论

          本文标题:device query

          本文链接:https://www.haomeiwen.com/subject/zaogfqtx.html