3.2.1 : Le main.cpp
Écrivons le main.cpp :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
#include <stdio.h> #include <stdlib.h> #include <time.h> #include <cuda.h> #include <cuda_runtime.h> #include <sstream> #include <iomanip> #include <iostream> // Beginning of GPU Architecture definitions /** @param major : */ inline int _ConvertSMVer2Cores(int major, int minor){ // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM typedef struct { int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version int Cores; } sSMtoCores; sSMtoCores nGpuArchCoresPerSM[] = { { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class { 0x52, 128}, // Maxwell Generation (SM 5.2) { 0x53, 128}, // Maxwell Generation (SM 5.3) { 0x60, 64}, // Pascal Generation (SM 6.0) { 0x61, 128}, // Pascal Generation (SM 6.1) { 0x62, 128}, // Pascal Generation (SM 6.2) { 0x70, 64}, // Volta Generation (SM 7.0) { 0x72, 64}, // Volta Generation (SM 7.0) { 0x75, 64}, // Volta Generation (SM 7.0) { 0x80, 64}, //Ampere Generation (SM 8.0) { -1, -1 } }; int index = 0; while (nGpuArchCoresPerSM[index].SM != -1){ if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)){ return nGpuArchCoresPerSM[index].Cores; } index++; } // If we don't find the values, we default use the previous one to run properly printf("\tMapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores); return nGpuArchCoresPerSM[index-1].Cores; } ///Get the UUID of the given Cuda Device /** @param deviceProp : Cuda Device to be used * @return corresponding UUID */ std::string phoenix_cuda_getUuid(const cudaDeviceProp & deviceProp){ const unsigned char * tabByte = (const unsigned char *)deviceProp.uuid.bytes; std::stringstream out; for(int i(0); i < 16; ++i){ // printf("%02x,", (unsigned int)(tabByte[i])); out << std::hex << std::setfill('0') << std::setw(2) << (unsigned int)tabByte[i]; } return out.str(); } int main(int argc, char** argv){ int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if(error_id != cudaSuccess){ printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); printf("deviceCount = %d\n", deviceCount); printf("Result = FAIL\n"); exit(EXIT_FAILURE); } // This function call returns 0 if there are no CUDA capable devices. if(deviceCount == 0){ printf("There are no available device(s) that support CUDA\n"); }else{ printf("Detected %d CUDA Capable device(s)\n", deviceCount); } int driverVersion = 0, runtimeVersion = 0; for(int dev = 0; dev < deviceCount; ++dev){ cudaSetDevice(dev); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); std::string uuid(phoenix_cuda_getUuid(deviceProp)); printf("\tUUID : \"%s\"\n", uuid.c_str()); cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); printf("\tCUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); printf("\tCUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; sprintf(msg, "\tTotal amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); printf("%s", msg); int nbCore = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); printf("\t(%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, nbCore, nbCore * deviceProp.multiProcessorCount); printf("\tGPU Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); printf("\tMaximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n", deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); printf("\tMaximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]); printf("\tMaximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n", deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); printf("\tTotal amount of constant memory: %lu bytes\n", deviceProp.totalConstMem); printf("\tTotal amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock); printf("\tTotal number of registers available per block: %d\n", deviceProp.regsPerBlock); printf("\tWarp size: %d\n", deviceProp.warpSize); printf("\tMaximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); printf("\tMaximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf("\tMax dimension size of a thread block (x,y,z): (%d, %d, %d)\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf("\tMax dimension size of a grid size (x,y,z): (%d, %d, %d)\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf("\tMaximum memory pitch: %lu bytes\n", deviceProp.memPitch); printf("\tTexture alignment: %lu bytes\n", deviceProp.textureAlignment); printf("\tConcurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); printf("\tRun time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); printf("\tIntegrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); printf("\tSupport host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf("\tAlignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); printf("\tDevice has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled"); } return 0; } |
Le fichier main.cpp complet :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
/*************************************** Auteur : Pierre Aubert Mail : pierre.aubert@lapp.in2p3.fr Licence : CeCILL-C ****************************************/ #include <stdio.h> #include <stdlib.h> #include <time.h> #include <cuda.h> #include <cuda_runtime.h> #include <sstream> #include <iomanip> #include <iostream> // Beginning of GPU Architecture definitions /** @param major : */ inline int _ConvertSMVer2Cores(int major, int minor){ // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM typedef struct { int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version int Cores; } sSMtoCores; sSMtoCores nGpuArchCoresPerSM[] = { { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class { 0x52, 128}, // Maxwell Generation (SM 5.2) { 0x53, 128}, // Maxwell Generation (SM 5.3) { 0x60, 64}, // Pascal Generation (SM 6.0) { 0x61, 128}, // Pascal Generation (SM 6.1) { 0x62, 128}, // Pascal Generation (SM 6.2) { 0x70, 64}, // Volta Generation (SM 7.0) { 0x72, 64}, // Volta Generation (SM 7.0) { 0x75, 64}, // Volta Generation (SM 7.0) { 0x80, 64}, //Ampere Generation (SM 8.0) { -1, -1 } }; int index = 0; while (nGpuArchCoresPerSM[index].SM != -1){ if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)){ return nGpuArchCoresPerSM[index].Cores; } index++; } // If we don't find the values, we default use the previous one to run properly printf("\tMapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores); return nGpuArchCoresPerSM[index-1].Cores; } ///Get the UUID of the given Cuda Device /** @param deviceProp : Cuda Device to be used * @return corresponding UUID */ std::string phoenix_cuda_getUuid(const cudaDeviceProp & deviceProp){ const unsigned char * tabByte = (const unsigned char *)deviceProp.uuid.bytes; std::stringstream out; for(int i(0); i < 16; ++i){ // printf("%02x,", (unsigned int)(tabByte[i])); out << std::hex << std::setfill('0') << std::setw(2) << (unsigned int)tabByte[i]; } return out.str(); } int main(int argc, char** argv){ int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if(error_id != cudaSuccess){ printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); printf("deviceCount = %d\n", deviceCount); printf("Result = FAIL\n"); exit(EXIT_FAILURE); } // This function call returns 0 if there are no CUDA capable devices. if(deviceCount == 0){ printf("There are no available device(s) that support CUDA\n"); }else{ printf("Detected %d CUDA Capable device(s)\n", deviceCount); } int driverVersion = 0, runtimeVersion = 0; for(int dev = 0; dev < deviceCount; ++dev){ cudaSetDevice(dev); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); std::string uuid(phoenix_cuda_getUuid(deviceProp)); printf("\tUUID : \"%s\"\n", uuid.c_str()); cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); printf("\tCUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); printf("\tCUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; sprintf(msg, "\tTotal amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); printf("%s", msg); int nbCore = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); printf("\t(%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, nbCore, nbCore * deviceProp.multiProcessorCount); printf("\tGPU Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); printf("\tMaximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n", deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); printf("\tMaximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]); printf("\tMaximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n", deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); printf("\tTotal amount of constant memory: %lu bytes\n", deviceProp.totalConstMem); printf("\tTotal amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock); printf("\tTotal number of registers available per block: %d\n", deviceProp.regsPerBlock); printf("\tWarp size: %d\n", deviceProp.warpSize); printf("\tMaximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); printf("\tMaximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf("\tMax dimension size of a thread block (x,y,z): (%d, %d, %d)\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf("\tMax dimension size of a grid size (x,y,z): (%d, %d, %d)\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf("\tMaximum memory pitch: %lu bytes\n", deviceProp.memPitch); printf("\tTexture alignment: %lu bytes\n", deviceProp.textureAlignment); printf("\tConcurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); printf("\tRun time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); printf("\tIntegrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); printf("\tSupport host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf("\tAlignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); printf("\tDevice has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled"); } return 0; } |
Vous pouvez le télécharger ici.