3.2.1 : Le main.cpp

Écrivons le main.cpp :

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <cuda.h>
#include <cuda_runtime.h>

#include <sstream>
#include <iomanip>
#include <iostream>

// Beginning of GPU Architecture definitions
/**	@param major : 
*/
inline int _ConvertSMVer2Cores(int major, int minor){
	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
	typedef struct
	{
		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
		int Cores;
	} sSMtoCores;

	sSMtoCores nGpuArchCoresPerSM[] =
	{
		{ 0x10,  8 },	// Tesla Generation (SM 1.0) G80 class
		{ 0x11,  8 },	// Tesla Generation (SM 1.1) G8x class
		{ 0x12,  8 },	// Tesla Generation (SM 1.2) G9x class
		{ 0x13,  8 },	// Tesla Generation (SM 1.3) GT200 class
		{ 0x20, 32 },	// Fermi Generation (SM 2.0) GF100 class
		{ 0x21, 48 },	// Fermi Generation (SM 2.1) GF10x class
		{ 0x30, 192},	// Kepler Generation (SM 3.0) GK10x class
		{ 0x32, 192},	// Kepler Generation (SM 3.2) GK10x class
		{ 0x35, 192},	// Kepler Generation (SM 3.5) GK11x class
		{ 0x37, 192},	// Kepler Generation (SM 3.7) GK21x class
		{ 0x50, 128},	// Maxwell Generation (SM 5.0) GM10x class
		{ 0x52, 128},	// Maxwell Generation (SM 5.2)
		{ 0x53, 128},	// Maxwell Generation (SM 5.3)
		{ 0x60, 64},	// Pascal Generation (SM 6.0)
		{ 0x61, 128},	// Pascal Generation (SM 6.1)
		{ 0x62, 128},	// Pascal Generation (SM 6.2)
		{ 0x70, 64},	// Volta Generation (SM 7.0)
		{ 0x72, 64},	// Volta Generation (SM 7.0)
		{ 0x75, 64},	// Volta Generation (SM 7.0)
		{ 0x80, 64},	//Ampere Generation (SM 8.0)
		{   -1, -1 }
	};
	int index = 0;
	while (nGpuArchCoresPerSM[index].SM != -1){
		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)){
			return nGpuArchCoresPerSM[index].Cores;
		}
		index++;
	}
	// If we don't find the values, we default use the previous one to run properly
	printf("\tMapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
	return nGpuArchCoresPerSM[index-1].Cores;
}

///Get the UUID of the given Cuda Device
/**	@param deviceProp : Cuda Device to be used
 * 	@return corresponding UUID
*/
std::string phoenix_cuda_getUuid(const cudaDeviceProp & deviceProp){
	const unsigned char * tabByte = (const unsigned char *)deviceProp.uuid.bytes;
	std::stringstream out;
	for(int i(0); i < 16; ++i){
// 		printf("%02x,", (unsigned int)(tabByte[i]));
		out << std::hex << std::setfill('0') << std::setw(2) << (unsigned int)tabByte[i];
	}
	return out.str();
}

int main(int argc, char** argv){
	int deviceCount = 0;
	cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

	if(error_id != cudaSuccess){
		printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
		printf("deviceCount = %d\n", deviceCount);
		printf("Result = FAIL\n");
		exit(EXIT_FAILURE);
	}
	// This function call returns 0 if there are no CUDA capable devices.
	if(deviceCount == 0){
		printf("There are no available device(s) that support CUDA\n");
	}else{
		printf("Detected %d CUDA Capable device(s)\n", deviceCount);
	}

	int driverVersion = 0, runtimeVersion = 0;
	for(int dev = 0; dev < deviceCount; ++dev){
		cudaSetDevice(dev);
		cudaDeviceProp deviceProp;
		cudaGetDeviceProperties(&deviceProp, dev);
		
		printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
		std::string uuid(phoenix_cuda_getUuid(deviceProp));
		printf("\tUUID :                                         \"%s\"\n", uuid.c_str());
		cudaDriverGetVersion(&driverVersion);
		cudaRuntimeGetVersion(&runtimeVersion);
		printf("\tCUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", 
			driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
		printf("\tCUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
		char msg[256];
		sprintf(msg, "\tTotal amount of global memory:                 %.0f MBytes (%llu bytes)\n",
			(float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
		printf("%s", msg);
		int nbCore = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
		printf("\t(%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
			deviceProp.multiProcessorCount,
			nbCore,
			nbCore * deviceProp.multiProcessorCount);
		printf("\tGPU Clock rate:                                %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
		
		printf("\tMaximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
			deviceProp.maxTexture1D   , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
			deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
		printf("\tMaximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
			deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
		printf("\tMaximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",
			deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
		printf("\tTotal amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
		printf("\tTotal amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
		printf("\tTotal number of registers available per block: %d\n", deviceProp.regsPerBlock);
		printf("\tWarp size:                                     %d\n", deviceProp.warpSize);
		printf("\tMaximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
		printf("\tMaximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
		printf("\tMax dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
			deviceProp.maxThreadsDim[0],
			deviceProp.maxThreadsDim[1],
			deviceProp.maxThreadsDim[2]);
		printf("\tMax dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
			deviceProp.maxGridSize[0],
			deviceProp.maxGridSize[1],
			deviceProp.maxGridSize[2]);
		printf("\tMaximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
		printf("\tTexture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
		printf("\tConcurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
		printf("\tRun time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
		printf("\tIntegrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
		printf("\tSupport host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
		printf("\tAlignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
		printf("\tDevice has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
	}
	return 0;
}


Le fichier main.cpp complet :

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/***************************************
	Auteur : Pierre Aubert
	Mail : pierre.aubert@lapp.in2p3.fr
	Licence : CeCILL-C
****************************************/

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <cuda.h>
#include <cuda_runtime.h>

#include <sstream>
#include <iomanip>
#include <iostream>

// Beginning of GPU Architecture definitions
/**	@param major : 
*/
inline int _ConvertSMVer2Cores(int major, int minor){
	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
	typedef struct
	{
		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
		int Cores;
	} sSMtoCores;

	sSMtoCores nGpuArchCoresPerSM[] =
	{
		{ 0x10,  8 },	// Tesla Generation (SM 1.0) G80 class
		{ 0x11,  8 },	// Tesla Generation (SM 1.1) G8x class
		{ 0x12,  8 },	// Tesla Generation (SM 1.2) G9x class
		{ 0x13,  8 },	// Tesla Generation (SM 1.3) GT200 class
		{ 0x20, 32 },	// Fermi Generation (SM 2.0) GF100 class
		{ 0x21, 48 },	// Fermi Generation (SM 2.1) GF10x class
		{ 0x30, 192},	// Kepler Generation (SM 3.0) GK10x class
		{ 0x32, 192},	// Kepler Generation (SM 3.2) GK10x class
		{ 0x35, 192},	// Kepler Generation (SM 3.5) GK11x class
		{ 0x37, 192},	// Kepler Generation (SM 3.7) GK21x class
		{ 0x50, 128},	// Maxwell Generation (SM 5.0) GM10x class
		{ 0x52, 128},	// Maxwell Generation (SM 5.2)
		{ 0x53, 128},	// Maxwell Generation (SM 5.3)
		{ 0x60, 64},	// Pascal Generation (SM 6.0)
		{ 0x61, 128},	// Pascal Generation (SM 6.1)
		{ 0x62, 128},	// Pascal Generation (SM 6.2)
		{ 0x70, 64},	// Volta Generation (SM 7.0)
		{ 0x72, 64},	// Volta Generation (SM 7.0)
		{ 0x75, 64},	// Volta Generation (SM 7.0)
		{ 0x80, 64},	//Ampere Generation (SM 8.0)
		{   -1, -1 }
	};
	int index = 0;
	while (nGpuArchCoresPerSM[index].SM != -1){
		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)){
			return nGpuArchCoresPerSM[index].Cores;
		}
		index++;
	}
	// If we don't find the values, we default use the previous one to run properly
	printf("\tMapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
	return nGpuArchCoresPerSM[index-1].Cores;
}

///Get the UUID of the given Cuda Device
/**	@param deviceProp : Cuda Device to be used
 * 	@return corresponding UUID
*/
std::string phoenix_cuda_getUuid(const cudaDeviceProp & deviceProp){
	const unsigned char * tabByte = (const unsigned char *)deviceProp.uuid.bytes;
	std::stringstream out;
	for(int i(0); i < 16; ++i){
// 		printf("%02x,", (unsigned int)(tabByte[i]));
		out << std::hex << std::setfill('0') << std::setw(2) << (unsigned int)tabByte[i];
	}
	return out.str();
}

int main(int argc, char** argv){
	int deviceCount = 0;
	cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

	if(error_id != cudaSuccess){
		printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
		printf("deviceCount = %d\n", deviceCount);
		printf("Result = FAIL\n");
		exit(EXIT_FAILURE);
	}
	// This function call returns 0 if there are no CUDA capable devices.
	if(deviceCount == 0){
		printf("There are no available device(s) that support CUDA\n");
	}else{
		printf("Detected %d CUDA Capable device(s)\n", deviceCount);
	}

	int driverVersion = 0, runtimeVersion = 0;
	for(int dev = 0; dev < deviceCount; ++dev){
		cudaSetDevice(dev);
		cudaDeviceProp deviceProp;
		cudaGetDeviceProperties(&deviceProp, dev);
		
		printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
		std::string uuid(phoenix_cuda_getUuid(deviceProp));
		printf("\tUUID :                                         \"%s\"\n", uuid.c_str());
		cudaDriverGetVersion(&driverVersion);
		cudaRuntimeGetVersion(&runtimeVersion);
		printf("\tCUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", 
			driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
		printf("\tCUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
		char msg[256];
		sprintf(msg, "\tTotal amount of global memory:                 %.0f MBytes (%llu bytes)\n",
			(float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
		printf("%s", msg);
		int nbCore = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
		printf("\t(%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
			deviceProp.multiProcessorCount,
			nbCore,
			nbCore * deviceProp.multiProcessorCount);
		printf("\tGPU Clock rate:                                %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
		
		printf("\tMaximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
			deviceProp.maxTexture1D   , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
			deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
		printf("\tMaximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
			deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
		printf("\tMaximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",
			deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
		printf("\tTotal amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
		printf("\tTotal amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
		printf("\tTotal number of registers available per block: %d\n", deviceProp.regsPerBlock);
		printf("\tWarp size:                                     %d\n", deviceProp.warpSize);
		printf("\tMaximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
		printf("\tMaximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
		printf("\tMax dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
			deviceProp.maxThreadsDim[0],
			deviceProp.maxThreadsDim[1],
			deviceProp.maxThreadsDim[2]);
		printf("\tMax dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
			deviceProp.maxGridSize[0],
			deviceProp.maxGridSize[1],
			deviceProp.maxGridSize[2]);
		printf("\tMaximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
		printf("\tTexture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
		printf("\tConcurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
		printf("\tRun time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
		printf("\tIntegrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
		printf("\tSupport host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
		printf("\tAlignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
		printf("\tDevice has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
	}
	return 0;
}


Vous pouvez le télécharger ici.