Le main.cpp

3.2.1 : Le main.cpp

Écrivons le main.cpp :

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <cuda.h>
#include <cuda_runtime.h>

#include <sstream>
#include <iomanip>
#include <iostream>

// Beginning of GPU Architecture definitions
/**	@param major : 
*/
inline int _ConvertSMVer2Cores(int major, int minor){
	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
	typedef struct
	{
		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
		int Cores;
	} sSMtoCores;

	sSMtoCores nGpuArchCoresPerSM[] =
	{
		{ 0x10,  8 },	// Tesla Generation (SM 1.0) G80 class
		{ 0x11,  8 },	// Tesla Generation (SM 1.1) G8x class
		{ 0x12,  8 },	// Tesla Generation (SM 1.2) G9x class
		{ 0x13,  8 },	// Tesla Generation (SM 1.3) GT200 class
		{ 0x20, 32 },	// Fermi Generation (SM 2.0) GF100 class
		{ 0x21, 48 },	// Fermi Generation (SM 2.1) GF10x class
		{ 0x30, 192},	// Kepler Generation (SM 3.0) GK10x class
		{ 0x32, 192},	// Kepler Generation (SM 3.2) GK10x class
		{ 0x35, 192},	// Kepler Generation (SM 3.5) GK11x class
		{ 0x37, 192},	// Kepler Generation (SM 3.7) GK21x class
		{ 0x50, 128},	// Maxwell Generation (SM 5.0) GM10x class
		{ 0x52, 128},	// Maxwell Generation (SM 5.2)
		{ 0x53, 128},	// Maxwell Generation (SM 5.3)
		{ 0x60, 64},	// Pascal Generation (SM 6.0)
		{ 0x61, 128},	// Pascal Generation (SM 6.1)
		{ 0x62, 128},	// Pascal Generation (SM 6.2)
		{ 0x70, 64},	// Volta Generation (SM 7.0)
		{ 0x72, 64},	// Volta Generation (SM 7.0)
		{ 0x75, 64},	// Volta Generation (SM 7.0)
		{ 0x80, 64},	//Ampere Generation (SM 8.0)
		{   -1, -1 }
	};
	int index = 0;
	while (nGpuArchCoresPerSM[index].SM != -1){
		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)){
			return nGpuArchCoresPerSM[index].Cores;
		}
		index++;
	}
	// If we don't find the values, we default use the previous one to run properly
	printf("\tMapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
	return nGpuArchCoresPerSM[index-1].Cores;
}

///Get the UUID of the given Cuda Device
/**	@param deviceProp : Cuda Device to be used
 * 	@return corresponding UUID
*/
std::string phoenix_cuda_getUuid(const cudaDeviceProp & deviceProp){
	const unsigned char * tabByte = (const unsigned char *)deviceProp.uuid.bytes;
	std::stringstream out;
	for(int i(0); i < 16; ++i){
// 		printf("%02x,", (unsigned int)(tabByte[i]));
		out << std::hex << std::setfill('0') << std::setw(2) << (unsigned int)tabByte[i];
	}
	return out.str();
}

int main(int argc, char** argv){
	int deviceCount = 0;
	cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

	if(error_id != cudaSuccess){
		printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
		printf("deviceCount = %d\n", deviceCount);
		printf("Result = FAIL\n");
		exit(EXIT_FAILURE);
	}
	// This function call returns 0 if there are no CUDA capable devices.
	if(deviceCount == 0){
		printf("There are no available device(s) that support CUDA\n");
	}else{
		printf("Detected %d CUDA Capable device(s)\n", deviceCount);
	}

	int driverVersion = 0, runtimeVersion = 0;
	for(int dev = 0; dev < deviceCount; ++dev){
		cudaSetDevice(dev);
		cudaDeviceProp deviceProp;
		cudaGetDeviceProperties(&deviceProp, dev);
		
		printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
		std::string uuid(phoenix_cuda_getUuid(deviceProp));
		printf("\tUUID :                                         \"%s\"\n", uuid.c_str());
		cudaDriverGetVersion(&driverVersion);
		cudaRuntimeGetVersion(&runtimeVersion);
		printf("\tCUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", 
			driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
		printf("\tCUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
		char msg[256];
		sprintf(msg, "\tTotal amount of global memory:                 %.0f MBytes (%llu bytes)\n",
			(float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
		printf("%s", msg);
		int nbCore = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
		printf("\t(%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
			deviceProp.multiProcessorCount,
			nbCore,
			nbCore * deviceProp.multiProcessorCount);
		printf("\tGPU Clock rate:                                %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
		
		printf("\tMaximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
			deviceProp.maxTexture1D   , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
			deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
		printf("\tMaximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
			deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
		printf("\tMaximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",
			deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
		printf("\tTotal amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
		printf("\tTotal amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
		printf("\tTotal number of registers available per block: %d\n", deviceProp.regsPerBlock);
		printf("\tWarp size:                                     %d\n", deviceProp.warpSize);
		printf("\tMaximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
		printf("\tMaximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
		printf("\tMax dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
			deviceProp.maxThreadsDim[0],
			deviceProp.maxThreadsDim[1],
			deviceProp.maxThreadsDim[2]);
		printf("\tMax dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
			deviceProp.maxGridSize[0],
			deviceProp.maxGridSize[1],
			deviceProp.maxGridSize[2]);
		printf("\tMaximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
		printf("\tTexture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
		printf("\tConcurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
		printf("\tRun time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
		printf("\tIntegrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
		printf("\tSupport host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
		printf("\tAlignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
		printf("\tDevice has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
	}
	return 0;
}

Le fichier main.cpp complet :

/***************************************
	Auteur : Pierre Aubert
	Mail : pierre.aubert@lapp.in2p3.fr
	Licence : CeCILL-C
****************************************/

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <cuda.h>
#include <cuda_runtime.h>

#include <sstream>
#include <iomanip>
#include <iostream>

// Beginning of GPU Architecture definitions
/**	@param major : 
*/
inline int _ConvertSMVer2Cores(int major, int minor){
	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
	typedef struct
	{
		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
		int Cores;
	} sSMtoCores;

	sSMtoCores nGpuArchCoresPerSM[] =
	{
		{ 0x10,  8 },	// Tesla Generation (SM 1.0) G80 class
		{ 0x11,  8 },	// Tesla Generation (SM 1.1) G8x class
		{ 0x12,  8 },	// Tesla Generation (SM 1.2) G9x class
		{ 0x13,  8 },	// Tesla Generation (SM 1.3) GT200 class
		{ 0x20, 32 },	// Fermi Generation (SM 2.0) GF100 class
		{ 0x21, 48 },	// Fermi Generation (SM 2.1) GF10x class
		{ 0x30, 192},	// Kepler Generation (SM 3.0) GK10x class
		{ 0x32, 192},	// Kepler Generation (SM 3.2) GK10x class
		{ 0x35, 192},	// Kepler Generation (SM 3.5) GK11x class
		{ 0x37, 192},	// Kepler Generation (SM 3.7) GK21x class
		{ 0x50, 128},	// Maxwell Generation (SM 5.0) GM10x class
		{ 0x52, 128},	// Maxwell Generation (SM 5.2)
		{ 0x53, 128},	// Maxwell Generation (SM 5.3)
		{ 0x60, 64},	// Pascal Generation (SM 6.0)
		{ 0x61, 128},	// Pascal Generation (SM 6.1)
		{ 0x62, 128},	// Pascal Generation (SM 6.2)
		{ 0x70, 64},	// Volta Generation (SM 7.0)
		{ 0x72, 64},	// Volta Generation (SM 7.0)
		{ 0x75, 64},	// Volta Generation (SM 7.0)
		{ 0x80, 64},	//Ampere Generation (SM 8.0)
		{   -1, -1 }
	};
	int index = 0;
	while (nGpuArchCoresPerSM[index].SM != -1){
		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)){
			return nGpuArchCoresPerSM[index].Cores;
		}
		index++;
	}
	// If we don't find the values, we default use the previous one to run properly
	printf("\tMapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
	return nGpuArchCoresPerSM[index-1].Cores;
}

///Get the UUID of the given Cuda Device
/**	@param deviceProp : Cuda Device to be used
 * 	@return corresponding UUID
*/
std::string phoenix_cuda_getUuid(const cudaDeviceProp & deviceProp){
	const unsigned char * tabByte = (const unsigned char *)deviceProp.uuid.bytes;
	std::stringstream out;
	for(int i(0); i < 16; ++i){
// 		printf("%02x,", (unsigned int)(tabByte[i]));
		out << std::hex << std::setfill('0') << std::setw(2) << (unsigned int)tabByte[i];
	}
	return out.str();
}

int main(int argc, char** argv){
	int deviceCount = 0;
	cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

	if(error_id != cudaSuccess){
		printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
		printf("deviceCount = %d\n", deviceCount);
		printf("Result = FAIL\n");
		exit(EXIT_FAILURE);
	}
	// This function call returns 0 if there are no CUDA capable devices.
	if(deviceCount == 0){
		printf("There are no available device(s) that support CUDA\n");
	}else{
		printf("Detected %d CUDA Capable device(s)\n", deviceCount);
	}

	int driverVersion = 0, runtimeVersion = 0;
	for(int dev = 0; dev < deviceCount; ++dev){
		cudaSetDevice(dev);
		cudaDeviceProp deviceProp;
		cudaGetDeviceProperties(&deviceProp, dev);
		
		printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
		std::string uuid(phoenix_cuda_getUuid(deviceProp));
		printf("\tUUID :                                         \"%s\"\n", uuid.c_str());
		cudaDriverGetVersion(&driverVersion);
		cudaRuntimeGetVersion(&runtimeVersion);
		printf("\tCUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", 
			driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
		printf("\tCUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
		char msg[256];
		sprintf(msg, "\tTotal amount of global memory:                 %.0f MBytes (%llu bytes)\n",
			(float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
		printf("%s", msg);
		int nbCore = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
		printf("\t(%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
			deviceProp.multiProcessorCount,
			nbCore,
			nbCore * deviceProp.multiProcessorCount);
		printf("\tGPU Clock rate:                                %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
		
		printf("\tMaximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
			deviceProp.maxTexture1D   , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
			deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
		printf("\tMaximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
			deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
		printf("\tMaximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",
			deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
		printf("\tTotal amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
		printf("\tTotal amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
		printf("\tTotal number of registers available per block: %d\n", deviceProp.regsPerBlock);
		printf("\tWarp size:                                     %d\n", deviceProp.warpSize);
		printf("\tMaximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
		printf("\tMaximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
		printf("\tMax dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
			deviceProp.maxThreadsDim[0],
			deviceProp.maxThreadsDim[1],
			deviceProp.maxThreadsDim[2]);
		printf("\tMax dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
			deviceProp.maxGridSize[0],
			deviceProp.maxGridSize[1],
			deviceProp.maxGridSize[2]);
		printf("\tMaximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
		printf("\tTexture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
		printf("\tConcurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
		printf("\tRun time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
		printf("\tIntegrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
		printf("\tSupport host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
		printf("\tAlignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
		printf("\tDevice has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
	}
	return 0;
}

Vous pouvez le télécharger ici.