Le fichier Cuda

4.1.7.2 : Le fichier Cuda

Écrivons le fichier gray_scott_cuda.cu :

Commençons par les inclues :

#include <stdio.h>
#include <stdlib.h>            //pour avoir abort()
#include <math.h>
#include "gray_scott_cuda.h"

Puis la fonction qui calcule la réaction de Gray Scott :

///Propagate the U and V species in the matU and matV with CUda
/**	@param[out] outMatU : updated matrix U version (with padding)
 * 	@param[out] outMatV : updated matrix V version (with padding)
 * 	@param matU : input of matrix U (with padding)
 * 	@param matV : input of matrix V (with padding)
 * 	@param nbRow : number of rows of the matrices (with padding)
 * 	@param nbCol : number of columns of the matrices (with padding)
 * 	@param matDeltaSquare : matrix of the delta square values
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffudionRateU : diffusion rate of the U specie
 * 	@param diffudionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two steps
*/
__global__ void gray_scott_cuda_kernel(float * outMatU, float * outMatV, const float * matU, const float * matV, size_t nbRow, size_t nbCol,
		       const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
		       float diffudionRateU, float diffusionRateV, float feedRate, float killRate, float dt)
{

Il faut calculer l'élément courrant que l'on doit traiter avec la position du thread courrant :

	// identifiant de thread a deux dimensions, comme la matrice
	// La bonne nouvelle c'est que l'on a un thread par element
	int indexElRow = blockIdx.x*blockDim.x + threadIdx.x;	//on rows
	int indexElCol = blockIdx.y*blockDim.y + threadIdx.y;	//on columns
	
	int i = indexElRow + 1;	//We have one padding top (and bottom)
	int j = indexElCol + 1;	//We have one padding left (and right)

On lit les données de al cellule courrante :

1	float u = matU[inbCol + j], v = matV[inbCol + j];

On applique notre stencil :

	float fullU = 0.0f, fullV = 0.0f;
	for(long k = 0l; k < nbStencilRow; ++k){
		for(long l = 0l; l < nbStencilCol; ++l){
			float deltaSquare = matDeltaSquare[k*nbStencilCol + l];
			
			long idxRow = i + k - 1;	//i shifted of -1, 0 or 1
			long idxCol = j + l - 1;	//j shifted of -1, 0 or 1
			
			fullU += (matU[idxRow*nbCol + idxCol] - u)*deltaSquare;
			fullV += (matV[idxRow*nbCol + idxCol] - v)*deltaSquare;
		}
	}
	float uvSquare = u*v*v;
	float du = diffudionRateU*fullU - uvSquare + feedRate*(1.0f - u);
	float dv = diffusionRateV*fullV + uvSquare - (feedRate + killRate)*v;

On sauve le résultat :

1
2
3

	outMatU[i*nbCol + j] = u + du*dt;
	outMatV[i*nbCol + j] = v + dv*dt;
}

On implemente une petite fonction qui va nous permettre de calculer la taille d'un bloc en fonction de la taille de l'image en entrée :

///Get the size of a block (works for both X and Y dimensions
/**	@param maxBlockSize : maximum size of block on the given dimension
 * 	@param imageSize : size of the image on this dimension
 * 	@return matching size of the block for the corresponding dimension
*/
int getBlockSize(int maxBlockSize, size_t imageSize){
	if((long)imageSize < maxBlockSize){return imageSize;}
	int blockSize = maxBlockSize;
	int restSize = imageSize % blockSize;
	while(restSize != 0 && blockSize > 1){
		--blockSize;
		restSize = imageSize % blockSize;	//We reduce until we found a multiple
	}
	if(blockSize == 0){
		blockSize = maxBlockSize;
	}
	return blockSize;
}

Un fonction qui échange du pointeurs de données :

///Swap pointer
/**	@param[out] ptr1 : pointer to be swaped
 * 	@param[out] ptr2 : pointer to be swaped
*/
void swapPointer(float ** ptr1, float ** ptr2){
	float * tmp = *ptr1;
	*ptr1 = *ptr2;
	*ptr2 = tmp;
}

La fonction C qui appellera notre kernel de calcul :

///Interface of the Gray Scott reaction kernel in Cuda
/**	@param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol)
 * 	@param matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol)
 * 	@param matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol)
 * 	@param nbImage : total number of images to be created
 * 	@param nbExtraStep : number of extra steps to be computed between images
 * 	@param nbRow : number of rows of the images to be created
 * 	@param nbCol : number of columns of the images to be created
 * 	@param paddedNbRow : padded number of rows of the images to be created
 * 	@param paddedNbCol : number of columns of the images to be created
 * 	@param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol)
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffusionRateU : diffusion rate of the U specie
 * 	@param diffusionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two computation
 * 	@param maxNbThreadPerBlockX : maximum number of thread per block on X
 * 	@param maxNbBlockX : maximum number of block in the grid on X
 * 	@param totalGpuMemory : total available memory on GPU
*/
void gray_scott_cuda(float * matOutV, const float * matInU, const float * matInV,
			size_t nbImage, size_t nbExtraStep, size_t nbRow, size_t nbCol,
			size_t paddedNbRow, size_t paddedNbCol,
			const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
			float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt,
			int maxNbThreadPerBlockX, int maxNbBlockX, size_t totalGpuMemory)
{

On commence par afficher des paramètres généraux sur le calcul :

	size_t totalNeededMemory = (nbStencilRow*nbStencilCol + paddedNbRow*paddedNbCol*4lu + nbImage*nbRow*nbCol)*sizeof(float);
	size_t nbGpuCall = 1lu + totalNeededMemory/(totalGpuMemory - 290000000lu);	//Combien de fois il faudra appeler le kernel pour traiter toutes les images
	printf("gray_scott_cuda : number of bunch %lu\n", nbGpuCall);
	size_t nbImagePerCall = nbImage/nbGpuCall;			//Nombre d'images traitees par le GPU en une fois
	printf("gray_scott_cuda : nbImagePerCall = %lu\n", nbImagePerCall);
	size_t nbImageLastCall = nbImage - nbImagePerCall*nbGpuCall;	//Dernières images a traiter
	printf("gray_scott_cuda : nbImageLastCall = %lu\n", nbImageLastCall);

On alloue les données de notre stencil sur le GPU :

	//Allocation des donnees sur GPU
	size_t sizeDeltaSquareByte = nbStencilRow*nbStencilCol*sizeof(float);
	float * dmatDeltaSquare = NULL;
	cudaMalloc((void**)&dmatDeltaSquare, sizeDeltaSquareByte);

On vérifie que l'allocation s'est bien passée :

1 2	PLIB_CUDA_CHECK_FILE

Ensuite on alloue les matrices de concentration de nos produit U et V en entrée et sortie :

	size_t sizeMatFloat = nbRow*nbCol;
	size_t sizeMatByte = sizeMatFloat*sizeof(float);
	size_t sizePaddedMatByte = paddedNbRow*paddedNbCol*sizeof(float);
	float *doutMatU = NULL, *doutMatV, *dmatU = NULL, *dmatV = NULL;	//Le d est pour device, pour se rappeler que ce sont les donnees sur le GPU
	cudaMalloc((void**)&doutMatU, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&doutMatV, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&dmatU, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&dmatV, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE

On déclare notre premier pointeur qui sera échangé par la suite :

1	float *dRefFirstOutputV = doutMatV;

On alloue le gros tableau de résulats sur le GPU :

	size_t sizeMatOutputInByte = sizeMatByte*nbImagePerCall;		//Le gros tableau de resultat, côte GPU
	float * dMatOutV = NULL;
	cudaError_t err = cudaMalloc((void**)&dMatOutV, sizeMatOutputInByte);
	if(err != cudaSuccess){
		printf("gray_scott_cuda : Cannot allocate temporary of %lu, %lu bytes : %s \n", nbImagePerCall, sizeMatOutputInByte, cudaGetErrorString(err));
		abort();
	}

On calcule la taille des blocs que nous utiliserons pour exécuter notre kernel de calcul :

	int maxBlockSize = sqrt(maxNbThreadPerBlockX);
	//Convention X => row, Y => col
	int dimBlockX = getBlockSize(maxBlockSize, nbRow), dimBlockY = getBlockSize(maxBlockSize, nbCol);
	int dimGridX = nbRow/dimBlockX, dimGridY = nbCol/dimBlockY;
	printf("gray_scott_cuda : Grid size (%d, %d, 1)\n", dimGridX, dimGridY);
	printf("gray_scott_cuda : block size (%d, %d, 1)\n", dimBlockX, dimBlockY);

On définit la taille de la grille et celle des blocs (il y aura aussi des clusters dans CUDA 12) :

1
2
3

	dim3 dimGrid(dimGridX, dimGridY, 1);
	dim3 dimBlock(dimBlockX, dimBlockY, 1);

On copie les données de l'hôte au GPU :

	//Data transfert from host to device
	cudaMemcpy(dmatDeltaSquare, matDeltaSquare, sizeDeltaSquareByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	
	cudaMemcpy(dmatU, matInU, sizePaddedMatByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	cudaMemcpy(dmatV, matInV, sizePaddedMatByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE

On finalise les calculs pour prendre en compte le padding de nos tableaux :

1 2	size_t colPitch = paddedNbColsizeof(float), colShift = paddedNbCol + 1lu; //colShift is added with float, so there is no sizeof(float) needed size_t sizeCol = nbColsizeof(float);

On boucle sur les bunch (dans le cas où le GPU que l'on utilise a moins de mémoire que le nombre de résulats total que l'on veut calculer) :

1 2	//let's call the computation for(size_t i = 0lu; i < nbGpuCall; ++i){

On boucle sur toutes les images que l'on veut calculer par bunch :

1 2	printf("gray_scott_cuda : bunch %lu/%lu\n", (i+1), nbGpuCall); for(size_t j = 0lu; j < nbImagePerCall; ++j){

On boucle sur les étapes intermédiaires ce qui nous permet d'accélérer la vitesse de la réaction :

			for(size_t k = 0lu; k < nbExtraStep; ++k){
				//Let's call the kernel
				gray_scott_cuda_kernel<<<dimGrid, dimBlock>>>(doutMatU, doutMatV, dmatU, dmatV, paddedNbRow, paddedNbCol,
										dmatDeltaSquare, nbStencilRow, nbStencilCol,
										diffusionRateU, diffusionRateV, feedRate, killRate, dt);

On échange nos pointeurs ce qui évite une copie :

				//Now pointer swap
				swapPointer(&doutMatU, &dmatU);
				swapPointer(&doutMatV, &dmatV);
			}

On copie nos résultats du temporaire au tableau de résulats (tous sur le GPU) en faisant attention d'utiliser les bons pointeurs :

			//To deal with 2d copy :
			// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee
			size_t shifImage = sizeMatFloat*j;	//shift is in float
			if(doutMatV == dRefFirstOutputV){
				cudaMemcpy2D(dMatOutV + shifImage, sizeCol, doutMatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice);
				PLIB_CUDA_CHECK_FILE
			}else{
				cudaMemcpy2D(dMatOutV + shifImage, sizeCol, dmatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice);
				PLIB_CUDA_CHECK_FILE
			}
		}

On transfers nos résultats du GPU à l'hôte :

		//Flush computed data to Host
		cudaMemcpy(matOutV + i*nbImagePerCall*sizeMatFloat, dMatOutV, sizeMatOutputInByte, cudaMemcpyDeviceToHost);
		PLIB_CUDA_CHECK_FILE
	}

On traite le cas où il reste des images à traiter car la taille de tous les bunch n'est pas un multiple du nombre total d'image à calculer :

	if(nbImageLastCall != 0lu){	//If there are remaning images to compute
		printf("gray_scott_cuda : last bunch of %lu image(s) after %lu bunche(s) of %lu images each\n", nbImageLastCall, nbGpuCall, nbImagePerCall);
		for(size_t j = 0lu; j < nbImageLastCall; ++j){
			for(size_t k = 0lu; k < nbExtraStep; ++k){
				//Let's call the kernel
				gray_scott_cuda_kernel<<<dimGrid, dimBlock>>>(doutMatU, doutMatV, dmatU, dmatV, paddedNbRow, paddedNbCol,
										dmatDeltaSquare, nbStencilRow, nbStencilCol,
										diffusionRateU, diffusionRateV, feedRate, killRate, dt);
				//Now pointer swap
				swapPointer(&doutMatU, &dmatU);
				swapPointer(&doutMatV, &dmatV);
			}
			//To deal with 2d copy : 
			// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee
			size_t shifImage = sizeMatFloat*j;	//shift is in float
			if(doutMatV == dRefFirstOutputV){
				cudaMemcpy2D(dMatOutV + shifImage, sizeCol, doutMatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice);
				PLIB_CUDA_CHECK_FILE
			}else{
				cudaMemcpy2D(dMatOutV + shifImage, sizeCol, dmatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice);
				PLIB_CUDA_CHECK_FILE
			}
		}
		//Flush computed data to Host
		cudaMemcpy(matOutV + nbGpuCall*nbImagePerCall*sizeMatFloat, dMatOutV, nbImageLastCall*sizeMatByte, cudaMemcpyDeviceToHost);
		PLIB_CUDA_CHECK_FILE
	}

On libère la mémoire allouée sur le GPU :

	//Free the temporaries
	cudaFree(doutMatU);
	cudaFree(doutMatV);
	cudaFree(dmatU);
	cudaFree(dmatV);
	//Free the big images result
	cudaFree(dMatOutV);
}

Appellons notre kernel de calcul sans utiliser de résultat temporaire du le GPU ^{noteIl est que ce sera moins efficace, mais comme cela nous pourrons évaluer à quel point} :

///Interface of the Gray Scott reaction kernel in Cuda
/**	@param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol)
 * 	@param matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol)
 * 	@param matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol)
 * 	@param nbImage : total number of images to be created
 * 	@param nbExtraStep : number of extra steps to be computed between images
 * 	@param nbRow : number of rows of the images to be created
 * 	@param nbCol : number of columns of the images to be created
 * 	@param paddedNbRow : padded number of rows of the images to be created
 * 	@param paddedNbCol : number of columns of the images to be created
 * 	@param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol)
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffusionRateU : diffusion rate of the U specie
 * 	@param diffusionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two computation
 * 	@param maxNbThreadPerBlockX : maximum number of thread per block on X
 * 	@param maxNbBlockX : maximum number of block in the grid on X
 * 	@param totalGpuMemory : total available memory on GPU
*/
void gray_scott_cuda_stupid(float * matOutV, const float * matInU, const float * matInV,
			size_t nbImage, size_t nbExtraStep, size_t nbRow, size_t nbCol,
			size_t paddedNbRow, size_t paddedNbCol,
			const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
			float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt,
			int maxNbThreadPerBlockX, int maxNbBlockX, size_t totalGpuMemory)
{