4.1.7.2 : Le fichier Cuda

Écrivons le fichier gray_scott_cuda.cu :

Commençons par les inclues :
1
2
3
4
#include <stdio.h>
#include <stdlib.h>            //pour avoir abort()
#include <math.h>
#include "gray_scott_cuda.h"


Puis la fonction qui calcule la réaction de Gray Scott :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
///Propagate the U and V species in the matU and matV with CUda
/**	@param[out] outMatU : updated matrix U version (with padding)
 * 	@param[out] outMatV : updated matrix V version (with padding)
 * 	@param matU : input of matrix U (with padding)
 * 	@param matV : input of matrix V (with padding)
 * 	@param nbRow : number of rows of the matrices (with padding)
 * 	@param nbCol : number of columns of the matrices (with padding)
 * 	@param matDeltaSquare : matrix of the delta square values
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffudionRateU : diffusion rate of the U specie
 * 	@param diffudionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two steps
*/
__global__ void gray_scott_cuda_kernel(float * outMatU, float * outMatV, const float * matU, const float * matV, size_t nbRow, size_t nbCol,
		       const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
		       float diffudionRateU, float diffusionRateV, float feedRate, float killRate, float dt)
{


Il faut calculer l'élément courrant que l'on doit traiter avec la position du thread courrant :
1
2
3
4
5
6
7
	// identifiant de thread a deux dimensions, comme la matrice
	// La bonne nouvelle c'est que l'on a un thread par element
	int indexElRow = blockIdx.x*blockDim.x + threadIdx.x;	//on rows
	int indexElCol = blockIdx.y*blockDim.y + threadIdx.y;	//on columns
	
	int i = indexElRow + 1;	//We have one padding top (and bottom)
	int j = indexElCol + 1;	//We have one padding left (and right)


On lit les données de al cellule courrante :
1
	float u = matU[i*nbCol + j], v = matV[i*nbCol + j];


On applique notre stencil :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
	float fullU = 0.0f, fullV = 0.0f;
	for(long k = 0l; k < nbStencilRow; ++k){
		for(long l = 0l; l < nbStencilCol; ++l){
			float deltaSquare = matDeltaSquare[k*nbStencilCol + l];
			
			long idxRow = i + k - 1;	//i shifted of -1, 0 or 1
			long idxCol = j + l - 1;	//j shifted of -1, 0 or 1
			
			fullU += (matU[idxRow*nbCol + idxCol] - u)*deltaSquare;
			fullV += (matV[idxRow*nbCol + idxCol] - v)*deltaSquare;
		}
	}
	float uvSquare = u*v*v;
	float du = diffudionRateU*fullU - uvSquare + feedRate*(1.0f - u);
	float dv = diffusionRateV*fullV + uvSquare - (feedRate + killRate)*v;


On sauve le résultat :
1
2
3
	outMatU[i*nbCol + j] = u + du*dt;
	outMatV[i*nbCol + j] = v + dv*dt;
}


On implemente une petite fonction qui va nous permettre de calculer la taille d'un bloc en fonction de la taille de l'image en entrée :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
///Get the size of a block (works for both X and Y dimensions
/**	@param maxBlockSize : maximum size of block on the given dimension
 * 	@param imageSize : size of the image on this dimension
 * 	@return matching size of the block for the corresponding dimension
*/
int getBlockSize(int maxBlockSize, size_t imageSize){
	if((long)imageSize < maxBlockSize){return imageSize;}
	int blockSize = maxBlockSize;
	int restSize = imageSize % blockSize;
	while(restSize != 0 && blockSize > 1){
		--blockSize;
		restSize = imageSize % blockSize;	//We reduce until we found a multiple
	}
	if(blockSize == 0){
		blockSize = maxBlockSize;
	}
	return blockSize;
}


Un fonction qui échange du pointeurs de données :
1
2
3
4
5
6
7
8
9
///Swap pointer
/**	@param[out] ptr1 : pointer to be swaped
 * 	@param[out] ptr2 : pointer to be swaped
*/
void swapPointer(float ** ptr1, float ** ptr2){
	float * tmp = *ptr1;
	*ptr1 = *ptr2;
	*ptr2 = tmp;
}


La fonction C qui appellera notre kernel de calcul :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
///Interface of the Gray Scott reaction kernel in Cuda
/**	@param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol)
 * 	@param matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol)
 * 	@param matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol)
 * 	@param nbImage : total number of images to be created
 * 	@param nbExtraStep : number of extra steps to be computed between images
 * 	@param nbRow : number of rows of the images to be created
 * 	@param nbCol : number of columns of the images to be created
 * 	@param paddedNbRow : padded number of rows of the images to be created
 * 	@param paddedNbCol : number of columns of the images to be created
 * 	@param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol)
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffusionRateU : diffusion rate of the U specie
 * 	@param diffusionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two computation
 * 	@param maxNbThreadPerBlockX : maximum number of thread per block on X
 * 	@param maxNbBlockX : maximum number of block in the grid on X
 * 	@param totalGpuMemory : total available memory on GPU
*/
void gray_scott_cuda(float * matOutV, const float * matInU, const float * matInV,
			size_t nbImage, size_t nbExtraStep, size_t nbRow, size_t nbCol,
			size_t paddedNbRow, size_t paddedNbCol,
			const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
			float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt,
			int maxNbThreadPerBlockX, int maxNbBlockX, size_t totalGpuMemory)
{


On commence par afficher des paramètres généraux sur le calcul :
1
2
3
4
5
6
7
	size_t totalNeededMemory = (nbStencilRow*nbStencilCol + paddedNbRow*paddedNbCol*4lu + nbImage*nbRow*nbCol)*sizeof(float);
	size_t nbGpuCall = 1lu + totalNeededMemory/(totalGpuMemory - 290000000lu);	//Combien de fois il faudra appeler le kernel pour traiter toutes les images
	printf("gray_scott_cuda : number of bunch %lu\n", nbGpuCall);
	size_t nbImagePerCall = nbImage/nbGpuCall;			//Nombre d'images traitees par le GPU en une fois
	printf("gray_scott_cuda : nbImagePerCall = %lu\n", nbImagePerCall);
	size_t nbImageLastCall = nbImage - nbImagePerCall*nbGpuCall;	//Dernières images a traiter
	printf("gray_scott_cuda : nbImageLastCall = %lu\n", nbImageLastCall);


On alloue les données de notre stencil sur le GPU :
1
2
3
4
	//Allocation des donnees sur GPU
	size_t sizeDeltaSquareByte = nbStencilRow*nbStencilCol*sizeof(float);
	float * dmatDeltaSquare = NULL;
	cudaMalloc((void**)&dmatDeltaSquare, sizeDeltaSquareByte);


On vérifie que l'allocation s'est bien passée :
1
2
	PLIB_CUDA_CHECK_FILE
	


Ensuite on alloue les matrices de concentration de nos produit U et V en entrée et sortie :
1
2
3
4
5
6
7
8
9
10
11
12
13
	size_t sizeMatFloat = nbRow*nbCol;
	size_t sizeMatByte = sizeMatFloat*sizeof(float);
	size_t sizePaddedMatByte = paddedNbRow*paddedNbCol*sizeof(float);
	float *doutMatU = NULL, *doutMatV, *dmatU = NULL, *dmatV = NULL;	//Le d est pour device, pour se rappeler que ce sont les donnees sur le GPU
	cudaMalloc((void**)&doutMatU, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&doutMatV, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&dmatU, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&dmatV, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	


On déclare notre premier pointeur qui sera échangé par la suite :
1
	float *dRefFirstOutputV = doutMatV;


On alloue le gros tableau de résulats sur le GPU :
1
2
3
4
5
6
7
8
	size_t sizeMatOutputInByte = sizeMatByte*nbImagePerCall;		//Le gros tableau de resultat, côte GPU
	float * dMatOutV = NULL;
	cudaError_t err = cudaMalloc((void**)&dMatOutV, sizeMatOutputInByte);
	if(err != cudaSuccess){
		printf("gray_scott_cuda : Cannot allocate temporary of %lu, %lu bytes : %s \n", nbImagePerCall, sizeMatOutputInByte, cudaGetErrorString(err));
		abort();
	}
	


On calcule la taille des blocs que nous utiliserons pour exécuter notre kernel de calcul :
1
2
3
4
5
6
7
	int maxBlockSize = sqrt(maxNbThreadPerBlockX);
	//Convention X => row, Y => col
	int dimBlockX = getBlockSize(maxBlockSize, nbRow), dimBlockY = getBlockSize(maxBlockSize, nbCol);
	int dimGridX = nbRow/dimBlockX, dimGridY = nbCol/dimBlockY;
	printf("gray_scott_cuda : Grid size (%d, %d, 1)\n", dimGridX, dimGridY);
	printf("gray_scott_cuda : block size (%d, %d, 1)\n", dimBlockX, dimBlockY);
	


On définit la taille de la grille et celle des blocs (il y aura aussi des clusters dans CUDA 12) :
1
2
3
	dim3 dimGrid(dimGridX, dimGridY, 1);
	dim3 dimBlock(dimBlockX, dimBlockY, 1);
	


On copie les données de l'hôte au GPU :
1
2
3
4
5
6
7
8
9
	//Data transfert from host to device
	cudaMemcpy(dmatDeltaSquare, matDeltaSquare, sizeDeltaSquareByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	
	cudaMemcpy(dmatU, matInU, sizePaddedMatByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	cudaMemcpy(dmatV, matInV, sizePaddedMatByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	


On finalise les calculs pour prendre en compte le padding de nos tableaux :
1
2
	size_t colPitch = paddedNbCol*sizeof(float), colShift = paddedNbCol + 1lu;	//colShift is added with float, so there is no sizeof(float) needed
	size_t sizeCol = nbCol*sizeof(float);


On boucle sur les bunch (dans le cas où le GPU que l'on utilise a moins de mémoire que le nombre de résulats total que l'on veut calculer) :
1
2
	//let's call the computation
	for(size_t i = 0lu; i < nbGpuCall; ++i){


On boucle sur toutes les images que l'on veut calculer par bunch :
1
2
		printf("gray_scott_cuda : bunch %lu/%lu\n", (i+1), nbGpuCall);
		for(size_t j = 0lu; j < nbImagePerCall; ++j){


On boucle sur les étapes intermédiaires ce qui nous permet d'accélérer la vitesse de la réaction :
1
2
3
4
5
			for(size_t k = 0lu; k < nbExtraStep; ++k){
				//Let's call the kernel
				gray_scott_cuda_kernel<<<dimGrid, dimBlock>>>(doutMatU, doutMatV, dmatU, dmatV, paddedNbRow, paddedNbCol,
										dmatDeltaSquare, nbStencilRow, nbStencilCol,
										diffusionRateU, diffusionRateV, feedRate, killRate, dt);


On échange nos pointeurs ce qui évite une copie :
1
2
3
4
				//Now pointer swap
				swapPointer(&doutMatU, &dmatU);
				swapPointer(&doutMatV, &dmatV);
			}


On copie nos résultats du temporaire au tableau de résulats (tous sur le GPU) en faisant attention d'utiliser les bons pointeurs :
1
2
3
4
5
6
7
8
9
10
11
			//To deal with 2d copy :
			// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee
			size_t shifImage = sizeMatFloat*j;	//shift is in float
			if(doutMatV == dRefFirstOutputV){
				cudaMemcpy2D(dMatOutV + shifImage, sizeCol, doutMatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice);
				PLIB_CUDA_CHECK_FILE
			}else{
				cudaMemcpy2D(dMatOutV + shifImage, sizeCol, dmatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice);
				PLIB_CUDA_CHECK_FILE
			}
		}


On transfers nos résultats du GPU à l'hôte :
1
2
3
4
		//Flush computed data to Host
		cudaMemcpy(matOutV + i*nbImagePerCall*sizeMatFloat, dMatOutV, sizeMatOutputInByte, cudaMemcpyDeviceToHost);
		PLIB_CUDA_CHECK_FILE
	}


On traite le cas où il reste des images à traiter car la taille de tous les bunch n'est pas un multiple du nombre total d'image à calculer :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
	if(nbImageLastCall != 0lu){	//If there are remaning images to compute
		printf("gray_scott_cuda : last bunch of %lu image(s) after %lu bunche(s) of %lu images each\n", nbImageLastCall, nbGpuCall, nbImagePerCall);
		for(size_t j = 0lu; j < nbImageLastCall; ++j){
			for(size_t k = 0lu; k < nbExtraStep; ++k){
				//Let's call the kernel
				gray_scott_cuda_kernel<<<dimGrid, dimBlock>>>(doutMatU, doutMatV, dmatU, dmatV, paddedNbRow, paddedNbCol,
										dmatDeltaSquare, nbStencilRow, nbStencilCol,
										diffusionRateU, diffusionRateV, feedRate, killRate, dt);
				//Now pointer swap
				swapPointer(&doutMatU, &dmatU);
				swapPointer(&doutMatV, &dmatV);
			}
			//To deal with 2d copy : 
			// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee
			size_t shifImage = sizeMatFloat*j;	//shift is in float
			if(doutMatV == dRefFirstOutputV){
				cudaMemcpy2D(dMatOutV + shifImage, sizeCol, doutMatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice);
				PLIB_CUDA_CHECK_FILE
			}else{
				cudaMemcpy2D(dMatOutV + shifImage, sizeCol, dmatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice);
				PLIB_CUDA_CHECK_FILE
			}
		}
		//Flush computed data to Host
		cudaMemcpy(matOutV + nbGpuCall*nbImagePerCall*sizeMatFloat, dMatOutV, nbImageLastCall*sizeMatByte, cudaMemcpyDeviceToHost);
		PLIB_CUDA_CHECK_FILE
	}


On libère la mémoire allouée sur le GPU :
1
2
3
4
5
6
7
8
	//Free the temporaries
	cudaFree(doutMatU);
	cudaFree(doutMatV);
	cudaFree(dmatU);
	cudaFree(dmatV);
	//Free the big images result
	cudaFree(dMatOutV);
}


Appellons notre kernel de calcul sans utiliser de résultat temporaire du le GPU noteIl est que ce sera moins efficace, mais comme cela nous pourrons évaluer à quel point :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
///Interface of the Gray Scott reaction kernel in Cuda
/**	@param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol)
 * 	@param matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol)
 * 	@param matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol)
 * 	@param nbImage : total number of images to be created
 * 	@param nbExtraStep : number of extra steps to be computed between images
 * 	@param nbRow : number of rows of the images to be created
 * 	@param nbCol : number of columns of the images to be created
 * 	@param paddedNbRow : padded number of rows of the images to be created
 * 	@param paddedNbCol : number of columns of the images to be created
 * 	@param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol)
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffusionRateU : diffusion rate of the U specie
 * 	@param diffusionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two computation
 * 	@param maxNbThreadPerBlockX : maximum number of thread per block on X
 * 	@param maxNbBlockX : maximum number of block in the grid on X
 * 	@param totalGpuMemory : total available memory on GPU
*/
void gray_scott_cuda_stupid(float * matOutV, const float * matInU, const float * matInV,
			size_t nbImage, size_t nbExtraStep, size_t nbRow, size_t nbCol,
			size_t paddedNbRow, size_t paddedNbCol,
			const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
			float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt,
			int maxNbThreadPerBlockX, int maxNbBlockX, size_t totalGpuMemory)
{


On alloue les données de notre stencil sur le GPU :
1
2
3
4
	//Allocation des donnees sur GPU
	size_t sizeDeltaSquareByte = nbStencilRow*nbStencilCol*sizeof(float);
	float * dmatDeltaSquare = NULL;
	cudaMalloc((void**)&dmatDeltaSquare, sizeDeltaSquareByte);


On vérifie que l'allocation s'est bien passée :
1
2
	PLIB_CUDA_CHECK_FILE
	


Ensuite on alloue les matrices de concentration de nos produit U et V en entrée et sortie :
1
2
3
4
5
6
7
8
9
10
11
12
	size_t sizeMatFloat = nbRow*nbCol;
	size_t sizePaddedMatByte = paddedNbRow*paddedNbCol*sizeof(float);
	float *doutMatU = NULL, *doutMatV, *dmatU = NULL, *dmatV = NULL;	//Le d est pour device, pour se rappeler que ce sont les donnees sur le GPU
	cudaMalloc((void**)&doutMatU, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&doutMatV, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&dmatU, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&dmatV, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	


On déclare notre premier pointeur qui sera échangé par la suite :
1
2
	float *dRefFirstOutputV = doutMatV;
	


On calcule la taille des blocs que nous utiliserons pour exécuter notre kernel de calcul :
1
2
3
4
5
6
7
	int maxBlockSize = sqrt(maxNbThreadPerBlockX);
	//Convention X => row, Y => col
	int dimBlockX = getBlockSize(maxBlockSize, nbRow), dimBlockY = getBlockSize(maxBlockSize, nbCol);
	int dimGridX = nbRow/dimBlockX, dimGridY = nbCol/dimBlockY;
	printf("gray_scott_cuda_stupid : Grid size (%d, %d, 1)\n", dimGridX, dimGridY);
	printf("gray_scott_cuda_stupid : block size (%d, %d, 1)\n", dimBlockX, dimBlockY);
	


On définit la taille de la grille et celle des blocs (il y aura aussi des clusters dans CUDA 12) :
1
2
3
	dim3 dimGrid(dimGridX, dimGridY, 1);
	dim3 dimBlock(dimBlockX, dimBlockY, 1);
	


On copie les données de l'hôte au GPU :
1
2
3
4
5
6
7
8
9
	//Data transfert from host to device
	cudaMemcpy(dmatDeltaSquare, matDeltaSquare, sizeDeltaSquareByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	
	cudaMemcpy(dmatU, matInU, sizePaddedMatByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	cudaMemcpy(dmatV, matInV, sizePaddedMatByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	


On finalise les calculs pour prendre en compte le padding de nos tableaux :
1
2
	size_t colPitch = paddedNbCol*sizeof(float), colShift = paddedNbCol + 1lu;	//colShift is added with float, so there is no sizeof(float) needed
	size_t sizeCol = nbCol*sizeof(float);


On boucle sur toutes les images à calculer car nous n'avons pas de problème de mémoire puisque nous n'utilisons pas de gros temporaire pour stocker nos résultats :
1
2
	//let's call the computation
	for(size_t j = 0lu; j < nbImage; ++j){


On boucle sur les étapes intermédiaires ce qui nous permet d'accélérer la vitesse de la réaction :
1
2
3
4
5
		for(size_t k = 0lu; k < nbExtraStep; ++k){
			//Let's call the kernel
			gray_scott_cuda_kernel<<<dimGrid, dimBlock>>>(doutMatU, doutMatV, dmatU, dmatV, paddedNbRow, paddedNbCol,
									dmatDeltaSquare, nbStencilRow, nbStencilCol,
									diffusionRateU, diffusionRateV, feedRate, killRate, dt);


On échange nos pointeurs ce qui évite une copie :
1
2
3
4
			//Now pointer swap
			swapPointer(&doutMatU, &dmatU);
			swapPointer(&doutMatV, &dmatV);
		}


On copie nos résultats du temporaire au tableau de résulats (tous sur le GPU) en faisant attention d'utiliser les bons pointeurs :
1
2
3
4
5
6
7
8
9
10
11
		//To deal with 2d copy :
		// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee
		float * outputMatPtr = matOutV + sizeMatFloat*j;
		if(doutMatV == dRefFirstOutputV){
			cudaMemcpy2D(outputMatPtr, sizeCol, doutMatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToHost);
			PLIB_CUDA_CHECK_FILE
		}else{
			cudaMemcpy2D(outputMatPtr, sizeCol, dmatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToHost);
			PLIB_CUDA_CHECK_FILE
		}
	}


On libère la mémoire allouée sur le GPU :
1
2
3
4
5
6
	//Free the temporaries
	cudaFree(doutMatU);
	cudaFree(doutMatV);
	cudaFree(dmatU);
	cudaFree(dmatV);
}


Le fichier gray_scott_cuda.cu complet :

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
/***************************************
	Auteur : Pierre Aubert
	Mail : pierre.aubert@lapp.in2p3.fr
	Licence : CeCILL-C
****************************************/
#include <stdio.h>
#include <stdlib.h>            //pour avoir abort()
#include <math.h>
#include "gray_scott_cuda.h"

///Propagate the U and V species in the matU and matV with CUda
/**	@param[out] outMatU : updated matrix U version (with padding)
 * 	@param[out] outMatV : updated matrix V version (with padding)
 * 	@param matU : input of matrix U (with padding)
 * 	@param matV : input of matrix V (with padding)
 * 	@param nbRow : number of rows of the matrices (with padding)
 * 	@param nbCol : number of columns of the matrices (with padding)
 * 	@param matDeltaSquare : matrix of the delta square values
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffudionRateU : diffusion rate of the U specie
 * 	@param diffudionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two steps
*/
__global__ void gray_scott_cuda_kernel(float * outMatU, float * outMatV, const float * matU, const float * matV, size_t nbRow, size_t nbCol,
		       const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
		       float diffudionRateU, float diffusionRateV, float feedRate, float killRate, float dt)
{
	// identifiant de thread a deux dimensions, comme la matrice
	// La bonne nouvelle c'est que l'on a un thread par element
	int indexElRow = blockIdx.x*blockDim.x + threadIdx.x;	//on rows
	int indexElCol = blockIdx.y*blockDim.y + threadIdx.y;	//on columns
	
	int i = indexElRow + 1;	//We have one padding top (and bottom)
	int j = indexElCol + 1;	//We have one padding left (and right)
	float u = matU[i*nbCol + j], v = matV[i*nbCol + j];
	float fullU = 0.0f, fullV = 0.0f;
	for(long k = 0l; k < nbStencilRow; ++k){
		for(long l = 0l; l < nbStencilCol; ++l){
			float deltaSquare = matDeltaSquare[k*nbStencilCol + l];
			
			long idxRow = i + k - 1;	//i shifted of -1, 0 or 1
			long idxCol = j + l - 1;	//j shifted of -1, 0 or 1
			
			fullU += (matU[idxRow*nbCol + idxCol] - u)*deltaSquare;
			fullV += (matV[idxRow*nbCol + idxCol] - v)*deltaSquare;
		}
	}
	float uvSquare = u*v*v;
	float du = diffudionRateU*fullU - uvSquare + feedRate*(1.0f - u);
	float dv = diffusionRateV*fullV + uvSquare - (feedRate + killRate)*v;
	outMatU[i*nbCol + j] = u + du*dt;
	outMatV[i*nbCol + j] = v + dv*dt;
}

///Get the size of a block (works for both X and Y dimensions
/**	@param maxBlockSize : maximum size of block on the given dimension
 * 	@param imageSize : size of the image on this dimension
 * 	@return matching size of the block for the corresponding dimension
*/
int getBlockSize(int maxBlockSize, size_t imageSize){
	if((long)imageSize < maxBlockSize){return imageSize;}
	int blockSize = maxBlockSize;
	int restSize = imageSize % blockSize;
	while(restSize != 0 && blockSize > 1){
		--blockSize;
		restSize = imageSize % blockSize;	//We reduce until we found a multiple
	}
	if(blockSize == 0){
		blockSize = maxBlockSize;
	}
	return blockSize;
}

///Swap pointer
/**	@param[out] ptr1 : pointer to be swaped
 * 	@param[out] ptr2 : pointer to be swaped
*/
void swapPointer(float ** ptr1, float ** ptr2){
	float * tmp = *ptr1;
	*ptr1 = *ptr2;
	*ptr2 = tmp;
}

///Interface of the Gray Scott reaction kernel in Cuda
/**	@param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol)
 * 	@param matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol)
 * 	@param matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol)
 * 	@param nbImage : total number of images to be created
 * 	@param nbExtraStep : number of extra steps to be computed between images
 * 	@param nbRow : number of rows of the images to be created
 * 	@param nbCol : number of columns of the images to be created
 * 	@param paddedNbRow : padded number of rows of the images to be created
 * 	@param paddedNbCol : number of columns of the images to be created
 * 	@param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol)
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffusionRateU : diffusion rate of the U specie
 * 	@param diffusionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two computation
 * 	@param maxNbThreadPerBlockX : maximum number of thread per block on X
 * 	@param maxNbBlockX : maximum number of block in the grid on X
 * 	@param totalGpuMemory : total available memory on GPU
*/
void gray_scott_cuda(float * matOutV, const float * matInU, const float * matInV,
			size_t nbImage, size_t nbExtraStep, size_t nbRow, size_t nbCol,
			size_t paddedNbRow, size_t paddedNbCol,
			const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
			float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt,
			int maxNbThreadPerBlockX, int maxNbBlockX, size_t totalGpuMemory)
{
	size_t totalNeededMemory = (nbStencilRow*nbStencilCol + paddedNbRow*paddedNbCol*4lu + nbImage*nbRow*nbCol)*sizeof(float);
	size_t nbGpuCall = 1lu + totalNeededMemory/(totalGpuMemory - 290000000lu);	//Combien de fois il faudra appeler le kernel pour traiter toutes les images
	printf("gray_scott_cuda : number of bunch %lu\n", nbGpuCall);
	size_t nbImagePerCall = nbImage/nbGpuCall;			//Nombre d'images traitees par le GPU en une fois
	printf("gray_scott_cuda : nbImagePerCall = %lu\n", nbImagePerCall);
	size_t nbImageLastCall = nbImage - nbImagePerCall*nbGpuCall;	//Dernières images a traiter
	printf("gray_scott_cuda : nbImageLastCall = %lu\n", nbImageLastCall);
	//Allocation des donnees sur GPU
	size_t sizeDeltaSquareByte = nbStencilRow*nbStencilCol*sizeof(float);
	float * dmatDeltaSquare = NULL;
	cudaMalloc((void**)&dmatDeltaSquare, sizeDeltaSquareByte);
	PLIB_CUDA_CHECK_FILE
	
	size_t sizeMatFloat = nbRow*nbCol;
	size_t sizeMatByte = sizeMatFloat*sizeof(float);
	size_t sizePaddedMatByte = paddedNbRow*paddedNbCol*sizeof(float);
	float *doutMatU = NULL, *doutMatV, *dmatU = NULL, *dmatV = NULL;	//Le d est pour device, pour se rappeler que ce sont les donnees sur le GPU
	cudaMalloc((void**)&doutMatU, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&doutMatV, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&dmatU, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&dmatV, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	
	float *dRefFirstOutputV = doutMatV;

	size_t sizeMatOutputInByte = sizeMatByte*nbImagePerCall;		//Le gros tableau de resultat, côte GPU
	float * dMatOutV = NULL;
	cudaError_t err = cudaMalloc((void**)&dMatOutV, sizeMatOutputInByte);
	if(err != cudaSuccess){
		printf("gray_scott_cuda : Cannot allocate temporary of %lu, %lu bytes : %s \n", nbImagePerCall, sizeMatOutputInByte, cudaGetErrorString(err));
		abort();
	}
	
	int maxBlockSize = sqrt(maxNbThreadPerBlockX);
	//Convention X => row, Y => col
	int dimBlockX = getBlockSize(maxBlockSize, nbRow), dimBlockY = getBlockSize(maxBlockSize, nbCol);
	int dimGridX = nbRow/dimBlockX, dimGridY = nbCol/dimBlockY;
	printf("gray_scott_cuda : Grid size (%d, %d, 1)\n", dimGridX, dimGridY);
	printf("gray_scott_cuda : block size (%d, %d, 1)\n", dimBlockX, dimBlockY);
	
	dim3 dimGrid(dimGridX, dimGridY, 1);
	dim3 dimBlock(dimBlockX, dimBlockY, 1);
	
	//Data transfert from host to device
	cudaMemcpy(dmatDeltaSquare, matDeltaSquare, sizeDeltaSquareByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	
	cudaMemcpy(dmatU, matInU, sizePaddedMatByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	cudaMemcpy(dmatV, matInV, sizePaddedMatByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	
	size_t colPitch = paddedNbCol*sizeof(float), colShift = paddedNbCol + 1lu;	//colShift is added with float, so there is no sizeof(float) needed
	size_t sizeCol = nbCol*sizeof(float);
	//let's call the computation
	for(size_t i = 0lu; i < nbGpuCall; ++i){
		printf("gray_scott_cuda : bunch %lu/%lu\n", (i+1), nbGpuCall);
		for(size_t j = 0lu; j < nbImagePerCall; ++j){
			for(size_t k = 0lu; k < nbExtraStep; ++k){
				//Let's call the kernel
				gray_scott_cuda_kernel<<<dimGrid, dimBlock>>>(doutMatU, doutMatV, dmatU, dmatV, paddedNbRow, paddedNbCol,
										dmatDeltaSquare, nbStencilRow, nbStencilCol,
										diffusionRateU, diffusionRateV, feedRate, killRate, dt);
				//Now pointer swap
				swapPointer(&doutMatU, &dmatU);
				swapPointer(&doutMatV, &dmatV);
			}
			//To deal with 2d copy :
			// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee
			size_t shifImage = sizeMatFloat*j;	//shift is in float
			if(doutMatV == dRefFirstOutputV){
				cudaMemcpy2D(dMatOutV + shifImage, sizeCol, doutMatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice);
				PLIB_CUDA_CHECK_FILE
			}else{
				cudaMemcpy2D(dMatOutV + shifImage, sizeCol, dmatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice);
				PLIB_CUDA_CHECK_FILE
			}
		}
		//Flush computed data to Host
		cudaMemcpy(matOutV + i*nbImagePerCall*sizeMatFloat, dMatOutV, sizeMatOutputInByte, cudaMemcpyDeviceToHost);
		PLIB_CUDA_CHECK_FILE
	}
	if(nbImageLastCall != 0lu){	//If there are remaning images to compute
		printf("gray_scott_cuda : last bunch of %lu image(s) after %lu bunche(s) of %lu images each\n", nbImageLastCall, nbGpuCall, nbImagePerCall);
		for(size_t j = 0lu; j < nbImageLastCall; ++j){
			for(size_t k = 0lu; k < nbExtraStep; ++k){
				//Let's call the kernel
				gray_scott_cuda_kernel<<<dimGrid, dimBlock>>>(doutMatU, doutMatV, dmatU, dmatV, paddedNbRow, paddedNbCol,
										dmatDeltaSquare, nbStencilRow, nbStencilCol,
										diffusionRateU, diffusionRateV, feedRate, killRate, dt);
				//Now pointer swap
				swapPointer(&doutMatU, &dmatU);
				swapPointer(&doutMatV, &dmatV);
			}
			//To deal with 2d copy : 
			// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee
			size_t shifImage = sizeMatFloat*j;	//shift is in float
			if(doutMatV == dRefFirstOutputV){
				cudaMemcpy2D(dMatOutV + shifImage, sizeCol, doutMatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice);
				PLIB_CUDA_CHECK_FILE
			}else{
				cudaMemcpy2D(dMatOutV + shifImage, sizeCol, dmatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice);
				PLIB_CUDA_CHECK_FILE
			}
		}
		//Flush computed data to Host
		cudaMemcpy(matOutV + nbGpuCall*nbImagePerCall*sizeMatFloat, dMatOutV, nbImageLastCall*sizeMatByte, cudaMemcpyDeviceToHost);
		PLIB_CUDA_CHECK_FILE
	}
	//Free the temporaries
	cudaFree(doutMatU);
	cudaFree(doutMatV);
	cudaFree(dmatU);
	cudaFree(dmatV);
	//Free the big images result
	cudaFree(dMatOutV);
}

///Interface of the Gray Scott reaction kernel in Cuda
/**	@param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol)
 * 	@param matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol)
 * 	@param matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol)
 * 	@param nbImage : total number of images to be created
 * 	@param nbExtraStep : number of extra steps to be computed between images
 * 	@param nbRow : number of rows of the images to be created
 * 	@param nbCol : number of columns of the images to be created
 * 	@param paddedNbRow : padded number of rows of the images to be created
 * 	@param paddedNbCol : number of columns of the images to be created
 * 	@param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol)
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffusionRateU : diffusion rate of the U specie
 * 	@param diffusionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two computation
 * 	@param maxNbThreadPerBlockX : maximum number of thread per block on X
 * 	@param maxNbBlockX : maximum number of block in the grid on X
 * 	@param totalGpuMemory : total available memory on GPU
*/
void gray_scott_cuda_stupid(float * matOutV, const float * matInU, const float * matInV,
			size_t nbImage, size_t nbExtraStep, size_t nbRow, size_t nbCol,
			size_t paddedNbRow, size_t paddedNbCol,
			const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
			float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt,
			int maxNbThreadPerBlockX, int maxNbBlockX, size_t totalGpuMemory)
{
	//Allocation des donnees sur GPU
	size_t sizeDeltaSquareByte = nbStencilRow*nbStencilCol*sizeof(float);
	float * dmatDeltaSquare = NULL;
	cudaMalloc((void**)&dmatDeltaSquare, sizeDeltaSquareByte);
	PLIB_CUDA_CHECK_FILE
	
	size_t sizeMatFloat = nbRow*nbCol;
	size_t sizePaddedMatByte = paddedNbRow*paddedNbCol*sizeof(float);
	float *doutMatU = NULL, *doutMatV, *dmatU = NULL, *dmatV = NULL;	//Le d est pour device, pour se rappeler que ce sont les donnees sur le GPU
	cudaMalloc((void**)&doutMatU, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&doutMatV, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&dmatU, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	cudaMalloc((void**)&dmatV, sizePaddedMatByte);
	PLIB_CUDA_CHECK_FILE
	
	float *dRefFirstOutputV = doutMatV;
	
	int maxBlockSize = sqrt(maxNbThreadPerBlockX);
	//Convention X => row, Y => col
	int dimBlockX = getBlockSize(maxBlockSize, nbRow), dimBlockY = getBlockSize(maxBlockSize, nbCol);
	int dimGridX = nbRow/dimBlockX, dimGridY = nbCol/dimBlockY;
	printf("gray_scott_cuda_stupid : Grid size (%d, %d, 1)\n", dimGridX, dimGridY);
	printf("gray_scott_cuda_stupid : block size (%d, %d, 1)\n", dimBlockX, dimBlockY);
	
	dim3 dimGrid(dimGridX, dimGridY, 1);
	dim3 dimBlock(dimBlockX, dimBlockY, 1);
	
	//Data transfert from host to device
	cudaMemcpy(dmatDeltaSquare, matDeltaSquare, sizeDeltaSquareByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	
	cudaMemcpy(dmatU, matInU, sizePaddedMatByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	cudaMemcpy(dmatV, matInV, sizePaddedMatByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	
	size_t colPitch = paddedNbCol*sizeof(float), colShift = paddedNbCol + 1lu;	//colShift is added with float, so there is no sizeof(float) needed
	size_t sizeCol = nbCol*sizeof(float);
	//let's call the computation
	for(size_t j = 0lu; j < nbImage; ++j){
		for(size_t k = 0lu; k < nbExtraStep; ++k){
			//Let's call the kernel
			gray_scott_cuda_kernel<<<dimGrid, dimBlock>>>(doutMatU, doutMatV, dmatU, dmatV, paddedNbRow, paddedNbCol,
									dmatDeltaSquare, nbStencilRow, nbStencilCol,
									diffusionRateU, diffusionRateV, feedRate, killRate, dt);
			//Now pointer swap
			swapPointer(&doutMatU, &dmatU);
			swapPointer(&doutMatV, &dmatV);
		}
		//To deal with 2d copy :
		// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee
		float * outputMatPtr = matOutV + sizeMatFloat*j;
		if(doutMatV == dRefFirstOutputV){
			cudaMemcpy2D(outputMatPtr, sizeCol, doutMatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToHost);
			PLIB_CUDA_CHECK_FILE
		}else{
			cudaMemcpy2D(outputMatPtr, sizeCol, dmatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToHost);
			PLIB_CUDA_CHECK_FILE
		}
	}
	//Free the temporaries
	cudaFree(doutMatU);
	cudaFree(doutMatV);
	cudaFree(dmatU);
	cudaFree(dmatV);
}


Vous pouvez le télécharger ici.