5.8.6.2 : Le fichier source gray_scott_nvcpp.cpp

Écrivons le fichier gray_scott_nvcpp.cpp :

1
2
#include <string.h>
#include <vector>


Nous aurons besoin de la fonction std::for_each qui est disponible dans algorithm et d'un mode d'execution qui est définit dans execution :

1
2
3
4
5
#include <algorithm>
#include <execution>


#include "gray_scott_nvcpp.h"


Nous aurons besoin d'échanger régulièrement les pointeurs de nos temporaires. Donc nous définissons une fonction pour cela :
1
2
3
4
5
6
7
8
9
///Swap pointer
/**	@param[out] ptr1 : pointer to be swaped
 * 	@param[out] ptr2 : pointer to be swaped
*/
void swapPointer(float *& ptr1, float *& ptr2){
	float *& tmp = ptr1;
	ptr1 = ptr2;
	ptr2 = tmp;
}


Puis nous ajoutons la fonction qui calcule une itération de la réaction de Gray Scott :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
///Propagate the U and V species in the matU and matV
/**	@param[out] outMatU : updated matrix U version (with padding)
 * 	@param[out] outMatV : updated matrix V version (with padding)
 * 	@param matU : input of matrix U (with padding)
 * 	@param matV : input of matrix V (with padding)
 * 	@param vecIndex : vector of index to compute all pixels
 * 	@param nbCol : number of columns of the matrices (with padding)
 * 	@param paddedNbCol : number of columns of the images to be created
 * 	@param matDeltaSquare : matrix of the delta square values
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffusionRateU : diffusion rate of the U specie
 * 	@param diffudionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two steps
 * 	@param nbStep : number of steps to perform
*/
void gray_scott_nvcpp_step(float * outMatU, float * outMatV, float * matU, float * matV, const std::vector<int> & vecIndex,
			size_t nbCol, size_t paddedNbCol,
			const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
			float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt, size_t nbStep)
{


On commence par itérer sur le nombre d'étapes de calcul (puisque chaque étape dépend de la précédente) :
1
	for(size_t iStep(0lu); iStep < nbStep; ++iStep){	//Perform all the steps


Puis, nous utilisons un std::for_each qui nous permet de traiter chaque pixel de notre image :
1
2
3
		std::for_each(std::execution::par_unseq, vecIndex.begin(), vecIndex.end(),
			[=](int indexPixel)
		{


Le début de l'implémentation est très similaire à l'implémentation CUDA puisque nous avons aussi besoin de connaître les données dont le pixel courrant à besoin :
1
2
3
4
5
			int indexElRow = indexPixel/nbCol;	//on rows
			int indexElCol = indexPixel - indexElRow*nbCol;	//on columns
			
			int i = indexElRow + 1;	//We have one padding top (and bottom)
			int j = indexElCol + 1;	//We have one padding left (and right)


Ensuite, nous calculons la réaction de Gray Scott comme avant :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
			float u = matU[i*paddedNbCol + j], v = matV[i*paddedNbCol + j];
			float fullU = 0.0f, fullV = 0.0f;
			for(long k = 0l; k < nbStencilRow; ++k){
				for(long l = 0l; l < nbStencilCol; ++l){
					float deltaSquare = matDeltaSquare[k*nbStencilCol + l];
					
					long idxRow = i + k - 1;	//i shifted of -1, 0 or 1
					long idxCol = j + l - 1;	//j shifted of -1, 0 or 1
					
					fullU += (matU[idxRow*paddedNbCol + idxCol] - u)*deltaSquare;
					fullV += (matV[idxRow*paddedNbCol + idxCol] - v)*deltaSquare;
				}
			}
			float uvSquare = u*v*v;
			float du = diffusionRateU*fullU - uvSquare + feedRate*(1.0f - u);
			float dv = diffusionRateV*fullV + uvSquare - (feedRate + killRate)*v;


On sauve les résultats :
1
2
3
			outMatU[i*paddedNbCol + j] = u + du*dt;
			outMatV[i*paddedNbCol + j] = v + dv*dt;
		});


On échange les pointeurs de nos temporaires :
1
2
3
4
5
		//Now pointer swap
		swapPointer(outMatU, matU);
		swapPointer(outMatV, matV);
	}
}


Finalement, la fonction qui calcule la totalité de la réaction de Gray Scott :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
///Interface of the Gray Scott reaction kernel
/**	@param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol)
 * 	@param[out] matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol)
 * 	@param[out] matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol)
 * 	@param[out] matTmpU : temporary matrix of U concentration (size : paddedNbRow*paddedNbCol)
 * 	@param[out] matTmpV : temporary matrix of V concentration (size : paddedNbRow*paddedNbCol)
 * 	@param nbImage : total number of images to be created
 * 	@param nbExtraStep : number of extra steps to be computed between images
 * 	@param nbGpuCall : number of time to call the GPU to create nbImage
 * 	@param nbRow : number of rows of the images to be created
 * 	@param nbCol : number of columns of the images to be created
 * 	@param paddedNbRow : padded number of rows of the images to be created
 * 	@param paddedNbCol : number of columns of the images to be created
 * 	@param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol)
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffusionRateU : diffusion rate of the U specie
 * 	@param diffusionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two computation
 * 	@param maxNbThreadPerBlockX : maximum number of thread per block on X
 * 	@param maxNbBlockX : maximum number of block in the grid on X
 * 	@param totalGpuMemory : total available memory on GPU
*/
void gray_scott_nvcpp(float * matOutV, float * matInU, float * matInV, float * matTmpU, float * matTmpV,
			size_t nbImage, size_t nbExtraStep, size_t nbGpuCall, size_t nbRow, size_t nbCol,
			size_t paddedNbRow, size_t paddedNbCol,
			const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
			float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt)
{
	
	size_t sizeMatFloat = nbRow*nbCol;

	size_t nbPixel(nbRow*nbCol);


On initialise le vecteur d'indices noteOn aurait pu utiliser un std::iota :
1
2
3
4
	std::vector<int> vecIndex;
	vecIndex.resize(nbPixel);
	for(size_t i(0lu); i < nbPixel; ++i){vecIndex[i] = (int)i;}
	


Enfin le calcul proprement dit, découpé en bloc afin d'utiliser au mieux la mémoire disponible sur le GPU :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
	//let's call the computation
	for(size_t i = 0lu; i < nbGpuCall; ++i){
		std::cout << "gray_scott_nvcpp : bunch "<<(i+1)<<"/"<<nbGpuCall<<std::endl;
		for(size_t j = 0lu; j < nbImage; ++j){
			gray_scott_nvcpp_step(matTmpU, matTmpV, matInU, matInV, vecIndex, nbCol, paddedNbCol,
						matDeltaSquare, nbStencilRow, nbStencilCol,
						diffusionRateU, diffusionRateV, feedRate, killRate, dt, nbExtraStep);
			//Flush computed data to Host
			for(size_t k(0lu); k < nbRow; ++k){		//Do not forget the 1,1 shift
				memcpy(matOutV + i*sizeMatFloat*nbImage + j*sizeMatFloat + k*nbCol, matTmpV + (k+1lu)*paddedNbCol + 1lu, nbCol*sizeof(float));
			}
		}
	}
}


Le fichier gray_scott_cuda.cu complet :

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/***************************************
	Auteur : Pierre Aubert
	Mail : pierre.aubert@lapp.in2p3.fr
	Licence : CeCILL-C
****************************************/

#include <string.h>
#include <vector>
#include <algorithm>
#include <execution>


#include "gray_scott_nvcpp.h"

///Swap pointer
/**	@param[out] ptr1 : pointer to be swaped
 * 	@param[out] ptr2 : pointer to be swaped
*/
void swapPointer(float *& ptr1, float *& ptr2){
	float *& tmp = ptr1;
	ptr1 = ptr2;
	ptr2 = tmp;
}

///Propagate the U and V species in the matU and matV
/**	@param[out] outMatU : updated matrix U version (with padding)
 * 	@param[out] outMatV : updated matrix V version (with padding)
 * 	@param matU : input of matrix U (with padding)
 * 	@param matV : input of matrix V (with padding)
 * 	@param vecIndex : vector of index to compute all pixels
 * 	@param nbCol : number of columns of the matrices (with padding)
 * 	@param paddedNbCol : number of columns of the images to be created
 * 	@param matDeltaSquare : matrix of the delta square values
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffusionRateU : diffusion rate of the U specie
 * 	@param diffudionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two steps
 * 	@param nbStep : number of steps to perform
*/
void gray_scott_nvcpp_step(float * outMatU, float * outMatV, float * matU, float * matV, const std::vector<int> & vecIndex,
			size_t nbCol, size_t paddedNbCol,
			const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
			float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt, size_t nbStep)
{
	for(size_t iStep(0lu); iStep < nbStep; ++iStep){	//Perform all the steps
		std::for_each(std::execution::par_unseq, vecIndex.begin(), vecIndex.end(),
			[=](int indexPixel)
		{
			int indexElRow = indexPixel/nbCol;	//on rows
			int indexElCol = indexPixel - indexElRow*nbCol;	//on columns
			
			int i = indexElRow + 1;	//We have one padding top (and bottom)
			int j = indexElCol + 1;	//We have one padding left (and right)
			float u = matU[i*paddedNbCol + j], v = matV[i*paddedNbCol + j];
			float fullU = 0.0f, fullV = 0.0f;
			for(long k = 0l; k < nbStencilRow; ++k){
				for(long l = 0l; l < nbStencilCol; ++l){
					float deltaSquare = matDeltaSquare[k*nbStencilCol + l];
					
					long idxRow = i + k - 1;	//i shifted of -1, 0 or 1
					long idxCol = j + l - 1;	//j shifted of -1, 0 or 1
					
					fullU += (matU[idxRow*paddedNbCol + idxCol] - u)*deltaSquare;
					fullV += (matV[idxRow*paddedNbCol + idxCol] - v)*deltaSquare;
				}
			}
			float uvSquare = u*v*v;
			float du = diffusionRateU*fullU - uvSquare + feedRate*(1.0f - u);
			float dv = diffusionRateV*fullV + uvSquare - (feedRate + killRate)*v;
			outMatU[i*paddedNbCol + j] = u + du*dt;
			outMatV[i*paddedNbCol + j] = v + dv*dt;
		});
		//Now pointer swap
		swapPointer(outMatU, matU);
		swapPointer(outMatV, matV);
	}
}

///Interface of the Gray Scott reaction kernel
/**	@param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol)
 * 	@param[out] matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol)
 * 	@param[out] matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol)
 * 	@param[out] matTmpU : temporary matrix of U concentration (size : paddedNbRow*paddedNbCol)
 * 	@param[out] matTmpV : temporary matrix of V concentration (size : paddedNbRow*paddedNbCol)
 * 	@param nbImage : total number of images to be created
 * 	@param nbExtraStep : number of extra steps to be computed between images
 * 	@param nbGpuCall : number of time to call the GPU to create nbImage
 * 	@param nbRow : number of rows of the images to be created
 * 	@param nbCol : number of columns of the images to be created
 * 	@param paddedNbRow : padded number of rows of the images to be created
 * 	@param paddedNbCol : number of columns of the images to be created
 * 	@param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol)
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffusionRateU : diffusion rate of the U specie
 * 	@param diffusionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two computation
 * 	@param maxNbThreadPerBlockX : maximum number of thread per block on X
 * 	@param maxNbBlockX : maximum number of block in the grid on X
 * 	@param totalGpuMemory : total available memory on GPU
*/
void gray_scott_nvcpp(float * matOutV, float * matInU, float * matInV, float * matTmpU, float * matTmpV,
			size_t nbImage, size_t nbExtraStep, size_t nbGpuCall, size_t nbRow, size_t nbCol,
			size_t paddedNbRow, size_t paddedNbCol,
			const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
			float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt)
{
	
	size_t sizeMatFloat = nbRow*nbCol;

	size_t nbPixel(nbRow*nbCol);
	std::vector<int> vecIndex;
	vecIndex.resize(nbPixel);
	for(size_t i(0lu); i < nbPixel; ++i){vecIndex[i] = (int)i;}
	
	//let's call the computation
	for(size_t i = 0lu; i < nbGpuCall; ++i){
		std::cout << "gray_scott_nvcpp : bunch "<<(i+1)<<"/"<<nbGpuCall<<std::endl;
		for(size_t j = 0lu; j < nbImage; ++j){
			gray_scott_nvcpp_step(matTmpU, matTmpV, matInU, matInV, vecIndex, nbCol, paddedNbCol,
						matDeltaSquare, nbStencilRow, nbStencilCol,
						diffusionRateU, diffusionRateV, feedRate, killRate, dt, nbExtraStep);
			//Flush computed data to Host
			for(size_t k(0lu); k < nbRow; ++k){		//Do not forget the 1,1 shift
				memcpy(matOutV + i*sizeMatFloat*nbImage + j*sizeMatFloat + k*nbCol, matTmpV + (k+1lu)*paddedNbCol + 1lu, nbCol*sizeof(float));
			}
		}
	}
}


Vous pouvez le télécharger ici.