Le fichier source gray_scott

5.8.6.2 : Le fichier source gray_scott_nvcpp.cpp

Écrivons le fichier gray_scott_nvcpp.cpp :

1 2	#include <string.h> #include <vector>

Nous aurons besoin de la fonction std::for_each qui est disponible dans algorithm et d'un mode d'execution qui est définit dans execution :

#include <algorithm>
#include <execution>


#include "gray_scott_nvcpp.h"

Nous aurons besoin d'échanger régulièrement les pointeurs de nos temporaires. Donc nous définissons une fonction pour cela :

///Swap pointer
/**	@param[out] ptr1 : pointer to be swaped
 * 	@param[out] ptr2 : pointer to be swaped
*/
void swapPointer(float *& ptr1, float *& ptr2){
	float *& tmp = ptr1;
	ptr1 = ptr2;
	ptr2 = tmp;
}

Puis nous ajoutons la fonction qui calcule une itération de la réaction de Gray Scott :

///Propagate the U and V species in the matU and matV
/**	@param[out] outMatU : updated matrix U version (with padding)
 * 	@param[out] outMatV : updated matrix V version (with padding)
 * 	@param matU : input of matrix U (with padding)
 * 	@param matV : input of matrix V (with padding)
 * 	@param vecIndex : vector of index to compute all pixels
 * 	@param nbCol : number of columns of the matrices (with padding)
 * 	@param paddedNbCol : number of columns of the images to be created
 * 	@param matDeltaSquare : matrix of the delta square values
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffusionRateU : diffusion rate of the U specie
 * 	@param diffudionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two steps
 * 	@param nbStep : number of steps to perform
*/
void gray_scott_nvcpp_step(float * outMatU, float * outMatV, float * matU, float * matV, const std::vector<int> & vecIndex,
			size_t nbCol, size_t paddedNbCol,
			const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
			float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt, size_t nbStep)
{

On commence par itérer sur le nombre d'étapes de calcul (puisque chaque étape dépend de la précédente) :

1	for(size_t iStep(0lu); iStep < nbStep; ++iStep){ //Perform all the steps

Puis, nous utilisons un std::for_each qui nous permet de traiter chaque pixel de notre image :

1
2
3

		std::for_each(std::execution::par_unseq, vecIndex.begin(), vecIndex.end(),
			[=](int indexPixel)
		{

Le début de l'implémentation est très similaire à l'implémentation CUDA puisque nous avons aussi besoin de connaître les données dont le pixel courrant à besoin :

			int indexElRow = indexPixel/nbCol;	//on rows
			int indexElCol = indexPixel - indexElRow*nbCol;	//on columns
			
			int i = indexElRow + 1;	//We have one padding top (and bottom)
			int j = indexElCol + 1;	//We have one padding left (and right)

Ensuite, nous calculons la réaction de Gray Scott comme avant :

			float u = matU[i*paddedNbCol + j], v = matV[i*paddedNbCol + j];
			float fullU = 0.0f, fullV = 0.0f;
			for(long k = 0l; k < nbStencilRow; ++k){
				for(long l = 0l; l < nbStencilCol; ++l){
					float deltaSquare = matDeltaSquare[k*nbStencilCol + l];
					
					long idxRow = i + k - 1;	//i shifted of -1, 0 or 1
					long idxCol = j + l - 1;	//j shifted of -1, 0 or 1
					
					fullU += (matU[idxRow*paddedNbCol + idxCol] - u)*deltaSquare;
					fullV += (matV[idxRow*paddedNbCol + idxCol] - v)*deltaSquare;
				}
			}
			float uvSquare = u*v*v;
			float du = diffusionRateU*fullU - uvSquare + feedRate*(1.0f - u);
			float dv = diffusionRateV*fullV + uvSquare - (feedRate + killRate)*v;

On sauve les résultats :

1
2
3

			outMatU[i*paddedNbCol + j] = u + du*dt;
			outMatV[i*paddedNbCol + j] = v + dv*dt;
		});

On échange les pointeurs de nos temporaires :

		//Now pointer swap
		swapPointer(outMatU, matU);
		swapPointer(outMatV, matV);
	}
}

Finalement, la fonction qui calcule la totalité de la réaction de Gray Scott :

///Interface of the Gray Scott reaction kernel
/**	@param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol)
 * 	@param[out] matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol)
 * 	@param[out] matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol)
 * 	@param[out] matTmpU : temporary matrix of U concentration (size : paddedNbRow*paddedNbCol)
 * 	@param[out] matTmpV : temporary matrix of V concentration (size : paddedNbRow*paddedNbCol)
 * 	@param nbImage : total number of images to be created
 * 	@param nbExtraStep : number of extra steps to be computed between images
 * 	@param nbGpuCall : number of time to call the GPU to create nbImage
 * 	@param nbRow : number of rows of the images to be created
 * 	@param nbCol : number of columns of the images to be created
 * 	@param paddedNbRow : padded number of rows of the images to be created
 * 	@param paddedNbCol : number of columns of the images to be created
 * 	@param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol)
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffusionRateU : diffusion rate of the U specie
 * 	@param diffusionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two computation
 * 	@param maxNbThreadPerBlockX : maximum number of thread per block on X
 * 	@param maxNbBlockX : maximum number of block in the grid on X
 * 	@param totalGpuMemory : total available memory on GPU
*/
void gray_scott_nvcpp(float * matOutV, float * matInU, float * matInV, float * matTmpU, float * matTmpV,
			size_t nbImage, size_t nbExtraStep, size_t nbGpuCall, size_t nbRow, size_t nbCol,
			size_t paddedNbRow, size_t paddedNbCol,
			const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
			float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt)
{
	
	size_t sizeMatFloat = nbRow*nbCol;

	size_t nbPixel(nbRow*nbCol);

On initialise le vecteur d'indices ^{noteOn aurait pu utiliser un std::iota} :

	std::vector<int> vecIndex;
	vecIndex.resize(nbPixel);
	for(size_t i(0lu); i < nbPixel; ++i){vecIndex[i] = (int)i;}

Enfin le calcul proprement dit, découpé en bloc afin d'utiliser au mieux la mémoire disponible sur le GPU :

	//let's call the computation
	for(size_t i = 0lu; i < nbGpuCall; ++i){
		std::cout << "gray_scott_nvcpp : bunch "<<(i+1)<<"/"<<nbGpuCall<<std::endl;
		for(size_t j = 0lu; j < nbImage; ++j){
			gray_scott_nvcpp_step(matTmpU, matTmpV, matInU, matInV, vecIndex, nbCol, paddedNbCol,
						matDeltaSquare, nbStencilRow, nbStencilCol,
						diffusionRateU, diffusionRateV, feedRate, killRate, dt, nbExtraStep);
			//Flush computed data to Host
			for(size_t k(0lu); k < nbRow; ++k){		//Do not forget the 1,1 shift
				memcpy(matOutV + i*sizeMatFloat*nbImage + j*sizeMatFloat + k*nbCol, matTmpV + (k+1lu)*paddedNbCol + 1lu, nbCol*sizeof(float));
			}
		}
	}
}

Le fichier gray_scott_cuda.cu complet :

/***************************************
	Auteur : Pierre Aubert
	Mail : pierre.aubert@lapp.in2p3.fr
	Licence : CeCILL-C
****************************************/

#include <string.h>
#include <vector>
#include <algorithm>
#include <execution>


#include "gray_scott_nvcpp.h"

///Swap pointer
/**	@param[out] ptr1 : pointer to be swaped
 * 	@param[out] ptr2 : pointer to be swaped
*/
void swapPointer(float *& ptr1, float *& ptr2){
	float *& tmp = ptr1;
	ptr1 = ptr2;
	ptr2 = tmp;
}

///Propagate the U and V species in the matU and matV
/**	@param[out] outMatU : updated matrix U version (with padding)
 * 	@param[out] outMatV : updated matrix V version (with padding)
 * 	@param matU : input of matrix U (with padding)
 * 	@param matV : input of matrix V (with padding)
 * 	@param vecIndex : vector of index to compute all pixels
 * 	@param nbCol : number of columns of the matrices (with padding)
 * 	@param paddedNbCol : number of columns of the images to be created
 * 	@param matDeltaSquare : matrix of the delta square values
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffusionRateU : diffusion rate of the U specie
 * 	@param diffudionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two steps
 * 	@param nbStep : number of steps to perform
*/
void gray_scott_nvcpp_step(float * outMatU, float * outMatV, float * matU, float * matV, const std::vector<int> & vecIndex,
			size_t nbCol, size_t paddedNbCol,
			const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
			float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt, size_t nbStep)
{
	for(size_t iStep(0lu); iStep < nbStep; ++iStep){	//Perform all the steps
		std::for_each(std::execution::par_unseq, vecIndex.begin(), vecIndex.end(),
			[=](int indexPixel)
		{
			int indexElRow = indexPixel/nbCol;	//on rows
			int indexElCol = indexPixel - indexElRow*nbCol;	//on columns
			
			int i = indexElRow + 1;	//We have one padding top (and bottom)
			int j = indexElCol + 1;	//We have one padding left (and right)
			float u = matU[i*paddedNbCol + j], v = matV[i*paddedNbCol + j];
			float fullU = 0.0f, fullV = 0.0f;
			for(long k = 0l; k < nbStencilRow; ++k){
				for(long l = 0l; l < nbStencilCol; ++l){
					float deltaSquare = matDeltaSquare[k*nbStencilCol + l];
					
					long idxRow = i + k - 1;	//i shifted of -1, 0 or 1
					long idxCol = j + l - 1;	//j shifted of -1, 0 or 1
					
					fullU += (matU[idxRow*paddedNbCol + idxCol] - u)*deltaSquare;
					fullV += (matV[idxRow*paddedNbCol + idxCol] - v)*deltaSquare;
				}
			}
			float uvSquare = u*v*v;
			float du = diffusionRateU*fullU - uvSquare + feedRate*(1.0f - u);
			float dv = diffusionRateV*fullV + uvSquare - (feedRate + killRate)*v;
			outMatU[i*paddedNbCol + j] = u + du*dt;
			outMatV[i*paddedNbCol + j] = v + dv*dt;
		});
		//Now pointer swap
		swapPointer(outMatU, matU);
		swapPointer(outMatV, matV);
	}
}

///Interface of the Gray Scott reaction kernel
/**	@param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol)
 * 	@param[out] matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol)
 * 	@param[out] matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol)
 * 	@param[out] matTmpU : temporary matrix of U concentration (size : paddedNbRow*paddedNbCol)
 * 	@param[out] matTmpV : temporary matrix of V concentration (size : paddedNbRow*paddedNbCol)
 * 	@param nbImage : total number of images to be created
 * 	@param nbExtraStep : number of extra steps to be computed between images
 * 	@param nbGpuCall : number of time to call the GPU to create nbImage
 * 	@param nbRow : number of rows of the images to be created
 * 	@param nbCol : number of columns of the images to be created
 * 	@param paddedNbRow : padded number of rows of the images to be created
 * 	@param paddedNbCol : number of columns of the images to be created
 * 	@param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol)
 * 	@param nbStencilRow : number of rows of the matrix matDeltaSquare
 * 	@param nbStencilCol : number of columns of the matrix matDeltaSquare
 * 	@param diffusionRateU : diffusion rate of the U specie
 * 	@param diffusionRateV : diffusion rate of the V specie
 * 	@param feedRate : rate of the process which feeds U and drains U, V and P
 * 	@param killRate : rate of the process which converts V into P
 * 	@param dt : time interval between two computation
 * 	@param maxNbThreadPerBlockX : maximum number of thread per block on X
 * 	@param maxNbBlockX : maximum number of block in the grid on X
 * 	@param totalGpuMemory : total available memory on GPU
*/
void gray_scott_nvcpp(float * matOutV, float * matInU, float * matInV, float * matTmpU, float * matTmpV,
			size_t nbImage, size_t nbExtraStep, size_t nbGpuCall, size_t nbRow, size_t nbCol,
			size_t paddedNbRow, size_t paddedNbCol,
			const float * matDeltaSquare, long nbStencilRow, long nbStencilCol,
			float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt)
{
	
	size_t sizeMatFloat = nbRow*nbCol;

	size_t nbPixel(nbRow*nbCol);
	std::vector<int> vecIndex;
	vecIndex.resize(nbPixel);
	for(size_t i(0lu); i < nbPixel; ++i){vecIndex[i] = (int)i;}
	
	//let's call the computation
	for(size_t i = 0lu; i < nbGpuCall; ++i){
		std::cout << "gray_scott_nvcpp : bunch "<<(i+1)<<"/"<<nbGpuCall<<std::endl;
		for(size_t j = 0lu; j < nbImage; ++j){
			gray_scott_nvcpp_step(matTmpU, matTmpV, matInU, matInV, vecIndex, nbCol, paddedNbCol,
						matDeltaSquare, nbStencilRow, nbStencilCol,
						diffusionRateU, diffusionRateV, feedRate, killRate, dt, nbExtraStep);
			//Flush computed data to Host
			for(size_t k(0lu); k < nbRow; ++k){		//Do not forget the 1,1 shift
				memcpy(matOutV + i*sizeMatFloat*nbImage + j*sizeMatFloat + k*nbCol, matTmpV + (k+1lu)*paddedNbCol + 1lu, nbCol*sizeof(float));
			}
		}
	}
}

Vous pouvez le télécharger ici.