5.8.6.2 : Le fichier source gray_scott_nvcpp.cpp
Écrivons le fichier gray_scott_nvcpp.cpp :
1 2 |
#include <string.h> #include <vector> |
Nous aurons besoin de la fonction std::for_each qui est disponible dans algorithm et d'un mode d'execution qui est définit dans execution :
1 2 3 4 5 |
#include <algorithm> #include <execution> #include "gray_scott_nvcpp.h" |
Nous aurons besoin d'échanger régulièrement les pointeurs de nos temporaires. Donc nous définissons une fonction pour cela :
1 2 3 4 5 6 7 8 9 |
///Swap pointer /** @param[out] ptr1 : pointer to be swaped * @param[out] ptr2 : pointer to be swaped */ void swapPointer(float *& ptr1, float *& ptr2){ float *& tmp = ptr1; ptr1 = ptr2; ptr2 = tmp; } |
Puis nous ajoutons la fonction qui calcule une itération de la réaction de Gray Scott :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
///Propagate the U and V species in the matU and matV /** @param[out] outMatU : updated matrix U version (with padding) * @param[out] outMatV : updated matrix V version (with padding) * @param matU : input of matrix U (with padding) * @param matV : input of matrix V (with padding) * @param vecIndex : vector of index to compute all pixels * @param nbCol : number of columns of the matrices (with padding) * @param paddedNbCol : number of columns of the images to be created * @param matDeltaSquare : matrix of the delta square values * @param nbStencilRow : number of rows of the matrix matDeltaSquare * @param nbStencilCol : number of columns of the matrix matDeltaSquare * @param diffusionRateU : diffusion rate of the U specie * @param diffudionRateV : diffusion rate of the V specie * @param feedRate : rate of the process which feeds U and drains U, V and P * @param killRate : rate of the process which converts V into P * @param dt : time interval between two steps * @param nbStep : number of steps to perform */ void gray_scott_nvcpp_step(float * outMatU, float * outMatV, float * matU, float * matV, const std::vector<int> & vecIndex, size_t nbCol, size_t paddedNbCol, const float * matDeltaSquare, long nbStencilRow, long nbStencilCol, float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt, size_t nbStep) { |
On commence par itérer sur le nombre d'étapes de calcul (puisque chaque étape dépend de la précédente) :
1 |
for(size_t iStep(0lu); iStep < nbStep; ++iStep){ //Perform all the steps |
Puis, nous utilisons un std::for_each qui nous permet de traiter chaque pixel de notre image :
1 2 3 |
std::for_each(std::execution::par_unseq, vecIndex.begin(), vecIndex.end(), [=](int indexPixel) { |
Le début de l'implémentation est très similaire à l'implémentation CUDA puisque nous avons aussi besoin de connaître les données dont le pixel courrant à besoin :
1 2 3 4 5 |
int indexElRow = indexPixel/nbCol; //on rows int indexElCol = indexPixel - indexElRow*nbCol; //on columns int i = indexElRow + 1; //We have one padding top (and bottom) int j = indexElCol + 1; //We have one padding left (and right) |
Ensuite, nous calculons la réaction de Gray Scott comme avant :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
float u = matU[i*paddedNbCol + j], v = matV[i*paddedNbCol + j]; float fullU = 0.0f, fullV = 0.0f; for(long k = 0l; k < nbStencilRow; ++k){ for(long l = 0l; l < nbStencilCol; ++l){ float deltaSquare = matDeltaSquare[k*nbStencilCol + l]; long idxRow = i + k - 1; //i shifted of -1, 0 or 1 long idxCol = j + l - 1; //j shifted of -1, 0 or 1 fullU += (matU[idxRow*paddedNbCol + idxCol] - u)*deltaSquare; fullV += (matV[idxRow*paddedNbCol + idxCol] - v)*deltaSquare; } } float uvSquare = u*v*v; float du = diffusionRateU*fullU - uvSquare + feedRate*(1.0f - u); float dv = diffusionRateV*fullV + uvSquare - (feedRate + killRate)*v; |
On sauve les résultats :
1 2 3 |
outMatU[i*paddedNbCol + j] = u + du*dt; outMatV[i*paddedNbCol + j] = v + dv*dt; }); |
On échange les pointeurs de nos temporaires :
1 2 3 4 5 |
//Now pointer swap
swapPointer(outMatU, matU);
swapPointer(outMatV, matV);
}
}
|
Finalement, la fonction qui calcule la totalité de la réaction de Gray Scott :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
///Interface of the Gray Scott reaction kernel /** @param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol) * @param[out] matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol) * @param[out] matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol) * @param[out] matTmpU : temporary matrix of U concentration (size : paddedNbRow*paddedNbCol) * @param[out] matTmpV : temporary matrix of V concentration (size : paddedNbRow*paddedNbCol) * @param nbImage : total number of images to be created * @param nbExtraStep : number of extra steps to be computed between images * @param nbGpuCall : number of time to call the GPU to create nbImage * @param nbRow : number of rows of the images to be created * @param nbCol : number of columns of the images to be created * @param paddedNbRow : padded number of rows of the images to be created * @param paddedNbCol : number of columns of the images to be created * @param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol) * @param nbStencilRow : number of rows of the matrix matDeltaSquare * @param nbStencilCol : number of columns of the matrix matDeltaSquare * @param diffusionRateU : diffusion rate of the U specie * @param diffusionRateV : diffusion rate of the V specie * @param feedRate : rate of the process which feeds U and drains U, V and P * @param killRate : rate of the process which converts V into P * @param dt : time interval between two computation * @param maxNbThreadPerBlockX : maximum number of thread per block on X * @param maxNbBlockX : maximum number of block in the grid on X * @param totalGpuMemory : total available memory on GPU */ void gray_scott_nvcpp(float * matOutV, float * matInU, float * matInV, float * matTmpU, float * matTmpV, size_t nbImage, size_t nbExtraStep, size_t nbGpuCall, size_t nbRow, size_t nbCol, size_t paddedNbRow, size_t paddedNbCol, const float * matDeltaSquare, long nbStencilRow, long nbStencilCol, float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt) { size_t sizeMatFloat = nbRow*nbCol; size_t nbPixel(nbRow*nbCol); |
On initialise le vecteur d'indices noteOn aurait pu utiliser un std::iota :
1 2 3 4 |
std::vector<int> vecIndex; vecIndex.resize(nbPixel); for(size_t i(0lu); i < nbPixel; ++i){vecIndex[i] = (int)i;} |
Enfin le calcul proprement dit, découpé en bloc afin d'utiliser au mieux la mémoire disponible sur le GPU :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
//let's call the computation for(size_t i = 0lu; i < nbGpuCall; ++i){ std::cout << "gray_scott_nvcpp : bunch "<<(i+1)<<"/"<<nbGpuCall<<std::endl; for(size_t j = 0lu; j < nbImage; ++j){ gray_scott_nvcpp_step(matTmpU, matTmpV, matInU, matInV, vecIndex, nbCol, paddedNbCol, matDeltaSquare, nbStencilRow, nbStencilCol, diffusionRateU, diffusionRateV, feedRate, killRate, dt, nbExtraStep); //Flush computed data to Host for(size_t k(0lu); k < nbRow; ++k){ //Do not forget the 1,1 shift memcpy(matOutV + i*sizeMatFloat*nbImage + j*sizeMatFloat + k*nbCol, matTmpV + (k+1lu)*paddedNbCol + 1lu, nbCol*sizeof(float)); } } } } |
Le fichier gray_scott_cuda.cu complet :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
/*************************************** Auteur : Pierre Aubert Mail : pierre.aubert@lapp.in2p3.fr Licence : CeCILL-C ****************************************/ #include <string.h> #include <vector> #include <algorithm> #include <execution> #include "gray_scott_nvcpp.h" ///Swap pointer /** @param[out] ptr1 : pointer to be swaped * @param[out] ptr2 : pointer to be swaped */ void swapPointer(float *& ptr1, float *& ptr2){ float *& tmp = ptr1; ptr1 = ptr2; ptr2 = tmp; } ///Propagate the U and V species in the matU and matV /** @param[out] outMatU : updated matrix U version (with padding) * @param[out] outMatV : updated matrix V version (with padding) * @param matU : input of matrix U (with padding) * @param matV : input of matrix V (with padding) * @param vecIndex : vector of index to compute all pixels * @param nbCol : number of columns of the matrices (with padding) * @param paddedNbCol : number of columns of the images to be created * @param matDeltaSquare : matrix of the delta square values * @param nbStencilRow : number of rows of the matrix matDeltaSquare * @param nbStencilCol : number of columns of the matrix matDeltaSquare * @param diffusionRateU : diffusion rate of the U specie * @param diffudionRateV : diffusion rate of the V specie * @param feedRate : rate of the process which feeds U and drains U, V and P * @param killRate : rate of the process which converts V into P * @param dt : time interval between two steps * @param nbStep : number of steps to perform */ void gray_scott_nvcpp_step(float * outMatU, float * outMatV, float * matU, float * matV, const std::vector<int> & vecIndex, size_t nbCol, size_t paddedNbCol, const float * matDeltaSquare, long nbStencilRow, long nbStencilCol, float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt, size_t nbStep) { for(size_t iStep(0lu); iStep < nbStep; ++iStep){ //Perform all the steps std::for_each(std::execution::par_unseq, vecIndex.begin(), vecIndex.end(), [=](int indexPixel) { int indexElRow = indexPixel/nbCol; //on rows int indexElCol = indexPixel - indexElRow*nbCol; //on columns int i = indexElRow + 1; //We have one padding top (and bottom) int j = indexElCol + 1; //We have one padding left (and right) float u = matU[i*paddedNbCol + j], v = matV[i*paddedNbCol + j]; float fullU = 0.0f, fullV = 0.0f; for(long k = 0l; k < nbStencilRow; ++k){ for(long l = 0l; l < nbStencilCol; ++l){ float deltaSquare = matDeltaSquare[k*nbStencilCol + l]; long idxRow = i + k - 1; //i shifted of -1, 0 or 1 long idxCol = j + l - 1; //j shifted of -1, 0 or 1 fullU += (matU[idxRow*paddedNbCol + idxCol] - u)*deltaSquare; fullV += (matV[idxRow*paddedNbCol + idxCol] - v)*deltaSquare; } } float uvSquare = u*v*v; float du = diffusionRateU*fullU - uvSquare + feedRate*(1.0f - u); float dv = diffusionRateV*fullV + uvSquare - (feedRate + killRate)*v; outMatU[i*paddedNbCol + j] = u + du*dt; outMatV[i*paddedNbCol + j] = v + dv*dt; }); //Now pointer swap swapPointer(outMatU, matU); swapPointer(outMatV, matV); } } ///Interface of the Gray Scott reaction kernel /** @param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol) * @param[out] matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol) * @param[out] matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol) * @param[out] matTmpU : temporary matrix of U concentration (size : paddedNbRow*paddedNbCol) * @param[out] matTmpV : temporary matrix of V concentration (size : paddedNbRow*paddedNbCol) * @param nbImage : total number of images to be created * @param nbExtraStep : number of extra steps to be computed between images * @param nbGpuCall : number of time to call the GPU to create nbImage * @param nbRow : number of rows of the images to be created * @param nbCol : number of columns of the images to be created * @param paddedNbRow : padded number of rows of the images to be created * @param paddedNbCol : number of columns of the images to be created * @param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol) * @param nbStencilRow : number of rows of the matrix matDeltaSquare * @param nbStencilCol : number of columns of the matrix matDeltaSquare * @param diffusionRateU : diffusion rate of the U specie * @param diffusionRateV : diffusion rate of the V specie * @param feedRate : rate of the process which feeds U and drains U, V and P * @param killRate : rate of the process which converts V into P * @param dt : time interval between two computation * @param maxNbThreadPerBlockX : maximum number of thread per block on X * @param maxNbBlockX : maximum number of block in the grid on X * @param totalGpuMemory : total available memory on GPU */ void gray_scott_nvcpp(float * matOutV, float * matInU, float * matInV, float * matTmpU, float * matTmpV, size_t nbImage, size_t nbExtraStep, size_t nbGpuCall, size_t nbRow, size_t nbCol, size_t paddedNbRow, size_t paddedNbCol, const float * matDeltaSquare, long nbStencilRow, long nbStencilCol, float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt) { size_t sizeMatFloat = nbRow*nbCol; size_t nbPixel(nbRow*nbCol); std::vector<int> vecIndex; vecIndex.resize(nbPixel); for(size_t i(0lu); i < nbPixel; ++i){vecIndex[i] = (int)i;} //let's call the computation for(size_t i = 0lu; i < nbGpuCall; ++i){ std::cout << "gray_scott_nvcpp : bunch "<<(i+1)<<"/"<<nbGpuCall<<std::endl; for(size_t j = 0lu; j < nbImage; ++j){ gray_scott_nvcpp_step(matTmpU, matTmpV, matInU, matInV, vecIndex, nbCol, paddedNbCol, matDeltaSquare, nbStencilRow, nbStencilCol, diffusionRateU, diffusionRateV, feedRate, killRate, dt, nbExtraStep); //Flush computed data to Host for(size_t k(0lu); k < nbRow; ++k){ //Do not forget the 1,1 shift memcpy(matOutV + i*sizeMatFloat*nbImage + j*sizeMatFloat + k*nbCol, matTmpV + (k+1lu)*paddedNbCol + 1lu, nbCol*sizeof(float)); } } } } |
Vous pouvez le télécharger ici.