4.1.7.2 : Le fichier Cuda
Écrivons le fichier gray_scott_cuda.cu :Commençons par les inclues :
1 2 3 4 |
#include <stdio.h> #include <stdlib.h> //pour avoir abort() #include <math.h> #include "gray_scott_cuda.h" |
Puis la fonction qui calcule la réaction de Gray Scott :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
///Propagate the U and V species in the matU and matV with CUda /** @param[out] outMatU : updated matrix U version (with padding) * @param[out] outMatV : updated matrix V version (with padding) * @param matU : input of matrix U (with padding) * @param matV : input of matrix V (with padding) * @param nbRow : number of rows of the matrices (with padding) * @param nbCol : number of columns of the matrices (with padding) * @param matDeltaSquare : matrix of the delta square values * @param nbStencilRow : number of rows of the matrix matDeltaSquare * @param nbStencilCol : number of columns of the matrix matDeltaSquare * @param diffudionRateU : diffusion rate of the U specie * @param diffudionRateV : diffusion rate of the V specie * @param feedRate : rate of the process which feeds U and drains U, V and P * @param killRate : rate of the process which converts V into P * @param dt : time interval between two steps */ __global__ void gray_scott_cuda_kernel(float * outMatU, float * outMatV, const float * matU, const float * matV, size_t nbRow, size_t nbCol, const float * matDeltaSquare, long nbStencilRow, long nbStencilCol, float diffudionRateU, float diffusionRateV, float feedRate, float killRate, float dt) { |
Il faut calculer l'élément courrant que l'on doit traiter avec la position du thread courrant :
1 2 3 4 5 6 7 |
// identifiant de thread a deux dimensions, comme la matrice // La bonne nouvelle c'est que l'on a un thread par element int indexElRow = blockIdx.x*blockDim.x + threadIdx.x; //on rows int indexElCol = blockIdx.y*blockDim.y + threadIdx.y; //on columns int i = indexElRow + 1; //We have one padding top (and bottom) int j = indexElCol + 1; //We have one padding left (and right) |
On lit les données de al cellule courrante :
1 |
float u = matU[i*nbCol + j], v = matV[i*nbCol + j];
|
On applique notre stencil :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
float fullU = 0.0f, fullV = 0.0f; for(long k = 0l; k < nbStencilRow; ++k){ for(long l = 0l; l < nbStencilCol; ++l){ float deltaSquare = matDeltaSquare[k*nbStencilCol + l]; long idxRow = i + k - 1; //i shifted of -1, 0 or 1 long idxCol = j + l - 1; //j shifted of -1, 0 or 1 fullU += (matU[idxRow*nbCol + idxCol] - u)*deltaSquare; fullV += (matV[idxRow*nbCol + idxCol] - v)*deltaSquare; } } float uvSquare = u*v*v; float du = diffudionRateU*fullU - uvSquare + feedRate*(1.0f - u); float dv = diffusionRateV*fullV + uvSquare - (feedRate + killRate)*v; |
On sauve le résultat :
1 2 3 |
outMatU[i*nbCol + j] = u + du*dt; outMatV[i*nbCol + j] = v + dv*dt; } |
On implemente une petite fonction qui va nous permettre de calculer la taille d'un bloc en fonction de la taille de l'image en entrée :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
///Get the size of a block (works for both X and Y dimensions /** @param maxBlockSize : maximum size of block on the given dimension * @param imageSize : size of the image on this dimension * @return matching size of the block for the corresponding dimension */ int getBlockSize(int maxBlockSize, size_t imageSize){ if((long)imageSize < maxBlockSize){return imageSize;} int blockSize = maxBlockSize; int restSize = imageSize % blockSize; while(restSize != 0 && blockSize > 1){ --blockSize; restSize = imageSize % blockSize; //We reduce until we found a multiple } if(blockSize == 0){ blockSize = maxBlockSize; } return blockSize; } |
Un fonction qui échange du pointeurs de données :
1 2 3 4 5 6 7 8 9 |
///Swap pointer /** @param[out] ptr1 : pointer to be swaped * @param[out] ptr2 : pointer to be swaped */ void swapPointer(float ** ptr1, float ** ptr2){ float * tmp = *ptr1; *ptr1 = *ptr2; *ptr2 = tmp; } |
La fonction C qui appellera notre kernel de calcul :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
///Interface of the Gray Scott reaction kernel in Cuda /** @param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol) * @param matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol) * @param matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol) * @param nbImage : total number of images to be created * @param nbExtraStep : number of extra steps to be computed between images * @param nbRow : number of rows of the images to be created * @param nbCol : number of columns of the images to be created * @param paddedNbRow : padded number of rows of the images to be created * @param paddedNbCol : number of columns of the images to be created * @param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol) * @param nbStencilRow : number of rows of the matrix matDeltaSquare * @param nbStencilCol : number of columns of the matrix matDeltaSquare * @param diffusionRateU : diffusion rate of the U specie * @param diffusionRateV : diffusion rate of the V specie * @param feedRate : rate of the process which feeds U and drains U, V and P * @param killRate : rate of the process which converts V into P * @param dt : time interval between two computation * @param maxNbThreadPerBlockX : maximum number of thread per block on X * @param maxNbBlockX : maximum number of block in the grid on X * @param totalGpuMemory : total available memory on GPU */ void gray_scott_cuda(float * matOutV, const float * matInU, const float * matInV, size_t nbImage, size_t nbExtraStep, size_t nbRow, size_t nbCol, size_t paddedNbRow, size_t paddedNbCol, const float * matDeltaSquare, long nbStencilRow, long nbStencilCol, float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt, int maxNbThreadPerBlockX, int maxNbBlockX, size_t totalGpuMemory) { |
On commence par afficher des paramètres généraux sur le calcul :
1 2 3 4 5 6 7 |
size_t totalNeededMemory = (nbStencilRow*nbStencilCol + paddedNbRow*paddedNbCol*4lu + nbImage*nbRow*nbCol)*sizeof(float); size_t nbGpuCall = 1lu + totalNeededMemory/(totalGpuMemory - 290000000lu); //Combien de fois il faudra appeler le kernel pour traiter toutes les images printf("gray_scott_cuda : number of bunch %lu\n", nbGpuCall); size_t nbImagePerCall = nbImage/nbGpuCall; //Nombre d'images traitees par le GPU en une fois printf("gray_scott_cuda : nbImagePerCall = %lu\n", nbImagePerCall); size_t nbImageLastCall = nbImage - nbImagePerCall*nbGpuCall; //Dernières images a traiter printf("gray_scott_cuda : nbImageLastCall = %lu\n", nbImageLastCall); |
On alloue les données de notre stencil sur le GPU :
1 2 3 4 |
//Allocation des donnees sur GPU size_t sizeDeltaSquareByte = nbStencilRow*nbStencilCol*sizeof(float); float * dmatDeltaSquare = NULL; cudaMalloc((void**)&dmatDeltaSquare, sizeDeltaSquareByte); |
On vérifie que l'allocation s'est bien passée :
1 2 |
PLIB_CUDA_CHECK_FILE
|
Ensuite on alloue les matrices de concentration de nos produit U et V en entrée et sortie :
1 2 3 4 5 6 7 8 9 10 11 12 13 |
size_t sizeMatFloat = nbRow*nbCol; size_t sizeMatByte = sizeMatFloat*sizeof(float); size_t sizePaddedMatByte = paddedNbRow*paddedNbCol*sizeof(float); float *doutMatU = NULL, *doutMatV, *dmatU = NULL, *dmatV = NULL; //Le d est pour device, pour se rappeler que ce sont les donnees sur le GPU cudaMalloc((void**)&doutMatU, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE cudaMalloc((void**)&doutMatV, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE cudaMalloc((void**)&dmatU, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE cudaMalloc((void**)&dmatV, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE |
On déclare notre premier pointeur qui sera échangé par la suite :
1 |
float *dRefFirstOutputV = doutMatV;
|
On alloue le gros tableau de résulats sur le GPU :
1 2 3 4 5 6 7 8 |
size_t sizeMatOutputInByte = sizeMatByte*nbImagePerCall; //Le gros tableau de resultat, côte GPU float * dMatOutV = NULL; cudaError_t err = cudaMalloc((void**)&dMatOutV, sizeMatOutputInByte); if(err != cudaSuccess){ printf("gray_scott_cuda : Cannot allocate temporary of %lu, %lu bytes : %s \n", nbImagePerCall, sizeMatOutputInByte, cudaGetErrorString(err)); abort(); } |
On calcule la taille des blocs que nous utiliserons pour exécuter notre kernel de calcul :
1 2 3 4 5 6 7 |
int maxBlockSize = sqrt(maxNbThreadPerBlockX); //Convention X => row, Y => col int dimBlockX = getBlockSize(maxBlockSize, nbRow), dimBlockY = getBlockSize(maxBlockSize, nbCol); int dimGridX = nbRow/dimBlockX, dimGridY = nbCol/dimBlockY; printf("gray_scott_cuda : Grid size (%d, %d, 1)\n", dimGridX, dimGridY); printf("gray_scott_cuda : block size (%d, %d, 1)\n", dimBlockX, dimBlockY); |
On définit la taille de la grille et celle des blocs (il y aura aussi des clusters dans CUDA 12) :
1 2 3 |
dim3 dimGrid(dimGridX, dimGridY, 1); dim3 dimBlock(dimBlockX, dimBlockY, 1); |
On copie les données de l'hôte au GPU :
1 2 3 4 5 6 7 8 9 |
//Data transfert from host to device cudaMemcpy(dmatDeltaSquare, matDeltaSquare, sizeDeltaSquareByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE cudaMemcpy(dmatU, matInU, sizePaddedMatByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE cudaMemcpy(dmatV, matInV, sizePaddedMatByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE |
On finalise les calculs pour prendre en compte le padding de nos tableaux :
1 2 |
size_t colPitch = paddedNbCol*sizeof(float), colShift = paddedNbCol + 1lu; //colShift is added with float, so there is no sizeof(float) needed size_t sizeCol = nbCol*sizeof(float); |
On boucle sur les bunch (dans le cas où le GPU que l'on utilise a moins de mémoire que le nombre de résulats total que l'on veut calculer) :
1 2 |
//let's call the computation for(size_t i = 0lu; i < nbGpuCall; ++i){ |
On boucle sur toutes les images que l'on veut calculer par bunch :
1 2 |
printf("gray_scott_cuda : bunch %lu/%lu\n", (i+1), nbGpuCall); for(size_t j = 0lu; j < nbImagePerCall; ++j){ |
On boucle sur les étapes intermédiaires ce qui nous permet d'accélérer la vitesse de la réaction :
1 2 3 4 5 |
for(size_t k = 0lu; k < nbExtraStep; ++k){ //Let's call the kernel gray_scott_cuda_kernel<<<dimGrid, dimBlock>>>(doutMatU, doutMatV, dmatU, dmatV, paddedNbRow, paddedNbCol, dmatDeltaSquare, nbStencilRow, nbStencilCol, diffusionRateU, diffusionRateV, feedRate, killRate, dt); |
On échange nos pointeurs ce qui évite une copie :
1 2 3 4 |
//Now pointer swap
swapPointer(&doutMatU, &dmatU);
swapPointer(&doutMatV, &dmatV);
}
|
On copie nos résultats du temporaire au tableau de résulats (tous sur le GPU) en faisant attention d'utiliser les bons pointeurs :
1 2 3 4 5 6 7 8 9 10 11 |
//To deal with 2d copy : // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee size_t shifImage = sizeMatFloat*j; //shift is in float if(doutMatV == dRefFirstOutputV){ cudaMemcpy2D(dMatOutV + shifImage, sizeCol, doutMatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice); PLIB_CUDA_CHECK_FILE }else{ cudaMemcpy2D(dMatOutV + shifImage, sizeCol, dmatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice); PLIB_CUDA_CHECK_FILE } } |
On transfers nos résultats du GPU à l'hôte :
1 2 3 4 |
//Flush computed data to Host cudaMemcpy(matOutV + i*nbImagePerCall*sizeMatFloat, dMatOutV, sizeMatOutputInByte, cudaMemcpyDeviceToHost); PLIB_CUDA_CHECK_FILE } |
On traite le cas où il reste des images à traiter car la taille de tous les bunch n'est pas un multiple du nombre total d'image à calculer :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
if(nbImageLastCall != 0lu){ //If there are remaning images to compute printf("gray_scott_cuda : last bunch of %lu image(s) after %lu bunche(s) of %lu images each\n", nbImageLastCall, nbGpuCall, nbImagePerCall); for(size_t j = 0lu; j < nbImageLastCall; ++j){ for(size_t k = 0lu; k < nbExtraStep; ++k){ //Let's call the kernel gray_scott_cuda_kernel<<<dimGrid, dimBlock>>>(doutMatU, doutMatV, dmatU, dmatV, paddedNbRow, paddedNbCol, dmatDeltaSquare, nbStencilRow, nbStencilCol, diffusionRateU, diffusionRateV, feedRate, killRate, dt); //Now pointer swap swapPointer(&doutMatU, &dmatU); swapPointer(&doutMatV, &dmatV); } //To deal with 2d copy : // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee size_t shifImage = sizeMatFloat*j; //shift is in float if(doutMatV == dRefFirstOutputV){ cudaMemcpy2D(dMatOutV + shifImage, sizeCol, doutMatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice); PLIB_CUDA_CHECK_FILE }else{ cudaMemcpy2D(dMatOutV + shifImage, sizeCol, dmatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice); PLIB_CUDA_CHECK_FILE } } //Flush computed data to Host cudaMemcpy(matOutV + nbGpuCall*nbImagePerCall*sizeMatFloat, dMatOutV, nbImageLastCall*sizeMatByte, cudaMemcpyDeviceToHost); PLIB_CUDA_CHECK_FILE } |
On libère la mémoire allouée sur le GPU :
1 2 3 4 5 6 7 8 |
//Free the temporaries cudaFree(doutMatU); cudaFree(doutMatV); cudaFree(dmatU); cudaFree(dmatV); //Free the big images result cudaFree(dMatOutV); } |
Appellons notre kernel de calcul sans utiliser de résultat temporaire du le GPU noteIl est que ce sera moins efficace, mais comme cela nous pourrons évaluer à quel point :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
///Interface of the Gray Scott reaction kernel in Cuda /** @param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol) * @param matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol) * @param matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol) * @param nbImage : total number of images to be created * @param nbExtraStep : number of extra steps to be computed between images * @param nbRow : number of rows of the images to be created * @param nbCol : number of columns of the images to be created * @param paddedNbRow : padded number of rows of the images to be created * @param paddedNbCol : number of columns of the images to be created * @param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol) * @param nbStencilRow : number of rows of the matrix matDeltaSquare * @param nbStencilCol : number of columns of the matrix matDeltaSquare * @param diffusionRateU : diffusion rate of the U specie * @param diffusionRateV : diffusion rate of the V specie * @param feedRate : rate of the process which feeds U and drains U, V and P * @param killRate : rate of the process which converts V into P * @param dt : time interval between two computation * @param maxNbThreadPerBlockX : maximum number of thread per block on X * @param maxNbBlockX : maximum number of block in the grid on X * @param totalGpuMemory : total available memory on GPU */ void gray_scott_cuda_stupid(float * matOutV, const float * matInU, const float * matInV, size_t nbImage, size_t nbExtraStep, size_t nbRow, size_t nbCol, size_t paddedNbRow, size_t paddedNbCol, const float * matDeltaSquare, long nbStencilRow, long nbStencilCol, float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt, int maxNbThreadPerBlockX, int maxNbBlockX, size_t totalGpuMemory) { |
On alloue les données de notre stencil sur le GPU :
1 2 3 4 |
//Allocation des donnees sur GPU size_t sizeDeltaSquareByte = nbStencilRow*nbStencilCol*sizeof(float); float * dmatDeltaSquare = NULL; cudaMalloc((void**)&dmatDeltaSquare, sizeDeltaSquareByte); |
On vérifie que l'allocation s'est bien passée :
1 2 |
PLIB_CUDA_CHECK_FILE
|
Ensuite on alloue les matrices de concentration de nos produit U et V en entrée et sortie :
1 2 3 4 5 6 7 8 9 10 11 12 |
size_t sizeMatFloat = nbRow*nbCol; size_t sizePaddedMatByte = paddedNbRow*paddedNbCol*sizeof(float); float *doutMatU = NULL, *doutMatV, *dmatU = NULL, *dmatV = NULL; //Le d est pour device, pour se rappeler que ce sont les donnees sur le GPU cudaMalloc((void**)&doutMatU, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE cudaMalloc((void**)&doutMatV, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE cudaMalloc((void**)&dmatU, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE cudaMalloc((void**)&dmatV, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE |
On déclare notre premier pointeur qui sera échangé par la suite :
1 2 |
float *dRefFirstOutputV = doutMatV;
|
On calcule la taille des blocs que nous utiliserons pour exécuter notre kernel de calcul :
1 2 3 4 5 6 7 |
int maxBlockSize = sqrt(maxNbThreadPerBlockX); //Convention X => row, Y => col int dimBlockX = getBlockSize(maxBlockSize, nbRow), dimBlockY = getBlockSize(maxBlockSize, nbCol); int dimGridX = nbRow/dimBlockX, dimGridY = nbCol/dimBlockY; printf("gray_scott_cuda_stupid : Grid size (%d, %d, 1)\n", dimGridX, dimGridY); printf("gray_scott_cuda_stupid : block size (%d, %d, 1)\n", dimBlockX, dimBlockY); |
On définit la taille de la grille et celle des blocs (il y aura aussi des clusters dans CUDA 12) :
1 2 3 |
dim3 dimGrid(dimGridX, dimGridY, 1); dim3 dimBlock(dimBlockX, dimBlockY, 1); |
On copie les données de l'hôte au GPU :
1 2 3 4 5 6 7 8 9 |
//Data transfert from host to device cudaMemcpy(dmatDeltaSquare, matDeltaSquare, sizeDeltaSquareByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE cudaMemcpy(dmatU, matInU, sizePaddedMatByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE cudaMemcpy(dmatV, matInV, sizePaddedMatByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE |
On finalise les calculs pour prendre en compte le padding de nos tableaux :
1 2 |
size_t colPitch = paddedNbCol*sizeof(float), colShift = paddedNbCol + 1lu; //colShift is added with float, so there is no sizeof(float) needed size_t sizeCol = nbCol*sizeof(float); |
On boucle sur toutes les images à calculer car nous n'avons pas de problème de mémoire puisque nous n'utilisons pas de gros temporaire pour stocker nos résultats :
1 2 |
//let's call the computation for(size_t j = 0lu; j < nbImage; ++j){ |
On boucle sur les étapes intermédiaires ce qui nous permet d'accélérer la vitesse de la réaction :
1 2 3 4 5 |
for(size_t k = 0lu; k < nbExtraStep; ++k){ //Let's call the kernel gray_scott_cuda_kernel<<<dimGrid, dimBlock>>>(doutMatU, doutMatV, dmatU, dmatV, paddedNbRow, paddedNbCol, dmatDeltaSquare, nbStencilRow, nbStencilCol, diffusionRateU, diffusionRateV, feedRate, killRate, dt); |
On échange nos pointeurs ce qui évite une copie :
1 2 3 4 |
//Now pointer swap
swapPointer(&doutMatU, &dmatU);
swapPointer(&doutMatV, &dmatV);
}
|
On copie nos résultats du temporaire au tableau de résulats (tous sur le GPU) en faisant attention d'utiliser les bons pointeurs :
1 2 3 4 5 6 7 8 9 10 11 |
//To deal with 2d copy : // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee float * outputMatPtr = matOutV + sizeMatFloat*j; if(doutMatV == dRefFirstOutputV){ cudaMemcpy2D(outputMatPtr, sizeCol, doutMatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToHost); PLIB_CUDA_CHECK_FILE }else{ cudaMemcpy2D(outputMatPtr, sizeCol, dmatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToHost); PLIB_CUDA_CHECK_FILE } } |
On libère la mémoire allouée sur le GPU :
1 2 3 4 5 6 |
//Free the temporaries cudaFree(doutMatU); cudaFree(doutMatV); cudaFree(dmatU); cudaFree(dmatV); } |
Le fichier gray_scott_cuda.cu complet :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 |
/*************************************** Auteur : Pierre Aubert Mail : pierre.aubert@lapp.in2p3.fr Licence : CeCILL-C ****************************************/ #include <stdio.h> #include <stdlib.h> //pour avoir abort() #include <math.h> #include "gray_scott_cuda.h" ///Propagate the U and V species in the matU and matV with CUda /** @param[out] outMatU : updated matrix U version (with padding) * @param[out] outMatV : updated matrix V version (with padding) * @param matU : input of matrix U (with padding) * @param matV : input of matrix V (with padding) * @param nbRow : number of rows of the matrices (with padding) * @param nbCol : number of columns of the matrices (with padding) * @param matDeltaSquare : matrix of the delta square values * @param nbStencilRow : number of rows of the matrix matDeltaSquare * @param nbStencilCol : number of columns of the matrix matDeltaSquare * @param diffudionRateU : diffusion rate of the U specie * @param diffudionRateV : diffusion rate of the V specie * @param feedRate : rate of the process which feeds U and drains U, V and P * @param killRate : rate of the process which converts V into P * @param dt : time interval between two steps */ __global__ void gray_scott_cuda_kernel(float * outMatU, float * outMatV, const float * matU, const float * matV, size_t nbRow, size_t nbCol, const float * matDeltaSquare, long nbStencilRow, long nbStencilCol, float diffudionRateU, float diffusionRateV, float feedRate, float killRate, float dt) { // identifiant de thread a deux dimensions, comme la matrice // La bonne nouvelle c'est que l'on a un thread par element int indexElRow = blockIdx.x*blockDim.x + threadIdx.x; //on rows int indexElCol = blockIdx.y*blockDim.y + threadIdx.y; //on columns int i = indexElRow + 1; //We have one padding top (and bottom) int j = indexElCol + 1; //We have one padding left (and right) float u = matU[i*nbCol + j], v = matV[i*nbCol + j]; float fullU = 0.0f, fullV = 0.0f; for(long k = 0l; k < nbStencilRow; ++k){ for(long l = 0l; l < nbStencilCol; ++l){ float deltaSquare = matDeltaSquare[k*nbStencilCol + l]; long idxRow = i + k - 1; //i shifted of -1, 0 or 1 long idxCol = j + l - 1; //j shifted of -1, 0 or 1 fullU += (matU[idxRow*nbCol + idxCol] - u)*deltaSquare; fullV += (matV[idxRow*nbCol + idxCol] - v)*deltaSquare; } } float uvSquare = u*v*v; float du = diffudionRateU*fullU - uvSquare + feedRate*(1.0f - u); float dv = diffusionRateV*fullV + uvSquare - (feedRate + killRate)*v; outMatU[i*nbCol + j] = u + du*dt; outMatV[i*nbCol + j] = v + dv*dt; } ///Get the size of a block (works for both X and Y dimensions /** @param maxBlockSize : maximum size of block on the given dimension * @param imageSize : size of the image on this dimension * @return matching size of the block for the corresponding dimension */ int getBlockSize(int maxBlockSize, size_t imageSize){ if((long)imageSize < maxBlockSize){return imageSize;} int blockSize = maxBlockSize; int restSize = imageSize % blockSize; while(restSize != 0 && blockSize > 1){ --blockSize; restSize = imageSize % blockSize; //We reduce until we found a multiple } if(blockSize == 0){ blockSize = maxBlockSize; } return blockSize; } ///Swap pointer /** @param[out] ptr1 : pointer to be swaped * @param[out] ptr2 : pointer to be swaped */ void swapPointer(float ** ptr1, float ** ptr2){ float * tmp = *ptr1; *ptr1 = *ptr2; *ptr2 = tmp; } ///Interface of the Gray Scott reaction kernel in Cuda /** @param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol) * @param matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol) * @param matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol) * @param nbImage : total number of images to be created * @param nbExtraStep : number of extra steps to be computed between images * @param nbRow : number of rows of the images to be created * @param nbCol : number of columns of the images to be created * @param paddedNbRow : padded number of rows of the images to be created * @param paddedNbCol : number of columns of the images to be created * @param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol) * @param nbStencilRow : number of rows of the matrix matDeltaSquare * @param nbStencilCol : number of columns of the matrix matDeltaSquare * @param diffusionRateU : diffusion rate of the U specie * @param diffusionRateV : diffusion rate of the V specie * @param feedRate : rate of the process which feeds U and drains U, V and P * @param killRate : rate of the process which converts V into P * @param dt : time interval between two computation * @param maxNbThreadPerBlockX : maximum number of thread per block on X * @param maxNbBlockX : maximum number of block in the grid on X * @param totalGpuMemory : total available memory on GPU */ void gray_scott_cuda(float * matOutV, const float * matInU, const float * matInV, size_t nbImage, size_t nbExtraStep, size_t nbRow, size_t nbCol, size_t paddedNbRow, size_t paddedNbCol, const float * matDeltaSquare, long nbStencilRow, long nbStencilCol, float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt, int maxNbThreadPerBlockX, int maxNbBlockX, size_t totalGpuMemory) { size_t totalNeededMemory = (nbStencilRow*nbStencilCol + paddedNbRow*paddedNbCol*4lu + nbImage*nbRow*nbCol)*sizeof(float); size_t nbGpuCall = 1lu + totalNeededMemory/(totalGpuMemory - 290000000lu); //Combien de fois il faudra appeler le kernel pour traiter toutes les images printf("gray_scott_cuda : number of bunch %lu\n", nbGpuCall); size_t nbImagePerCall = nbImage/nbGpuCall; //Nombre d'images traitees par le GPU en une fois printf("gray_scott_cuda : nbImagePerCall = %lu\n", nbImagePerCall); size_t nbImageLastCall = nbImage - nbImagePerCall*nbGpuCall; //Dernières images a traiter printf("gray_scott_cuda : nbImageLastCall = %lu\n", nbImageLastCall); //Allocation des donnees sur GPU size_t sizeDeltaSquareByte = nbStencilRow*nbStencilCol*sizeof(float); float * dmatDeltaSquare = NULL; cudaMalloc((void**)&dmatDeltaSquare, sizeDeltaSquareByte); PLIB_CUDA_CHECK_FILE size_t sizeMatFloat = nbRow*nbCol; size_t sizeMatByte = sizeMatFloat*sizeof(float); size_t sizePaddedMatByte = paddedNbRow*paddedNbCol*sizeof(float); float *doutMatU = NULL, *doutMatV, *dmatU = NULL, *dmatV = NULL; //Le d est pour device, pour se rappeler que ce sont les donnees sur le GPU cudaMalloc((void**)&doutMatU, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE cudaMalloc((void**)&doutMatV, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE cudaMalloc((void**)&dmatU, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE cudaMalloc((void**)&dmatV, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE float *dRefFirstOutputV = doutMatV; size_t sizeMatOutputInByte = sizeMatByte*nbImagePerCall; //Le gros tableau de resultat, côte GPU float * dMatOutV = NULL; cudaError_t err = cudaMalloc((void**)&dMatOutV, sizeMatOutputInByte); if(err != cudaSuccess){ printf("gray_scott_cuda : Cannot allocate temporary of %lu, %lu bytes : %s \n", nbImagePerCall, sizeMatOutputInByte, cudaGetErrorString(err)); abort(); } int maxBlockSize = sqrt(maxNbThreadPerBlockX); //Convention X => row, Y => col int dimBlockX = getBlockSize(maxBlockSize, nbRow), dimBlockY = getBlockSize(maxBlockSize, nbCol); int dimGridX = nbRow/dimBlockX, dimGridY = nbCol/dimBlockY; printf("gray_scott_cuda : Grid size (%d, %d, 1)\n", dimGridX, dimGridY); printf("gray_scott_cuda : block size (%d, %d, 1)\n", dimBlockX, dimBlockY); dim3 dimGrid(dimGridX, dimGridY, 1); dim3 dimBlock(dimBlockX, dimBlockY, 1); //Data transfert from host to device cudaMemcpy(dmatDeltaSquare, matDeltaSquare, sizeDeltaSquareByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE cudaMemcpy(dmatU, matInU, sizePaddedMatByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE cudaMemcpy(dmatV, matInV, sizePaddedMatByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE size_t colPitch = paddedNbCol*sizeof(float), colShift = paddedNbCol + 1lu; //colShift is added with float, so there is no sizeof(float) needed size_t sizeCol = nbCol*sizeof(float); //let's call the computation for(size_t i = 0lu; i < nbGpuCall; ++i){ printf("gray_scott_cuda : bunch %lu/%lu\n", (i+1), nbGpuCall); for(size_t j = 0lu; j < nbImagePerCall; ++j){ for(size_t k = 0lu; k < nbExtraStep; ++k){ //Let's call the kernel gray_scott_cuda_kernel<<<dimGrid, dimBlock>>>(doutMatU, doutMatV, dmatU, dmatV, paddedNbRow, paddedNbCol, dmatDeltaSquare, nbStencilRow, nbStencilCol, diffusionRateU, diffusionRateV, feedRate, killRate, dt); //Now pointer swap swapPointer(&doutMatU, &dmatU); swapPointer(&doutMatV, &dmatV); } //To deal with 2d copy : // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee size_t shifImage = sizeMatFloat*j; //shift is in float if(doutMatV == dRefFirstOutputV){ cudaMemcpy2D(dMatOutV + shifImage, sizeCol, doutMatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice); PLIB_CUDA_CHECK_FILE }else{ cudaMemcpy2D(dMatOutV + shifImage, sizeCol, dmatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice); PLIB_CUDA_CHECK_FILE } } //Flush computed data to Host cudaMemcpy(matOutV + i*nbImagePerCall*sizeMatFloat, dMatOutV, sizeMatOutputInByte, cudaMemcpyDeviceToHost); PLIB_CUDA_CHECK_FILE } if(nbImageLastCall != 0lu){ //If there are remaning images to compute printf("gray_scott_cuda : last bunch of %lu image(s) after %lu bunche(s) of %lu images each\n", nbImageLastCall, nbGpuCall, nbImagePerCall); for(size_t j = 0lu; j < nbImageLastCall; ++j){ for(size_t k = 0lu; k < nbExtraStep; ++k){ //Let's call the kernel gray_scott_cuda_kernel<<<dimGrid, dimBlock>>>(doutMatU, doutMatV, dmatU, dmatV, paddedNbRow, paddedNbCol, dmatDeltaSquare, nbStencilRow, nbStencilCol, diffusionRateU, diffusionRateV, feedRate, killRate, dt); //Now pointer swap swapPointer(&doutMatU, &dmatU); swapPointer(&doutMatV, &dmatV); } //To deal with 2d copy : // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee size_t shifImage = sizeMatFloat*j; //shift is in float if(doutMatV == dRefFirstOutputV){ cudaMemcpy2D(dMatOutV + shifImage, sizeCol, doutMatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice); PLIB_CUDA_CHECK_FILE }else{ cudaMemcpy2D(dMatOutV + shifImage, sizeCol, dmatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToDevice); PLIB_CUDA_CHECK_FILE } } //Flush computed data to Host cudaMemcpy(matOutV + nbGpuCall*nbImagePerCall*sizeMatFloat, dMatOutV, nbImageLastCall*sizeMatByte, cudaMemcpyDeviceToHost); PLIB_CUDA_CHECK_FILE } //Free the temporaries cudaFree(doutMatU); cudaFree(doutMatV); cudaFree(dmatU); cudaFree(dmatV); //Free the big images result cudaFree(dMatOutV); } ///Interface of the Gray Scott reaction kernel in Cuda /** @param[out] matOutV : result of the computing (size : nbImage*nbRow*nbCol) * @param matInU : input matrix of U concentration (size : paddedNbRow*paddedNbCol) * @param matInV : input matrix of V concentration (size : paddedNbRow*paddedNbCol) * @param nbImage : total number of images to be created * @param nbExtraStep : number of extra steps to be computed between images * @param nbRow : number of rows of the images to be created * @param nbCol : number of columns of the images to be created * @param paddedNbRow : padded number of rows of the images to be created * @param paddedNbCol : number of columns of the images to be created * @param matDeltaSquare : matrix of the delta square values (size : nbStencilRow*nbStencilCol) * @param nbStencilRow : number of rows of the matrix matDeltaSquare * @param nbStencilCol : number of columns of the matrix matDeltaSquare * @param diffusionRateU : diffusion rate of the U specie * @param diffusionRateV : diffusion rate of the V specie * @param feedRate : rate of the process which feeds U and drains U, V and P * @param killRate : rate of the process which converts V into P * @param dt : time interval between two computation * @param maxNbThreadPerBlockX : maximum number of thread per block on X * @param maxNbBlockX : maximum number of block in the grid on X * @param totalGpuMemory : total available memory on GPU */ void gray_scott_cuda_stupid(float * matOutV, const float * matInU, const float * matInV, size_t nbImage, size_t nbExtraStep, size_t nbRow, size_t nbCol, size_t paddedNbRow, size_t paddedNbCol, const float * matDeltaSquare, long nbStencilRow, long nbStencilCol, float diffusionRateU, float diffusionRateV, float feedRate, float killRate, float dt, int maxNbThreadPerBlockX, int maxNbBlockX, size_t totalGpuMemory) { //Allocation des donnees sur GPU size_t sizeDeltaSquareByte = nbStencilRow*nbStencilCol*sizeof(float); float * dmatDeltaSquare = NULL; cudaMalloc((void**)&dmatDeltaSquare, sizeDeltaSquareByte); PLIB_CUDA_CHECK_FILE size_t sizeMatFloat = nbRow*nbCol; size_t sizePaddedMatByte = paddedNbRow*paddedNbCol*sizeof(float); float *doutMatU = NULL, *doutMatV, *dmatU = NULL, *dmatV = NULL; //Le d est pour device, pour se rappeler que ce sont les donnees sur le GPU cudaMalloc((void**)&doutMatU, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE cudaMalloc((void**)&doutMatV, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE cudaMalloc((void**)&dmatU, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE cudaMalloc((void**)&dmatV, sizePaddedMatByte); PLIB_CUDA_CHECK_FILE float *dRefFirstOutputV = doutMatV; int maxBlockSize = sqrt(maxNbThreadPerBlockX); //Convention X => row, Y => col int dimBlockX = getBlockSize(maxBlockSize, nbRow), dimBlockY = getBlockSize(maxBlockSize, nbCol); int dimGridX = nbRow/dimBlockX, dimGridY = nbCol/dimBlockY; printf("gray_scott_cuda_stupid : Grid size (%d, %d, 1)\n", dimGridX, dimGridY); printf("gray_scott_cuda_stupid : block size (%d, %d, 1)\n", dimBlockX, dimBlockY); dim3 dimGrid(dimGridX, dimGridY, 1); dim3 dimBlock(dimBlockX, dimBlockY, 1); //Data transfert from host to device cudaMemcpy(dmatDeltaSquare, matDeltaSquare, sizeDeltaSquareByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE cudaMemcpy(dmatU, matInU, sizePaddedMatByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE cudaMemcpy(dmatV, matInV, sizePaddedMatByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE size_t colPitch = paddedNbCol*sizeof(float), colShift = paddedNbCol + 1lu; //colShift is added with float, so there is no sizeof(float) needed size_t sizeCol = nbCol*sizeof(float); //let's call the computation for(size_t j = 0lu; j < nbImage; ++j){ for(size_t k = 0lu; k < nbExtraStep; ++k){ //Let's call the kernel gray_scott_cuda_kernel<<<dimGrid, dimBlock>>>(doutMatU, doutMatV, dmatU, dmatV, paddedNbRow, paddedNbCol, dmatDeltaSquare, nbStencilRow, nbStencilCol, diffusionRateU, diffusionRateV, feedRate, killRate, dt); //Now pointer swap swapPointer(&doutMatU, &dmatU); swapPointer(&doutMatV, &dmatV); } //To deal with 2d copy : // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee float * outputMatPtr = matOutV + sizeMatFloat*j; if(doutMatV == dRefFirstOutputV){ cudaMemcpy2D(outputMatPtr, sizeCol, doutMatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToHost); PLIB_CUDA_CHECK_FILE }else{ cudaMemcpy2D(outputMatPtr, sizeCol, dmatV + colShift, colPitch, sizeCol, nbRow, cudaMemcpyDeviceToHost); PLIB_CUDA_CHECK_FILE } } //Free the temporaries cudaFree(doutMatU); cudaFree(doutMatV); cudaFree(dmatU); cudaFree(dmatV); } |
Vous pouvez le télécharger ici.