3.3.2.1.2 : Le fichier hadamard_product_cuda.cu
Développons le fichier hadamard_product_cuda.cu :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
#include "timer.h" #include "asterics_cuda.h" #include "hadamard_product_cuda.h" ///Compute a Hadamard Product with Cuda /** @param[out] vecResult : result of the computing * @param vectLeft : vector of values * @param vectRight : vector of values * @param nbElement : number of elements to be computed */ __global__ void hadamard_product_kernel(float * vecResult, const float * vectLeft, const float * vectRight, int nbElement){ // identifiant de thread à deux dimensions, comme la matrice int i = blockIdx.x*blockDim.x + threadIdx.x; // if(i < nbElement) vecResult[i] = vectLeft[i] * vectRight[i]; } ///Interface of the Hadamard product kernel /** @param[out] vecResult : result of the computing * @param vectLeft : vector of values * @param vectRight : vector of values * @param nbElement : number of elements to be computed * @param maxNbThreadPerBlockX : maximum number of thread per block on X * @param maxNbBlockX : maximum number of block in the grid on X * @param warpSize : number of thread per warp */ void hadamard_product_cuda(float * vecResult, const float * vectLeft, const float * vectRight, int nbElement, int maxNbThreadPerBlockX, int maxNbBlockX, int warpSize) { float *vecResultd = NULL, *vectLeftd = NULL, *vectRightd = NULL; int sizeByte = nbElement*sizeof(float); cudaMalloc((void**)&vecResultd, sizeByte); cudaMalloc((void**)&vectLeftd, sizeByte); cudaMalloc((void**)&vectRightd, sizeByte); int dimGridX = 1; int dimBlockX = 1; if(nbElement > maxNbThreadPerBlockX){ //au lieu d'utiliser plusieurs warps d'un block, on utilise plusieurs block dimBlockX = maxNbThreadPerBlockX; dimGridX = nbElement/dimBlockX; }else{ //sinon on utlise qu'un seul bloc dimBlockX = nbElement; } cudaMemcpy(vectLeftd, vectLeft, sizeByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE cudaMemcpy(vectRightd, vectRight, sizeByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE dim3 dimGrid(dimGridX, 1, 1); dim3 dimBlock(dimBlockX, 1, 1); hadamard_product_kernel<<<dimGrid, dimBlock>>>(vecResultd, vectLeftd, vectRightd, nbElement); PLIB_CUDA_CHECK_FILE cudaMemcpy(vecResult, vecResultd, sizeByte, cudaMemcpyDeviceToHost); PLIB_CUDA_CHECK_FILE cudaFree(vectRightd); cudaFree(vectLeftd); cudaFree(vecResultd); } ///Interface of the Hadamard product kernel /** @param[out] vecResult : result of the computing * @param vectLeft : vector of values * @param vectRight : vector of values * @param nbElement : number of elements to be computed * @param nbRepeat : number of repeat of the computation * @param maxNbThreadPerBlockX : maximum number of thread per block on X * @param maxNbBlockX : maximum number of block in the grid on X * @param warpSize : number of thread per warp */ void hadamard_product_cuda_clock(float * vecResult, const float * vectLeft, const float * vectRight, int nbElement, int nbRepeat, int maxNbThreadPerBlockX, int maxNbBlockX, int warpSize) { float *vecResultd = NULL, *vectLeftd = NULL, *vectRightd = NULL; int size = nbElement*sizeof(float); cudaMalloc((void**)&vecResultd, size); cudaMalloc((void**)&vectLeftd, size); cudaMalloc((void**)&vectRightd, size); int dimGridX = 1; int dimBlockX = 1; if(nbElement > maxNbThreadPerBlockX){ //au lieu d'utiliser plusieurs warps d'un block, on utilise plusieurs block dimBlockX = maxNbThreadPerBlockX; dimGridX = nbElement/dimBlockX; }else{ //sinon on utlise qu'un seul bloc dimBlockX = nbElement; } cudaMemcpy(vectLeftd, vectLeft, size, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE cudaMemcpy(vectRightd, vectRight, size, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE dim3 dimGrid(dimGridX, 1, 1); dim3 dimBlock(dimBlockX, 1, 1); float res = 0.0f; long unsigned int beginTime = rdtsc(); for(int i(0); i < nbRepeat; ++i){ hadamard_product_kernel<<<dimGrid, dimBlock>>>(vecResultd, vectLeftd, vectRightd, nbElement); |
Il ne faut pas oublier d'utiliser le résultat de notre calcul sinon le compilateur va le supprimer :
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
float val = 0.0; cudaMemcpy(&val, vecResultd, sizeof(float), cudaMemcpyDeviceToHost); res += val; } long unsigned int elapsedTime = (double)(rdtsc() - beginTime)/((double)nbRepeat); double cyclePerElement = ((double)elapsedTime)/((double)nbElement); PLIB_CUDA_CHECK_FILE cudaMemcpy(vecResult, vecResultd, size, cudaMemcpyDeviceToHost); res += vecResult[0]; printf("hadamard_product_cuda_clock : nbElement = %d, cyclePerElement = %lf cy/el, elapsedTime = %ld cy, res = %f\n", nbElement, cyclePerElement, elapsedTime, res); fprintf(stderr, "%d\t%f\t%ld\n", nbElement, cyclePerElement, elapsedTime); PLIB_CUDA_CHECK_FILE cudaFree(vectRightd); cudaFree(vectLeftd); cudaFree(vecResultd); } |
Le fichier hadamard_product_cuda.cu complet.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
/*************************************** Auteur : Pierre Aubert Mail : pierre.aubert@lapp.in2p3.fr Licence : CeCILL-C ****************************************/ #include "timer.h" #include "asterics_cuda.h" #include "hadamard_product_cuda.h" ///Compute a Hadamard Product with Cuda /** @param[out] vecResult : result of the computing * @param vectLeft : vector of values * @param vectRight : vector of values * @param nbElement : number of elements to be computed */ __global__ void hadamard_product_kernel(float * vecResult, const float * vectLeft, const float * vectRight, int nbElement){ // identifiant de thread à deux dimensions, comme la matrice int i = blockIdx.x*blockDim.x + threadIdx.x; // if(i < nbElement) vecResult[i] = vectLeft[i] * vectRight[i]; } ///Interface of the Hadamard product kernel /** @param[out] vecResult : result of the computing * @param vectLeft : vector of values * @param vectRight : vector of values * @param nbElement : number of elements to be computed * @param maxNbThreadPerBlockX : maximum number of thread per block on X * @param maxNbBlockX : maximum number of block in the grid on X * @param warpSize : number of thread per warp */ void hadamard_product_cuda(float * vecResult, const float * vectLeft, const float * vectRight, int nbElement, int maxNbThreadPerBlockX, int maxNbBlockX, int warpSize) { float *vecResultd = NULL, *vectLeftd = NULL, *vectRightd = NULL; int sizeByte = nbElement*sizeof(float); cudaMalloc((void**)&vecResultd, sizeByte); cudaMalloc((void**)&vectLeftd, sizeByte); cudaMalloc((void**)&vectRightd, sizeByte); int dimGridX = 1; int dimBlockX = 1; if(nbElement > maxNbThreadPerBlockX){ //au lieu d'utiliser plusieurs warps d'un block, on utilise plusieurs block dimBlockX = maxNbThreadPerBlockX; dimGridX = nbElement/dimBlockX; }else{ //sinon on utlise qu'un seul bloc dimBlockX = nbElement; } cudaMemcpy(vectLeftd, vectLeft, sizeByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE cudaMemcpy(vectRightd, vectRight, sizeByte, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE dim3 dimGrid(dimGridX, 1, 1); dim3 dimBlock(dimBlockX, 1, 1); hadamard_product_kernel<<<dimGrid, dimBlock>>>(vecResultd, vectLeftd, vectRightd, nbElement); PLIB_CUDA_CHECK_FILE cudaMemcpy(vecResult, vecResultd, sizeByte, cudaMemcpyDeviceToHost); PLIB_CUDA_CHECK_FILE cudaFree(vectRightd); cudaFree(vectLeftd); cudaFree(vecResultd); } ///Interface of the Hadamard product kernel /** @param[out] vecResult : result of the computing * @param vectLeft : vector of values * @param vectRight : vector of values * @param nbElement : number of elements to be computed * @param nbRepeat : number of repeat of the computation * @param maxNbThreadPerBlockX : maximum number of thread per block on X * @param maxNbBlockX : maximum number of block in the grid on X * @param warpSize : number of thread per warp */ void hadamard_product_cuda_clock(float * vecResult, const float * vectLeft, const float * vectRight, int nbElement, int nbRepeat, int maxNbThreadPerBlockX, int maxNbBlockX, int warpSize) { float *vecResultd = NULL, *vectLeftd = NULL, *vectRightd = NULL; int size = nbElement*sizeof(float); cudaMalloc((void**)&vecResultd, size); cudaMalloc((void**)&vectLeftd, size); cudaMalloc((void**)&vectRightd, size); int dimGridX = 1; int dimBlockX = 1; if(nbElement > maxNbThreadPerBlockX){ //au lieu d'utiliser plusieurs warps d'un block, on utilise plusieurs block dimBlockX = maxNbThreadPerBlockX; dimGridX = nbElement/dimBlockX; }else{ //sinon on utlise qu'un seul bloc dimBlockX = nbElement; } cudaMemcpy(vectLeftd, vectLeft, size, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE cudaMemcpy(vectRightd, vectRight, size, cudaMemcpyHostToDevice); PLIB_CUDA_CHECK_FILE dim3 dimGrid(dimGridX, 1, 1); dim3 dimBlock(dimBlockX, 1, 1); float res = 0.0f; long unsigned int beginTime = rdtsc(); for(int i(0); i < nbRepeat; ++i){ hadamard_product_kernel<<<dimGrid, dimBlock>>>(vecResultd, vectLeftd, vectRightd, nbElement); float val = 0.0; cudaMemcpy(&val, vecResultd, sizeof(float), cudaMemcpyDeviceToHost); res += val; } long unsigned int elapsedTime = (double)(rdtsc() - beginTime)/((double)nbRepeat); double cyclePerElement = ((double)elapsedTime)/((double)nbElement); PLIB_CUDA_CHECK_FILE cudaMemcpy(vecResult, vecResultd, size, cudaMemcpyDeviceToHost); res += vecResult[0]; printf("hadamard_product_cuda_clock : nbElement = %d, cyclePerElement = %lf cy/el, elapsedTime = %ld cy, res = %f\n", nbElement, cyclePerElement, elapsedTime, res); fprintf(stderr, "%d\t%f\t%ld\n", nbElement, cyclePerElement, elapsedTime); PLIB_CUDA_CHECK_FILE cudaFree(vectRightd); cudaFree(vectLeftd); cudaFree(vecResultd); } |
Vous pouvez télécharger le fichier ici.