Le fichier hadamard_product

3.3.2.1.2 : Le fichier hadamard_product_cuda.cu

Développons le fichier hadamard_product_cuda.cu :

#include "timer.h"
#include "asterics_cuda.h"
#include "hadamard_product_cuda.h"

///Compute a Hadamard Product with Cuda
/**	@param[out] vecResult : result of the computing
 * 	@param vectLeft : vector of values
 * 	@param vectRight : vector of values
 * 	@param nbElement : number of elements to be computed
*/
__global__ void hadamard_product_kernel(float * vecResult, const float * vectLeft, const float * vectRight, int nbElement){
	// identifiant de thread à deux dimensions, comme la matrice
	int i = blockIdx.x*blockDim.x + threadIdx.x;
// 	if(i < nbElement) 
	vecResult[i] = vectLeft[i] * vectRight[i];
}

///Interface of the Hadamard product kernel
/**	@param[out] vecResult : result of the computing
 * 	@param vectLeft : vector of values
 * 	@param vectRight : vector of values
 * 	@param nbElement : number of elements to be computed
 * 	@param maxNbThreadPerBlockX : maximum number of thread per block on X
 * 	@param maxNbBlockX : maximum number of block in the grid on X
 * 	@param warpSize : number of thread per warp
*/
void hadamard_product_cuda(float * vecResult, const float * vectLeft, const float * vectRight, int nbElement,
					int maxNbThreadPerBlockX, int maxNbBlockX, int warpSize)
{
	float *vecResultd = NULL, *vectLeftd = NULL, *vectRightd = NULL;
	
	int sizeByte = nbElement*sizeof(float);
	cudaMalloc((void**)&vecResultd, sizeByte);
	cudaMalloc((void**)&vectLeftd, sizeByte);
	cudaMalloc((void**)&vectRightd, sizeByte);
	
	int dimGridX = 1;
	int dimBlockX = 1;
	
	if(nbElement > maxNbThreadPerBlockX){          //au lieu d'utiliser plusieurs warps d'un block, on utilise plusieurs block
		dimBlockX = maxNbThreadPerBlockX;
		dimGridX = nbElement/dimBlockX;
	}else{                                //sinon on utlise qu'un seul bloc
		dimBlockX = nbElement;
	}
	cudaMemcpy(vectLeftd, vectLeft, sizeByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	cudaMemcpy(vectRightd, vectRight, sizeByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	
	dim3 dimGrid(dimGridX, 1, 1);
	dim3 dimBlock(dimBlockX, 1, 1);
	hadamard_product_kernel<<<dimGrid, dimBlock>>>(vecResultd, vectLeftd, vectRightd, nbElement);
	PLIB_CUDA_CHECK_FILE
	cudaMemcpy(vecResult, vecResultd, sizeByte, cudaMemcpyDeviceToHost);
	PLIB_CUDA_CHECK_FILE
	
	cudaFree(vectRightd);
	cudaFree(vectLeftd);
	cudaFree(vecResultd);
}


///Interface of the Hadamard product kernel
/**	@param[out] vecResult : result of the computing
 * 	@param vectLeft : vector of values
 * 	@param vectRight : vector of values
 * 	@param nbElement : number of elements to be computed
 * 	@param nbRepeat : number of repeat of the computation
 * 	@param maxNbThreadPerBlockX : maximum number of thread per block on X
 * 	@param maxNbBlockX : maximum number of block in the grid on X
 * 	@param warpSize : number of thread per warp
*/
void hadamard_product_cuda_clock(float * vecResult, const float * vectLeft, const float * vectRight, int nbElement, int nbRepeat,
						int maxNbThreadPerBlockX, int maxNbBlockX, int warpSize)
{
	float *vecResultd = NULL, *vectLeftd = NULL, *vectRightd = NULL;
	
	int size = nbElement*sizeof(float);
	cudaMalloc((void**)&vecResultd, size);
	cudaMalloc((void**)&vectLeftd, size);
	cudaMalloc((void**)&vectRightd, size);
	
	int dimGridX = 1;
	int dimBlockX = 1;
	
	if(nbElement > maxNbThreadPerBlockX){          //au lieu d'utiliser plusieurs warps d'un block, on utilise plusieurs block
		dimBlockX = maxNbThreadPerBlockX;
		dimGridX = nbElement/dimBlockX;
	}else{                                //sinon on utlise qu'un seul bloc
		dimBlockX = nbElement;
	}
	cudaMemcpy(vectLeftd, vectLeft, size, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	cudaMemcpy(vectRightd, vectRight, size, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	
	dim3 dimGrid(dimGridX, 1, 1);
	dim3 dimBlock(dimBlockX, 1, 1);
	
	float res = 0.0f;
	long unsigned int beginTime = rdtsc();
	for(int i(0); i < nbRepeat; ++i){
		hadamard_product_kernel<<<dimGrid, dimBlock>>>(vecResultd, vectLeftd,
								vectRightd, nbElement);

Il ne faut pas oublier d'utiliser le résultat de notre calcul sinon le compilateur va le supprimer :

		float val = 0.0;
		cudaMemcpy(&val, vecResultd, sizeof(float), cudaMemcpyDeviceToHost);
		res += val;
	}
	long unsigned int elapsedTime = (double)(rdtsc() - beginTime)/((double)nbRepeat);
	double cyclePerElement = ((double)elapsedTime)/((double)nbElement);
	
	PLIB_CUDA_CHECK_FILE
	cudaMemcpy(vecResult, vecResultd, size, cudaMemcpyDeviceToHost);
	res += vecResult[0];
	printf("hadamard_product_cuda_clock : nbElement = %d, cyclePerElement = %lf cy/el, elapsedTime = %ld cy, res = %f\n", nbElement, cyclePerElement, elapsedTime, res);
	fprintf(stderr, "%d\t%f\t%ld\n", nbElement, cyclePerElement, elapsedTime);
	PLIB_CUDA_CHECK_FILE
	cudaFree(vectRightd);
	cudaFree(vectLeftd);
	cudaFree(vecResultd);
}

Le fichier hadamard_product_cuda.cu complet.

/***************************************
	Auteur : Pierre Aubert
	Mail : pierre.aubert@lapp.in2p3.fr
	Licence : CeCILL-C
****************************************/

#include "timer.h"
#include "asterics_cuda.h"
#include "hadamard_product_cuda.h"

///Compute a Hadamard Product with Cuda
/**	@param[out] vecResult : result of the computing
 * 	@param vectLeft : vector of values
 * 	@param vectRight : vector of values
 * 	@param nbElement : number of elements to be computed
*/
__global__ void hadamard_product_kernel(float * vecResult, const float * vectLeft, const float * vectRight, int nbElement){
	// identifiant de thread à deux dimensions, comme la matrice
	int i = blockIdx.x*blockDim.x + threadIdx.x;
// 	if(i < nbElement) 
	vecResult[i] = vectLeft[i] * vectRight[i];
}

///Interface of the Hadamard product kernel
/**	@param[out] vecResult : result of the computing
 * 	@param vectLeft : vector of values
 * 	@param vectRight : vector of values
 * 	@param nbElement : number of elements to be computed
 * 	@param maxNbThreadPerBlockX : maximum number of thread per block on X
 * 	@param maxNbBlockX : maximum number of block in the grid on X
 * 	@param warpSize : number of thread per warp
*/
void hadamard_product_cuda(float * vecResult, const float * vectLeft, const float * vectRight, int nbElement,
					int maxNbThreadPerBlockX, int maxNbBlockX, int warpSize)
{
	float *vecResultd = NULL, *vectLeftd = NULL, *vectRightd = NULL;
	
	int sizeByte = nbElement*sizeof(float);
	cudaMalloc((void**)&vecResultd, sizeByte);
	cudaMalloc((void**)&vectLeftd, sizeByte);
	cudaMalloc((void**)&vectRightd, sizeByte);
	
	int dimGridX = 1;
	int dimBlockX = 1;
	
	if(nbElement > maxNbThreadPerBlockX){          //au lieu d'utiliser plusieurs warps d'un block, on utilise plusieurs block
		dimBlockX = maxNbThreadPerBlockX;
		dimGridX = nbElement/dimBlockX;
	}else{                                //sinon on utlise qu'un seul bloc
		dimBlockX = nbElement;
	}
	cudaMemcpy(vectLeftd, vectLeft, sizeByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	cudaMemcpy(vectRightd, vectRight, sizeByte, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	
	dim3 dimGrid(dimGridX, 1, 1);
	dim3 dimBlock(dimBlockX, 1, 1);
	hadamard_product_kernel<<<dimGrid, dimBlock>>>(vecResultd, vectLeftd, vectRightd, nbElement);
	PLIB_CUDA_CHECK_FILE
	cudaMemcpy(vecResult, vecResultd, sizeByte, cudaMemcpyDeviceToHost);
	PLIB_CUDA_CHECK_FILE
	
	cudaFree(vectRightd);
	cudaFree(vectLeftd);
	cudaFree(vecResultd);
}


///Interface of the Hadamard product kernel
/**	@param[out] vecResult : result of the computing
 * 	@param vectLeft : vector of values
 * 	@param vectRight : vector of values
 * 	@param nbElement : number of elements to be computed
 * 	@param nbRepeat : number of repeat of the computation
 * 	@param maxNbThreadPerBlockX : maximum number of thread per block on X
 * 	@param maxNbBlockX : maximum number of block in the grid on X
 * 	@param warpSize : number of thread per warp
*/
void hadamard_product_cuda_clock(float * vecResult, const float * vectLeft, const float * vectRight, int nbElement, int nbRepeat,
						int maxNbThreadPerBlockX, int maxNbBlockX, int warpSize)
{
	float *vecResultd = NULL, *vectLeftd = NULL, *vectRightd = NULL;
	
	int size = nbElement*sizeof(float);
	cudaMalloc((void**)&vecResultd, size);
	cudaMalloc((void**)&vectLeftd, size);
	cudaMalloc((void**)&vectRightd, size);
	
	int dimGridX = 1;
	int dimBlockX = 1;
	
	if(nbElement > maxNbThreadPerBlockX){          //au lieu d'utiliser plusieurs warps d'un block, on utilise plusieurs block
		dimBlockX = maxNbThreadPerBlockX;
		dimGridX = nbElement/dimBlockX;
	}else{                                //sinon on utlise qu'un seul bloc
		dimBlockX = nbElement;
	}
	cudaMemcpy(vectLeftd, vectLeft, size, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	cudaMemcpy(vectRightd, vectRight, size, cudaMemcpyHostToDevice);
	PLIB_CUDA_CHECK_FILE
	
	dim3 dimGrid(dimGridX, 1, 1);
	dim3 dimBlock(dimBlockX, 1, 1);
	
	float res = 0.0f;
	long unsigned int beginTime = rdtsc();
	for(int i(0); i < nbRepeat; ++i){
		hadamard_product_kernel<<<dimGrid, dimBlock>>>(vecResultd, vectLeftd,
								vectRightd, nbElement);
		float val = 0.0;
		cudaMemcpy(&val, vecResultd, sizeof(float), cudaMemcpyDeviceToHost);
		res += val;
	}
	long unsigned int elapsedTime = (double)(rdtsc() - beginTime)/((double)nbRepeat);
	double cyclePerElement = ((double)elapsedTime)/((double)nbElement);
	
	PLIB_CUDA_CHECK_FILE
	cudaMemcpy(vecResult, vecResultd, size, cudaMemcpyDeviceToHost);
	res += vecResult[0];
	printf("hadamard_product_cuda_clock : nbElement = %d, cyclePerElement = %lf cy/el, elapsedTime = %ld cy, res = %f\n", nbElement, cyclePerElement, elapsedTime, res);
	fprintf(stderr, "%d\t%f\t%ld\n", nbElement, cyclePerElement, elapsedTime);
	PLIB_CUDA_CHECK_FILE
	cudaFree(vectRightd);
	cudaFree(vectLeftd);
	cudaFree(vecResultd);
}

Vous pouvez télécharger le fichier ici.