1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
#include <immintrin.h>
#include "barycentre_intrinsics.h"
///Compute the barycentre of the x,y,a tables
/** @param[out] gx : barycentre on X axis
* @param[out] gy : barycentre on Y axis
* @param tabX : coordinates on X axis
* @param tabY : coordinates on Y axis
* @param tabA : signal amplitude
* @param nbElement : number of elements of the input tables
*/
void barycentre(float & gx, float & gy, const float * tabX, const float* tabY, const float* tabA, long unsigned int nbElement){
gx = 0.0f;
gy = 0.0f;
long unsigned int vecSize(VECTOR_ALIGNEMENT/sizeof(float));
long unsigned int nbVec(nbElement/(vecSize*2lu));
__m256 vecResX1 = _mm256_broadcast_ss(&gx);
__m256 vecResX2 = _mm256_broadcast_ss(&gx);
__m256 vecResY1 = _mm256_broadcast_ss(&gy);
__m256 vecResY2 = _mm256_broadcast_ss(&gy);
for(long unsigned int i(0lu); i < nbVec; ++i){
__m256 vecValueA1 = _mm256_load_ps(tabA + 2lu*i*vecSize);
__m256 vecValueX1 = _mm256_load_ps(tabX + 2lu*i*vecSize);
__m256 vecAX1 = _mm256_mul_ps(vecValueA1, vecValueX1);
vecResX1 = _mm256_add_ps(vecResX1, vecAX1);
__m256 vecValueY1 = _mm256_load_ps(tabY + 2lu*i*vecSize);
__m256 vecAY1 = _mm256_mul_ps(vecValueA1, vecValueY1);
vecResY1 = _mm256_add_ps(vecResY1, vecAY1);
__m256 vecValueA2 = _mm256_load_ps(tabA + (2lu*i + 1lu)*vecSize);
__m256 vecValueX2 = _mm256_load_ps(tabX + (2lu*i + 1lu)*vecSize);
__m256 vecAX2 = _mm256_mul_ps(vecValueA2, vecValueX2);
vecResX2 = _mm256_add_ps(vecResX2, vecAX2);
__m256 vecValueY2 = _mm256_load_ps(tabY + (2lu*i + 1lu)*vecSize);
__m256 vecAY2 = _mm256_mul_ps(vecValueA2, vecValueY2);
vecResY2 = _mm256_add_ps(vecResY2, vecAY2);
}
__m256 vecResX = _mm256_add_ps(vecResX1, vecResX2);
float tmp[8lu];
_mm256_storeu_ps(tmp, vecResX);
for(long unsigned int i(0lu); i < 8lu; ++i){
gx += tmp[i];
}
gx /= (float)nbElement;
__m256 vecResY = _mm256_add_ps(vecResY1, vecResY2);
_mm256_storeu_ps(tmp, vecResY);
for(long unsigned int i(0lu); i < 8lu; ++i){
gy += tmp[i];
}
gy /= (float)nbElement;
}
|