9 int len = atom*(hi-lo);
11 REAL32* x_ptr = &(
a->x_ptr[low]);
12 REAL32* y_ptr = &(
a->y_ptr[low]);
13 REAL64 norm_array[3] = {0,0,0};
15 __m128d
sum = _mm_set_pd((
double)0,(
double)0);
16 __m128d dotprod = _mm_set_pd((
double)0,(
double)0);
17 __m128d mask = _mm_set_pd((
double)-1, (
double)1);
23 __m128 xv = _mm_load_ps(&x_ptr[
count]);
24 __m128 yv = _mm_load_ps(&y_ptr[
count]);
30 xlow = _mm_cvtps_pd(xv);
33 xv = _mm_shuffle_ps( xv,xv, 0x4e);
36 xhi = _mm_cvtps_pd(xv);
39 ylow = _mm_cvtps_pd(yv);
40 yv = _mm_shuffle_ps( yv,yv, 0x4e);
41 yhi = _mm_cvtps_pd(yv);
43 sum = _mm_add_pd(
sum,_mm_mul_pd(xlow,xlow));
44 sum = _mm_add_pd(
sum,_mm_mul_pd(xhi,xhi));
57 __m128d t1 = _mm_shuffle_pd(ylow, ylow, 0x0);
58 __m128d t2 = _mm_mul_pd(mask, t1);
59 __m128d t3 = _mm_mul_pd(xlow, t2);
60 dotprod = _mm_add_pd(dotprod, t3);
64 t1 = _mm_shuffle_pd(ylow, ylow, 0x3);
65 t2 = _mm_shuffle_pd(xlow, xlow, 0x1);
66 t3 = _mm_mul_pd(t1,t2);
67 dotprod = _mm_add_pd(dotprod,t3);
72 t1 = _mm_shuffle_pd(yhi,yhi,0x0);
73 t2 = _mm_mul_pd(mask,t1);
74 t3 = _mm_mul_pd(xhi,t2);
75 dotprod = _mm_add_pd(dotprod,t3);
79 t1 = _mm_shuffle_pd(yhi,yhi, 0x3);
80 t2 = _mm_shuffle_pd(xhi,xhi,0x1);
81 t3 = _mm_mul_pd(t1, t2);
82 dotprod = _mm_add_pd(dotprod,t3);
86 a->norm_space[3*my_id]=((
double *)&
sum)[0] + ((
double *)&
sum)[1];
87 a->norm_space[3*my_id+1]=((
double *)&dotprod)[0];
88 a->norm_space[3*my_id+2]=((
double *)&dotprod)[1];
91 QDPIO::cout <<
"ord_norm2x_cdotxy_kernel_sse.h: len not divisible by 4" << std::endl;
void ord_norm2x_cdotxy_kernel(int lo, int hi, int my_id, ord_norm2x_cdotxy_arg *a)