6 int len = atom*(hi-lo);
8 REAL32* x_ptr=&(
a->x_ptr[low]);
9 REAL32* y_ptr=&(
a->y_ptr[low]);
10 REAL32* z_ptr=&(
a->z_ptr[low]);
11 REAL32 a_re =
a->a_re;
12 REAL32 a_im =
a->a_im;
15 __m128d
sum = _mm_set_pd((
double)0,(
double)0);
16 __m128d dotprod = _mm_set_pd((
double)0,(
double)0);
17 __m128d mask = _mm_set_pd((
double)-1, (
double)1);
19 __m128 av_re = _mm_set_ps( a_re, a_re, a_re, a_re);
20 __m128 av_im = _mm_set_ps(-a_im, a_im, -a_im, a_im);
24 __m128 xv = _mm_load_ps(&x_ptr[
count]);
25 __m128 yv = _mm_load_ps(&y_ptr[
count]);
26 __m128 zv = _mm_load_ps(&z_ptr[
count]);
34 __m128 t1 = _mm_mul_ps(av_re, yv);
35 xv = _mm_sub_ps(xv,t1);
42 t1 = _mm_shuffle_ps(yv,yv,0xb1);
43 __m128 t2 = _mm_mul_ps(av_im, t1);
44 xv = _mm_add_ps(xv, t2);
46 _mm_store_ps(&x_ptr[
count], xv);
52 xlow = _mm_cvtps_pd(xv);
55 xv = _mm_shuffle_ps( xv,xv, 0x4e);
58 xhi = _mm_cvtps_pd(xv);
61 zlow = _mm_cvtps_pd(zv);
62 zv = _mm_shuffle_ps( zv,zv, 0x4e);
63 zhi = _mm_cvtps_pd(zv);
70 sum = _mm_add_pd(
sum,_mm_mul_pd(xlow,xlow));
71 sum = _mm_add_pd(
sum,_mm_mul_pd(xhi,xhi));
77 __m128d t1d = _mm_shuffle_pd(xlow, xlow, 0x0);
78 __m128d t2d = _mm_mul_pd(mask, t1d);
79 __m128d t3d = _mm_mul_pd(zlow, t2d);
81 dotprod = _mm_add_pd(dotprod, t3d);
86 t1d = _mm_shuffle_pd(xlow, xlow, 0x3);
87 t2d = _mm_shuffle_pd(zlow, zlow, 0x1);
88 t3d = _mm_mul_pd(t1d,t2d);
89 dotprod = _mm_add_pd(dotprod,t3d);
94 t1d = _mm_shuffle_pd(xhi,xhi,0x0);
95 t2d = _mm_mul_pd(mask,t1d);
96 t3d = _mm_mul_pd(zhi,t2d);
97 dotprod = _mm_add_pd(dotprod,t3d);
101 t1d = _mm_shuffle_pd(xhi,xhi, 0x3);
102 t2d = _mm_shuffle_pd(zhi,zhi,0x1);
103 t3d = _mm_mul_pd(t1d, t2d);
104 dotprod = _mm_add_pd(dotprod,t3d);
107 a->norm_space[3*my_id]=((
double *)&
sum)[0] + ((
double *)&
sum)[1];
108 a->norm_space[3*my_id+1]=((
double *)&dotprod)[0];
109 a->norm_space[3*my_id+2]=((
double *)&dotprod)[1];
112 QDPIO::cout <<
" ord_xmay_normx_cdotzx_kernel_sse.h: len not divisible by 4" << std::endl;
void ord_xmay_normx_cdotzx_kernel(int lo, int hi, int my_id, ord_xmay_normx_cdotzx_arg *a)