9 int len = atom*(hi - lo);
11 REAL32* x_ptr = &(
a->x_ptr[low]);
12 REAL32* y_ptr = &(
a->y_ptr[low]);
13 REAL32* z_ptr = &(
a->z_ptr[low]);
15 REAL32 a_re =
a->a_re;
16 REAL32 a_im =
a->a_im;
17 REAL32 b_re =
a->b_re;
18 REAL32 b_im =
a->b_im;
25 __m128 av_re = _mm_set_ps(a_re, a_re, a_re, a_re);
26 __m128 av_im = _mm_set_ps(a_im, -a_im, a_im,-a_im);
27 __m128 bv_re = _mm_set_ps(b_re, b_re, b_re, b_re);
28 __m128 bv_im = _mm_set_ps(-b_im, b_im, -b_im, b_im);
33 __m128 xv = _mm_load_ps(&x_ptr[
count]);
34 __m128 yv = _mm_load_ps(&y_ptr[
count]);
35 __m128 zv = _mm_load_ps(&z_ptr[
count]);
43 zv2 = _mm_shuffle_ps(zv2,zv2,0xb1);
47 __m128 t1 = _mm_mul_ps(bv_re, zv);
48 __m128 t2 = _mm_sub_ps(yv,t1);
50 __m128 t3 = _mm_mul_ps(bv_im, zv2);
51 t2 = _mm_add_ps(t2,t3);
56 zv2 = _mm_shuffle_ps(zv2,zv2,0xb1);
59 t1 = _mm_mul_ps(av_re, t2);
60 yv = _mm_add_ps(xv, t1);
61 t3 = _mm_mul_ps(av_im, zv2);
62 yv = _mm_add_ps(yv, t3);
64 _mm_store_ps(&y_ptr[
count], yv);
70 QDPIO::cout <<
"ord_yxpaymabz_kernel_sse.h: len not divisible by 4" << std::endl;
void ord_yxpaymabz_kernel(int lo, int hi, int my_id, ord_yxpaymabz_arg *a)