9 int len = atom*(hi - lo);
11 REAL32* x_ptr = &(
a->x_ptr[low]);
12 REAL32* y_ptr = &(
a->y_ptr[low]);
13 REAL32* z_ptr = &(
a->z_ptr[low]);
15 REAL32 a_re =
a->a_re;
16 REAL32 a_im =
a->a_im;
17 REAL32 b_re =
a->b_re;
18 REAL32 b_im =
a->b_im;
20 __m128 bv_re = _mm_set_ps(b_re, b_re, b_re, b_re);
21 __m128 bv_im = _mm_set_ps(b_im, -b_im, b_im, -b_im);
23 __m128 av_re = _mm_set_ps(a_re, a_re, a_re, a_re);
24 __m128 av_im = _mm_set_ps(a_im, -a_im, a_im, -a_im);
28 __m128 xv = _mm_load_ps(&x_ptr[
count]);
29 __m128 yv = _mm_load_ps(&y_ptr[
count]);
30 __m128 zv = _mm_load_ps(&z_ptr[
count]);
35 __m128 t1 = _mm_shuffle_ps(yv,yv, 0xb1);
36 __m128 t2 = _mm_mul_ps(av_re, yv);
37 __m128 t3 = _mm_add_ps(xv,t2);
38 __m128 t4 = _mm_mul_ps(av_im, t1);
39 xv = _mm_add_ps(t3, t4);
41 t1 = _mm_shuffle_ps(zv,zv,0xb1);
42 t2 = _mm_mul_ps(bv_re, zv);
43 t3 = _mm_add_ps(xv, t2);
44 t4 = _mm_mul_ps(bv_im, t1);
45 xv = _mm_add_ps(t3,t4);
47 _mm_store_ps(&x_ptr[
count], xv);
52 REAL32 tmp_re, tmp_im;
53 REAL32 tmp_re2, tmp_im2;
56 tmp_re -= a_im*y_ptr[
count+1];
58 tmp_im += a_im*y_ptr[
count];
62 tmp_re2 = x_ptr[
count+2] + a_re*y_ptr[
count+2];
63 tmp_re2 -= a_im*y_ptr[
count+3];
64 tmp_im2 = x_ptr[
count+3] + a_re*y_ptr[
count+3];
65 tmp_im2 += a_im*y_ptr[
count+2];
74 x_ptr[
count+2] = tmp_re2 + b_re*z_ptr[
count+2] ;
76 x_ptr[
count+3] = tmp_im2 + b_re*z_ptr[
count+3];
83 QDPIO::cout <<
"ord_xpaypbz_kernel_sse.h: len not divisible by 4" << std::endl;
void ord_xpaypbz_kernel(int lo, int hi, int my_id, ord_xpaypbz_arg *a)