10 int len = atom*(hi - lo);
12 REAL32* x_ptr = &(arg->x_ptr[low]);
13 REAL32* y_ptr = &(arg->y_ptr[low]);
15 REAL32 a_re = arg->a_re;
16 REAL32 a_im = arg->a_im;
18 __m128 av_re = _mm_set_ps(a_re, a_re, a_re, a_re);
19 __m128 av_im = _mm_set_ps(-a_im,a_im,-a_im, a_im);
23 __m128 xv = _mm_load_ps(&x_ptr[
count]);
24 __m128 yv = _mm_load_ps(&y_ptr[
count]);
25 __m128 yv2 = _mm_shuffle_ps( yv,yv, 0xb1);
27 __m128 t1 = _mm_mul_ps(av_re, yv);
28 __m128 t2 = _mm_sub_ps(xv, t1);
29 __m128 t3 = _mm_mul_ps(av_im, yv2);
30 xv = _mm_add_ps(t2, t3);
32 _mm_store_ps(&x_ptr[
count], xv);
37 QDPIO::cout <<
"ord_cxmayf_kernel_sse.h: len not divisible by 4 " << std::endl;
void ord_cxmayf_kernel(int lo, int hi, int my_id, ord_cxmayf_arg *arg)