12 int len = atom*(hi - lo);
14 REAL32*
s = &(
a->s_ptr[low]);
15 REAL32*
t = &(
a->t_ptr[low]);
16 REAL32*
z = &(
a->z_ptr[low]);
17 REAL32*
r = &(
a->r_ptr[low]);
18 REAL32*
x = &(
a->x_ptr[low]);
20 REAL32 om_re =
a->omega_re;
21 REAL32 om_im =
a->omega_im;
23 __m128 om_re_vec = _mm_set_ps(om_re, om_re, om_re, om_re);
24 __m128 om_im_vec = _mm_set_ps(-om_im, om_im, -om_im, om_im);
25 __m128 mom_im_vec = _mm_set_ps(om_im, -om_im, om_im, -om_im);
37 __m128 t_vec = _mm_load_ps(&
t[
count]);
38 __m128 s_vec = _mm_load_ps(&
s[
count]);
40 __m128 tmpv1 = _mm_mul_ps(om_re_vec, t_vec);
42 __m128 tmpv2 = _mm_shuffle_ps(t_vec, t_vec, 0xb1);
43 __m128 r_vec = _mm_sub_ps(s_vec, tmpv1);
45 tmpv1 = _mm_mul_ps(om_im_vec,tmpv2);
46 r_vec = _mm_add_ps(r_vec, tmpv1);
47 _mm_store_ps(&
r[
count], r_vec);
58 __m128 xvec = _mm_load_ps(&
x[
count]);
61 __m128 zvec = _mm_load_ps(&
z[
count]);
64 tmpv1 = _mm_mul_ps(om_re_vec, s_vec);
65 xvec = _mm_add_ps(xvec, tmpv1);
67 tmpv2 = _mm_shuffle_ps(s_vec, s_vec, 0xb1);
68 xvec = _mm_add_ps(xvec,zvec);
70 tmpv1 = _mm_mul_ps(mom_im_vec,tmpv2);
71 xvec = _mm_add_ps(xvec, tmpv1);
73 _mm_store_ps(&
x[
count], xvec);
78 QDPIO::cout <<
"ord_ib_rxupdate_kernel_sse.h: len not divisible by 4" << std::endl;
95 int len = atom*(hi - lo)
97 REAL64*
s = &(
a->s_ptr[low]);
98 REAL64*
t = &(
a->t_ptr[low]);
99 REAL64*
z = &(
a->z_ptr[low]);
100 REAL64*
r = &(
a->r_ptr[low]);
101 REAL64*
x = &(
a->x_ptr[low]);
103 REAL64 om_re =
a->omega_re;
104 REAL64 om_im =
a->omega_im;
108 __m128d om_re_vec = _mm_set_pd(om_re, om_re);
109 __m128d om_im_vec = _mm_set_pd(-om_im, om_im);
110 __m128d mom_im_vec = _mm_set_pd(om_im, -om_im);
119 __m128d svec = _mm_load_pd(&
s[
count]);
120 __m128d tvec = _mm_load_pd(&
t[
count]);
122 __m128d tmpv2 = _mm_mul_pd(om_re_vec,tvec);
123 __m128d rvec = _mm_sub_pd(svec, tmpv2);
126 __m128d tmpv1 = _mm_shuffle_pd(tvec,tvec,0x1);
127 tmpv2 = _mm_mul_pd(om_im_vec,tmpv1);
128 rvec = _mm_add_pd(rvec, tmpv2);
129 _mm_store_pd(&
r[
count], rvec);
138 __m128d xvec = _mm_load_pd(&
x[
count]);
139 __m128d zvec = _mm_load_pd(&
z[
count]);
141 tmpv1 = _mm_mul_pd(om_re_vec, svec);
142 xvec = _mm_add_pd(xvec,tmpv1);
144 tmpv2 = _mm_shuffle_pd(svec,svec,0x1);
145 xvec = _mm_add_pd(xvec,zvec);
147 tmpv1 = _mm_mul_pd(mom_im_vec, tmpv2);
148 xvec = _mm_add_pd(xvec,tmpv1);
150 _mm_store_pd(&
x[
count],xvec);
155 QDPIO::cout <<
"ord_ib_rxupdate_kernel_sse.h: len not divisible by 2" << std::endl;
void ord_ib_rxupdate_kernel_real64(int lo, int hi, int my_id, ib_rxupdate_arg< REAL64 > *a)
void ord_ib_rxupdate_kernel_real32(int lo, int hi, int my_id, ib_rxupdate_arg< REAL32 > *a)
multi1d< LatticeFermion > r(Ncb)