10 int len = atom*(hi - lo);
12 REAL32*
r = &(
a->r_ptr[low]);
13 REAL32*
z = &(
a->z_ptr[low]);
14 REAL32* v = &(
a->v_ptr[low]);
15 REAL32*
u = &(
a->u_ptr[low]);
16 REAL32*
q = &(
a->q_ptr[low]);
18 REAL32 a_re =
a->alpha_re;
19 REAL32 a_im =
a->alpha_im;
21 REAL32 arb_re =
a->alpha_rat_beta_re;
22 REAL32 arb_im =
a->alpha_rat_beta_im;
24 REAL32 ad_re =
a->alpha_delta_re;
25 REAL32 ad_im =
a->alpha_delta_im;
27 REAL32 b_re =
a->beta_re;
28 REAL32 b_im =
a->beta_im;
30 REAL32 d_re =
a->delta_re;
31 REAL32 d_im =
a->delta_im;
38 __m128 zvec, rvec, vvec, uvec, qvec, tmpshuf1, tmpshuf2, tmpshuf3,tmpshuf4;
39 const __m128 arb_re_vec = _mm_set_ps(arb_re,arb_re,arb_re,arb_re);
40 const __m128 arb_im_vec = _mm_set_ps(arb_im,-arb_im,arb_im,-arb_im);
41 const __m128 a_re_vec = _mm_set_ps(a_re,a_re,a_re,a_re);
42 const __m128 a_im_vec = _mm_set_ps(a_im,-a_im,a_im,-a_im);
43 const __m128 ad_re_vec = _mm_set_ps(ad_re,ad_re,ad_re,ad_re);
44 const __m128 ad_im_vec = _mm_set_ps(ad_im,-ad_im,ad_im,-ad_im);
45 const __m128 b_re_vec = _mm_set_ps(b_re,b_re,b_re,b_re);
46 const __m128 b_im_vec = _mm_set_ps(b_im,-b_im,b_im,-b_im);
47 const __m128 d_re_vec = _mm_set_ps(d_re,d_re,d_re,d_re);
48 const __m128 d_im_vec = _mm_set_ps(d_im,-d_im,d_im,-d_im);
54 ztmp = _mm_load_ps(&
z[
count]);
57 rvec = _mm_load_ps(&
r[
count]);
60 vtmp = _mm_load_ps(&v[
count]);
63 uvec = _mm_load_ps(&
u[
count]);
66 qvec = _mm_load_ps(&
q[
count]);
69 tmpshuf1 = _mm_shuffle_ps(ztmp,ztmp, 0xb1);
72 tmpshuf2 = _mm_shuffle_ps(rvec,rvec, 0xb1);
75 tmpshuf3 = _mm_shuffle_ps(vtmp,vtmp, 0xb1);
78 tmpshuf4 = _mm_shuffle_ps(qvec,qvec, 0xb1);
94 zvec = _mm_mul_ps(arb_re_vec,ztmp);
95 zvec = _mm_add_ps(zvec, _mm_mul_ps(arb_im_vec,tmpshuf1));
106 zvec = _mm_add_ps(zvec, _mm_mul_ps(a_re_vec,rvec));
116 zvec = _mm_add_ps(zvec, _mm_mul_ps(a_im_vec,tmpshuf2));
126 zvec = _mm_sub_ps(zvec, _mm_mul_ps(ad_re_vec, vtmp));
134 zvec = _mm_sub_ps(zvec, _mm_mul_ps(ad_im_vec, tmpshuf3));
136 _mm_store_ps(&
z[
count],zvec);
145 vvec = _mm_add_ps(uvec, _mm_mul_ps(b_re_vec,vtmp));
146 vvec = _mm_add_ps(vvec, _mm_mul_ps(b_im_vec,tmpshuf3));
155 vvec = _mm_sub_ps(vvec, _mm_mul_ps(d_re_vec,qvec));
162 vvec = _mm_sub_ps(vvec, _mm_mul_ps(d_im_vec,tmpshuf4));
164 _mm_store_ps(&v[
count],vvec);
170 QDPIO::cout <<
"ord_ib_zvupdates_sse.h: len not divisible by 4" << std::endl;
187 int len = atom*(hi - lo);
189 REAL64*
r = &(
a->r_ptr[low]);
190 REAL64*
z = &(
a->z_ptr[low]);
191 REAL64* v = &(
a->v_ptr[low]);
192 REAL64*
u = &(
a->u_ptr[low]);
193 REAL64*
q = &(
a->q_ptr[low]);
195 REAL64 a_re =
a->alpha_re;
196 REAL64 a_im =
a->alpha_im;
198 REAL64 arb_re =
a->alpha_rat_beta_re;
199 REAL64 arb_im =
a->alpha_rat_beta_im;
201 REAL64 ad_re =
a->alpha_delta_re;
202 REAL64 ad_im =
a->alpha_delta_im;
204 REAL64 b_re =
a->beta_re;
205 REAL64 b_im =
a->beta_im;
207 REAL64 d_re =
a->delta_re;
208 REAL64 d_im =
a->delta_im;
213 __m128d zvec, rvec, vvec, uvec, qvec, tmpshuf1, tmpshuf2, tmpshuf3,tmpshuf4;
214 const __m128d arb_re_vec = _mm_set_pd(arb_re,arb_re);
215 const __m128d arb_im_vec = _mm_set_pd(arb_im,-arb_im);
216 const __m128d a_re_vec = _mm_set_pd(a_re,a_re);
217 const __m128d a_im_vec = _mm_set_pd(a_im,-a_im);
218 const __m128d ad_re_vec = _mm_set_pd(ad_re,ad_re);
219 const __m128d ad_im_vec = _mm_set_pd(ad_im,-ad_im);
220 const __m128d b_re_vec = _mm_set_pd(b_re,b_re);
221 const __m128d b_im_vec = _mm_set_pd(b_im,-b_im);
222 const __m128d d_re_vec = _mm_set_pd(d_re,d_re);
223 const __m128d d_im_vec = _mm_set_pd(d_im,-d_im);
228 ztmp = _mm_load_pd(&
z[
count]);
229 vtmp = _mm_load_pd(&v[
count]);
230 rvec = _mm_load_pd(&
r[
count]);
231 uvec = _mm_load_pd(&
u[
count]);
232 qvec = _mm_load_pd(&
q[
count]);
235 tmpshuf1= _mm_shuffle_pd(ztmp,ztmp,0x1);
236 tmpshuf2 = _mm_shuffle_pd(rvec,rvec,0x1);
237 tmpshuf3 = _mm_shuffle_pd(vtmp,vtmp,0x1);
238 tmpshuf4 = _mm_shuffle_pd(qvec,qvec,0x1);
249 zvec = _mm_mul_pd(arb_re_vec, ztmp);
250 zvec = _mm_add_pd(zvec, _mm_mul_pd(arb_im_vec, tmpshuf1));
264 zvec = _mm_add_pd(zvec,_mm_mul_pd(a_re_vec,rvec));
265 zvec = _mm_add_pd(zvec,_mm_mul_pd(a_im_vec,tmpshuf2));
275 zvec = _mm_sub_pd(zvec, _mm_mul_pd(ad_re_vec,vtmp));
276 zvec = _mm_sub_pd(zvec, _mm_mul_pd(ad_im_vec, tmpshuf3));
277 _mm_store_pd(&
z[
count], zvec);
288 vvec = _mm_add_pd( uvec, _mm_mul_pd(b_re_vec,vtmp));
289 vvec = _mm_add_pd( vvec, _mm_mul_pd(b_im_vec,tmpshuf3));
295 vvec = _mm_sub_pd( vvec, _mm_mul_pd(d_re_vec,qvec));
301 vvec = _mm_sub_pd( vvec, _mm_mul_pd(d_im_vec, tmpshuf4));
302 _mm_store_pd(&v[
count],vvec);
306 QDPIO::cout <<
"ord_ib_zvupdates_sse.h: len not divisible by 2" << std::endl;
static multi1d< LatticeColorMatrix > u
void ord_ib_zvupdates_kernel_real32(int lo, int hi, int my_id, ib_zvupdates_arg< REAL32 > *a)
void ord_ib_zvupdates_kernel_real64(int lo, int hi, int my_id, ib_zvupdates_arg< REAL64 > *a)
multi1d< LatticeFermion > r(Ncb)