11 int len = atom*(hi-lo);
13 REAL32*
r = &(
a->r[low]);
14 REAL32*
u = &(
a->u[low]);
15 REAL32* v = &(
a->v[low]);
16 REAL32*
q = &(
a->q[low]);
17 REAL32*
r0 = &(
a->r0[low]);
18 REAL32* f0 = &(
a->f0[low]);
19 REAL32*
s = &(
a->s[low]);
20 REAL32*
t = &(
a->t[low]);
21 REAL64* norm_array = &(
a->norm_space[12*my_id]);
25 __m128 svec, rvec, vvec, tmpshuf1;
26 __m128 qvec, uvec, tvec, tmpshuf2;
27 __m128 ar_vec = _mm_set_ps(a_r,a_r,a_r,a_r);
28 __m128 ai_vec = _mm_set_ps(a_i,-a_i,a_i,-a_i);
38 __m128d mask = _mm_set_pd((
double)-1,(
double)1);
44 vvec = _mm_load_ps(&v[
count]);
45 qvec = _mm_load_ps(&
q[
count]);
47 rvec = _mm_load_ps(&
r[
count]);
48 uvec = _mm_load_ps(&
u[
count]);
49 tmpshuf1 = _mm_shuffle_ps(vvec,vvec,0xb1);
50 tmpshuf2 = _mm_shuffle_ps(qvec, qvec,0xb1);
59 svec = _mm_sub_ps(rvec, _mm_mul_ps(ar_vec,vvec));
67 svec = _mm_sub_ps(svec, _mm_mul_ps(ai_vec, tmpshuf1));
70 tvec = _mm_sub_ps(uvec, _mm_mul_ps(ar_vec,qvec));
71 tvec = _mm_sub_ps(tvec, _mm_mul_ps(ai_vec,tmpshuf2));
73 _mm_store_ps(&
s[
count],svec);
74 _mm_store_ps(&
t[
count],tvec);
77 slo = _mm_cvtps_pd(svec);
78 svec = _mm_shuffle_ps(svec,svec,0x4e);
79 shi = _mm_cvtps_pd(svec);
81 qlo = _mm_cvtps_pd(qvec);
82 qvec = _mm_shuffle_ps(qvec,qvec,0x4e);
83 qhi = _mm_cvtps_pd(qvec);
85 tlo = _mm_cvtps_pd(tvec);
86 tvec = _mm_shuffle_ps(tvec,tvec,0x4e);
87 thi = _mm_cvtps_pd(tvec);
105 lvec = _mm_load_ps(&
r0[
count]);
108 dotprod = _mm_load_pd(&norm_array[0]);
111 r0lo = _mm_cvtps_pd(lvec);
112 lvec = _mm_shuffle_ps(lvec,lvec,0x4e);
113 r0hi = _mm_cvtps_pd(lvec);
126 t1 = _mm_shuffle_pd(slo,slo,0x0);
127 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(r0lo, _mm_mul_pd(mask,t1)));
131 t1 = _mm_shuffle_pd(slo,slo,0x3);
132 t2 = _mm_shuffle_pd(r0lo,r0lo,0x1);
133 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
137 t1 = _mm_shuffle_pd(shi,shi,0x0);
138 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(r0hi, _mm_mul_pd(mask,t1)));
142 t1 = _mm_shuffle_pd(shi,shi,0x3);
143 t2 = _mm_shuffle_pd(r0hi,r0hi,0x1);
144 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
145 _mm_store_pd(&norm_array[0],dotprod);
168 lvec = _mm_load_ps(&f0[
count]);
171 dotprod = _mm_load_pd(&norm_array[2]);
174 f0lo = _mm_cvtps_pd(lvec);
175 lvec = _mm_shuffle_ps(lvec,lvec,0x4e);
176 f0hi = _mm_cvtps_pd(lvec);
189 t1 = _mm_shuffle_pd(slo,slo,0x0);
190 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(f0lo, _mm_mul_pd(mask,t1)));
194 t1 = _mm_shuffle_pd(slo,slo,0x3);
195 t2 = _mm_shuffle_pd(f0lo,f0lo,0x1);
196 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
200 t1 = _mm_shuffle_pd(shi,shi,0x0);
201 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(f0hi, _mm_mul_pd(mask,t1)));
205 t1 = _mm_shuffle_pd(shi,shi,0x3);
206 t2 = _mm_shuffle_pd(f0hi,f0hi,0x1);
207 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
208 _mm_store_pd(&norm_array[2],dotprod);
228 dotprod = _mm_load_pd(&norm_array[4]);
234 t1 = _mm_shuffle_pd(qlo,qlo,0x0);
235 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(r0lo, _mm_mul_pd(mask,t1)));
239 t1 = _mm_shuffle_pd(qlo,qlo,0x3);
240 t2 = _mm_shuffle_pd(r0lo,r0lo,0x1);
241 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
245 t1 = _mm_shuffle_pd(qhi,qhi,0x0);
246 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(r0hi, _mm_mul_pd(mask,t1)));
250 t1 = _mm_shuffle_pd(qhi,qhi,0x3);
251 t2 = _mm_shuffle_pd(r0hi,r0hi,0x1);
252 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
253 _mm_store_pd(&norm_array[4],dotprod);
274 dotprod = _mm_load_pd(&norm_array[6]);
280 t1 = _mm_shuffle_pd(tlo,tlo,0x0);
281 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(f0lo, _mm_mul_pd(mask,t1)));
285 t1 = _mm_shuffle_pd(tlo,tlo,0x3);
286 t2 = _mm_shuffle_pd(f0lo,f0lo,0x1);
287 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
291 t1 = _mm_shuffle_pd(thi,thi,0x0);
292 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(f0hi, _mm_mul_pd(mask,t1)));
296 t1 = _mm_shuffle_pd(thi,thi,0x3);
297 t2 = _mm_shuffle_pd(f0hi,f0hi,0x1);
298 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
299 _mm_store_pd(&norm_array[6],dotprod);
318 dotprod = _mm_load_pd(&norm_array[8]);
324 t1 = _mm_shuffle_pd(slo,slo,0x0);
325 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(tlo, _mm_mul_pd(mask,t1)));
329 t1 = _mm_shuffle_pd(slo,slo,0x3);
330 t2 = _mm_shuffle_pd(tlo,tlo,0x1);
331 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
335 t1 = _mm_shuffle_pd(shi,shi,0x0);
336 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(thi, _mm_mul_pd(mask,t1)));
340 t1 = _mm_shuffle_pd(shi,shi,0x3);
341 t2 = _mm_shuffle_pd(thi,thi,0x1);
342 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
343 _mm_store_pd(&norm_array[8],dotprod);
359 dotprod = _mm_load_pd(&norm_array[10]);
361 r0lo = _mm_cvtps_pd(rvec);
362 rvec = _mm_shuffle_ps(rvec,rvec,0x4e);
363 r0hi = _mm_cvtps_pd(rvec);
366 f0lo = _mm_shuffle_pd(tlo,r0lo, 0x0);
369 f0hi = _mm_shuffle_pd(tlo,r0lo, 0x3);
373 dotprod = _mm_add_pd(dotprod,_mm_mul_pd(f0lo,f0lo));
374 dotprod = _mm_add_pd(dotprod,_mm_mul_pd(f0hi,f0hi));
377 f0lo = _mm_shuffle_pd(thi,r0hi, 0x0);
380 f0hi = _mm_shuffle_pd(thi,r0hi, 0x3);
384 dotprod = _mm_add_pd(dotprod,_mm_mul_pd(f0lo,f0lo));
385 dotprod = _mm_add_pd(dotprod,_mm_mul_pd(f0hi,f0hi));
387 _mm_store_pd(&norm_array[10],dotprod);
392 QDPIO::cout <<
"ord_ib_stupdates_kernel_sse.h: len not divisible by 4" << std::endl;
404 int len = atom*(hi-lo);
406 REAL64*
r = &(
a->r[low]);
407 REAL64*
u = &(
a->u[low]);
408 REAL64* v = &(
a->v[low]);
409 REAL64*
q = &(
a->q[low]);
410 REAL64*
r0 = &(
a->r0[low]);
411 REAL64* f0 = &(
a->f0[low]);
412 REAL64*
s = &(
a->s[low]);
413 REAL64*
t = &(
a->t[low]);
414 REAL64* norm_array = &(
a->norm_space[12*my_id]);
416 __m128d svec, rvec, vvec;
417 __m128d qvec, uvec, tvec;
418 __m128d ar_vec = _mm_set_pd(a_r,a_r);
419 __m128d ai_vec = _mm_set_pd(a_i,-a_i);
427 __m128d mask = _mm_set_pd((
double)-1,(
double)1);
434 rvec = _mm_load_pd(&
r[
count]);
435 vvec = _mm_load_pd(&v[
count]);
436 t1 = _mm_shuffle_pd(vvec,vvec,0x1);
438 uvec = _mm_load_pd(&
u[
count]);
439 qvec = _mm_load_pd(&
q[
count]);
440 t2 = _mm_shuffle_pd(qvec,qvec,0x1);
442 svec = _mm_sub_pd(rvec, _mm_mul_pd(ar_vec, vvec));
444 svec = _mm_sub_pd(svec, _mm_mul_pd(ai_vec, t1));
447 tvec = _mm_sub_pd(uvec, _mm_mul_pd(ar_vec, qvec));
448 f0v = _mm_load_pd(&f0[
count]);
449 tvec = _mm_sub_pd(tvec, _mm_mul_pd(ai_vec, t2));
451 _mm_store_pd(&
s[
count],svec);
452 _mm_store_pd(&
t[
count],tvec);
466 dotprod = _mm_load_pd(&norm_array[0]);
472 t1 = _mm_shuffle_pd(svec,svec,0x0);
473 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(r0v, _mm_mul_pd(mask,t1)));
477 t1 = _mm_shuffle_pd(svec,svec,0x3);
478 t2 = _mm_shuffle_pd(r0v,r0v,0x1);
479 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
481 _mm_store_pd(&norm_array[0],dotprod);
493 dotprod = _mm_load_pd(&norm_array[2]);
499 t1 = _mm_shuffle_pd(svec,svec,0x0);
500 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(f0v, _mm_mul_pd(mask,t1)));
504 t1 = _mm_shuffle_pd(svec,svec,0x3);
505 t2 = _mm_shuffle_pd(f0v,f0v,0x1);
506 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
508 _mm_store_pd(&norm_array[2],dotprod);
521 dotprod = _mm_load_pd(&norm_array[4]);
527 t1 = _mm_shuffle_pd(qvec,qvec,0x0);
528 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(r0v, _mm_mul_pd(mask,t1)));
532 t1 = _mm_shuffle_pd(qvec,qvec,0x3);
533 t2 = _mm_shuffle_pd(r0v,r0v,0x1);
534 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
536 _mm_store_pd(&norm_array[4],dotprod);
550 dotprod = _mm_load_pd(&norm_array[6]);
556 t1 = _mm_shuffle_pd(tvec,tvec,0x0);
557 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(f0v, _mm_mul_pd(mask,t1)));
561 t1 = _mm_shuffle_pd(tvec,tvec,0x3);
562 t2 = _mm_shuffle_pd(f0v,f0v,0x1);
563 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
565 _mm_store_pd(&norm_array[6],dotprod);
577 dotprod = _mm_load_pd(&norm_array[8]);
583 t1 = _mm_shuffle_pd(svec,svec,0x0);
584 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(tvec, _mm_mul_pd(mask,t1)));
588 t1 = _mm_shuffle_pd(svec,svec,0x3);
589 t2 = _mm_shuffle_pd(tvec,tvec,0x1);
590 dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
592 _mm_store_pd(&norm_array[8],dotprod);
604 dotprod = _mm_load_pd(&norm_array[10]);
607 f0v = _mm_shuffle_pd(tvec,rvec, 0x0);
610 r0v = _mm_shuffle_pd(tvec,rvec, 0x3);
614 dotprod = _mm_add_pd(dotprod,_mm_mul_pd(f0v,f0v));
615 dotprod = _mm_add_pd(dotprod,_mm_mul_pd(r0v,r0v));
618 _mm_store_pd(&norm_array[10],dotprod);
627 QDPIO::cout <<
"ord_ib_stubdates_kernel_sse.h: len not divisible by 2" << std::endl;
static multi1d< LatticeColorMatrix > u
void ord_ib_stupdates_kernel_real64(int lo, int hi, int my_id, ib_stupdate_arg< REAL64 > *a)
void ord_ib_stupdates_kernel_real32(int lo, int hi, int my_id, ib_stupdate_arg< REAL32 > *a)
multi1d< LatticeFermion > r(Ncb)