CHROMA
ord_ib_stupdates_kernel_sse.h
Go to the documentation of this file.
1 #include <xmmintrin.h>
2 #include <emmintrin.h>
3 
4 inline
5 void ord_ib_stupdates_kernel_real32(int lo, int hi, int my_id, ib_stupdate_arg<REAL32>* a)
6 {
7  REAL32 a_r = a->a_r;
8  REAL32 a_i = a->a_i;
9  int atom = a->atom;
10  int low = atom*lo;
11  int len = atom*(hi-lo);
12 
13  REAL32* r = &(a->r[low]);
14  REAL32* u = &(a->u[low]);
15  REAL32* v = &(a->v[low]);
16  REAL32* q = &(a->q[low]);
17  REAL32* r0 = &(a->r0[low]);
18  REAL32* f0 = &(a->f0[low]);
19  REAL32* s = &(a->s[low]);
20  REAL32* t = &(a->t[low]);
21  REAL64* norm_array = &(a->norm_space[12*my_id]);
22 
23  // Caller zeroed norm_space
24 
25  __m128 svec, rvec, vvec, tmpshuf1;
26  __m128 qvec, uvec, tvec, tmpshuf2;
27  __m128 ar_vec = _mm_set_ps(a_r,a_r,a_r,a_r);
28  __m128 ai_vec = _mm_set_ps(a_i,-a_i,a_i,-a_i);
29  __m128 lvec;
30 
31  __m128d dotprod;
32  __m128d llo,lhi;
33  __m128d slo,shi;
34  __m128d tlo,thi;
35  __m128d qlo,qhi;
36  __m128d r0lo,r0hi;
37  __m128d f0lo,f0hi;
38  __m128d mask = _mm_set_pd((double)-1,(double)1);
39  __m128d t1,t2;
40 
41  if( len % 4 == 0 ) {
42  for(int count = 0; count < len; count+=4) {
43 
44  vvec = _mm_load_ps(&v[count]);
45  qvec = _mm_load_ps(&q[count]);
46 
47  rvec = _mm_load_ps(&r[count]);
48  uvec = _mm_load_ps(&u[count]);
49  tmpshuf1 = _mm_shuffle_ps(vvec,vvec,0xb1);
50  tmpshuf2 = _mm_shuffle_ps(qvec, qvec,0xb1);
51 
52  // First need s = r - alpha*v
53  /*
54  s[count ] = r[count ] - a_r*v[count ];
55  s[count+1] = r[count+1] - a_r*v[count+1];
56  s[count+2] = r[count+2] - a_r*v[count+2];
57  s[count+3] = r[count+3] - a_r*v[count+3];
58  */
59  svec = _mm_sub_ps(rvec, _mm_mul_ps(ar_vec,vvec));
60 
61  /*
62  s[count] += a_i*v[count+1];
63  s[count+1] -= a_i*v[count];
64  s[count+2] += a_i*v[count+3];
65  s[count+3] -= a_i*v[count+2];
66  */
67  svec = _mm_sub_ps(svec, _mm_mul_ps(ai_vec, tmpshuf1));
68 
69  // Second want t = u - alqha q
70  tvec = _mm_sub_ps(uvec, _mm_mul_ps(ar_vec,qvec));
71  tvec = _mm_sub_ps(tvec, _mm_mul_ps(ai_vec,tmpshuf2));
72 
73  _mm_store_ps(&s[count],svec);
74  _mm_store_ps(&t[count],tvec);
75 
76 
77  slo = _mm_cvtps_pd(svec);
78  svec = _mm_shuffle_ps(svec,svec,0x4e);
79  shi = _mm_cvtps_pd(svec);
80 
81  qlo = _mm_cvtps_pd(qvec);
82  qvec = _mm_shuffle_ps(qvec,qvec,0x4e);
83  qhi = _mm_cvtps_pd(qvec);
84 
85  tlo = _mm_cvtps_pd(tvec);
86  tvec = _mm_shuffle_ps(tvec,tvec,0x4e);
87  thi = _mm_cvtps_pd(tvec);
88 
89 #if 0
90  // ** phi=(r0,s)
91  norm_array[0] += r0[count]*s[count];
92  norm_array[0] += r0[count+1]*s[count+1];
93  norm_array[0] += r0[count+2]*s[count+2];
94  norm_array[0] += r0[count+3]*s[count+3];
95 
96  norm_array[1] += r0[count]*s[count+1];
97  norm_array[1] -= r0[count+1]*s[count];
98  norm_array[1] += r0[count+2]*s[count+3];
99  norm_array[1] -= r0[count+3]*s[count+2];
100 #else
101 
102  // ** phi=(r0,s)
103 
104  /* Left std::vector = r0 */
105  lvec = _mm_load_ps(&r0[count]);
106 
107  /* Load dotprod accumulated so far */
108  dotprod = _mm_load_pd(&norm_array[0]);
109 
110  /* Turn left std::vector into double precision */
111  r0lo = _mm_cvtps_pd(lvec);
112  lvec = _mm_shuffle_ps(lvec,lvec,0x4e);
113  r0hi = _mm_cvtps_pd(lvec);
114 
115 
116 
117  /* So now: llo = [ r0[count+1], r0[count] ]
118  lhi = [ r0[count+3], r0[count+2] ]
119  rlo = [ s[count+1], s[count] ]
120  rhi = [ s[count+3], s[count+2] ]
121 
122  na_vec holds the current partial sum */
123 
124  /* dotprod[0] += r0lo[0]*slo[0]; */
125  /* dotprod[1] -= r0lo[1]*slo[0]; */
126  t1 = _mm_shuffle_pd(slo,slo,0x0);
127  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(r0lo, _mm_mul_pd(mask,t1)));
128 
129  /* dotprod[0] += r0lo[1]*slo[1] */
130  /* dotprod[1] += r0lo[0]*slo[1]; */
131  t1 = _mm_shuffle_pd(slo,slo,0x3);
132  t2 = _mm_shuffle_pd(r0lo,r0lo,0x1);
133  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
134 
135  /* dotprod[0] += r0hi[0]*shi[0]; */
136  /* dotprod[1] -= r0hi[1]*shi[0]; */
137  t1 = _mm_shuffle_pd(shi,shi,0x0);
138  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(r0hi, _mm_mul_pd(mask,t1)));
139 
140  /* dotprod[0] += r0hi[1]*shi[1] */
141  /* dotprod[1] += r0hi[0]*shi[1]; */
142  t1 = _mm_shuffle_pd(shi,shi,0x3);
143  t2 = _mm_shuffle_pd(r0hi,r0hi,0x1);
144  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
145  _mm_store_pd(&norm_array[0],dotprod);
146 
147 #endif
148 
149  // ** phi=(f0,s)
150 
151  /* rlo and rhi are still svec as before
152  no need to turn them into double precision */
153 #if 0
154  norm_array[2] += f0[count]*s[count];
155  norm_array[2] += f0[count+1]*s[count+1];
156  norm_array[2] += f0[count+2]*s[count+2];
157  norm_array[2] += f0[count+3]*s[count+3];
158 
159  norm_array[3] += f0[count]*s[count+1];
160  norm_array[3] -= f0[count+1]*s[count];
161  norm_array[3] += f0[count+2]*s[count+3];
162  norm_array[3] -= f0[count+3]*s[count+2];
163 #else
164 
165  // ** phi=(f0,s)
166 
167  /* Left std::vector = f0 */
168  lvec = _mm_load_ps(&f0[count]);
169 
170  /* Load dotprod accumulated so far */
171  dotprod = _mm_load_pd(&norm_array[2]);
172 
173  /* Turn left std::vector into double precision */
174  f0lo = _mm_cvtps_pd(lvec);
175  lvec = _mm_shuffle_ps(lvec,lvec,0x4e);
176  f0hi = _mm_cvtps_pd(lvec);
177 
178 
179 
180  /* So now: llo = [ f0[count+1], f0[count] ]
181  lhi = [ f0[count+3], f0[count+2] ]
182  rlo = [ s[count+1], s[count] ]
183  rhi = [ s[count+3], s[count+2] ]
184 
185  dotprod holds the current partial sum */
186 
187  /* dotprod[0] += f0lo[0]*slo[0]; */
188  /* dotprod[1] -= f0lo[1]*slo[0]; */
189  t1 = _mm_shuffle_pd(slo,slo,0x0);
190  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(f0lo, _mm_mul_pd(mask,t1)));
191 
192  /* dotprod[0] += f0lo[1]*slo[1] */
193  /* dotprod[1] += f0lo[0]*slo[1]; */
194  t1 = _mm_shuffle_pd(slo,slo,0x3);
195  t2 = _mm_shuffle_pd(f0lo,f0lo,0x1);
196  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
197 
198  /* dotprod[0] += f0hi[0]*shi[0]; */
199  /* dotprod[1] -= f0hi[1]*shi[0]; */
200  t1 = _mm_shuffle_pd(shi,shi,0x0);
201  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(f0hi, _mm_mul_pd(mask,t1)));
202 
203  /* dotprod[0] += f0hi[1]*shi[1] */
204  /* dotprod[1] += f0hi[0]*shi[1]; */
205  t1 = _mm_shuffle_pd(shi,shi,0x3);
206  t2 = _mm_shuffle_pd(f0hi,f0hi,0x1);
207  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
208  _mm_store_pd(&norm_array[2],dotprod);
209 #endif
210 
211 
212 #if 0
213  // Now inner products with q.
214  // ** pi=(r0,q)
215  norm_array[4] += r0[count]*q[count];
216  norm_array[4] += r0[count+1]*q[count+1];
217  norm_array[4] += r0[count+2]*q[count+2];
218  norm_array[4] += r0[count+3]*q[count+3];
219 
220  norm_array[5] += r0[count]*q[count+1];
221  norm_array[5] -= r0[count+1]*q[count];
222  norm_array[5] += r0[count+2]*q[count+3];
223  norm_array[5] -= r0[count+3]*q[count+2];
224 #else
225  // ** pi=(r0,q)
226 
227  /* Load dotprod accumulated so far */
228  dotprod = _mm_load_pd(&norm_array[4]);
229 
230  /* dotprod holds the current partial sum */
231 
232  /* dotprod[0] += r0lo[0]*qlo[0]; */
233  /* dotprod[1] -= r0lo[1]*qlo[0]; */
234  t1 = _mm_shuffle_pd(qlo,qlo,0x0);
235  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(r0lo, _mm_mul_pd(mask,t1)));
236 
237  /* dotprod[0] += r0lo[1]*qlo[1] */
238  /* dotprod[1] += r0lo[0]*qlo[1]; */
239  t1 = _mm_shuffle_pd(qlo,qlo,0x3);
240  t2 = _mm_shuffle_pd(r0lo,r0lo,0x1);
241  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
242 
243  /* dotprod[0] += r0hi[0]*qhi[0]; */
244  /* dotprod[1] -= r0hi[1]*qhi[0]; */
245  t1 = _mm_shuffle_pd(qhi,qhi,0x0);
246  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(r0hi, _mm_mul_pd(mask,t1)));
247 
248  /* dotprod[0] += r0hi[1]*qhi[1] */
249  /* dotprod[1] += r0hi[0]*qhi[1]; */
250  t1 = _mm_shuffle_pd(qhi,qhi,0x3);
251  t2 = _mm_shuffle_pd(r0hi,r0hi,0x1);
252  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
253  _mm_store_pd(&norm_array[4],dotprod);
254 #endif
255 
256 
257 #if 0
258  // Now inner products with t.
259  // ** eta=(f0,t)
260  norm_array[6] += f0[count]*t[count];
261  norm_array[6] += f0[count+1]*t[count+1];
262  norm_array[6] += f0[count+2]*t[count+2];
263  norm_array[6] += f0[count+3]*t[count+3];
264 
265  norm_array[7] += f0[count]*t[count+1];
266  norm_array[7] -= f0[count+1]*t[count];
267  norm_array[7] += f0[count+2]*t[count+3];
268  norm_array[7] -= f0[count+3]*t[count+2];
269 
270 #else
271  // ** eta=(f0,t)
272 
273  /* Load dotprod accumulated so far */
274  dotprod = _mm_load_pd(&norm_array[6]);
275 
276  /* dotprod holds the current partial sum */
277 
278  /* dotprod[0] += f0lo[0]*tlo[0]; */
279  /* dotprod[1] -= f0lo[1]*tlo[0]; */
280  t1 = _mm_shuffle_pd(tlo,tlo,0x0);
281  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(f0lo, _mm_mul_pd(mask,t1)));
282 
283  /* dotprod[0] += f0lo[1]*tlo[1] */
284  /* dotprod[1] += f0lo[0]*tlo[1]; */
285  t1 = _mm_shuffle_pd(tlo,tlo,0x3);
286  t2 = _mm_shuffle_pd(f0lo,f0lo,0x1);
287  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
288 
289  /* dotprod[0] += f0hi[0]*thi[0]; */
290  /* dotprod[1] -= f0hi[1]*thi[0]; */
291  t1 = _mm_shuffle_pd(thi,thi,0x0);
292  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(f0hi, _mm_mul_pd(mask,t1)));
293 
294  /* dotprod[0] += f0hi[1]*thi[1] */
295  /* dotprod[1] += f0hi[0]*thi[1]; */
296  t1 = _mm_shuffle_pd(thi,thi,0x3);
297  t2 = _mm_shuffle_pd(f0hi,f0hi,0x1);
298  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
299  _mm_store_pd(&norm_array[6],dotprod);
300 
301 #endif
302 
303 #if 0
304  // ** theta=(t,s)
305  norm_array[8] += t[count]*s[count];
306  norm_array[8] += t[count+1]*s[count+1];
307  norm_array[8] += t[count+2]*s[count+2];
308  norm_array[8] += t[count+3]*s[count+3];
309 
310  norm_array[9] += t[count]*s[count+1];
311  norm_array[9] -= t[count+1]*s[count];
312  norm_array[9] += t[count+2]*s[count+3];
313  norm_array[9] -= t[count+3]*s[count+2];
314 #else
315  // ** theta=(t,s)
316 
317  /* Load dotprod accumulated so far */
318  dotprod = _mm_load_pd(&norm_array[8]);
319 
320  /* dotprod holds the current partial sum */
321 
322  /* dotprod[0] += tlo[0]*slo[0]; */
323  /* dotprod[1] -= tlo[1]*slo[0]; */
324  t1 = _mm_shuffle_pd(slo,slo,0x0);
325  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(tlo, _mm_mul_pd(mask,t1)));
326 
327  /* dotprod[0] += tlo[1]*slo[1] */
328  /* dotprod[1] += tlo[0]*slo[1]; */
329  t1 = _mm_shuffle_pd(slo,slo,0x3);
330  t2 = _mm_shuffle_pd(tlo,tlo,0x1);
331  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
332 
333  /* dotprod[0] += thi[0]*shi[0]; */
334  /* dotprod[1] -= thi[1]*shi[0]; */
335  t1 = _mm_shuffle_pd(shi,shi,0x0);
336  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(thi, _mm_mul_pd(mask,t1)));
337 
338  /* dotprod[0] += thi[1]*shi[1] */
339  /* dotprod[1] += thi[0]*shi[1]; */
340  t1 = _mm_shuffle_pd(shi,shi,0x3);
341  t2 = _mm_shuffle_pd(thi,thi,0x1);
342  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
343  _mm_store_pd(&norm_array[8],dotprod);
344 #endif
345 
346 #if 0
347  // ** kappa = || t ||^2
348  norm_array[10] += t[count]*t[count];
349  norm_array[10] += t[count+1]*t[count+1];
350  norm_array[10] += t[count+2]*t[count+2];
351  norm_array[10] += t[count+3]*t[count+3];
352 
353  // ** rnorm = || r ||^2
354  norm_array[11] += r[count]*r[count];
355  norm_array[11] += r[count+1]*r[count+1];
356  norm_array[11] += r[count+2]*r[count+2];
357  norm_array[11] += r[count+3]*r[count+3];
358 #else
359  dotprod = _mm_load_pd(&norm_array[10]);
360 
361  r0lo = _mm_cvtps_pd(rvec);
362  rvec = _mm_shuffle_ps(rvec,rvec,0x4e);
363  r0hi = _mm_cvtps_pd(rvec);
364 
365  // [ti,tr] & [ri,rr] -> [tr, rr]
366  f0lo = _mm_shuffle_pd(tlo,r0lo, 0x0);
367 
368  // [ti,tr] & [ri,rr] -> [ti, ri]
369  f0hi = _mm_shuffle_pd(tlo,r0lo, 0x3);
370 
371  // [ tnorm, rnorm ] += [ tr^2, rr^2 ]
372  // [ tnorm, rnorm ] += [ ti^2, ri^2 ]
373  dotprod = _mm_add_pd(dotprod,_mm_mul_pd(f0lo,f0lo));
374  dotprod = _mm_add_pd(dotprod,_mm_mul_pd(f0hi,f0hi));
375 
376  // [ti,tr] & [ri,rr] -> [tr, rr]
377  f0lo = _mm_shuffle_pd(thi,r0hi, 0x0);
378 
379  // [ti,tr] & [ri,rr] -> [ti, ri]
380  f0hi = _mm_shuffle_pd(thi,r0hi, 0x3);
381 
382  // [ tnorm, rnorm ] += [ tr^2, rr^2 ]
383  // [ tnorm, rnorm ] += [ ti^2, ri^2 ]
384  dotprod = _mm_add_pd(dotprod,_mm_mul_pd(f0lo,f0lo));
385  dotprod = _mm_add_pd(dotprod,_mm_mul_pd(f0hi,f0hi));
386 
387  _mm_store_pd(&norm_array[10],dotprod);
388 #endif
389  }
390  }
391  else {
392  QDPIO::cout << "ord_ib_stupdates_kernel_sse.h: len not divisible by 4" << std::endl;
393  QDP_abort(1);
394  }
395 }
396 
397 inline
398 void ord_ib_stupdates_kernel_real64(int lo, int hi, int my_id, ib_stupdate_arg<REAL64>* a)
399 {
400  REAL64 a_r = a->a_r;
401  REAL64 a_i = a->a_i;
402  int atom = a->atom;
403  int low = atom*lo;
404  int len = atom*(hi-lo);
405 
406  REAL64* r = &(a->r[low]);
407  REAL64* u = &(a->u[low]);
408  REAL64* v = &(a->v[low]);
409  REAL64* q = &(a->q[low]);
410  REAL64* r0 = &(a->r0[low]);
411  REAL64* f0 = &(a->f0[low]);
412  REAL64* s = &(a->s[low]);
413  REAL64* t = &(a->t[low]);
414  REAL64* norm_array = &(a->norm_space[12*my_id]);
415 
416  __m128d svec, rvec, vvec;
417  __m128d qvec, uvec, tvec;
418  __m128d ar_vec = _mm_set_pd(a_r,a_r);
419  __m128d ai_vec = _mm_set_pd(a_i,-a_i);
420 
421  __m128d dotprod;
422  __m128d sv;
423  __m128d tv;
424  __m128d qv;
425  __m128d r0v;
426  __m128d f0v;
427  __m128d mask = _mm_set_pd((double)-1,(double)1);
428  __m128d t1,t2;
429 
430  // Caller zeroed norm_space
431  if( len % 2 == 0){
432  for(int count = 0; count < len; count+=2) {
433  // First need s = r - alpha*v
434  rvec = _mm_load_pd(&r[count]);
435  vvec = _mm_load_pd(&v[count]);
436  t1 = _mm_shuffle_pd(vvec,vvec,0x1);
437 
438  uvec = _mm_load_pd(&u[count]);
439  qvec = _mm_load_pd(&q[count]);
440  t2 = _mm_shuffle_pd(qvec,qvec,0x1);
441 
442  svec = _mm_sub_pd(rvec, _mm_mul_pd(ar_vec, vvec));
443  r0v = _mm_load_pd(&r0[count]);
444  svec = _mm_sub_pd(svec, _mm_mul_pd(ai_vec, t1));
445 
446 
447  tvec = _mm_sub_pd(uvec, _mm_mul_pd(ar_vec, qvec));
448  f0v = _mm_load_pd(&f0[count]);
449  tvec = _mm_sub_pd(tvec, _mm_mul_pd(ai_vec, t2));
450 
451  _mm_store_pd(&s[count],svec);
452  _mm_store_pd(&t[count],tvec);
453 
454 
455 
456  // Now inner products with s.
457  // ** phi=(r0,s)
458 #if 0
459  norm_array[0] += r0[count]*s[count];
460  norm_array[0] += r0[count+1]*s[count+1];
461 
462  norm_array[1] += r0[count]*s[count+1];
463  norm_array[1] -= r0[count+1]*s[count];
464 #else
465 
466  dotprod = _mm_load_pd(&norm_array[0]);
467 
468  /* dotprod holds the current partial sum */
469 
470  /* dotprod[0] += r0[0]*s[0]; */
471  /* dotprod[1] -= r0[1]*s[0]; */
472  t1 = _mm_shuffle_pd(svec,svec,0x0);
473  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(r0v, _mm_mul_pd(mask,t1)));
474 
475  /* dotprod[0] += r0[1]*s[1] */
476  /* dotprod[1] += r0[0]*s[1]; */
477  t1 = _mm_shuffle_pd(svec,svec,0x3);
478  t2 = _mm_shuffle_pd(r0v,r0v,0x1);
479  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
480 
481  _mm_store_pd(&norm_array[0],dotprod);
482 #endif
483 
484 #if 0
485  // ** phi=(f0,s)
486  norm_array[2] += f0[count]*s[count];
487  norm_array[2] += f0[count+1]*s[count+1];
488 
489  norm_array[3] += f0[count]*s[count+1];
490  norm_array[3] -= f0[count+1]*s[count];
491 #else
492 
493  dotprod = _mm_load_pd(&norm_array[2]);
494 
495  /* dotprod holds the current partial sum */
496 
497  /* dotprod[0] += r0[0]*s[0]; */
498  /* dotprod[1] -= r0[1]*s[0]; */
499  t1 = _mm_shuffle_pd(svec,svec,0x0);
500  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(f0v, _mm_mul_pd(mask,t1)));
501 
502  /* dotprod[0] += r0[1]*s[1] */
503  /* dotprod[1] += r0[0]*s[1]; */
504  t1 = _mm_shuffle_pd(svec,svec,0x3);
505  t2 = _mm_shuffle_pd(f0v,f0v,0x1);
506  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
507 
508  _mm_store_pd(&norm_array[2],dotprod);
509 #endif
510 
511 
512 #if 0
513  // Now inner products with q.
514  // ** pi=(r0,q)
515  norm_array[4] += r0[count]*q[count];
516  norm_array[4] += r0[count+1]*q[count+1];
517 
518  norm_array[5] += r0[count]*q[count+1];
519  norm_array[5] -= r0[count+1]*q[count];
520 #else
521  dotprod = _mm_load_pd(&norm_array[4]);
522 
523  /* dotprod holds the current partial sum */
524 
525  /* dotprod[0] += r0[0]*q[0]; */
526  /* dotprod[1] -= r0[1]*q[0]; */
527  t1 = _mm_shuffle_pd(qvec,qvec,0x0);
528  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(r0v, _mm_mul_pd(mask,t1)));
529 
530  /* dotprod[0] += r0[1]*q[1] */
531  /* dotprod[1] += r0[0]*q[1]; */
532  t1 = _mm_shuffle_pd(qvec,qvec,0x3);
533  t2 = _mm_shuffle_pd(r0v,r0v,0x1);
534  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
535 
536  _mm_store_pd(&norm_array[4],dotprod);
537 
538 #endif
539 
540 
541 #if 0
542  // Now inner products with t.
543  // ** eta=(f0,t)
544  norm_array[6] += f0[count]*t[count];
545  norm_array[6] += f0[count+1]*t[count+1];
546 
547  norm_array[7] -= f0[count+1]*t[count];
548  norm_array[7] += f0[count]*t[count+1];
549 #else
550  dotprod = _mm_load_pd(&norm_array[6]);
551 
552  /* dotprod holds the current partial sum */
553 
554  /* dotprod[0] += f0[0]*t[0]; */
555  /* dotprod[1] -= f0[1]*t[0]; */
556  t1 = _mm_shuffle_pd(tvec,tvec,0x0);
557  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(f0v, _mm_mul_pd(mask,t1)));
558 
559  /* dotprod[0] += f0[1]*t[1] */
560  /* dotprod[1] += f0[0]*t[1]; */
561  t1 = _mm_shuffle_pd(tvec,tvec,0x3);
562  t2 = _mm_shuffle_pd(f0v,f0v,0x1);
563  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
564 
565  _mm_store_pd(&norm_array[6],dotprod);
566 #endif
567 
568 
569 #if 0
570  // ** theta=(t,s)
571  norm_array[8] += t[count]*s[count];
572  norm_array[8] += t[count+1]*s[count+1];
573 
574  norm_array[9] += t[count]*s[count+1];
575  norm_array[9] -= t[count+1]*s[count];
576 #else
577  dotprod = _mm_load_pd(&norm_array[8]);
578 
579  /* dotprod holds the current partial sum */
580 
581  /* dotprod[0] += t[0]*s[0]; */
582  /* dotprod[1] -= t[1]*s[0]; */
583  t1 = _mm_shuffle_pd(svec,svec,0x0);
584  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(tvec, _mm_mul_pd(mask,t1)));
585 
586  /* dotprod[0] += t[1]*s[1] */
587  /* dotprod[1] += t[0]*s[1]; */
588  t1 = _mm_shuffle_pd(svec,svec,0x3);
589  t2 = _mm_shuffle_pd(tvec,tvec,0x1);
590  dotprod = _mm_add_pd(dotprod, _mm_mul_pd(t2,t1));
591 
592  _mm_store_pd(&norm_array[8],dotprod);
593 #endif
594 
595 #if 0
596  // ** kappa = || t ||^2
597  norm_array[10] += t[count]*t[count];
598  norm_array[10] += t[count+1]*t[count+1];
599 
600  // ** rnorm = || r ||^2
601  norm_array[11] += r[count]*r[count];
602  norm_array[11] += r[count+1]*r[count+1];
603 #else
604  dotprod = _mm_load_pd(&norm_array[10]);
605 
606  // [ti,tr] & [ri,rr] -> [tr, rr]
607  f0v = _mm_shuffle_pd(tvec,rvec, 0x0);
608 
609  // [ti,tr] & [ri,rr] -> [ti, ri]
610  r0v = _mm_shuffle_pd(tvec,rvec, 0x3);
611 
612  // [ tnorm, rnorm ] += [ tr^2, rr^2 ]
613  // [ tnorm, rnorm ] += [ ti^2, ri^2 ]
614  dotprod = _mm_add_pd(dotprod,_mm_mul_pd(f0v,f0v));
615  dotprod = _mm_add_pd(dotprod,_mm_mul_pd(r0v,r0v));
616 
617 
618  _mm_store_pd(&norm_array[10],dotprod);
619 
620 #endif
621 
622 
623 
624  }
625  }
626  else {
627  QDPIO::cout << "ord_ib_stubdates_kernel_sse.h: len not divisible by 2" << std::endl;
628  QDP_abort(1);
629  }
630 
631 }
unsigned s
Definition: ldumul_w.cc:37
int t
Definition: meslate.cc:37
Double q
Definition: mesq.cc:17
static multi1d< LatticeColorMatrix > u
Complex a
Definition: invbicg.cc:95
int count
Definition: octave.h:14
void ord_ib_stupdates_kernel_real64(int lo, int hi, int my_id, ib_stupdate_arg< REAL64 > *a)
void ord_ib_stupdates_kernel_real32(int lo, int hi, int my_id, ib_stupdate_arg< REAL32 > *a)
multi1d< LatticeFermion > r(Ncb)
int r0
Definition: qtopcor.cc:41