CHROMA
bicgstab_kernels_scalarsite.h
Go to the documentation of this file.
1 #ifndef BICGSTAB_KERNELS_SCALARSITE_H
2 #define BICGSTAB_KERNELS_SCALARSITE_H
3 #include "chromabase.h"
4 #include "chroma_config.h"
5 
6 /* The funky kernels used by BiCGStab.
7  These versions should always work... */
8 
9 namespace Chroma {
10 
11  namespace BiCGStabKernels {
12  void initScalarSiteKernels();
14 
15  REAL64* getNormSpace();
16 
18  {
19  REAL64* x_ptr;
20  REAL64* y_ptr;
21  REAL64* z_ptr;
22  REAL64* norm_ptr;
23  int atom;
24  };
25 
27 
28 
29  template<>
30  inline
31  void xymz_normx(LatticeDiracFermionD& x,
32  const LatticeDiracFermionD& y,
33  const LatticeDiracFermionD& z,
34  Double& x_norm,
35  const Subset& s)
36  {
37  REAL64 norm = 0;
38 
39  if ( s.hasOrderedRep() ) {
40 
41  REAL64* x_ptr = (REAL64*)&(x.elem(s.start()).elem(0).elem(0).real());
42  REAL64* y_ptr = (REAL64*)&(y.elem(s.start()).elem(0).elem(0).real());
43  REAL64* z_ptr = (REAL64*)&(z.elem(s.start()).elem(0).elem(0).real());
44  REAL64* norms = getNormSpace();
45 
46  ord_xymz_normx_arg arg={x_ptr,y_ptr,z_ptr,norms,4*3*2};
47  int len = (s.end()-s.start()+1);
48 
49  dispatch_to_threads(len,arg,ord_xymz_normx_kernel);
50  norm = norms[0];
51  // Sum the norms...
52  for(int i=1 ; i < qdpNumThreads(); i++) {
53  norm += norms[i];
54  }
55  }
56  else {
57  QDPIO::cerr << "I only work for ordered subsets for now" << std::endl;
58  QDP_abort(1);
59  }
60 
61  QDPInternal::globalSum(norm);
62  x_norm = Double(norm);
63 
64  }
65 
66 
68  {
69  REAL32* x_ptr;
70  REAL32* y_ptr;
71  REAL32* z_ptr;
72  REAL32 a_re;
73  REAL32 a_im;
74  REAL32 b_re;
75  REAL32 b_im;
76  int atom;
77  };
78 
80  template<>
81  inline
82  void yxpaymabz(LatticeDiracFermionF& x,
83  LatticeDiracFermionF& y,
84  LatticeDiracFermionF& z,
85  const ComplexF& a, const ComplexF& b, const Subset& s)
86  {
87  //QDPIO::cout << "Using optimized yxpaymabz" << std::endl;
88 
89  if( s.hasOrderedRep() ) {
90  REAL32* x_ptr = (REAL32*)&(x.elem(s.start()).elem(0).elem(0).real());
91  REAL32* y_ptr = (REAL32*)&(y.elem(s.start()).elem(0).elem(0).real());
92  REAL32* z_ptr = (REAL32*)&(z.elem(s.start()).elem(0).elem(0).real());
93  REAL32 a_re = a.elem().elem().elem().real();
94  REAL32 a_im = a.elem().elem().elem().imag();
95  REAL32 b_re = b.elem().elem().elem().real();
96  REAL32 b_im = b.elem().elem().elem().imag();
97  ord_yxpaymabz_arg arg={x_ptr,y_ptr,z_ptr,a_re,a_im, b_re, b_im,4*3*2};
98 
99  int len = (s.end()-s.start()+1);
100  dispatch_to_threads(len,arg,ord_yxpaymabz_kernel);
101  }
102  else {
103  QDPIO::cerr << "I only work for ordered subsets for now" << std::endl;
104  QDP_abort(1);
105  }
106  }
107 
108 
110  {
111  REAL32* x_ptr;
112  REAL32* y_ptr;
113  REAL64* norm_space;
114  int atom;
115 
116  };
117 
118 
120 
121  template<>
122  inline
123  void norm2x_cdotxy(const LatticeDiracFermionF& x,
124  const LatticeDiracFermionF& y, Double& norm2x, DComplex& cdotxy, const Subset& s)
125  {
126 
127  REAL64 norm_array[3]={0,0,0};
128 
129 
130  if ( s.hasOrderedRep() ) {
131  REAL32* x_ptr = (REAL32*)&(x.elem(s.start()).elem(0).elem(0).real());
132  REAL32* y_ptr = (REAL32*)&(y.elem(s.start()).elem(0).elem(0).real());
133  REAL64* norm_space = getNormSpace();
134 
135  int len = (s.end()-s.start()+1);
136  ord_norm2x_cdotxy_arg arg={x_ptr,y_ptr,norm_space,4*3*2};
137  dispatch_to_threads(len,arg, ord_norm2x_cdotxy_kernel);
138 
139  for(int i=0; i < 3*qdpNumThreads(); i+=3) {
140  norm_array[0] += norm_space[i];
141  norm_array[1] += norm_space[i+1];
142  norm_array[2] += norm_space[i+2];
143  }
144  }
145  else {
146  QDPIO::cerr << "I only work for ordered subsets for now" << std::endl;
147  QDP_abort(1);
148  }
149 
150  QDPInternal::globalSumArray(norm_array,3);
151 
152  norm2x.elem().elem().elem().elem() =norm_array[0];
153  cdotxy.elem().elem().elem().real()= norm_array[1];
154  cdotxy.elem().elem().elem().imag() = norm_array[2];
155 
156  }
157 
158 
160  REAL32* x_ptr;
161  REAL32* y_ptr;
162  REAL32* z_ptr;
163  REAL32 a_re;
164  REAL32 a_im;
165  REAL32 b_re;
166  REAL32 b_im;
167  int atom;
168  };
169 
170 
172 
173  template<>
174  inline
175  void xpaypbz(LatticeDiracFermionF& x,
176  LatticeDiracFermionF& y,
177  LatticeDiracFermionF& z,
178  ComplexF& a,
179  ComplexF& b,
180  const Subset& s)
181 
182  {
183 
184  if( s.hasOrderedRep() ) {
185  REAL32* x_ptr = (REAL32*)&(x.elem(s.start()).elem(0).elem(0).real());
186  REAL32* y_ptr = (REAL32*)&(y.elem(s.start()).elem(0).elem(0).real());
187  REAL32* z_ptr = (REAL32*)&(z.elem(s.start()).elem(0).elem(0).real());
188  REAL32 a_re = a.elem().elem().elem().real();
189  REAL32 a_im = a.elem().elem().elem().imag();
190  REAL32 b_re = b.elem().elem().elem().real();
191  REAL32 b_im = b.elem().elem().elem().imag();
192  ord_xpaypbz_arg arg={x_ptr,y_ptr,z_ptr,a_re,a_im,b_re,b_im,4*3*2};
193 
194  int len=(s.end()-s.start()+1);
195  dispatch_to_threads(len,arg, ord_xpaypbz_kernel);
196  }
197  else {
198  QDPIO::cerr << "I only work for ordered subsets for now" << std::endl;
199  QDP_abort(1);
200  }
201 
202 
203  }
204 
205 
206 
208  REAL32* x_ptr;
209  REAL32* y_ptr;
210  REAL32* z_ptr;
211  REAL32 a_re;
212  REAL32 a_im;
213  REAL64* norm_space;
214  int atom;
215  };
216 
217 
219 
220 
221  template<>
222  inline
223  void xmay_normx_cdotzx(LatticeDiracFermionF& x,
224  const LatticeDiracFermionF& y,
225  const LatticeDiracFermionF& z,
226  ComplexF& a,
227  Double& normx,
228  DComplex& cdotzx, const Subset& s)
229  {
230 
231  REAL64 norm_array[3]={0,0,0};
232 
233  if( s.hasOrderedRep() ) {
234  REAL32* x_ptr = (REAL32*)&(x.elem(s.start()).elem(0).elem(0).real());
235  REAL32* y_ptr = (REAL32*)&(y.elem(s.start()).elem(0).elem(0).real());
236  REAL32* z_ptr = (REAL32*)&(z.elem(s.start()).elem(0).elem(0).real());
237 
238 
239  REAL32 a_re = a.elem().elem().elem().real();
240  REAL32 a_im = a.elem().elem().elem().imag();
241  REAL64* norm_space = getNormSpace();
242  ord_xmay_normx_cdotzx_arg arg = {x_ptr,y_ptr,z_ptr,a_re,a_im, norm_space,4*3*2};
243 
244 
245  int len =(s.end()-s.start()+1);
246  dispatch_to_threads(len,arg,ord_xmay_normx_cdotzx_kernel);
247 
248  for(int i=0; i < 3*qdpNumThreads(); i+=3) {
249  norm_array[0] += norm_space[i];
250  norm_array[1] += norm_space[i+1];
251  norm_array[2] += norm_space[i+2];
252  }
253 
254  }
255  else {
256  QDPIO::cerr << "I only work for ordered subsets for now" << std::endl;
257  QDP_abort(1);
258  }
259 
260  QDPInternal::globalSumArray(norm_array,3);
261  normx.elem().elem().elem().elem() =norm_array[0];
262  cdotzx.elem().elem().elem().real()= norm_array[1];
263  cdotzx.elem().elem().elem().imag() = norm_array[2];
264 
265  // x[s] -= a*y;
266  //normx = norm2(x,s);
267  // cdotzx = innerProduct(z,x,s);
268  }
269 
270 
271  struct ord_cxmayf_arg {
272  REAL32* x_ptr;
273  REAL32* y_ptr;
274  REAL32 a_re;
275  REAL32 a_im;
276  int atom;
277  };
278 
279 
281  template<>
282  inline
283  void cxmay(LatticeDiracFermionF& x,
284  const LatticeDiracFermionF& y,
285  const ComplexF& a,
286  const Subset& s)
287 
288  {
289 
290  if( s.hasOrderedRep() ) {
291  REAL32* x_ptr = (REAL32*)&(x.elem(s.start()).elem(0).elem(0).real());
292  REAL32* y_ptr = (REAL32*)&(y.elem(s.start()).elem(0).elem(0).real());
293  REAL32 a_re = a.elem().elem().elem().real();
294  REAL32 a_im = a.elem().elem().elem().imag();
295  ord_cxmayf_arg arg={x_ptr,y_ptr,a_re,a_im,4*3*2};
296 
297  int len=(s.end()-s.start()+1);
298  dispatch_to_threads(len,arg, ord_cxmayf_kernel);
299  }
300  else {
301  QDPIO::cerr << "I only work for ordered subsets for now" << std::endl;
302  QDP_abort(1);
303  }
304 
305 
306  }
307 
308 
309  /********************
310  * IBICGSTAB KERNELS
311  ******************** */
312 
313 
314  template<typename F>
321 
324 
327 
330 
333 
336  int atom;
337  };
338 
340 
341 #if 1
342  template<>
343  inline
344  void
345  ibicgstab_zvupdates(const LatticeDiracFermionF3& r,
346  LatticeDiracFermionF3& z,
347  LatticeDiracFermionF3& v,
348  const LatticeDiracFermionF3& u,
349  const LatticeDiracFermionF3& q,
350  const ComplexD& alpha,
351  const ComplexD& alpha_rat_beta,
352  const ComplexD& alpha_delta,
353  const ComplexD& beta,
354  const ComplexD& delta,
355  const Subset& sub)
356  {
357  if( sub.hasOrderedRep() ) {
358  REAL32* r_ptr = (REAL32*)&(r.elem(sub.start()).elem(0).elem(0).real());
359  REAL32* z_ptr = (REAL32*)&(z.elem(sub.start()).elem(0).elem(0).real());
360  REAL32* v_ptr = (REAL32*)&(v.elem(sub.start()).elem(0).elem(0).real());
361  REAL32* u_ptr = (REAL32*)&(u.elem(sub.start()).elem(0).elem(0).real());
362  REAL32* q_ptr = (REAL32*)&(q.elem(sub.start()).elem(0).elem(0).real());
363 
364  REAL32 a_re = (REAL32)alpha.elem().elem().elem().real();
365  REAL32 a_im = (REAL32)alpha.elem().elem().elem().imag();
366 
367  REAL32 arb_re = (REAL32)alpha_rat_beta.elem().elem().elem().real();
368  REAL32 arb_im = (REAL32)alpha_rat_beta.elem().elem().elem().imag();
369 
370  REAL32 ad_re = (REAL32)alpha_delta.elem().elem().elem().real();
371  REAL32 ad_im = (REAL32)alpha_delta.elem().elem().elem().imag();
372 
373  REAL32 beta_re = (REAL32)beta.elem().elem().elem().real();
374  REAL32 beta_im = (REAL32)beta.elem().elem().elem().imag();
375 
376  REAL32 delta_re = (REAL32)delta.elem().elem().elem().real();
377  REAL32 delta_im = (REAL32)delta.elem().elem().elem().imag();
378 
379  ib_zvupdates_arg<REAL32> arg={r_ptr,z_ptr,v_ptr,u_ptr,q_ptr,
380  a_re, a_im, arb_re, arb_im, ad_re, ad_im,
381  beta_re, beta_im, delta_re, delta_im,4*3*2};
382 
383  int len=(sub.end()-sub.start()+1);
384  dispatch_to_threads(len,arg, ord_ib_zvupdates_kernel_real32);
385  }
386  else {
387  QDPIO::cerr << "I only work for ordered subsets for now" << std::endl;
388  QDP_abort(1);
389  }
390 
391 
392 
393  }
394 #endif
395 
396 
397  template<>
398  inline
399  void
400  ibicgstab_zvupdates(const LatticeDiracFermionD3& r,
401  LatticeDiracFermionD3& z,
402  LatticeDiracFermionD3 &v,
403  const LatticeDiracFermionD3& u,
404  const LatticeDiracFermionD3& q,
405  const ComplexD& alpha,
406  const ComplexD& alpha_rat_beta,
407  const ComplexD& alpha_delta,
408  const ComplexD& beta,
409  const ComplexD& delta,
410  const Subset& sub)
411  {
412  if( sub.hasOrderedRep() ) {
413  REAL64* r_ptr = (REAL64*)&(r.elem(sub.start()).elem(0).elem(0).real());
414  REAL64* z_ptr = (REAL64*)&(z.elem(sub.start()).elem(0).elem(0).real());
415  REAL64* v_ptr = (REAL64*)&(v.elem(sub.start()).elem(0).elem(0).real());
416  REAL64* u_ptr = (REAL64*)&(u.elem(sub.start()).elem(0).elem(0).real());
417  REAL64* q_ptr = (REAL64*)&(q.elem(sub.start()).elem(0).elem(0).real());
418 
419  REAL64 a_re = alpha.elem().elem().elem().real();
420  REAL64 a_im = alpha.elem().elem().elem().imag();
421 
422  REAL64 arb_re = alpha_rat_beta.elem().elem().elem().real();
423  REAL64 arb_im = alpha_rat_beta.elem().elem().elem().imag();
424 
425  REAL64 ad_re = alpha_delta.elem().elem().elem().real();
426  REAL64 ad_im = alpha_delta.elem().elem().elem().imag();
427 
428  REAL64 beta_re = beta.elem().elem().elem().real();
429  REAL64 beta_im = beta.elem().elem().elem().imag();
430 
431  REAL64 delta_re = delta.elem().elem().elem().real();
432  REAL64 delta_im = delta.elem().elem().elem().imag();
433 
434  ib_zvupdates_arg<REAL64> arg={r_ptr,z_ptr,v_ptr,u_ptr,q_ptr,
435  a_re, a_im, arb_re, arb_im, ad_re, ad_im,
436  beta_re, beta_im, delta_re, delta_im,4*3*2};
437 
438  int len=(sub.end()-sub.start()+1);
439  dispatch_to_threads(len,arg, ord_ib_zvupdates_kernel_real64);
440  }
441  else {
442  QDPIO::cerr << "I only work for ordered subsets for now" << std::endl;
443  QDP_abort(1);
444  }
445  }
446 
447 
448 
449 
450  template<typename F>
451  struct ib_rxupdate_arg {
457 
458  F omega_re; // omega
460  int atom;
461  };
462 
464 
465  template<>
466  inline
467  void
468  ibicgstab_rxupdate(const ComplexD& omega,
469  const LatticeDiracFermionF3& s,
470  const LatticeDiracFermionF3& t,
471  const LatticeDiracFermionF3& z,
472  LatticeDiracFermionF3& r,
473  LatticeDiracFermionF3& x,
474  const Subset& sub)
475  {
476  if( sub.hasOrderedRep() ) {
477  REAL32* s_ptr = (REAL32*)&(s.elem(sub.start()).elem(0).elem(0).real());
478  REAL32* t_ptr = (REAL32*)&(t.elem(sub.start()).elem(0).elem(0).real());
479  REAL32* z_ptr = (REAL32*)&(z.elem(sub.start()).elem(0).elem(0).real());
480  REAL32* r_ptr = (REAL32*)&(r.elem(sub.start()).elem(0).elem(0).real());
481  REAL32* x_ptr = (REAL32*)&(x.elem(sub.start()).elem(0).elem(0).real());
482 
483  REAL32 omega_re = (REAL32)omega.elem().elem().elem().real();
484  REAL32 omega_im = (REAL32)omega.elem().elem().elem().imag();
485 
486  ib_rxupdate_arg<REAL32> arg={s_ptr,t_ptr,z_ptr,r_ptr,x_ptr,
487  omega_re, omega_im,4*3*2};
488 
489  int len=(sub.end()-sub.start()+1);
490  dispatch_to_threads(len,arg, ord_ib_rxupdate_kernel_real32);
491  }
492  else {
493  QDPIO::cerr << "I only work for ordered subsets for now" << std::endl;
494  QDP_abort(1);
495  }
496  }
497 
498 
499  template<>
500  inline
501  void
502  ibicgstab_rxupdate(const ComplexD& omega,
503  const LatticeDiracFermionD3& s,
504  const LatticeDiracFermionD3& t,
505  const LatticeDiracFermionD3& z,
506  LatticeDiracFermionD3& r,
507  LatticeDiracFermionD3& x,
508  const Subset& sub)
509  {
510  if( sub.hasOrderedRep() ) {
511  REAL64* s_ptr = (REAL64*)&(s.elem(sub.start()).elem(0).elem(0).real());
512  REAL64* t_ptr = (REAL64*)&(t.elem(sub.start()).elem(0).elem(0).real());
513  REAL64* z_ptr = (REAL64*)&(z.elem(sub.start()).elem(0).elem(0).real());
514  REAL64* r_ptr = (REAL64*)&(r.elem(sub.start()).elem(0).elem(0).real());
515  REAL64* x_ptr = (REAL64*)&(x.elem(sub.start()).elem(0).elem(0).real());
516 
517  REAL64 omega_re = omega.elem().elem().elem().real();
518 
519  REAL64 omega_im = omega.elem().elem().elem().imag();
520 
521  ib_rxupdate_arg<REAL64> arg={s_ptr,t_ptr,z_ptr,r_ptr,x_ptr,
522  omega_re, omega_im, 4*3*2};
523 
524  int len=(sub.end()-sub.start()+1);
525  dispatch_to_threads(len,arg, ord_ib_rxupdate_kernel_real64);
526  }
527  else {
528  QDPIO::cerr << "I only work for ordered subsets for now" << std::endl;
529  QDP_abort(1);
530  }
531  }
532 
533 
534  template<typename F>
535  struct ib_stupdate_arg {
536  F a_r; // Real + imaginary parts of alpha
537  F a_i;
538  F* r;
539  F* u;
540  F* v;
541  F* q;
542  F* r0;
543  F* f0;
544  F* s;
545  F* t;
546 
547  // Reduction results
548  REAL64* norm_space; // Space for 12 doubles
549  int atom;
550 
551  };
553 
554 
555  template<>
556  inline
557  void ibicgstab_stupdates_reduces(const ComplexD& alpha,
558  const LatticeDiracFermionF3& r,
559  const LatticeDiracFermionF3& u,
560  const LatticeDiracFermionF3& v,
561  const LatticeDiracFermionF3& q,
562  const LatticeDiracFermionF3& r0,
563  const LatticeDiracFermionF3& f0,
564  LatticeDiracFermionF3& s,
565  LatticeDiracFermionF3& t,
566  ComplexD& phi,
567  ComplexD& pi,
568  ComplexD& gamma,
569  ComplexD& eta,
570  ComplexD& theta,
571  Double& kappa,
572  Double& rnorm,
573  const Subset& sub)
574 
575  {
576 
577  REAL64 norm_array[12]={0,0,0,0,0,0,0,0,0,0,0,0};
578 
579  if( sub.hasOrderedRep() ) {
580  ib_stupdate_arg<REAL32> arg = {
581  (REAL32)alpha.elem().elem().elem().real(),
582  (REAL32)alpha.elem().elem().elem().imag(),
583  (REAL32*)&(r.elem(sub.start()).elem(0).elem(0).real()),
584  (REAL32*)&(u.elem(sub.start()).elem(0).elem(0).real()),
585  (REAL32*)&(v.elem(sub.start()).elem(0).elem(0).real()),
586  (REAL32*)&(q.elem(sub.start()).elem(0).elem(0).real()),
587  (REAL32*)&(r0.elem(sub.start()).elem(0).elem(0).real()),
588  (REAL32*)&(f0.elem(sub.start()).elem(0).elem(0).real()),
589  (REAL32*)&(s.elem(sub.start()).elem(0).elem(0).real()),
590  (REAL32*)&(t.elem(sub.start()).elem(0).elem(0).real()),
591  getNormSpace(),
592  4*3*2};
593 
594  for(int i=0; i < 12*qdpNumThreads(); i++) {
595  arg.norm_space[i] = (REAL64)0;
596  }
597 
598 
599  int len=(sub.end()-sub.start()+1);
600  dispatch_to_threads(len,arg,ord_ib_stupdates_kernel_real32);
601  for(int i=0; i < qdpNumThreads(); i++) {
602  norm_array[0] += arg.norm_space[12*i];
603  norm_array[1] += arg.norm_space[12*i+1];
604  norm_array[2] += arg.norm_space[12*i+2];
605  norm_array[3] += arg.norm_space[12*i+3];
606  norm_array[4] += arg.norm_space[12*i+4];
607  norm_array[5] += arg.norm_space[12*i+5];
608  norm_array[6] += arg.norm_space[12*i+6];
609  norm_array[7] += arg.norm_space[12*i+7];
610  norm_array[8] += arg.norm_space[12*i+8];
611  norm_array[9] += arg.norm_space[12*i+9];
612  norm_array[10] += arg.norm_space[12*i+10];
613  norm_array[11] += arg.norm_space[12*i+11];
614  }
615  QDPInternal::globalSumArray(norm_array,12);
616 
617  phi.elem().elem().elem().real() = norm_array[0];
618  phi.elem().elem().elem().imag() = norm_array[1];
619 
620  gamma.elem().elem().elem().real() = norm_array[2];
621  gamma.elem().elem().elem().imag() = norm_array[3];
622 
623  pi.elem().elem().elem().real() = norm_array[4];
624  pi.elem().elem().elem().imag() = norm_array[5];
625 
626  eta.elem().elem().elem().real() = norm_array[6];
627  eta.elem().elem().elem().imag() = norm_array[7];
628 
629  theta.elem().elem().elem().real() = norm_array[8];
630  theta.elem().elem().elem().imag() = norm_array[9];
631 
632  kappa.elem().elem().elem().elem() = norm_array[10];
633  rnorm.elem().elem().elem().elem() = norm_array[11];
634 
635  }
636  else {
637  QDPIO::cerr << "I only work for ordered subsets for now" << std::endl;
638  QDP_abort(1);
639  }
640  }
641 
642 
643  template<>
644  inline
645  void ibicgstab_stupdates_reduces(const ComplexD& alpha,
646  const LatticeDiracFermionD3& r,
647  const LatticeDiracFermionD3& u,
648  const LatticeDiracFermionD3& v,
649  const LatticeDiracFermionD3& q,
650  const LatticeDiracFermionD3& r0,
651  const LatticeDiracFermionD3& f0,
652  LatticeDiracFermionD3& s,
653  LatticeDiracFermionD3& t,
654  ComplexD& phi,
655  ComplexD& pi,
656  ComplexD& gamma,
657  ComplexD& eta,
658  ComplexD& theta,
659  Double& kappa,
660  Double& rnorm,
661  const Subset& sub)
662 
663  {
664 
665  REAL64 norm_array[12]={0,0,0,0,0,0,0,0,0,0,0,0};
666 
667  if( sub.hasOrderedRep() ) {
668 
669  ib_stupdate_arg<REAL64> arg = {
670  (REAL64)alpha.elem().elem().elem().real(),
671  (REAL64)alpha.elem().elem().elem().imag(),
672  (REAL64*)&(r.elem(sub.start()).elem(0).elem(0).real()),
673  (REAL64*)&(u.elem(sub.start()).elem(0).elem(0).real()),
674  (REAL64*)&(v.elem(sub.start()).elem(0).elem(0).real()),
675  (REAL64*)&(q.elem(sub.start()).elem(0).elem(0).real()),
676  (REAL64*)&(r0.elem(sub.start()).elem(0).elem(0).real()),
677  (REAL64*)&(f0.elem(sub.start()).elem(0).elem(0).real()),
678  (REAL64*)&(s.elem(sub.start()).elem(0).elem(0).real()),
679  (REAL64*)&(t.elem(sub.start()).elem(0).elem(0).real()),
680  getNormSpace(),
681  4*3*2};
682 
683  for(int i=0; i < 12*qdpNumThreads(); i++) {
684  arg.norm_space[i] = (REAL64)0;
685  }
686 
687  int len=(sub.end()-sub.start()+1);
688  dispatch_to_threads(len,arg,ord_ib_stupdates_kernel_real64);
689 
690 
691  for(int i=0; i < qdpNumThreads(); i++) {
692  norm_array[0] += arg.norm_space[12*i];
693  norm_array[1] += arg.norm_space[12*i+1];
694  norm_array[2] += arg.norm_space[12*i+2];
695  norm_array[3] += arg.norm_space[12*i+3];
696  norm_array[4] += arg.norm_space[12*i+4];
697  norm_array[5] += arg.norm_space[12*i+5];
698  norm_array[6] += arg.norm_space[12*i+6];
699  norm_array[7] += arg.norm_space[12*i+7];
700  norm_array[8] += arg.norm_space[12*i+8];
701  norm_array[9] += arg.norm_space[12*i+9];
702  norm_array[10] += arg.norm_space[12*i+10];
703  norm_array[11] += arg.norm_space[12*i+11];
704  }
705  QDPInternal::globalSumArray(norm_array,12);
706 
707  phi.elem().elem().elem().real() = norm_array[0];
708  phi.elem().elem().elem().imag() = norm_array[1];
709 
710  gamma.elem().elem().elem().real() = norm_array[2];
711  gamma.elem().elem().elem().imag() = norm_array[3];
712 
713  pi.elem().elem().elem().real() = norm_array[4];
714  pi.elem().elem().elem().imag() = norm_array[5];
715 
716  eta.elem().elem().elem().real() = norm_array[6];
717  eta.elem().elem().elem().imag() = norm_array[7];
718 
719  theta.elem().elem().elem().real() = norm_array[8];
720  theta.elem().elem().elem().imag() = norm_array[9];
721 
722  kappa.elem().elem().elem().elem() = norm_array[10];
723  rnorm.elem().elem().elem().elem() = norm_array[11];
724 
725  }
726  else {
727  QDPIO::cerr << "I only work for ordered subsets for now" << std::endl;
728  QDP_abort(1);
729  }
730 
731  }
732 
733 
734 
735 
736 
737  }
738 }
739 
740 
741 
742 #endif
Primary include file for CHROMA library code.
int y
Definition: meslate.cc:35
int z
Definition: meslate.cc:36
int x
Definition: meslate.cc:34
int t
Definition: meslate.cc:37
Double q
Definition: mesq.cc:17
LatticeReal phi
Definition: mesq.cc:27
void xpaypbz(T &x, T &y, T &z, C &a, C &b, const Subset &s)
void xmay_normx_cdotzx(T &x, const T &y, const T &z, C &a, Double &normx, DComplex &cdotzx, const Subset &s)
void xymz_normx(T &x, const T &y, const T &z, Double &x_norm, const Subset &s)
void norm2x_cdotxy(const T &x, const T &y, Double &norm2x, DComplex &cdotxy, const Subset &s)
void cxmay(T &x, const T &y, const C &a, const Subset &s)
void yxpaymabz(T &x, T &y, T &z, const C &a, const C &b, const Subset &s)
void ibicgstab_rxupdate(const C &omega, const T &s, const T &t, const T &z, T &r, T &x, const Subset &sub)
void ibicgstab_zvupdates(const T &r, T &z, T &v, const T &u, const T &q, const C &alpha, const C &alpha_rat_beta, const C &alpha_delta, const C &beta, const C &delta, const Subset &s)
void ibicgstab_stupdates_reduces(const C &alpha, const T &r, const T &u, const T &v, const T &q, const T &r0, const T &f0, T &s, T &t, C &phi, C &pi, C &gamma, C &eta, C &theta, F &kappa, F &rnorm, const Subset &sub)
static const LatticeInteger & beta(const int dim)
Definition: stag_phases_s.h:47
static const LatticeInteger & alpha(const int dim)
Definition: stag_phases_s.h:43
Asqtad Staggered-Dirac operator.
Definition: klein_gord.cc:10
static multi1d< LatticeColorMatrix > u
int i
Definition: pbg5p_w.cc:55
Complex a
Definition: invbicg.cc:95
Complex omega
Definition: invbicg.cc:97
LatticeFermion eta
Definition: mespbg5p_w.cc:37
Complex b
Definition: invbicg.cc:96
multi1d< LatticeFermion > s(Ncb)
FloatingPoint< double > Double
Definition: gtest.h:7351
void ord_cxmayf_kernel(int lo, int hi, int my_id, ord_cxmayf_arg *arg)
void ord_ib_rxupdate_kernel_real64(int lo, int hi, int my_id, ib_rxupdate_arg< REAL64 > *a)
void ord_ib_rxupdate_kernel_real32(int lo, int hi, int my_id, ib_rxupdate_arg< REAL32 > *a)
void ord_ib_stupdates_kernel_real64(int lo, int hi, int my_id, ib_stupdate_arg< REAL64 > *a)
void ord_ib_stupdates_kernel_real32(int lo, int hi, int my_id, ib_stupdate_arg< REAL32 > *a)
void ord_ib_zvupdates_kernel_real32(int lo, int hi, int my_id, ib_zvupdates_arg< REAL32 > *a)
void ord_ib_zvupdates_kernel_real64(int lo, int hi, int my_id, ib_zvupdates_arg< REAL64 > *a)
void ord_norm2x_cdotxy_kernel(int lo, int hi, int my_id, ord_norm2x_cdotxy_arg *a)
void ord_xmay_normx_cdotzx_kernel(int lo, int hi, int my_id, ord_xmay_normx_cdotzx_arg *a)
void ord_xymz_normx_kernel(int lo, int hi, int my_id, ord_xymz_normx_arg *a)
void ord_xpaypbz_kernel(int lo, int hi, int my_id, ord_xpaypbz_arg *a)
void ord_yxpaymabz_kernel(int lo, int hi, int my_id, ord_yxpaymabz_arg *a)
int kappa
Definition: pade_trln_w.cc:112
int norm
Definition: qtopcor.cc:35
int r0
Definition: qtopcor.cc:41
static INTERNAL_PRECISION F