1 #ifndef BICGSTAB_KERNELS_SCALARSITE_H
2 #define BICGSTAB_KERNELS_SCALARSITE_H
11 namespace BiCGStabKernels {
32 const LatticeDiracFermionD&
y,
33 const LatticeDiracFermionD&
z,
39 if (
s.hasOrderedRep() ) {
41 REAL64* x_ptr = (REAL64*)&(
x.elem(
s.start()).elem(0).elem(0).real());
42 REAL64* y_ptr = (REAL64*)&(
y.elem(
s.start()).elem(0).elem(0).real());
43 REAL64* z_ptr = (REAL64*)&(
z.elem(
s.start()).elem(0).elem(0).real());
47 int len = (
s.end()-
s.start()+1);
52 for(
int i=1 ;
i < qdpNumThreads();
i++) {
57 QDPIO::cerr <<
"I only work for ordered subsets for now" << std::endl;
61 QDPInternal::globalSum(
norm);
83 LatticeDiracFermionF&
y,
84 LatticeDiracFermionF&
z,
85 const ComplexF&
a,
const ComplexF&
b,
const Subset&
s)
89 if(
s.hasOrderedRep() ) {
90 REAL32* x_ptr = (REAL32*)&(
x.elem(
s.start()).elem(0).elem(0).real());
91 REAL32* y_ptr = (REAL32*)&(
y.elem(
s.start()).elem(0).elem(0).real());
92 REAL32* z_ptr = (REAL32*)&(
z.elem(
s.start()).elem(0).elem(0).real());
93 REAL32 a_re =
a.elem().elem().elem().real();
94 REAL32 a_im =
a.elem().elem().elem().imag();
95 REAL32 b_re =
b.elem().elem().elem().real();
96 REAL32 b_im =
b.elem().elem().elem().imag();
99 int len = (
s.end()-
s.start()+1);
103 QDPIO::cerr <<
"I only work for ordered subsets for now" << std::endl;
124 const LatticeDiracFermionF&
y,
Double& norm2x, DComplex& cdotxy,
const Subset&
s)
127 REAL64 norm_array[3]={0,0,0};
130 if (
s.hasOrderedRep() ) {
131 REAL32* x_ptr = (REAL32*)&(
x.elem(
s.start()).elem(0).elem(0).real());
132 REAL32* y_ptr = (REAL32*)&(
y.elem(
s.start()).elem(0).elem(0).real());
135 int len = (
s.end()-
s.start()+1);
139 for(
int i=0;
i < 3*qdpNumThreads();
i+=3) {
140 norm_array[0] += norm_space[
i];
141 norm_array[1] += norm_space[
i+1];
142 norm_array[2] += norm_space[
i+2];
146 QDPIO::cerr <<
"I only work for ordered subsets for now" << std::endl;
150 QDPInternal::globalSumArray(norm_array,3);
152 norm2x.elem().elem().elem().elem() =norm_array[0];
153 cdotxy.elem().elem().elem().real()= norm_array[1];
154 cdotxy.elem().elem().elem().imag() = norm_array[2];
176 LatticeDiracFermionF&
y,
177 LatticeDiracFermionF&
z,
184 if(
s.hasOrderedRep() ) {
185 REAL32* x_ptr = (REAL32*)&(
x.elem(
s.start()).elem(0).elem(0).real());
186 REAL32* y_ptr = (REAL32*)&(
y.elem(
s.start()).elem(0).elem(0).real());
187 REAL32* z_ptr = (REAL32*)&(
z.elem(
s.start()).elem(0).elem(0).real());
188 REAL32 a_re =
a.elem().elem().elem().real();
189 REAL32 a_im =
a.elem().elem().elem().imag();
190 REAL32 b_re =
b.elem().elem().elem().real();
191 REAL32 b_im =
b.elem().elem().elem().imag();
194 int len=(
s.end()-
s.start()+1);
198 QDPIO::cerr <<
"I only work for ordered subsets for now" << std::endl;
224 const LatticeDiracFermionF&
y,
225 const LatticeDiracFermionF&
z,
228 DComplex& cdotzx,
const Subset&
s)
231 REAL64 norm_array[3]={0,0,0};
233 if(
s.hasOrderedRep() ) {
234 REAL32* x_ptr = (REAL32*)&(
x.elem(
s.start()).elem(0).elem(0).real());
235 REAL32* y_ptr = (REAL32*)&(
y.elem(
s.start()).elem(0).elem(0).real());
236 REAL32* z_ptr = (REAL32*)&(
z.elem(
s.start()).elem(0).elem(0).real());
239 REAL32 a_re =
a.elem().elem().elem().real();
240 REAL32 a_im =
a.elem().elem().elem().imag();
245 int len =(
s.end()-
s.start()+1);
248 for(
int i=0;
i < 3*qdpNumThreads();
i+=3) {
249 norm_array[0] += norm_space[
i];
250 norm_array[1] += norm_space[
i+1];
251 norm_array[2] += norm_space[
i+2];
256 QDPIO::cerr <<
"I only work for ordered subsets for now" << std::endl;
260 QDPInternal::globalSumArray(norm_array,3);
261 normx.elem().elem().elem().elem() =norm_array[0];
262 cdotzx.elem().elem().elem().real()= norm_array[1];
263 cdotzx.elem().elem().elem().imag() = norm_array[2];
284 const LatticeDiracFermionF&
y,
290 if(
s.hasOrderedRep() ) {
291 REAL32* x_ptr = (REAL32*)&(
x.elem(
s.start()).elem(0).elem(0).real());
292 REAL32* y_ptr = (REAL32*)&(
y.elem(
s.start()).elem(0).elem(0).real());
293 REAL32 a_re =
a.elem().elem().elem().real();
294 REAL32 a_im =
a.elem().elem().elem().imag();
297 int len=(
s.end()-
s.start()+1);
301 QDPIO::cerr <<
"I only work for ordered subsets for now" << std::endl;
346 LatticeDiracFermionF3&
z,
347 LatticeDiracFermionF3& v,
348 const LatticeDiracFermionF3&
u,
349 const LatticeDiracFermionF3&
q,
350 const ComplexD&
alpha,
351 const ComplexD& alpha_rat_beta,
352 const ComplexD& alpha_delta,
353 const ComplexD&
beta,
354 const ComplexD& delta,
357 if( sub.hasOrderedRep() ) {
358 REAL32* r_ptr = (REAL32*)&(
r.elem(sub.start()).elem(0).elem(0).real());
359 REAL32* z_ptr = (REAL32*)&(
z.elem(sub.start()).elem(0).elem(0).real());
360 REAL32* v_ptr = (REAL32*)&(v.elem(sub.start()).elem(0).elem(0).real());
361 REAL32* u_ptr = (REAL32*)&(
u.elem(sub.start()).elem(0).elem(0).real());
362 REAL32* q_ptr = (REAL32*)&(
q.elem(sub.start()).elem(0).elem(0).real());
364 REAL32 a_re = (REAL32)
alpha.elem().elem().elem().real();
365 REAL32 a_im = (REAL32)
alpha.elem().elem().elem().imag();
367 REAL32 arb_re = (REAL32)alpha_rat_beta.elem().elem().elem().real();
368 REAL32 arb_im = (REAL32)alpha_rat_beta.elem().elem().elem().imag();
370 REAL32 ad_re = (REAL32)alpha_delta.elem().elem().elem().real();
371 REAL32 ad_im = (REAL32)alpha_delta.elem().elem().elem().imag();
373 REAL32 beta_re = (REAL32)
beta.elem().elem().elem().real();
374 REAL32 beta_im = (REAL32)
beta.elem().elem().elem().imag();
376 REAL32 delta_re = (REAL32)delta.elem().elem().elem().real();
377 REAL32 delta_im = (REAL32)delta.elem().elem().elem().imag();
380 a_re, a_im, arb_re, arb_im, ad_re, ad_im,
381 beta_re, beta_im, delta_re, delta_im,4*3*2};
383 int len=(sub.end()-sub.start()+1);
387 QDPIO::cerr <<
"I only work for ordered subsets for now" << std::endl;
401 LatticeDiracFermionD3&
z,
402 LatticeDiracFermionD3 &v,
403 const LatticeDiracFermionD3&
u,
404 const LatticeDiracFermionD3&
q,
405 const ComplexD&
alpha,
406 const ComplexD& alpha_rat_beta,
407 const ComplexD& alpha_delta,
408 const ComplexD&
beta,
409 const ComplexD& delta,
412 if( sub.hasOrderedRep() ) {
413 REAL64* r_ptr = (REAL64*)&(
r.elem(sub.start()).elem(0).elem(0).real());
414 REAL64* z_ptr = (REAL64*)&(
z.elem(sub.start()).elem(0).elem(0).real());
415 REAL64* v_ptr = (REAL64*)&(v.elem(sub.start()).elem(0).elem(0).real());
416 REAL64* u_ptr = (REAL64*)&(
u.elem(sub.start()).elem(0).elem(0).real());
417 REAL64* q_ptr = (REAL64*)&(
q.elem(sub.start()).elem(0).elem(0).real());
419 REAL64 a_re =
alpha.elem().elem().elem().real();
420 REAL64 a_im =
alpha.elem().elem().elem().imag();
422 REAL64 arb_re = alpha_rat_beta.elem().elem().elem().real();
423 REAL64 arb_im = alpha_rat_beta.elem().elem().elem().imag();
425 REAL64 ad_re = alpha_delta.elem().elem().elem().real();
426 REAL64 ad_im = alpha_delta.elem().elem().elem().imag();
428 REAL64 beta_re =
beta.elem().elem().elem().real();
429 REAL64 beta_im =
beta.elem().elem().elem().imag();
431 REAL64 delta_re = delta.elem().elem().elem().real();
432 REAL64 delta_im = delta.elem().elem().elem().imag();
435 a_re, a_im, arb_re, arb_im, ad_re, ad_im,
436 beta_re, beta_im, delta_re, delta_im,4*3*2};
438 int len=(sub.end()-sub.start()+1);
442 QDPIO::cerr <<
"I only work for ordered subsets for now" << std::endl;
469 const LatticeDiracFermionF3&
s,
470 const LatticeDiracFermionF3&
t,
471 const LatticeDiracFermionF3&
z,
472 LatticeDiracFermionF3&
r,
473 LatticeDiracFermionF3&
x,
476 if( sub.hasOrderedRep() ) {
477 REAL32* s_ptr = (REAL32*)&(
s.elem(sub.start()).elem(0).elem(0).real());
478 REAL32* t_ptr = (REAL32*)&(
t.elem(sub.start()).elem(0).elem(0).real());
479 REAL32* z_ptr = (REAL32*)&(
z.elem(sub.start()).elem(0).elem(0).real());
480 REAL32* r_ptr = (REAL32*)&(
r.elem(sub.start()).elem(0).elem(0).real());
481 REAL32* x_ptr = (REAL32*)&(
x.elem(sub.start()).elem(0).elem(0).real());
483 REAL32 omega_re = (REAL32)
omega.elem().elem().elem().real();
484 REAL32 omega_im = (REAL32)
omega.elem().elem().elem().imag();
487 omega_re, omega_im,4*3*2};
489 int len=(sub.end()-sub.start()+1);
493 QDPIO::cerr <<
"I only work for ordered subsets for now" << std::endl;
503 const LatticeDiracFermionD3&
s,
504 const LatticeDiracFermionD3&
t,
505 const LatticeDiracFermionD3&
z,
506 LatticeDiracFermionD3&
r,
507 LatticeDiracFermionD3&
x,
510 if( sub.hasOrderedRep() ) {
511 REAL64* s_ptr = (REAL64*)&(
s.elem(sub.start()).elem(0).elem(0).real());
512 REAL64* t_ptr = (REAL64*)&(
t.elem(sub.start()).elem(0).elem(0).real());
513 REAL64* z_ptr = (REAL64*)&(
z.elem(sub.start()).elem(0).elem(0).real());
514 REAL64* r_ptr = (REAL64*)&(
r.elem(sub.start()).elem(0).elem(0).real());
515 REAL64* x_ptr = (REAL64*)&(
x.elem(sub.start()).elem(0).elem(0).real());
517 REAL64 omega_re =
omega.elem().elem().elem().real();
519 REAL64 omega_im =
omega.elem().elem().elem().imag();
522 omega_re, omega_im, 4*3*2};
524 int len=(sub.end()-sub.start()+1);
528 QDPIO::cerr <<
"I only work for ordered subsets for now" << std::endl;
558 const LatticeDiracFermionF3&
r,
559 const LatticeDiracFermionF3&
u,
560 const LatticeDiracFermionF3& v,
561 const LatticeDiracFermionF3&
q,
562 const LatticeDiracFermionF3&
r0,
563 const LatticeDiracFermionF3& f0,
564 LatticeDiracFermionF3&
s,
565 LatticeDiracFermionF3&
t,
577 REAL64 norm_array[12]={0,0,0,0,0,0,0,0,0,0,0,0};
579 if( sub.hasOrderedRep() ) {
581 (REAL32)
alpha.elem().elem().elem().real(),
582 (REAL32)
alpha.elem().elem().elem().imag(),
583 (REAL32*)&(
r.elem(sub.start()).elem(0).elem(0).real()),
584 (REAL32*)&(
u.elem(sub.start()).elem(0).elem(0).real()),
585 (REAL32*)&(v.elem(sub.start()).elem(0).elem(0).real()),
586 (REAL32*)&(
q.elem(sub.start()).elem(0).elem(0).real()),
587 (REAL32*)&(
r0.elem(sub.start()).elem(0).elem(0).real()),
588 (REAL32*)&(f0.elem(sub.start()).elem(0).elem(0).real()),
589 (REAL32*)&(
s.elem(sub.start()).elem(0).elem(0).real()),
590 (REAL32*)&(
t.elem(sub.start()).elem(0).elem(0).real()),
594 for(
int i=0;
i < 12*qdpNumThreads();
i++) {
599 int len=(sub.end()-sub.start()+1);
601 for(
int i=0;
i < qdpNumThreads();
i++) {
615 QDPInternal::globalSumArray(norm_array,12);
617 phi.elem().elem().elem().real() = norm_array[0];
618 phi.elem().elem().elem().imag() = norm_array[1];
620 gamma.elem().elem().elem().real() = norm_array[2];
621 gamma.elem().elem().elem().imag() = norm_array[3];
623 pi.elem().elem().elem().real() = norm_array[4];
624 pi.elem().elem().elem().imag() = norm_array[5];
626 eta.elem().elem().elem().real() = norm_array[6];
627 eta.elem().elem().elem().imag() = norm_array[7];
629 theta.elem().elem().elem().real() = norm_array[8];
630 theta.elem().elem().elem().imag() = norm_array[9];
632 kappa.elem().elem().elem().elem() = norm_array[10];
633 rnorm.elem().elem().elem().elem() = norm_array[11];
637 QDPIO::cerr <<
"I only work for ordered subsets for now" << std::endl;
646 const LatticeDiracFermionD3&
r,
647 const LatticeDiracFermionD3&
u,
648 const LatticeDiracFermionD3& v,
649 const LatticeDiracFermionD3&
q,
650 const LatticeDiracFermionD3&
r0,
651 const LatticeDiracFermionD3& f0,
652 LatticeDiracFermionD3&
s,
653 LatticeDiracFermionD3&
t,
665 REAL64 norm_array[12]={0,0,0,0,0,0,0,0,0,0,0,0};
667 if( sub.hasOrderedRep() ) {
670 (REAL64)
alpha.elem().elem().elem().real(),
671 (REAL64)
alpha.elem().elem().elem().imag(),
672 (REAL64*)&(
r.elem(sub.start()).elem(0).elem(0).real()),
673 (REAL64*)&(
u.elem(sub.start()).elem(0).elem(0).real()),
674 (REAL64*)&(v.elem(sub.start()).elem(0).elem(0).real()),
675 (REAL64*)&(
q.elem(sub.start()).elem(0).elem(0).real()),
676 (REAL64*)&(
r0.elem(sub.start()).elem(0).elem(0).real()),
677 (REAL64*)&(f0.elem(sub.start()).elem(0).elem(0).real()),
678 (REAL64*)&(
s.elem(sub.start()).elem(0).elem(0).real()),
679 (REAL64*)&(
t.elem(sub.start()).elem(0).elem(0).real()),
683 for(
int i=0;
i < 12*qdpNumThreads();
i++) {
687 int len=(sub.end()-sub.start()+1);
691 for(
int i=0;
i < qdpNumThreads();
i++) {
705 QDPInternal::globalSumArray(norm_array,12);
707 phi.elem().elem().elem().real() = norm_array[0];
708 phi.elem().elem().elem().imag() = norm_array[1];
710 gamma.elem().elem().elem().real() = norm_array[2];
711 gamma.elem().elem().elem().imag() = norm_array[3];
713 pi.elem().elem().elem().real() = norm_array[4];
714 pi.elem().elem().elem().imag() = norm_array[5];
716 eta.elem().elem().elem().real() = norm_array[6];
717 eta.elem().elem().elem().imag() = norm_array[7];
719 theta.elem().elem().elem().real() = norm_array[8];
720 theta.elem().elem().elem().imag() = norm_array[9];
722 kappa.elem().elem().elem().elem() = norm_array[10];
723 rnorm.elem().elem().elem().elem() = norm_array[11];
727 QDPIO::cerr <<
"I only work for ordered subsets for now" << std::endl;
Primary include file for CHROMA library code.
void xpaypbz(T &x, T &y, T &z, C &a, C &b, const Subset &s)
void xmay_normx_cdotzx(T &x, const T &y, const T &z, C &a, Double &normx, DComplex &cdotzx, const Subset &s)
void xymz_normx(T &x, const T &y, const T &z, Double &x_norm, const Subset &s)
void norm2x_cdotxy(const T &x, const T &y, Double &norm2x, DComplex &cdotxy, const Subset &s)
void cxmay(T &x, const T &y, const C &a, const Subset &s)
void yxpaymabz(T &x, T &y, T &z, const C &a, const C &b, const Subset &s)
void ibicgstab_rxupdate(const C &omega, const T &s, const T &t, const T &z, T &r, T &x, const Subset &sub)
void ibicgstab_zvupdates(const T &r, T &z, T &v, const T &u, const T &q, const C &alpha, const C &alpha_rat_beta, const C &alpha_delta, const C &beta, const C &delta, const Subset &s)
void initScalarSiteKernels()
void ibicgstab_stupdates_reduces(const C &alpha, const T &r, const T &u, const T &v, const T &q, const T &r0, const T &f0, T &s, T &t, C &phi, C &pi, C &gamma, C &eta, C &theta, F &kappa, F &rnorm, const Subset &sub)
void finishScalarSiteKernels()
static const LatticeInteger & beta(const int dim)
static const LatticeInteger & alpha(const int dim)
Asqtad Staggered-Dirac operator.
static multi1d< LatticeColorMatrix > u
multi1d< LatticeFermion > s(Ncb)
FloatingPoint< double > Double
void ord_cxmayf_kernel(int lo, int hi, int my_id, ord_cxmayf_arg *arg)
void ord_ib_rxupdate_kernel_real64(int lo, int hi, int my_id, ib_rxupdate_arg< REAL64 > *a)
void ord_ib_rxupdate_kernel_real32(int lo, int hi, int my_id, ib_rxupdate_arg< REAL32 > *a)
void ord_ib_stupdates_kernel_real64(int lo, int hi, int my_id, ib_stupdate_arg< REAL64 > *a)
void ord_ib_stupdates_kernel_real32(int lo, int hi, int my_id, ib_stupdate_arg< REAL32 > *a)
void ord_ib_zvupdates_kernel_real32(int lo, int hi, int my_id, ib_zvupdates_arg< REAL32 > *a)
void ord_ib_zvupdates_kernel_real64(int lo, int hi, int my_id, ib_zvupdates_arg< REAL64 > *a)
void ord_norm2x_cdotxy_kernel(int lo, int hi, int my_id, ord_norm2x_cdotxy_arg *a)
void ord_xmay_normx_cdotzx_kernel(int lo, int hi, int my_id, ord_xmay_normx_cdotzx_arg *a)
void ord_xymz_normx_kernel(int lo, int hi, int my_id, ord_xymz_normx_arg *a)
void ord_xpaypbz_kernel(int lo, int hi, int my_id, ord_xpaypbz_arg *a)
void ord_yxpaymabz_kernel(int lo, int hi, int my_id, ord_yxpaymabz_arg *a)
static INTERNAL_PRECISION F