CHROMA
clover_term_qdp_w.h
Go to the documentation of this file.
1 // -*- C++ -*-
2 /*! \file
3  * \brief Clover term linear operator
4  */
5 
6 #ifndef __clover_term_qdp_w_h__
7 #define __clover_term_qdp_w_h__
8 
9 #include "state.h"
10 #include "qdp_allocator.h"
13 #include "meas/glue/mesfield.h"
14 #include <complex>
15 namespace Chroma
16 {
17 
18  //! Special structure used for triangular objects
19  template<typename R>
21  {
22  RScalar<R> diag[2][2*Nc];
23  RComplex<R> offd[2][2*Nc*Nc-Nc];
24  };
25 
26  template<typename R>
27  struct QUDAPackedClovSite {
28  R diag1[6];
29  R offDiag1[15][2];
30  R diag2[6];
31  R offDiag2[15][2];
32  };
33 
34  // Reader/writers
35  /*! \ingroup linop */
36 #if 0
37  void read(XMLReader& xml, const std::string& path, PrimitiveClovTriang& param);
38 
39  /*! \ingroup linop */
40  void write(XMLWriter& xml, const std::string& path, const PrimitiveClovTriang& param);
41 #endif
42 
43  //! Clover term
44  /*!
45  * \ingroup linop
46  *
47  */
48  template<typename T, typename U>
49  class QDPCloverTermT : public CloverTermBase<T, U>
50  {
51  public:
52  // Typedefs to save typing
53  typedef typename WordType<T>::Type_t REALT;
54 
55  typedef OLattice< PScalar< PScalar< RScalar< typename WordType<T>::Type_t> > > > LatticeREAL;
56  typedef OScalar< PScalar< PScalar< RScalar<REALT> > > > RealT;
57 
58  //! Empty constructor. Must use create later
60 
61  //! No real need for cleanup here
63  if ( tri != nullptr ) {
64  QDP::Allocator::theQDPAllocator::Instance().free(tri);
65  }
66  }
67 
68  //! Creation routine
69  void create(Handle< FermState<T, multi1d<U>, multi1d<U> > > fs,
70  const CloverFermActParams& param_);
71 
72  virtual void create(Handle< FermState<T, multi1d<U>, multi1d<U> > > fs,
73  const CloverFermActParams& param_,
74  const QDPCloverTermT<T,U>& from_);
75 
76  //! Computes the inverse of the term on cb using Cholesky
77  /*!
78  * \param cb checkerboard of work (Read)
79  */
80  void choles(int cb);
81 
82  //! Computes the inverse of the term on cb using Cholesky
83  /*!
84  * \param cb checkerboard of work (Read)
85  * \return logarithm of the determinant
86  */
87  Double cholesDet(int cb) const ;
88 
89  /**
90  * Apply a dslash
91  *
92  * Performs the operation
93  *
94  * chi <- (L + D + L^dag) . psi
95  *
96  * where
97  * L is a lower triangular matrix
98  * D is the real diagonal. (stored together in type TRIANG)
99  *
100  * Arguments:
101  * \param chi result (Write)
102  * \param psi source (Read)
103  * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read)
104  * \param cb Checkerboard of OUTPUT std::vector (Read)
105  */
106  void apply (T& chi, const T& psi, enum PlusMinus isign, int cb) const;
107 
108 
109  void applySite(T& chi, const T& psi, enum PlusMinus isign, int site) const;
110 
111 #ifdef BUILD_QPHIX
112  // Access the clover tri-buffer for packing
113  const PrimitiveClovTriang<REALT>* getTriBuffer() const {
114  return tri;
115  }
116 #endif
117 
118  //! Calculates Tr_D ( Gamma_mat L )
119  void triacntr(U& B, int mat, int cb) const;
120 
121  //! Return the fermion BC object for this linear operator
122  const FermBC<T, multi1d<U>, multi1d<U> >& getFermBC() const {return *fbc;}
123 
124  //! PACK UP the Clover term for QUDA library:
125  void packForQUDA(multi1d<QUDAPackedClovSite<REALT> >& quda_pack, int cb) const;
126 
127 
128 
129  protected:
130  //! Create the clover term on cb
131  /*!
132  * \param f field strength tensor F(mu,nu) (Read)
133  * \param cb checkerboard (Read)
134  */
135  void makeClov(const multi1d<U>& f, const RealT& diag_mass);
136 
137  //! Invert the clover term on cb
138  void chlclovms(LatticeREAL& log_diag, int cb);
139  void ldagdlinv(LatticeREAL& tr_log_diag, int cb);
140 
141  //! Get the u field
142  const multi1d<U>& getU() const {return u;}
143 
144  //! Calculates Tr_D ( Gamma_mat L )
145  Real getCloverCoeff(int mu, int nu) const;
146 
147  private:
149  multi1d<U> u;
151  LatticeREAL tr_log_diag_; // Fill this out during create
152  // but save the global sum until needed.
153  multi1d<bool> choles_done; // Keep note of whether the decomposition has been done
154  // on a particular checkerboard.
155 
157 
158  };
159 
160 
161  // Empty constructor. Must use create later
162  template<typename T, typename U>
164  // Always allocate on construction
165  int nodeSites = Layout::sitesOnNode();
166  tri = (PrimitiveClovTriang<REALT>*)QDP::Allocator::theQDPAllocator::Instance().allocate(nodeSites*sizeof(PrimitiveClovTriang<REALT>),
168 
169  }
170 
171  // Now copy
172  template<typename T, typename U>
173  void QDPCloverTermT<T,U>::create(Handle< FermState<T,multi1d<U>,multi1d<U> > > fs,
174  const CloverFermActParams& param_,
175  const QDPCloverTermT<T,U>& from)
176  {
177 #ifndef QDP_IS_QDPJIT
178  START_CODE();
179  u.resize(Nd);
180 
181  u = fs->getLinks();
182  fbc = fs->getFermBC();
183  param = param_;
184 
185  // Sanity check
186  if (fbc.operator->() == 0) {
187  QDPIO::cerr << "QDPCloverTerm: error: fbc is null" << std::endl;
188  QDP_abort(1);
189  }
190 
191  {
192  RealT ff = where(param.anisoParam.anisoP, Real(1) / param.anisoParam.xi_0, Real(1));
193  param.clovCoeffR *= Real(0.5) * ff;
194  param.clovCoeffT *= Real(0.5);
195  }
196 
197  //
198  // Yuk. Some bits of knowledge of the dslash term are buried in the
199  // effective mass term. They show up here. If I wanted some more
200  // complicated dslash then this will have to be fixed/adjusted.
201  //
202  RealT diag_mass;
203  {
204  RealT ff = where(param.anisoParam.anisoP, param.anisoParam.nu / param.anisoParam.xi_0, Real(1));
205  diag_mass = 1 + (Nd-1)*ff + param.Mass;
206  }
207 
208 
209  /* Calculate F(mu,nu) */
210  //multi1d<LatticeColorMatrix> f;
211  //mesField(f, u);
212  //makeClov(f, diag_mass);
213 
214  choles_done.resize(rb.numSubsets());
215  for(int i=0; i < rb.numSubsets(); i++) {
216  choles_done[i] = from.choles_done[i];
217  }
218 
219  // This is for the whole lattice (LatticeReal)
220  tr_log_diag_ = from.tr_log_diag_;
221 
222 
223  int nodeSites = Layout::sitesOnNode();
224  // Deep copy.
225 #pragma omp parallel for
226  for(int site=0; site < nodeSites;++site) {
227  for(int block=0; block < 2; ++block) {
228  for(int d=0; d < 6; ++d) {
229  tri[site].diag[block][d] = from.tri[site].diag[block][d];
230  }
231  for(int od=0; od < 15; ++od) {
232  tri[site].offd[block][od] = from.tri[site].offd[block][od];
233  }
234  }
235  }
236 
237 
238  END_CODE();
239 #endif
240  }
241 
242 
243  //! Creation routine
244  template<typename T, typename U>
245  void QDPCloverTermT<T,U>::create(Handle< FermState<T,multi1d<U>,multi1d<U> > > fs,
246  const CloverFermActParams& param_)
247  {
248 #ifndef QDP_IS_QDPJIT
249  START_CODE();
250 
251  u.resize(Nd);
252 
253  u = fs->getLinks();
254  fbc = fs->getFermBC();
255  param = param_;
256 
257  // Sanity check
258  if (fbc.operator->() == 0) {
259  QDPIO::cerr << "QDPCloverTerm: error: fbc is null" << std::endl;
260  QDP_abort(1);
261  }
262 
263  {
264  RealT ff = where(param.anisoParam.anisoP, Real(1) / param.anisoParam.xi_0, Real(1));
265  param.clovCoeffR *= RealT(0.5) * ff;
266  param.clovCoeffT *= RealT(0.5);
267  }
268 
269  //
270  // Yuk. Some bits of knowledge of the dslash term are buried in the
271  // effective mass term. They show up here. If I wanted some more
272  // complicated dslash then this will have to be fixed/adjusted.
273  //
274  RealT diag_mass;
275  {
276  RealT ff = where(param.anisoParam.anisoP, param.anisoParam.nu / param.anisoParam.xi_0, Real(1));
277  diag_mass = 1 + (Nd-1)*ff + param.Mass;
278  }
279 
280  /* Calculate F(mu,nu) */
281  multi1d<U> f;
282  mesField(f, u);
283  makeClov(f, diag_mass);
284 
285 
286  choles_done.resize(rb.numSubsets());
287  for(int i=0; i < rb.numSubsets(); i++) {
288  choles_done[i] = false;
289  }
290 
291 
292  END_CODE();
293 #endif
294  }
295 
296 
297  /*
298  * MAKCLOV
299  *
300  * In this routine, MAKCLOV calculates
301 
302  * 1 - (1/4)*sigma(mu,nu) F(mu,nu)
303 
304  * using F from mesfield
305 
306  * F(mu,nu) = (1/4) sum_p (1/2) [ U_p(x) - U^dag_p(x) ]
307 
308  * using basis of SPPROD and stores in a lower triangular matrix
309  * (no diagonal) plus real diagonal
310 
311  * where
312  * U_1 = u(x,mu)*u(x+mu,nu)*u_dag(x+nu,mu)*u_dag(x,nu)
313  * U_2 = u(x,nu)*u_dag(x-mu+nu,mu)*u_dag(x-mu,nu)*u(x-mu,mu)
314  * U_3 = u_dag(x-mu,mu)*u_dag(x-mu-nu,nu)*u(x-mu-nu,mu)*u(x-nu,nu)
315  * U_4 = u_dag(x-nu,nu)*u(x-nu,mu)*u(x-nu+mu,nu)*u_dag(x,mu)
316 
317  * and
318 
319  * | sigF(1) sigF(3) 0 0 |
320  * sigF = | sigF(5) -sigF(1) 0 0 |
321  * | 0 0 -sigF(0) -sigF(2) |
322  * | 0 0 -sigF(4) sigF(0) |
323  * where
324  * sigF(i) is a color matrix
325 
326  * sigF(0) = i*(ClovT*E_z + ClovR*B_z)
327  * = i*(ClovT*F(3,2) + ClovR*F(1,0))
328  * sigF(1) = i*(ClovT*E_z - ClovR*B_z)
329  * = i*(ClovT*F(3,2) - ClovR*F(1,0))
330  * sigF(2) = i*(E_+ + B_+)
331  * sigF(3) = i*(E_+ - B_+)
332  * sigF(4) = i*(E_- + B_-)
333  * sigF(5) = i*(E_- - B_-)
334  * i*E_+ = (i*ClovT*E_x - ClovT*E_y)
335  * = (i*ClovT*F(3,0) - ClovT*F(3,1))
336  * i*E_- = (i*ClovT*E_x + ClovT*E_y)
337  * = (i*ClovT*F(3,0) + ClovT*F(3,1))
338  * i*B_+ = (i*ClovR*B_x - ClovR*B_y)
339  * = (i*ClovR*F(2,1) + ClovR*F(2,0))
340  * i*B_- = (i*ClovR*B_x + ClovR*B_y)
341  * = (i*ClovR*F(2,1) - ClovR*F(2,0))
342 
343  * NOTE: I am using i*F of the usual F defined by UKQCD, Heatlie et.al.
344 
345  * NOTE: the above definitions assume that the time direction, t_dir,
346  * is 3. In general F(k,j) is multiplied with ClovT if either
347  * k=t_dir or j=t_dir, and with ClovR otherwise.
348 
349  *+++
350  * Here are some notes on the origin of this routine. NOTE, ClovCoeff or u0
351  * are not actually used in MAKCLOV.
352  *
353  * The clover mass term is suppose to act on a std::vector like
354  *
355  * chi = (1 - (ClovCoeff/u0^3) * kappa/4 * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)) * psi
356 
357  * Definitions used here (NOTE: no "i")
358  * sigma(mu,nu) = gamma(mu)*gamma(nu) - gamma(nu)*gamma(mu)
359  * = 2*gamma(mu)*gamma(nu) for mu != nu
360  *
361  * chi = sum_mu sum_nu F(mu,nu)*gamma(mu)*gamma(nu)*psi for mu < nu
362  * = (1/2) * sum_mu sum_nu F(mu,nu)*gamma(mu)*gamma(nu)*psi for mu != nu
363  * = (1/4) * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)*psi
364  *
365  *
366  * chi = (1 - (ClovCoeff/u0^3) * kappa/4 * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)) * psi
367  * = psi - (ClovCoeff/u0^3) * kappa * chi
368  * == psi - kappa * chi
369  *
370  * We have absorbed ClovCoeff/u0^3 into kappa. A u0 was previously absorbed into kappa
371  * for compatibility to ancient conventions.
372  *---
373 
374  * Arguments:
375  * \param f field strength tensor F(cb,mu,nu) (Read)
376  * \param diag_mass effective mass term (Read)
377  */
378 
379  /*Threaded this. It needs a QMT arg struct and I've extracted the site loop */
380  namespace QDPCloverEnv {
381 
382  template<typename U>
384  typedef typename WordType<U>::Type_t REALT;
385  typedef OScalar< PScalar< PScalar< RScalar<REALT> > > > RealT;
386  const RealT& diag_mass;
387  const U& f0;
388  const U& f1;
389  const U& f2;
390  const U& f3;
391  const U& f4;
392  const U& f5;
394  };
395 
396 
397  /* This is the extracted site loop for makeClover */
398  template<typename U>
399  inline
400  void makeClovSiteLoop(int lo, int hi, int myId, QDPCloverMakeClovArg<U> *a)
401  {
402 #ifndef QDP_IS_QDPJIT
403  typedef typename QDPCloverMakeClovArg<U>::RealT RealT;
404  typedef typename QDPCloverMakeClovArg<U>::REALT REALT;
405 
406  const RealT& diag_mass = a->diag_mass;
407  const U& f0=a->f0;
408  const U& f1=a->f1;
409  const U& f2=a->f2;
410  const U& f3=a->f3;
411  const U& f4=a->f4;
412  const U& f5=a->f5;
414 
415  // SITE LOOP STARTS HERE
416  for(int site = lo; site < hi; ++site) {
417  /*# Construct diagonal */
418 
419  for(int jj = 0; jj < 2; jj++) {
420 
421  for(int ii = 0; ii < 2*Nc; ii++) {
422 
423  tri[site].diag[jj][ii] = diag_mass.elem().elem().elem();
424  }
425  }
426 
427 
428 
429  RComplex<REALT> E_minus;
430  RComplex<REALT> B_minus;
431  RComplex<REALT> ctmp_0;
432  RComplex<REALT> ctmp_1;
433  RScalar<REALT> rtmp_0;
434  RScalar<REALT> rtmp_1;
435 
436  for(int i = 0; i < Nc; ++i) {
437 
438  /*# diag_L(i,0) = 1 - i*diag(E_z - B_z) */
439  /*# = 1 - i*diag(F(3,2) - F(1,0)) */
440  ctmp_0 = f5.elem(site).elem().elem(i,i);
441  ctmp_0 -= f0.elem(site).elem().elem(i,i);
442  rtmp_0 = imag(ctmp_0);
443  tri[site].diag[0][i] += rtmp_0;
444 
445  /*# diag_L(i+Nc,0) = 1 + i*diag(E_z - B_z) */
446  /*# = 1 + i*diag(F(3,2) - F(1,0)) */
447  tri[site].diag[0][i+Nc] -= rtmp_0;
448 
449  /*# diag_L(i,1) = 1 + i*diag(E_z + B_z) */
450  /*# = 1 + i*diag(F(3,2) + F(1,0)) */
451  ctmp_1 = f5.elem(site).elem().elem(i,i);
452  ctmp_1 += f0.elem(site).elem().elem(i,i);
453  rtmp_1 = imag(ctmp_1);
454  tri[site].diag[1][i] -= rtmp_1;
455 
456  /*# diag_L(i+Nc,1) = 1 - i*diag(E_z + B_z) */
457  /*# = 1 - i*diag(F(3,2) + F(1,0)) */
458  tri[site].diag[1][i+Nc] += rtmp_1;
459  }
460 
461  /*# Construct lower triangular portion */
462  /*# Block diagonal terms */
463  for(int i = 1; i < Nc; ++i) {
464 
465  for(int j = 0; j < i; ++j) {
466 
467  int elem_ij = i*(i-1)/2 + j;
468  int elem_tmp = (i+Nc)*(i+Nc-1)/2 + j+Nc;
469 
470  /*# L(i,j,0) = -i*(E_z - B_z)[i,j] */
471  /*# = -i*(F(3,2) - F(1,0)) */
472  ctmp_0 = f0.elem(site).elem().elem(i,j);
473  ctmp_0 -= f5.elem(site).elem().elem(i,j);
474  tri[site].offd[0][elem_ij] = timesI(ctmp_0);
475 
476  /*# L(i+Nc,j+Nc,0) = +i*(E_z - B_z)[i,j] */
477  /*# = +i*(F(3,2) - F(1,0)) */
478  tri[site].offd[0][elem_tmp] = -tri[site].offd[0][elem_ij];
479 
480  /*# L(i,j,1) = i*(E_z + B_z)[i,j] */
481  /*# = i*(F(3,2) + F(1,0)) */
482  ctmp_1 = f5.elem(site).elem().elem(i,j);
483  ctmp_1 += f0.elem(site).elem().elem(i,j);
484  tri[site].offd[1][elem_ij] = timesI(ctmp_1);
485 
486  /*# L(i+Nc,j+Nc,1) = -i*(E_z + B_z)[i,j] */
487  /*# = -i*(F(3,2) + F(1,0)) */
488  tri[site].offd[1][elem_tmp] = -tri[site].offd[1][elem_ij];
489  }
490  }
491 
492  /*# Off-diagonal */
493  for(int i = 0; i < Nc; ++i) {
494 
495  for(int j = 0; j < Nc; ++j) {
496 
497  // Flipped index
498  // by swapping i <-> j. In the past i would run slow
499  // and now j runs slow
500  int elem_ij = (i+Nc)*(i+Nc-1)/2 + j;
501 
502  /*# i*E_- = (i*E_x + E_y) */
503  /*# = (i*F(3,0) + F(3,1)) */
504  E_minus = timesI(f2.elem(site).elem().elem(i,j));
505  E_minus += f4.elem(site).elem().elem(i,j);
506 
507  /*# i*B_- = (i*B_x + B_y) */
508  /*# = (i*F(2,1) - F(2,0)) */
509  B_minus = timesI(f3.elem(site).elem().elem(i,j));
510  B_minus -= f1.elem(site).elem().elem(i,j);
511 
512  /*# L(i+Nc,j,0) = -i*(E_- - B_-) */
513  tri[site].offd[0][elem_ij] = B_minus - E_minus;
514 
515  /*# L(i+Nc,j,1) = +i*(E_- + B_-) */
516  tri[site].offd[1][elem_ij] = E_minus + B_minus;
517  }
518  }
519  } /* End Site loop */
520 #endif
521  } /* Function */
522  }
523 
524  /* This now just sets up and dispatches... */
525  template<typename T, typename U>
526  void QDPCloverTermT<T,U>::makeClov(const multi1d<U>& f, const RealT& diag_mass)
527  {
528  START_CODE();
529 
530  if ( Nd != 4 ){
531  QDPIO::cerr << __func__ << ": expecting Nd==4" << std::endl;
532  QDP_abort(1);
533  }
534 
535  if ( Ns != 4 ){
536  QDPIO::cerr << __func__ << ": expecting Ns==4" << std::endl;
537  QDP_abort(1);
538  }
539 
540  U f0 = f[0] * getCloverCoeff(0,1);
541  U f1 = f[1] * getCloverCoeff(0,2);
542  U f2 = f[2] * getCloverCoeff(0,3);
543  U f3 = f[3] * getCloverCoeff(1,2);
544  U f4 = f[4] * getCloverCoeff(1,3);
545  U f5 = f[5] * getCloverCoeff(2,3);
546 
547  const int nodeSites = QDP::Layout::sitesOnNode();
548  QDPCloverEnv::QDPCloverMakeClovArg<U> arg = {diag_mass, f0,f1,f2,f3,f4,f5,tri };
549  dispatch_to_threads(nodeSites, arg, QDPCloverEnv::makeClovSiteLoop<U>);
550 
551 
552  END_CODE();
553  }
554 
555 
556  //! Invert
557  /*!
558  * Computes the inverse of the term on cb using Cholesky
559  */
560  template<typename T, typename U>
562  {
563  START_CODE();
564 
565  // When you are doing the cholesky - also fill out the trace_log_diag piece)
566  // chlclovms(tr_log_diag_, cb);
567  // Switch to LDL^\dag inversion
568  ldagdlinv(tr_log_diag_,cb);
569 
570  END_CODE();
571  }
572 
573 
574  //! Invert
575  /*!
576  * Computes the inverse of the term on cb using Cholesky
577  *
578  * \return logarithm of the determinant
579  */
580  template<typename T, typename U>
582  {
583 #ifndef QDP_IS_QDPJIT
584  START_CODE();
585 
586  if( choles_done[cb] == false )
587  {
588  QDPIO::cout << __func__ << ": Error: you have not done the Cholesky.on this operator on this subset" << std::endl;
589  QDPIO::cout << "You sure you should not be asking invclov?" << std::endl;
590  QDP_abort(1);
591  }
592 
593 
594  END_CODE();
595 
596  // Need to thread generic sums in QDP++?
597  // Need to thread generic norm2() in QDP++?
598 
599 
600  return sum(tr_log_diag_, rb[cb]);
601 #else
602  assert(!"ni");
603  Double ret=0.;
604  return ret;
605 #endif
606  }
607 
608  namespace QDPCloverEnv {
609  template<typename U>
610  struct LDagDLInvArgs {
611  typedef typename WordType<U>::Type_t REALT;
612  typedef OScalar< PScalar< PScalar< RScalar<REALT> > > > RealT;
613  typedef OLattice< PScalar< PScalar< RScalar<REALT> > > > LatticeRealT;
616  int cb;
617  };
618 
619  template<typename U>
620  inline
621  void LDagDLInvSiteLoop(int lo, int hi, int myId, LDagDLInvArgs<U>* a)
622  {
623  typedef typename LDagDLInvArgs<U>::REALT REALT;
624  typedef typename LDagDLInvArgs<U>::RealT RealT;
625  typedef typename LDagDLInvArgs<U>::LatticeRealT LatticeRealT;
626 
627  LatticeRealT& tr_log_diag = a->tr_log_diag;
628  PrimitiveClovTriang < REALT>* tri = a->tri;
629  int cb = a->cb;
630 
631 
632  RScalar<REALT> zip=0;
633  int N = 2*Nc;
634 
635  // Loop through the sites.
636  for(int ssite=lo; ssite < hi; ++ssite) {
637 
638  int site = rb[cb].siteTable()[ssite];
639 
640  int site_neg_logdet=0;
641  // Loop through the blocks on the site.
642  for(int block=0; block < 2; block++) {
643 
644  // Triangular storage
645  RScalar<REALT> inv_d[6] QDP_ALIGN16;
646  RComplex<REALT> inv_offd[15] QDP_ALIGN16;
647  RComplex<REALT> v[6] QDP_ALIGN16;
648  RScalar<REALT> diag_g[6] QDP_ALIGN16;
649  // Algorithm 4.1.2 LDL^\dagger Decomposition
650  // From Golub, van Loan 3rd ed, page 139
651  for(int i=0; i < N; i++) {
652  inv_d[i] = tri[site].diag[block][i];
653  }
654 
655  for(int i=0; i < 15; i++) {
656  inv_offd[i] =tri[site].offd[block][i];
657  }
658 
659  for(int j=0; j < N; ++j) {
660 
661  // Compute v(0:j-1)
662  //
663  // for i=0:j-2
664  // v(i) = A(j,i) A(i,i)
665  // end
666 
667 
668  for(int i=0; i < j; i++) {
669  int elem_ji = j*(j-1)/2 + i;
670 
671  RComplex<REALT> A_ii = cmplx( inv_d[i], zip );
672  v[i] = A_ii*adj(inv_offd[elem_ji]);
673  }
674 
675  // v(j) = A(j,j) - A(j, 0:j-2) v(0:j-2)
676  // ^ This is done with a loop over k ie:
677  //
678  // v(j) = A(j,j) - sum_k A*(j,k) v(k) k=0...j-2
679  //
680  // = A(j,j) - sum_k A*(j,k) A(j,k) A(k,k)
681  // = A(j,j) - sum_k | A(j,k) |^2 A(k,k)
682 
683  v[j] = cmplx(inv_d[j],zip);
684 
685  for(int k=0; k < j; k++) {
686  int elem_jk = j*(j-1)/2 + k;
687  v[j] -= inv_offd[elem_jk]*v[k];
688  }
689 
690 
691  // At this point in time v[j] has to be real, since
692  // A(j,j) is from diag ie real and all | A(j,k) |^2 is real
693  // as is A(k,k)
694 
695  // A(j,j) is the diagonal element - so store it.
696  inv_d[j] = real( v[j] );
697 
698  // Last line of algorithm:
699  // A( j+1 : n, j) = ( A(j+1:n, j) - A(j+1:n, 1:j-1)v(1:k-1) ) / v(j)
700  //
701  // use k as first colon notation and l as second so
702  //
703  // for k=j+1 < n-1
704  // A(k,j) = A(k,j) ;
705  // for l=0 < j-1
706  // A(k,j) -= A(k, l) v(l)
707  // end
708  // A(k,j) /= v(j);
709  //
710  for(int k=j+1; k < N; k++) {
711  int elem_kj = k*(k-1)/2 + j;
712  for(int l=0; l < j; l++) {
713  int elem_kl = k*(k-1)/2 + l;
714  inv_offd[elem_kj] -= inv_offd[elem_kl] * v[l];
715  }
716  inv_offd[elem_kj] /= v[j];
717  }
718  }
719 
720  // Now fix up the inverse
721  RScalar<REALT> one;
722  one.elem() = (REALT)1;
723 
724  for(int i=0; i < N; i++) {
725  diag_g[i] = one/inv_d[i];
726 
727  // Compute the trace log
728  // NB we are always doing trace log | A |
729  // (because we are always working with actually A^\dagger A
730  // even in one flavour case where we square root)
731  tr_log_diag.elem(site).elem().elem().elem() += log(fabs(inv_d[i].elem()));
732  // However, it is worth counting just the no of negative logdets
733  // on site
734  if( inv_d[i].elem() < 0 ) {
735  site_neg_logdet++;
736  }
737  }
738  // Now we need to invert the L D L^\dagger
739  // We can do this by solving:
740  //
741  // L D L^\dagger M^{-1} = 1
742  //
743  // This can be done by solving L D X = 1 (X = L^\dagger M^{-1})
744  //
745  // Then solving L^\dagger M^{-1} = X
746  //
747  // LD is lower diagonal and so X will also be lower diagonal.
748  // LD X = 1 can be solved by forward substitution.
749  //
750  // Likewise L^\dagger is strictly upper triagonal and so
751  // L^\dagger M^{-1} = X can be solved by forward substitution.
752  RComplex<REALT> sum;
753  for(int k = 0; k < N; ++k) {
754 
755  for(int i = 0; i < k; ++i) {
756  zero_rep(v[i]);
757  }
758 
759  /*# Forward substitution */
760 
761  // The first element is the inverse of the diagonal
762  v[k] = cmplx(diag_g[k],zip);
763 
764  for(int i = k+1; i < N; ++i) {
765  zero_rep(v[i]);
766 
767  for(int j = k; j < i; ++j) {
768  int elem_ij = i*(i-1)/2+j;
769 
770  // subtract l_ij*d_j*x_{kj}
771  v[i] -= inv_offd[elem_ij] *inv_d[j]*v[j];
772 
773  }
774 
775  // scale out by 1/d_i
776  v[i] *= diag_g[i];
777  }
778 
779  /*# Backward substitution */
780  // V[N-1] remains unchanged
781  // Start from V[N-2]
782 
783  for(int i = N-2; (int)i >= (int)k; --i) {
784  for(int j = i+1; j < N; ++j) {
785  int elem_ji = j*(j-1)/2 + i;
786  // Subtract terms of typ (l_ji)*x_kj
787  v[i] -= adj(inv_offd[elem_ji]) * v[j];
788  }
789  }
790 
791  /*# Overwrite column k of invcl.offd */
792  inv_d[k] = real(v[k]);
793  for(int i = k+1; i < N; ++i) {
794 
795  int elem_ik = i*(i-1)/2+k;
796  inv_offd[elem_ik] = v[i];
797  }
798  }
799 
800 
801  // Overwrite original data
802  for(int i=0; i < N; i++) {
803  tri[site].diag[block][i] = inv_d[i];
804  }
805  for(int i=0; i < 15; i++) {
806  tri[site].offd[block][i] = inv_offd[i];
807  }
808  }
809 
810  if( site_neg_logdet != 0 ) {
811  // Report if site has any negative terms. (-ve def)
812  std::cout << "WARNING: found " << site_neg_logdet
813  << " negative eigenvalues in Clover DET at site: " << site << std::endl;
814  }
815  }/* End Site Loop */
816  } /* End Function */
817  } /* End Namespace */
818 
819 
820  /*! An LDL^\dag decomposition and inversion? */
821  template<typename T, typename U>
823  {
824 #ifndef QDP_IS_QDPJIT
825  START_CODE();
826 
827  if ( 2*Nc < 3 )
828  {
829  QDPIO::cerr << __func__ << ": Matrix is too small" << std::endl;
830  QDP_abort(1);
831  }
832 
833  // Zero trace log
834  tr_log_diag[rb[cb]] = zero;
835 
836  QDPCloverEnv::LDagDLInvArgs<U> a = { tr_log_diag, tri, cb };
837  int num_site_table = rb[cb].numSiteTable();
838  dispatch_to_threads(num_site_table, a, QDPCloverEnv::LDagDLInvSiteLoop<U>);
839 
840 
841  // This comes from the days when we used to do Cholesky
842  choles_done[cb] = true;
843 
844  END_CODE();
845 #endif
846  }
847 
848  /*! CHLCLOVMS - Cholesky decompose the clover mass term and uses it to
849  * compute lower(A^-1) = lower((L.L^dag)^-1)
850  * Adapted from Golub and Van Loan, Matrix Computations, 2nd, Sec 4.2.4
851  *
852  * Arguments:
853  *
854  * \param DetP flag whether to compute determinant (Read)
855  * \param logdet logarithm of the determinant (Write)
856  * \param cb checkerboard of work (Read)
857  */
858 
859  namespace QDPCloverEnv {
860 
861  template<typename U>
862  inline
863  void cholesSiteLoop(int lo, int hi, int myId, LDagDLInvArgs<U>* a)
864  {
865  typedef typename LDagDLInvArgs<U>::REALT REALT;
866  typedef typename LDagDLInvArgs<U>::RealT RealT;
867  typedef typename LDagDLInvArgs<U>::LatticeRealT LatticeRealT;
868 
869  LatticeRealT& tr_log_diag = a->tr_log_diag;
870  PrimitiveClovTriang <REALT>* tri = a->tri;
871  int cb = a->cb;
872 
873  int n = 2*Nc;
874  /*# Cholesky decompose A = L.L^dag */
875  /*# NOTE!!: I can store this matrix in invclov, but will need a */
876  /*# temporary diag */
877  for(int ssite=lo; ssite < hi; ++ssite) {
878  int site = rb[cb].siteTable()[ssite];
879 
881 
882  multi1d< RScalar<REALT> > diag_g(n);
883  multi1d< RComplex<REALT> > v1(n);
884  RComplex<REALT> sum;
885  RScalar<REALT> one;
886  RScalar<REALT> zero;
887  RScalar<REALT> lrtmp;
888 
889  one = 1;
890  zero = 0;
891 
892  for(int s = 0; s < 2; ++s) {
893 
894  int elem_jk = 0;
895  int elem_ij;
896 
897  for(int j = 0; j < n; ++j) {
898 
899  /*# Multiply clover mass term against basis std::vector. */
900  /*# Actually, I need a column of the lower triang matrix clov. */
901  v1[j] = cmplx(tri[site].diag[s][j],zero);
902 
903  elem_ij = elem_jk + 2*j;
904 
905  for(int i = j+1; i < n; ++i) {
906  v1[i] = tri[site].offd[s][elem_ij];
907  elem_ij += i;
908  }
909 
910  /*# Back to cholesky */
911  /*# forward substitute */
912  for(int k = 0; k < j; ++k) {
913 
914  int elem_ik = elem_jk;
915 
916  for(int i = j; i < n; ++i) {
917 
918  v1[i] -= adj(invcl.offd[s][elem_jk]) * invcl.offd[s][elem_ik];
919  elem_ik += i;
920  }
921  elem_jk++;
922  }
923 
924  /*# The diagonal is (should be!!) real and positive */
925  diag_g[j] = real(v1[j]);
926 
927  /*#+ */
928  /*# Squeeze in computation of the trace log of the diagonal term */
929  /*#- */
930  if ( diag_g[j].elem() > 0 ) {
931 
932  lrtmp = log(diag_g[j]);
933  }
934  else {
935 
936  // Make sure any node can print this message
937  std::cerr << "Clover term has negative diagonal element: "
938  << "diag_g[" << j << "]= " << diag_g[j]
939  << " at site: " << site << std::endl;
940  QDP_abort(1);
941  }
942 
943  tr_log_diag.elem(site).elem().elem() += lrtmp;
944 
945  diag_g[j] = sqrt(diag_g[j]);
946  diag_g[j] = one / diag_g[j];
947 
948  /*# backward substitute */
949  elem_ij = elem_jk + j;
950  for(int i = j+1; i < n; ++i) {
951 
952  invcl.offd[s][elem_ij] = v1[i] * diag_g[j];
953  elem_ij += i;
954  }
955  }
956 
957  /*# Use forward and back substitution to construct invcl.offd = lower(A^-1) */
958  for(int k = 0; k < n; ++k) {
959 
960  for(int i = 0; i < k; ++i)
961  zero_rep(v1[i]);
962 
963  /*# Forward substitution */
964  v1[k] = cmplx(diag_g[k],zero);
965 
966  for(int i = k+1; i < n; ++i) {
967 
968  zero_rep(sum);
969  elem_ij = i*(i-1)/2+k;
970  for(int j = k; j < i; ++j) {
971 
972  sum -= invcl.offd[s][elem_ij] * v1[j];
973  elem_ij++;
974  }
975 
976  v1[i] = sum * diag_g[i];
977  }
978 
979  /*# Backward substitution */
980  v1[n-1] = v1[n-1] * diag_g[n-1];
981 
982  for(int i = n-2; (int)i >= (int)k; --i) {
983 
984  sum = v1[i];
985 
986  int elem_ji = ((i+1)*i)/2+i;
987  for(int j = i+1; j < n; ++j) {
988 
989  sum -= adj(invcl.offd[s][elem_ji]) * v1[j];
990  elem_ji += j;
991  }
992  v1[i] = sum * diag_g[i];
993  }
994 
995  /*# Overwrite column k of invcl.offd */
996  invcl.diag[s][k] = real(v1[k]);
997 
998  int elem_ik = ((k+1)*k)/2+k;
999 
1000  for(int i = k+1; i < n; ++i) {
1001 
1002  invcl.offd[s][elem_ik] = v1[i];
1003  elem_ik += i;
1004  }
1005  }
1006  }
1007 
1008  // Overwrite original element
1009  tri[site] = invcl;
1010  }
1011  } // End function
1012 
1013  } // End Namespace
1014 
1015 
1016  template<typename T, typename U>
1018  {
1019 #ifndef QDP_IS_QDPJIT
1020  START_CODE();
1021 
1022  if ( 2*Nc < 3 )
1023  {
1024  QDPIO::cerr << __func__ << ": Matrix is too small" << std::endl;
1025  QDP_abort(1);
1026  }
1027 
1028  tr_log_diag = zero;
1029  QDPCloverEnv::LDagDLInvArgs<U> a = { tr_log_diag, tri, cb};
1030  dispatch_to_threads(rb[cb].numSiteTable(), a, QDPCloverEnv::cholesSiteLoop<U>);
1031 
1032  choles_done[cb] = true;
1033  END_CODE();
1034 #endif
1035  }
1036 
1037 
1038 
1039  /**
1040  * Apply a dslash
1041  *
1042  * Performs the operation
1043  *
1044  * chi <- (L + D + L^dag) . psi
1045  *
1046  * where
1047  * L is a lower triangular matrix
1048  * D is the real diagonal. (stored together in type TRIANG)
1049  *
1050  * Arguments:
1051  * \param chi result (Write)
1052  * \param psi source (Read)
1053  * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read)
1054  * \param cb Checkerboard of OUTPUT std::vector (Read)
1055  */
1056  template<typename T, typename U>
1058  enum PlusMinus isign, int site) const
1059  {
1060 #ifndef QDP_IS_QDPJIT
1061  START_CODE();
1062 
1063  if ( Ns != 4 )
1064  {
1065  QDPIO::cerr << __func__ << ": CloverTerm::applySite requires Ns==4" << std::endl;
1066  QDP_abort(1);
1067  }
1068 
1069  int n = 2*Nc;
1070 
1071  RComplex<REALT>* cchi = (RComplex<REALT>*)&(chi.elem(site).elem(0).elem(0));
1072  const RComplex<REALT>* ppsi = (const RComplex<REALT>*)&(psi.elem(site).elem(0).elem(0));
1073 
1074 
1075  cchi[ 0] = tri[site].diag[0][ 0] * ppsi[ 0]
1076  + conj(tri[site].offd[0][ 0]) * ppsi[ 1]
1077  + conj(tri[site].offd[0][ 1]) * ppsi[ 2]
1078  + conj(tri[site].offd[0][ 3]) * ppsi[ 3]
1079  + conj(tri[site].offd[0][ 6]) * ppsi[ 4]
1080  + conj(tri[site].offd[0][10]) * ppsi[ 5];
1081 
1082  cchi[ 1] = tri[site].diag[0][ 1] * ppsi[ 1]
1083  + tri[site].offd[0][ 0] * ppsi[ 0]
1084  + conj(tri[site].offd[0][ 2]) * ppsi[ 2]
1085  + conj(tri[site].offd[0][ 4]) * ppsi[ 3]
1086  + conj(tri[site].offd[0][ 7]) * ppsi[ 4]
1087  + conj(tri[site].offd[0][11]) * ppsi[ 5];
1088 
1089  cchi[ 2] = tri[site].diag[0][ 2] * ppsi[ 2]
1090  + tri[site].offd[0][ 1] * ppsi[ 0]
1091  + tri[site].offd[0][ 2] * ppsi[ 1]
1092  + conj(tri[site].offd[0][ 5]) * ppsi[ 3]
1093  + conj(tri[site].offd[0][ 8]) * ppsi[ 4]
1094  + conj(tri[site].offd[0][12]) * ppsi[ 5];
1095 
1096  cchi[ 3] = tri[site].diag[0][ 3] * ppsi[ 3]
1097  + tri[site].offd[0][ 3] * ppsi[ 0]
1098  + tri[site].offd[0][ 4] * ppsi[ 1]
1099  + tri[site].offd[0][ 5] * ppsi[ 2]
1100  + conj(tri[site].offd[0][ 9]) * ppsi[ 4]
1101  + conj(tri[site].offd[0][13]) * ppsi[ 5];
1102 
1103  cchi[ 4] = tri[site].diag[0][ 4] * ppsi[ 4]
1104  + tri[site].offd[0][ 6] * ppsi[ 0]
1105  + tri[site].offd[0][ 7] * ppsi[ 1]
1106  + tri[site].offd[0][ 8] * ppsi[ 2]
1107  + tri[site].offd[0][ 9] * ppsi[ 3]
1108  + conj(tri[site].offd[0][14]) * ppsi[ 5];
1109 
1110  cchi[ 5] = tri[site].diag[0][ 5] * ppsi[ 5]
1111  + tri[site].offd[0][10] * ppsi[ 0]
1112  + tri[site].offd[0][11] * ppsi[ 1]
1113  + tri[site].offd[0][12] * ppsi[ 2]
1114  + tri[site].offd[0][13] * ppsi[ 3]
1115  + tri[site].offd[0][14] * ppsi[ 4];
1116 
1117  cchi[ 6] = tri[site].diag[1][ 0] * ppsi[ 6]
1118  + conj(tri[site].offd[1][ 0]) * ppsi[ 7]
1119  + conj(tri[site].offd[1][ 1]) * ppsi[ 8]
1120  + conj(tri[site].offd[1][ 3]) * ppsi[ 9]
1121  + conj(tri[site].offd[1][ 6]) * ppsi[10]
1122  + conj(tri[site].offd[1][10]) * ppsi[11];
1123 
1124  cchi[ 7] = tri[site].diag[1][ 1] * ppsi[ 7]
1125  + tri[site].offd[1][ 0] * ppsi[ 6]
1126  + conj(tri[site].offd[1][ 2]) * ppsi[ 8]
1127  + conj(tri[site].offd[1][ 4]) * ppsi[ 9]
1128  + conj(tri[site].offd[1][ 7]) * ppsi[10]
1129  + conj(tri[site].offd[1][11]) * ppsi[11];
1130 
1131  cchi[ 8] = tri[site].diag[1][ 2] * ppsi[ 8]
1132  + tri[site].offd[1][ 1] * ppsi[ 6]
1133  + tri[site].offd[1][ 2] * ppsi[ 7]
1134  + conj(tri[site].offd[1][ 5]) * ppsi[ 9]
1135  + conj(tri[site].offd[1][ 8]) * ppsi[10]
1136  + conj(tri[site].offd[1][12]) * ppsi[11];
1137 
1138  cchi[ 9] = tri[site].diag[1][ 3] * ppsi[ 9]
1139  + tri[site].offd[1][ 3] * ppsi[ 6]
1140  + tri[site].offd[1][ 4] * ppsi[ 7]
1141  + tri[site].offd[1][ 5] * ppsi[ 8]
1142  + conj(tri[site].offd[1][ 9]) * ppsi[10]
1143  + conj(tri[site].offd[1][13]) * ppsi[11];
1144 
1145  cchi[10] = tri[site].diag[1][ 4] * ppsi[10]
1146  + tri[site].offd[1][ 6] * ppsi[ 6]
1147  + tri[site].offd[1][ 7] * ppsi[ 7]
1148  + tri[site].offd[1][ 8] * ppsi[ 8]
1149  + tri[site].offd[1][ 9] * ppsi[ 9]
1150  + conj(tri[site].offd[1][14]) * ppsi[11];
1151 
1152  cchi[11] = tri[site].diag[1][ 5] * ppsi[11]
1153  + tri[site].offd[1][10] * ppsi[ 6]
1154  + tri[site].offd[1][11] * ppsi[ 7]
1155  + tri[site].offd[1][12] * ppsi[ 8]
1156  + tri[site].offd[1][13] * ppsi[ 9]
1157  + tri[site].offd[1][14] * ppsi[10];
1158 
1159 
1160  END_CODE();
1161 #endif
1162  }
1163 
1164 
1165 
1166 
1167  //! TRIACNTR
1168  /*!
1169  * \ingroup linop
1170  *
1171  * Calculates
1172  * Tr_D ( Gamma_mat L )
1173  *
1174  * This routine is specific to Wilson fermions!
1175  *
1176  * the trace over the Dirac indices for one of the 16 Gamma matrices
1177  * and a hermitian color x spin matrix A, stored as a block diagonal
1178  * complex lower triangular matrix L and a real diagonal diag_L.
1179 
1180  * Here 0 <= mat <= 15 and
1181  * if mat = mat_1 + mat_2 * 2 + mat_3 * 4 + mat_4 * 8
1182  *
1183  * Gamma(mat) = gamma(1)^(mat_1) * gamma(2)^(mat_2) * gamma(3)^(mat_3)
1184  * * gamma(4)^(mat_4)
1185  *
1186  * Further, in basis for the Gamma matrices used, A is of the form
1187  *
1188  * | A_0 | 0 |
1189  * A = | --------- |
1190  * | 0 | A_1 |
1191  *
1192  *
1193  * Arguments:
1194  *
1195  * \param B the resulting SU(N) color matrix (Write)
1196  * \param clov clover term (Read)
1197  * \param mat label of the Gamma matrix (Read)
1198  */
1199 
1200  namespace QDPCloverEnv {
1201  template<typename U>
1202  struct TriaCntrArgs {
1203  typedef typename WordType<U>::Type_t REALT;
1204 
1205  U& B;
1207  int mat;
1208  int cb;
1209  };
1210 
1211  template<typename U>
1212  inline
1213  void triaCntrSiteLoop(int lo, int hi, int myId, TriaCntrArgs<U>* a)
1214  {
1215  typedef typename WordType<U>::Type_t REALT;
1216  U& B = a->B;
1217  const PrimitiveClovTriang< REALT >* tri = a->tri;
1218  int mat = a->mat;
1219  int cb = a->cb;
1220 
1221  for(int ssite=lo; ssite < hi; ++ssite) {
1222 
1223  int site = rb[cb].siteTable()[ssite];
1224 
1225  switch( mat ){
1226 
1227  case 0:
1228  /*# gamma( 0) 1 0 0 0 # ( 0000 ) --> 0 */
1229  /*# 0 1 0 0 */
1230  /*# 0 0 1 0 */
1231  /*# 0 0 0 1 */
1232  /*# From diagonal part */
1233  {
1234  RComplex<REALT> lctmp0;
1235  RScalar<REALT> lr_zero0;
1236  RScalar<REALT> lrtmp0;
1237 
1238  lr_zero0 = 0;
1239 
1240  for(int i0 = 0; i0 < Nc; ++i0) {
1241 
1242  lrtmp0 = tri[site].diag[0][i0];
1243  lrtmp0 += tri[site].diag[0][i0+Nc];
1244  lrtmp0 += tri[site].diag[1][i0];
1245  lrtmp0 += tri[site].diag[1][i0+Nc];
1246  B.elem(site).elem().elem(i0,i0) = cmplx(lrtmp0,lr_zero0);
1247  }
1248 
1249  /*# From lower triangular portion */
1250  int elem_ij0 = 0;
1251  for(int i0 = 1; i0 < Nc; ++i0) {
1252 
1253  int elem_ijb0 = (i0+Nc)*(i0+Nc-1)/2 + Nc;
1254 
1255  for(int j0 = 0; j0 < i0; ++j0) {
1256 
1257  lctmp0 = tri[site].offd[0][elem_ij0];
1258  lctmp0 += tri[site].offd[0][elem_ijb0];
1259  lctmp0 += tri[site].offd[1][elem_ij0];
1260  lctmp0 += tri[site].offd[1][elem_ijb0];
1261 
1262  B.elem(site).elem().elem(j0,i0) = lctmp0;
1263  B.elem(site).elem().elem(i0,j0) = adj(lctmp0);
1264 
1265 
1266  elem_ij0++;
1267  elem_ijb0++;
1268  }
1269  }
1270  }
1271 
1272  break;
1273 
1274  case 3:
1275  /*# gamma( 12) -i 0 0 0 # ( 0011 ) --> 3 */
1276  /*# 0 i 0 0 */
1277  /*# 0 0 -i 0 */
1278  /*# 0 0 0 i */
1279  /*# From diagonal part */
1280 
1281  {
1282  RComplex<REALT> lctmp3;
1283  RScalar<REALT> lr_zero3;
1284  RScalar<REALT> lrtmp3;
1285 
1286  lr_zero3 = 0;
1287 
1288  for(int i3 = 0; i3 < Nc; ++i3) {
1289 
1290  lrtmp3 = tri[site].diag[0][i3+Nc];
1291  lrtmp3 -= tri[site].diag[0][i3];
1292  lrtmp3 -= tri[site].diag[1][i3];
1293  lrtmp3 += tri[site].diag[1][i3+Nc];
1294  B.elem(site).elem().elem(i3,i3) = cmplx(lr_zero3,lrtmp3);
1295  }
1296 
1297  /*# From lower triangular portion */
1298  int elem_ij3 = 0;
1299  for(int i3 = 1; i3 < Nc; ++i3) {
1300 
1301  int elem_ijb3 = (i3+Nc)*(i3+Nc-1)/2 + Nc;
1302 
1303  for(int j3 = 0; j3 < i3; ++j3) {
1304 
1305  lctmp3 = tri[site].offd[0][elem_ijb3];
1306  lctmp3 -= tri[site].offd[0][elem_ij3];
1307  lctmp3 -= tri[site].offd[1][elem_ij3];
1308  lctmp3 += tri[site].offd[1][elem_ijb3];
1309 
1310  B.elem(site).elem().elem(j3,i3) = timesI(adj(lctmp3));
1311  B.elem(site).elem().elem(i3,j3) = timesI(lctmp3);
1312 
1313  elem_ij3++;
1314  elem_ijb3++;
1315  }
1316  }
1317  }
1318  break;
1319 
1320  case 5:
1321  /*# gamma( 13) 0 -1 0 0 # ( 0101 ) --> 5 */
1322  /*# 1 0 0 0 */
1323  /*# 0 0 0 -1 */
1324  /*# 0 0 1 0 */
1325 
1326  {
1327 
1328  RComplex<REALT> lctmp5;
1329  RScalar<REALT> lrtmp5;
1330 
1331  for(int i5 = 0; i5 < Nc; ++i5) {
1332 
1333  int elem_ij5 = (i5+Nc)*(i5+Nc-1)/2;
1334 
1335  for(int j5 = 0; j5 < Nc; ++j5) {
1336 
1337  int elem_ji5 = (j5+Nc)*(j5+Nc-1)/2 + i5;
1338 
1339 
1340  lctmp5 = adj(tri[site].offd[0][elem_ji5]);
1341  lctmp5 -= tri[site].offd[0][elem_ij5];
1342  lctmp5 += adj(tri[site].offd[1][elem_ji5]);
1343  lctmp5 -= tri[site].offd[1][elem_ij5];
1344 
1345 
1346  B.elem(site).elem().elem(i5,j5) = lctmp5;
1347 
1348  elem_ij5++;
1349  }
1350  }
1351  }
1352  break;
1353 
1354  case 6:
1355  /*# gamma( 23) 0 -i 0 0 # ( 0110 ) --> 6 */
1356  /*# -i 0 0 0 */
1357  /*# 0 0 0 -i */
1358  /*# 0 0 -i 0 */
1359 
1360  {
1361  RComplex<REALT> lctmp6;
1362  RScalar<REALT> lrtmp6;
1363 
1364  for(int i6 = 0; i6 < Nc; ++i6) {
1365 
1366  int elem_ij6 = (i6+Nc)*(i6+Nc-1)/2;
1367 
1368  for(int j6 = 0; j6 < Nc; ++j6) {
1369 
1370  int elem_ji6 = (j6+Nc)*(j6+Nc-1)/2 + i6;
1371 
1372  lctmp6 = adj(tri[site].offd[0][elem_ji6]);
1373  lctmp6 += tri[site].offd[0][elem_ij6];
1374  lctmp6 += adj(tri[site].offd[1][elem_ji6]);
1375  lctmp6 += tri[site].offd[1][elem_ij6];
1376 
1377  B.elem(site).elem().elem(i6,j6) = timesMinusI(lctmp6);
1378 
1379  elem_ij6++;
1380  }
1381  }
1382  }
1383  break;
1384 
1385  case 9:
1386  /*# gamma( 14) 0 i 0 0 # ( 1001 ) --> 9 */
1387  /*# i 0 0 0 */
1388  /*# 0 0 0 -i */
1389  /*# 0 0 -i 0 */
1390 
1391  {
1392  RComplex<REALT> lctmp9;
1393  RScalar<REALT> lrtmp9;
1394 
1395  for(int i9 = 0; i9 < Nc; ++i9) {
1396 
1397  int elem_ij9 = (i9+Nc)*(i9+Nc-1)/2;
1398 
1399  for(int j9 = 0; j9 < Nc; ++j9) {
1400 
1401  int elem_ji9 = (j9+Nc)*(j9+Nc-1)/2 + i9;
1402 
1403  lctmp9 = adj(tri[site].offd[0][elem_ji9]);
1404  lctmp9 += tri[site].offd[0][elem_ij9];
1405  lctmp9 -= adj(tri[site].offd[1][elem_ji9]);
1406  lctmp9 -= tri[site].offd[1][elem_ij9];
1407 
1408  B.elem(site).elem().elem(i9,j9) = timesI(lctmp9);
1409 
1410  elem_ij9++;
1411  }
1412  }
1413  }
1414  break;
1415 
1416  case 10:
1417  /*# gamma( 24) 0 -1 0 0 # ( 1010 ) --> 10 */
1418  /*# 1 0 0 0 */
1419  /*# 0 0 0 1 */
1420  /*# 0 0 -1 0 */
1421  {
1422  RComplex<REALT> lctmp10;
1423  RScalar<REALT> lrtmp10;
1424 
1425  for(int i10 = 0; i10 < Nc; ++i10) {
1426 
1427  int elem_ij10 = (i10+Nc)*(i10+Nc-1)/2;
1428 
1429  for(int j10 = 0; j10 < Nc; ++j10) {
1430 
1431  int elem_ji10 = (j10+Nc)*(j10+Nc-1)/2 + i10;
1432 
1433  lctmp10 = adj(tri[site].offd[0][elem_ji10]);
1434  lctmp10 -= tri[site].offd[0][elem_ij10];
1435  lctmp10 -= adj(tri[site].offd[1][elem_ji10]);
1436  lctmp10 += tri[site].offd[1][elem_ij10];
1437 
1438  B.elem(site).elem().elem(i10,j10) = lctmp10;
1439 
1440  elem_ij10++;
1441  }
1442  }
1443  }
1444  break;
1445 
1446  case 12:
1447  /*# gamma( 34) i 0 0 0 # ( 1100 ) --> 12 */
1448  /*# 0 -i 0 0 */
1449  /*# 0 0 -i 0 */
1450  /*# 0 0 0 i */
1451  /*# From diagonal part */
1452  {
1453 
1454  RComplex<REALT> lctmp12;
1455  RScalar<REALT> lr_zero12;
1456  RScalar<REALT> lrtmp12;
1457 
1458  lr_zero12 = 0;
1459 
1460  for(int i12 = 0; i12 < Nc; ++i12) {
1461 
1462  lrtmp12 = tri[site].diag[0][i12];
1463  lrtmp12 -= tri[site].diag[0][i12+Nc];
1464  lrtmp12 -= tri[site].diag[1][i12];
1465  lrtmp12 += tri[site].diag[1][i12+Nc];
1466  B.elem(site).elem().elem(i12,i12) = cmplx(lr_zero12,lrtmp12);
1467  }
1468 
1469  /*# From lower triangular portion */
1470  int elem_ij12 = 0;
1471  for(int i12 = 1; i12 < Nc; ++i12) {
1472 
1473  int elem_ijb12 = (i12+Nc)*(i12+Nc-1)/2 + Nc;
1474 
1475  for(int j12 = 0; j12 < i12; ++j12) {
1476 
1477  lctmp12 = tri[site].offd[0][elem_ij12];
1478  lctmp12 -= tri[site].offd[0][elem_ijb12];
1479  lctmp12 -= tri[site].offd[1][elem_ij12];
1480  lctmp12 += tri[site].offd[1][elem_ijb12];
1481 
1482  B.elem(site).elem().elem(i12,j12) = timesI(lctmp12);
1483  B.elem(site).elem().elem(j12,i12) = timesI(adj(lctmp12));
1484 
1485  elem_ij12++;
1486  elem_ijb12++;
1487  }
1488  }
1489  }
1490  break;
1491 
1492  default:
1493  QDPIO::cout << __func__ << ": invalid Gamma matrix int" << std::endl;
1494  QDP_abort(1);
1495  }
1496 
1497  } // END Site Loop
1498  } // End Function
1499  } // End Namespace
1500 
1501 
1502  template<typename T, typename U>
1503  void QDPCloverTermT<T,U>::triacntr(U& B, int mat, int cb) const
1504  {
1505 #ifndef QDP_IS_QDPJIT
1506  START_CODE();
1507 
1508  B = zero;
1509 
1510  if ( mat < 0 || mat > 15 )
1511  {
1512  QDPIO::cerr << __func__ << ": Gamma out of range: mat = " << mat << std::endl;
1513  QDP_abort(1);
1514  }
1515 
1516  QDPCloverEnv::TriaCntrArgs<U> a = { B, tri, mat, cb };
1517  dispatch_to_threads(rb[cb].numSiteTable(), a,
1518  QDPCloverEnv::triaCntrSiteLoop<U>);
1519 
1520  END_CODE();
1521 #endif
1522  }
1523 
1524  //! Returns the appropriate clover coefficient for indices mu and nu
1525  template<typename T, typename U>
1526  Real
1528  {
1529  START_CODE();
1530 
1531  if( param.anisoParam.anisoP ) {
1532  if (mu==param.anisoParam.t_dir || nu == param.anisoParam.t_dir) {
1533  return param.clovCoeffT;
1534  }
1535  else {
1536  // Otherwise return the spatial coeff
1537  return param.clovCoeffR;
1538  }
1539  }
1540  else {
1541  // If there is no anisotropy just return the spatial one, it will
1542  // be the same as the temporal one
1543  return param.clovCoeffR;
1544  }
1545 
1546  END_CODE();
1547  }
1548 
1549 
1550  namespace QDPCloverEnv {
1551 
1552  template<typename T>
1553  struct ApplyArgs {
1554  typedef typename WordType<T>::Type_t REALT;
1555  T& chi;
1556  const T& psi;
1558  int cb;
1559  };
1560 
1561 
1562  template<typename T>
1563  void applySiteLoop(int lo, int hi, int MyId,
1564  ApplyArgs<T>* arg)
1565 
1566  {
1567 
1568  // This is essentially the body of the previous "Apply"
1569  // but now the args are handed in through user arg struct...
1570 
1571  START_CODE();
1572 
1573  typedef typename WordType<T>::Type_t REALT;
1574  // Unwrap the args...
1575  T& chi=arg->chi;
1576  const T& psi=arg->psi;
1577  const PrimitiveClovTriang<REALT>* tri = arg->tri;
1578  int cb = arg->cb;
1579 
1580 
1581  const int n = 2*Nc;
1582 
1583  const int* tab = rb[cb].siteTable().slice();
1584 
1585  // Now just loop from low to high sites...
1586  for(int ssite=lo; ssite < hi; ++ssite) {
1587 
1588  int site = tab[ssite];
1589 #define NEW
1590 #ifndef NEW
1591  RComplex<REALT>* cchi = (RComplex<REALT>*)&(chi.elem(site).elem(0).elem(0));
1592  const RComplex<REALT>* ppsi = (const RComplex<REALT>*)&(psi.elem(site).elem(0).elem(0));
1593 #else
1594  std::complex<REALT>* cchi = (std::complex<REALT>*)&(chi.elem(site).elem(0).elem(0));
1595  std::complex<REALT>* ppsi = (std::complex<REALT>*)&(psi.elem(site).elem(0).elem(0));
1596  const REALT* const diag0 = (const REALT* const)(&(tri[site].diag[0][0].elem()));
1597  const REALT* const diag1 = (const REALT* const)(&(tri[site].diag[1][0].elem()));
1598  const std::complex<REALT>* const offdiag0 =
1599  (const std::complex<REALT>* const)(&(tri[site].offd[0][0].real()));
1600  const std::complex<REALT>* const offdiag1 =
1601  (const std::complex<REALT>* const)(&(tri[site].offd[1][0].real()));
1602 
1603 #endif
1604 #if 1
1605 #warning "Using unrolled clover term"
1606  // Rolled version
1607  for(int i = 0; i < n; ++i) {
1608 #ifndef NEW
1609  cchi[0*n+i] = tri[site].diag[0][i] * ppsi[0*n+i];
1610  cchi[1*n+i] = tri[site].diag[1][i] * ppsi[1*n+i];
1611 #else
1612  cchi[0*n+i] = diag0[i] * ppsi[0*n+i];
1613  cchi[1*n+i] = diag1[i] * ppsi[1*n+i];
1614 
1615 #endif
1616  }
1617 
1618  int kij = 0;
1619  for(int i = 0; i < n; ++i) {
1620 
1621  for(int j = 0; j < i; j++) {
1622 #ifndef NEW
1623  cchi[0*n+i] += tri[site].offd[0][kij] * ppsi[0*n+j];
1624  cchi[0*n+j] += conj(tri[site].offd[0][kij]) * ppsi[0*n+i];
1625  cchi[1*n+i] += tri[site].offd[1][kij] * ppsi[1*n+j];
1626  cchi[1*n+j] += conj(tri[site].offd[1][kij]) * ppsi[1*n+i];
1627 #else
1628  cchi[0*n+i] += offdiag0[kij] * ppsi[0*n+j];
1629  cchi[0*n+j] += conj(offdiag0[kij]) * ppsi[0*n+i];
1630  cchi[1*n+i] += offdiag1[kij] * ppsi[1*n+j];
1631  cchi[1*n+j] += conj(offdiag1[kij]) * ppsi[1*n+i];
1632 #endif
1633  kij++;
1634  }
1635 
1636  }
1637 #elif 0
1638 #warning "Using unrolled clover term - version 1"
1639  // Unrolled version - basically copying the loop structure
1640  cchi[ 0] = tri[site].diag[0][0] * ppsi[ 0];
1641  cchi[ 1] = tri[site].diag[0][1] * ppsi[ 1];
1642  cchi[ 2] = tri[site].diag[0][2] * ppsi[ 2];
1643  cchi[ 3] = tri[site].diag[0][3] * ppsi[ 3];
1644  cchi[ 4] = tri[site].diag[0][4] * ppsi[ 4];
1645  cchi[ 5] = tri[site].diag[0][5] * ppsi[ 5];
1646  cchi[ 6] = tri[site].diag[1][0] * ppsi[ 6];
1647  cchi[ 7] = tri[site].diag[1][1] * ppsi[ 7];
1648  cchi[ 8] = tri[site].diag[1][2] * ppsi[ 8];
1649  cchi[ 9] = tri[site].diag[1][3] * ppsi[ 9];
1650  cchi[10] = tri[site].diag[1][4] * ppsi[10];
1651  cchi[11] = tri[site].diag[1][5] * ppsi[11];
1652 
1653  // cchi[0*n+i] += tri[site].offd[0][kij] * ppsi[0*n+j];
1654  cchi[ 1] += tri[site].offd[0][ 0] * ppsi[ 0];
1655  cchi[ 2] += tri[site].offd[0][ 1] * ppsi[ 0];
1656  cchi[ 2] += tri[site].offd[0][ 2] * ppsi[ 1];
1657  cchi[ 3] += tri[site].offd[0][ 3] * ppsi[ 0];
1658  cchi[ 3] += tri[site].offd[0][ 4] * ppsi[ 1];
1659  cchi[ 3] += tri[site].offd[0][ 5] * ppsi[ 2];
1660  cchi[ 4] += tri[site].offd[0][ 6] * ppsi[ 0];
1661  cchi[ 4] += tri[site].offd[0][ 7] * ppsi[ 1];
1662  cchi[ 4] += tri[site].offd[0][ 8] * ppsi[ 2];
1663  cchi[ 4] += tri[site].offd[0][ 9] * ppsi[ 3];
1664  cchi[ 5] += tri[site].offd[0][10] * ppsi[ 0];
1665  cchi[ 5] += tri[site].offd[0][11] * ppsi[ 1];
1666  cchi[ 5] += tri[site].offd[0][12] * ppsi[ 2];
1667  cchi[ 5] += tri[site].offd[0][13] * ppsi[ 3];
1668  cchi[ 5] += tri[site].offd[0][14] * ppsi[ 4];
1669 
1670  // cchi[0*n+j] += conj(tri[site].offd[0][kij]) * ppsi[0*n+i];
1671  cchi[ 0] += conj(tri[site].offd[0][ 0]) * ppsi[ 1];
1672  cchi[ 0] += conj(tri[site].offd[0][ 1]) * ppsi[ 2];
1673  cchi[ 1] += conj(tri[site].offd[0][ 2]) * ppsi[ 2];
1674  cchi[ 0] += conj(tri[site].offd[0][ 3]) * ppsi[ 3];
1675  cchi[ 1] += conj(tri[site].offd[0][ 4]) * ppsi[ 3];
1676  cchi[ 2] += conj(tri[site].offd[0][ 5]) * ppsi[ 3];
1677  cchi[ 0] += conj(tri[site].offd[0][ 6]) * ppsi[ 4];
1678  cchi[ 1] += conj(tri[site].offd[0][ 7]) * ppsi[ 4];
1679  cchi[ 2] += conj(tri[site].offd[0][ 8]) * ppsi[ 4];
1680  cchi[ 3] += conj(tri[site].offd[0][ 9]) * ppsi[ 4];
1681  cchi[ 0] += conj(tri[site].offd[0][10]) * ppsi[ 5];
1682  cchi[ 1] += conj(tri[site].offd[0][11]) * ppsi[ 5];
1683  cchi[ 2] += conj(tri[site].offd[0][12]) * ppsi[ 5];
1684  cchi[ 3] += conj(tri[site].offd[0][13]) * ppsi[ 5];
1685  cchi[ 4] += conj(tri[site].offd[0][14]) * ppsi[ 5];
1686 
1687  // cchi[1*n+i] += tri[site].offd[1][kij] * ppsi[1*n+j];
1688  cchi[ 7] += tri[site].offd[1][ 0] * ppsi[ 6];
1689  cchi[ 8] += tri[site].offd[1][ 1] * ppsi[ 6];
1690  cchi[ 8] += tri[site].offd[1][ 2] * ppsi[ 7];
1691  cchi[ 9] += tri[site].offd[1][ 3] * ppsi[ 6];
1692  cchi[ 9] += tri[site].offd[1][ 4] * ppsi[ 7];
1693  cchi[ 9] += tri[site].offd[1][ 5] * ppsi[ 8];
1694  cchi[10] += tri[site].offd[1][ 6] * ppsi[ 6];
1695  cchi[10] += tri[site].offd[1][ 7] * ppsi[ 7];
1696  cchi[10] += tri[site].offd[1][ 8] * ppsi[ 8];
1697  cchi[10] += tri[site].offd[1][ 9] * ppsi[ 9];
1698  cchi[11] += tri[site].offd[1][10] * ppsi[ 6];
1699  cchi[11] += tri[site].offd[1][11] * ppsi[ 7];
1700  cchi[11] += tri[site].offd[1][12] * ppsi[ 8];
1701  cchi[11] += tri[site].offd[1][13] * ppsi[ 9];
1702  cchi[11] += tri[site].offd[1][14] * ppsi[10];
1703 
1704  // cchi[1*n+j] += conj(tri[site].offd[1][kij]) * ppsi[1*n+i];
1705  cchi[ 6] += conj(tri[site].offd[1][ 0]) * ppsi[ 7];
1706  cchi[ 6] += conj(tri[site].offd[1][ 1]) * ppsi[ 8];
1707  cchi[ 7] += conj(tri[site].offd[1][ 2]) * ppsi[ 8];
1708  cchi[ 6] += conj(tri[site].offd[1][ 3]) * ppsi[ 9];
1709  cchi[ 7] += conj(tri[site].offd[1][ 4]) * ppsi[ 9];
1710  cchi[ 8] += conj(tri[site].offd[1][ 5]) * ppsi[ 9];
1711  cchi[ 6] += conj(tri[site].offd[1][ 6]) * ppsi[10];
1712  cchi[ 7] += conj(tri[site].offd[1][ 7]) * ppsi[10];
1713  cchi[ 8] += conj(tri[site].offd[1][ 8]) * ppsi[10];
1714  cchi[ 9] += conj(tri[site].offd[1][ 9]) * ppsi[10];
1715  cchi[ 6] += conj(tri[site].offd[1][10]) * ppsi[11];
1716  cchi[ 7] += conj(tri[site].offd[1][11]) * ppsi[11];
1717  cchi[ 8] += conj(tri[site].offd[1][12]) * ppsi[11];
1718  cchi[ 9] += conj(tri[site].offd[1][13]) * ppsi[11];
1719  cchi[10] += conj(tri[site].offd[1][14]) * ppsi[11];
1720 #elif 0
1721 #warning "Using unrolled clover term - version 2"
1722  // Unrolled version - collect all LHS terms into 1 expression
1723  cchi[ 0] = tri[site].diag[0][ 0] * ppsi[ 0];
1724  cchi[ 0] += conj(tri[site].offd[0][ 0]) * ppsi[ 1];
1725  cchi[ 0] += conj(tri[site].offd[0][ 1]) * ppsi[ 2];
1726  cchi[ 0] += conj(tri[site].offd[0][ 3]) * ppsi[ 3];
1727  cchi[ 0] += conj(tri[site].offd[0][ 6]) * ppsi[ 4];
1728  cchi[ 0] += conj(tri[site].offd[0][10]) * ppsi[ 5];
1729 
1730  cchi[ 1] = tri[site].diag[0][ 1] * ppsi[ 1];
1731  cchi[ 1] += tri[site].offd[0][ 0] * ppsi[ 0];
1732  cchi[ 1] += conj(tri[site].offd[0][ 2]) * ppsi[ 2];
1733  cchi[ 1] += conj(tri[site].offd[0][ 4]) * ppsi[ 3];
1734  cchi[ 1] += conj(tri[site].offd[0][ 7]) * ppsi[ 4];
1735  cchi[ 1] += conj(tri[site].offd[0][11]) * ppsi[ 5];
1736 
1737  cchi[ 2] = tri[site].diag[0][ 2] * ppsi[ 2];
1738  cchi[ 2] += tri[site].offd[0][ 1] * ppsi[ 0];
1739  cchi[ 2] += tri[site].offd[0][ 2] * ppsi[ 1];
1740  cchi[ 2] += conj(tri[site].offd[0][ 5]) * ppsi[ 3];
1741  cchi[ 2] += conj(tri[site].offd[0][ 8]) * ppsi[ 4];
1742  cchi[ 2] += conj(tri[site].offd[0][12]) * ppsi[ 5];
1743 
1744  cchi[ 3] = tri[site].diag[0][ 3] * ppsi[ 3];
1745  cchi[ 3] += tri[site].offd[0][ 3] * ppsi[ 0];
1746  cchi[ 3] += tri[site].offd[0][ 4] * ppsi[ 1];
1747  cchi[ 3] += tri[site].offd[0][ 5] * ppsi[ 2];
1748  cchi[ 3] += conj(tri[site].offd[0][ 9]) * ppsi[ 4];
1749  cchi[ 3] += conj(tri[site].offd[0][13]) * ppsi[ 5];
1750 
1751  cchi[ 4] = tri[site].diag[0][ 4] * ppsi[ 4];
1752  cchi[ 4] += tri[site].offd[0][ 6] * ppsi[ 0];
1753  cchi[ 4] += tri[site].offd[0][ 7] * ppsi[ 1];
1754  cchi[ 4] += tri[site].offd[0][ 8] * ppsi[ 2];
1755  cchi[ 4] += tri[site].offd[0][ 9] * ppsi[ 3];
1756  cchi[ 4] += conj(tri[site].offd[0][14]) * ppsi[ 5];
1757 
1758  cchi[ 5] = tri[site].diag[0][ 5] * ppsi[ 5];
1759  cchi[ 5] += tri[site].offd[0][10] * ppsi[ 0];
1760  cchi[ 5] += tri[site].offd[0][11] * ppsi[ 1];
1761  cchi[ 5] += tri[site].offd[0][12] * ppsi[ 2];
1762  cchi[ 5] += tri[site].offd[0][13] * ppsi[ 3];
1763  cchi[ 5] += tri[site].offd[0][14] * ppsi[ 4];
1764 
1765  cchi[ 6] = tri[site].diag[1][ 0] * ppsi[ 6];
1766  cchi[ 6] += conj(tri[site].offd[1][ 0]) * ppsi[ 7];
1767  cchi[ 6] += conj(tri[site].offd[1][ 1]) * ppsi[ 8];
1768  cchi[ 6] += conj(tri[site].offd[1][ 3]) * ppsi[ 9];
1769  cchi[ 6] += conj(tri[site].offd[1][ 6]) * ppsi[10];
1770  cchi[ 6] += conj(tri[site].offd[1][10]) * ppsi[11];
1771 
1772  cchi[ 7] = tri[site].diag[1][ 1] * ppsi[ 7];
1773  cchi[ 7] += tri[site].offd[1][ 0] * ppsi[ 6];
1774  cchi[ 7] += conj(tri[site].offd[1][ 2]) * ppsi[ 8];
1775  cchi[ 7] += conj(tri[site].offd[1][ 4]) * ppsi[ 9];
1776  cchi[ 7] += conj(tri[site].offd[1][ 7]) * ppsi[10];
1777  cchi[ 7] += conj(tri[site].offd[1][11]) * ppsi[11];
1778 
1779  cchi[ 8] = tri[site].diag[1][ 2] * ppsi[ 8];
1780  cchi[ 8] += tri[site].offd[1][ 1] * ppsi[ 6];
1781  cchi[ 8] += tri[site].offd[1][ 2] * ppsi[ 7];
1782  cchi[ 8] += conj(tri[site].offd[1][ 5]) * ppsi[ 9];
1783  cchi[ 8] += conj(tri[site].offd[1][ 8]) * ppsi[10];
1784  cchi[ 8] += conj(tri[site].offd[1][12]) * ppsi[11];
1785 
1786  cchi[ 9] = tri[site].diag[1][ 3] * ppsi[ 9];
1787  cchi[ 9] += tri[site].offd[1][ 3] * ppsi[ 6];
1788  cchi[ 9] += tri[site].offd[1][ 4] * ppsi[ 7];
1789  cchi[ 9] += tri[site].offd[1][ 5] * ppsi[ 8];
1790  cchi[ 9] += conj(tri[site].offd[1][ 9]) * ppsi[10];
1791  cchi[ 9] += conj(tri[site].offd[1][13]) * ppsi[11];
1792 
1793  cchi[10] = tri[site].diag[1][ 4] * ppsi[10];
1794  cchi[10] += tri[site].offd[1][ 6] * ppsi[ 6];
1795  cchi[10] += tri[site].offd[1][ 7] * ppsi[ 7];
1796  cchi[10] += tri[site].offd[1][ 8] * ppsi[ 8];
1797  cchi[10] += tri[site].offd[1][ 9] * ppsi[ 9];
1798  cchi[10] += conj(tri[site].offd[1][14]) * ppsi[11];
1799 
1800  cchi[11] = tri[site].diag[1][ 5] * ppsi[11];
1801  cchi[11] += tri[site].offd[1][10] * ppsi[ 6];
1802  cchi[11] += tri[site].offd[1][11] * ppsi[ 7];
1803  cchi[11] += tri[site].offd[1][12] * ppsi[ 8];
1804  cchi[11] += tri[site].offd[1][13] * ppsi[ 9];
1805  cchi[11] += tri[site].offd[1][14] * ppsi[10];
1806 #elif 0
1807  // Unrolled version 3.
1808  // Took unrolled version 2 and wrote out in real() and imag()
1809  // parts. Rearranged so that all the reals follow each other
1810  // in the output so that we can write linearly
1811 
1812 
1813  cchi[ 0].real() = tri[site].diag[0][0].elem() * ppsi[0].real();
1814  cchi[ 0].real() += tri[site].offd[0][0].real() * ppsi[1].real();
1815  cchi[ 0].real() += tri[site].offd[0][0].imag() * ppsi[1].imag();
1816  cchi[ 0].real() += tri[site].offd[0][1].real() * ppsi[2].real();
1817  cchi[ 0].real() += tri[site].offd[0][1].imag() * ppsi[2].imag();
1818  cchi[ 0].real() += tri[site].offd[0][3].real() * ppsi[3].real();
1819  cchi[ 0].real() += tri[site].offd[0][3].imag() * ppsi[3].imag();
1820  cchi[ 0].real() += tri[site].offd[0][6].real() * ppsi[4].real();
1821  cchi[ 0].real() += tri[site].offd[0][6].imag() * ppsi[4].imag();
1822  cchi[ 0].real() += tri[site].offd[0][10].real() * ppsi[5].real();
1823  cchi[ 0].real() += tri[site].offd[0][10].imag() * ppsi[5].imag();
1824 
1825  cchi[ 0].imag() = tri[site].diag[0][0].elem() * ppsi[ 0].imag();
1826  cchi[ 0].imag() += tri[site].offd[0][0].real() * ppsi[1].imag();
1827  cchi[ 0].imag() -= tri[site].offd[0][0].imag() * ppsi[1].real();
1828  cchi[ 0].imag() += tri[site].offd[0][3].real() * ppsi[3].imag();
1829  cchi[ 0].imag() -= tri[site].offd[0][3].imag() * ppsi[3].real();
1830  cchi[ 0].imag() += tri[site].offd[0][1].real() * ppsi[2].imag();
1831  cchi[ 0].imag() -= tri[site].offd[0][1].imag() * ppsi[2].real();
1832  cchi[ 0].imag() += tri[site].offd[0][6].real() * ppsi[4].imag();
1833  cchi[ 0].imag() -= tri[site].offd[0][6].imag() * ppsi[4].real();
1834  cchi[ 0].imag() += tri[site].offd[0][10].real() * ppsi[5].imag();
1835  cchi[ 0].imag() -= tri[site].offd[0][10].imag() * ppsi[5].real();
1836 
1837 
1838  cchi[ 1].real() = tri[site].diag[0][ 1].elem() * ppsi[ 1].real();
1839  cchi[ 1].real() += tri[site].offd[0][ 0].real() * ppsi[ 0].real();
1840  cchi[ 1].real() -= tri[site].offd[0][ 0].imag() * ppsi[ 0].imag();
1841  cchi[ 1].real() += tri[site].offd[0][ 2].real() * ppsi[ 2].real();
1842  cchi[ 1].real() += tri[site].offd[0][ 2].imag() * ppsi[ 2].imag();
1843  cchi[ 1].real() += tri[site].offd[0][ 4].real() * ppsi[ 3].real();
1844  cchi[ 1].real() += tri[site].offd[0][ 4].imag() * ppsi[ 3].imag();
1845  cchi[ 1].real() += tri[site].offd[0][ 7].real() * ppsi[ 4].real();
1846  cchi[ 1].real() += tri[site].offd[0][ 7].imag() * ppsi[ 4].imag();
1847  cchi[ 1].real() += tri[site].offd[0][11].real() * ppsi[ 5].real();
1848  cchi[ 1].real() += tri[site].offd[0][11].imag() * ppsi[ 5].imag();
1849 
1850 
1851  cchi[ 1].imag() = tri[site].diag[0][ 1].elem() * ppsi[ 1].imag();
1852  cchi[ 1].imag() += tri[site].offd[0][ 0].real() * ppsi[ 0].imag();
1853  cchi[ 1].imag() += tri[site].offd[0][ 0].imag() * ppsi[ 0].real();
1854  cchi[ 1].imag() += tri[site].offd[0][ 2].real() * ppsi[ 2].imag();
1855  cchi[ 1].imag() -= tri[site].offd[0][ 2].imag() * ppsi[ 2].real();
1856  cchi[ 1].imag() += tri[site].offd[0][ 4].real() * ppsi[ 3].imag();
1857  cchi[ 1].imag() -= tri[site].offd[0][ 4].imag() * ppsi[ 3].real();
1858  cchi[ 1].imag() += tri[site].offd[0][ 7].real() * ppsi[ 4].imag();
1859  cchi[ 1].imag() -= tri[site].offd[0][ 7].imag() * ppsi[ 4].real();
1860  cchi[ 1].imag() += tri[site].offd[0][11].real() * ppsi[ 5].imag();
1861  cchi[ 1].imag() -= tri[site].offd[0][11].imag() * ppsi[ 5].real();
1862 
1863 
1864  cchi[ 2].real() = tri[site].diag[0][ 2].elem() * ppsi[ 2].real();
1865  cchi[ 2].real() += tri[site].offd[0][ 1].real() * ppsi[ 0].real();
1866  cchi[ 2].real() -= tri[site].offd[0][ 1].imag() * ppsi[ 0].imag();
1867  cchi[ 2].real() += tri[site].offd[0][ 2].real() * ppsi[ 1].real();
1868  cchi[ 2].real() -= tri[site].offd[0][ 2].imag() * ppsi[ 1].imag();
1869  cchi[ 2].real() += tri[site].offd[0][5].real() * ppsi[ 3].real();
1870  cchi[ 2].real() += tri[site].offd[0][5].imag() * ppsi[ 3].imag();
1871  cchi[ 2].real() += tri[site].offd[0][8].real() * ppsi[ 4].real();
1872  cchi[ 2].real() += tri[site].offd[0][8].imag() * ppsi[ 4].imag();
1873  cchi[ 2].real() += tri[site].offd[0][12].real() * ppsi[ 5].real();
1874  cchi[ 2].real() += tri[site].offd[0][12].imag() * ppsi[ 5].imag();
1875 
1876 
1877  cchi[ 2].imag() = tri[site].diag[0][ 2].elem() * ppsi[ 2].imag();
1878  cchi[ 2].imag() += tri[site].offd[0][ 1].real() * ppsi[ 0].imag();
1879  cchi[ 2].imag() += tri[site].offd[0][ 1].imag() * ppsi[ 0].real();
1880  cchi[ 2].imag() += tri[site].offd[0][ 2].real() * ppsi[ 1].imag();
1881  cchi[ 2].imag() += tri[site].offd[0][ 2].imag() * ppsi[ 1].real();
1882  cchi[ 2].imag() += tri[site].offd[0][5].real() * ppsi[ 3].imag();
1883  cchi[ 2].imag() -= tri[site].offd[0][5].imag() * ppsi[ 3].real();
1884  cchi[ 2].imag() += tri[site].offd[0][8].real() * ppsi[ 4].imag();
1885  cchi[ 2].imag() -= tri[site].offd[0][8].imag() * ppsi[ 4].real();
1886  cchi[ 2].imag() += tri[site].offd[0][12].real() * ppsi[ 5].imag();
1887  cchi[ 2].imag() -= tri[site].offd[0][12].imag() * ppsi[ 5].real();
1888 
1889 
1890  cchi[ 3].real() = tri[site].diag[0][ 3].elem() * ppsi[ 3].real();
1891  cchi[ 3].real() += tri[site].offd[0][ 3].real() * ppsi[ 0].real();
1892  cchi[ 3].real() -= tri[site].offd[0][ 3].imag() * ppsi[ 0].imag();
1893  cchi[ 3].real() += tri[site].offd[0][ 4].real() * ppsi[ 1].real();
1894  cchi[ 3].real() -= tri[site].offd[0][ 4].imag() * ppsi[ 1].imag();
1895  cchi[ 3].real() += tri[site].offd[0][ 5].real() * ppsi[ 2].real();
1896  cchi[ 3].real() -= tri[site].offd[0][ 5].imag() * ppsi[ 2].imag();
1897  cchi[ 3].real() += tri[site].offd[0][ 9].real() * ppsi[ 4].real();
1898  cchi[ 3].real() += tri[site].offd[0][ 9].imag() * ppsi[ 4].imag();
1899  cchi[ 3].real() += tri[site].offd[0][13].real() * ppsi[ 5].real();
1900  cchi[ 3].real() += tri[site].offd[0][13].imag() * ppsi[ 5].imag();
1901 
1902 
1903  cchi[ 3].imag() = tri[site].diag[0][ 3].elem() * ppsi[ 3].imag();
1904  cchi[ 3].imag() += tri[site].offd[0][ 3].real() * ppsi[ 0].imag();
1905  cchi[ 3].imag() += tri[site].offd[0][ 3].imag() * ppsi[ 0].real();
1906  cchi[ 3].imag() += tri[site].offd[0][ 4].real() * ppsi[ 1].imag();
1907  cchi[ 3].imag() += tri[site].offd[0][ 4].imag() * ppsi[ 1].real();
1908  cchi[ 3].imag() += tri[site].offd[0][ 5].real() * ppsi[ 2].imag();
1909  cchi[ 3].imag() += tri[site].offd[0][ 5].imag() * ppsi[ 2].real();
1910  cchi[ 3].imag() += tri[site].offd[0][ 9].real() * ppsi[ 4].imag();
1911  cchi[ 3].imag() -= tri[site].offd[0][ 9].imag() * ppsi[ 4].real();
1912  cchi[ 3].imag() += tri[site].offd[0][13].real() * ppsi[ 5].imag();
1913  cchi[ 3].imag() -= tri[site].offd[0][13].imag() * ppsi[ 5].real();
1914 
1915 
1916  cchi[ 4].real() = tri[site].diag[0][ 4].elem() * ppsi[ 4].real();
1917  cchi[ 4].real() += tri[site].offd[0][ 6].real() * ppsi[ 0].real();
1918  cchi[ 4].real() -= tri[site].offd[0][ 6].imag() * ppsi[ 0].imag();
1919  cchi[ 4].real() += tri[site].offd[0][ 7].real() * ppsi[ 1].real();
1920  cchi[ 4].real() -= tri[site].offd[0][ 7].imag() * ppsi[ 1].imag();
1921  cchi[ 4].real() += tri[site].offd[0][ 8].real() * ppsi[ 2].real();
1922  cchi[ 4].real() -= tri[site].offd[0][ 8].imag() * ppsi[ 2].imag();
1923  cchi[ 4].real() += tri[site].offd[0][ 9].real() * ppsi[ 3].real();
1924  cchi[ 4].real() -= tri[site].offd[0][ 9].imag() * ppsi[ 3].imag();
1925  cchi[ 4].real() += tri[site].offd[0][14].real() * ppsi[ 5].real();
1926  cchi[ 4].real() += tri[site].offd[0][14].imag() * ppsi[ 5].imag();
1927 
1928 
1929  cchi[ 4].imag() = tri[site].diag[0][ 4].elem() * ppsi[ 4].imag();
1930  cchi[ 4].imag() += tri[site].offd[0][ 6].real() * ppsi[ 0].imag();
1931  cchi[ 4].imag() += tri[site].offd[0][ 6].imag() * ppsi[ 0].real();
1932  cchi[ 4].imag() += tri[site].offd[0][ 7].real() * ppsi[ 1].imag();
1933  cchi[ 4].imag() += tri[site].offd[0][ 7].imag() * ppsi[ 1].real();
1934  cchi[ 4].imag() += tri[site].offd[0][ 8].real() * ppsi[ 2].imag();
1935  cchi[ 4].imag() += tri[site].offd[0][ 8].imag() * ppsi[ 2].real();
1936  cchi[ 4].imag() += tri[site].offd[0][ 9].real() * ppsi[ 3].imag();
1937  cchi[ 4].imag() += tri[site].offd[0][ 9].imag() * ppsi[ 3].real();
1938  cchi[ 4].imag() += tri[site].offd[0][14].real() * ppsi[ 5].imag();
1939  cchi[ 4].imag() -= tri[site].offd[0][14].imag() * ppsi[ 5].real();
1940 
1941 
1942  cchi[ 5].real() = tri[site].diag[0][ 5].elem() * ppsi[ 5].real();
1943  cchi[ 5].real() += tri[site].offd[0][10].real() * ppsi[ 0].real();
1944  cchi[ 5].real() -= tri[site].offd[0][10].imag() * ppsi[ 0].imag();
1945  cchi[ 5].real() += tri[site].offd[0][11].real() * ppsi[ 1].real();
1946  cchi[ 5].real() -= tri[site].offd[0][11].imag() * ppsi[ 1].imag();
1947  cchi[ 5].real() += tri[site].offd[0][12].real() * ppsi[ 2].real();
1948  cchi[ 5].real() -= tri[site].offd[0][12].imag() * ppsi[ 2].imag();
1949  cchi[ 5].real() += tri[site].offd[0][13].real() * ppsi[ 3].real();
1950  cchi[ 5].real() -= tri[site].offd[0][13].imag() * ppsi[ 3].imag();
1951  cchi[ 5].real() += tri[site].offd[0][14].real() * ppsi[ 4].real();
1952  cchi[ 5].real() -= tri[site].offd[0][14].imag() * ppsi[ 4].imag();
1953 
1954 
1955  cchi[ 5].imag() = tri[site].diag[0][ 5].elem() * ppsi[ 5].imag();
1956  cchi[ 5].imag() += tri[site].offd[0][10].real() * ppsi[ 0].imag();
1957  cchi[ 5].imag() += tri[site].offd[0][10].imag() * ppsi[ 0].real();
1958  cchi[ 5].imag() += tri[site].offd[0][11].real() * ppsi[ 1].imag();
1959  cchi[ 5].imag() += tri[site].offd[0][11].imag() * ppsi[ 1].real();
1960  cchi[ 5].imag() += tri[site].offd[0][12].real() * ppsi[ 2].imag();
1961  cchi[ 5].imag() += tri[site].offd[0][12].imag() * ppsi[ 2].real();
1962  cchi[ 5].imag() += tri[site].offd[0][13].real() * ppsi[ 3].imag();
1963  cchi[ 5].imag() += tri[site].offd[0][13].imag() * ppsi[ 3].real();
1964  cchi[ 5].imag() += tri[site].offd[0][14].real() * ppsi[ 4].imag();
1965  cchi[ 5].imag() += tri[site].offd[0][14].imag() * ppsi[ 4].real();
1966 
1967 
1968  cchi[ 6].real() = tri[site].diag[1][0].elem() * ppsi[6].real();
1969  cchi[ 6].real() += tri[site].offd[1][0].real() * ppsi[7].real();
1970  cchi[ 6].real() += tri[site].offd[1][0].imag() * ppsi[7].imag();
1971  cchi[ 6].real() += tri[site].offd[1][1].real() * ppsi[8].real();
1972  cchi[ 6].real() += tri[site].offd[1][1].imag() * ppsi[8].imag();
1973  cchi[ 6].real() += tri[site].offd[1][3].real() * ppsi[9].real();
1974  cchi[ 6].real() += tri[site].offd[1][3].imag() * ppsi[9].imag();
1975  cchi[ 6].real() += tri[site].offd[1][6].real() * ppsi[10].real();
1976  cchi[ 6].real() += tri[site].offd[1][6].imag() * ppsi[10].imag();
1977  cchi[ 6].real() += tri[site].offd[1][10].real() * ppsi[11].real();
1978  cchi[ 6].real() += tri[site].offd[1][10].imag() * ppsi[11].imag();
1979 
1980  cchi[ 6].imag() = tri[site].diag[1][0].elem() * ppsi[6].imag();
1981  cchi[ 6].imag() += tri[site].offd[1][0].real() * ppsi[7].imag();
1982  cchi[ 6].imag() -= tri[site].offd[1][0].imag() * ppsi[7].real();
1983  cchi[ 6].imag() += tri[site].offd[1][1].real() * ppsi[8].imag();
1984  cchi[ 6].imag() -= tri[site].offd[1][1].imag() * ppsi[8].real();
1985  cchi[ 6].imag() += tri[site].offd[1][3].real() * ppsi[9].imag();
1986  cchi[ 6].imag() -= tri[site].offd[1][3].imag() * ppsi[9].real();
1987  cchi[ 6].imag() += tri[site].offd[1][6].real() * ppsi[10].imag();
1988  cchi[ 6].imag() -= tri[site].offd[1][6].imag() * ppsi[10].real();
1989  cchi[ 6].imag() += tri[site].offd[1][10].real() * ppsi[11].imag();
1990  cchi[ 6].imag() -= tri[site].offd[1][10].imag() * ppsi[11].real();
1991 
1992 
1993  cchi[ 7].real() = tri[site].diag[1][ 1].elem() * ppsi[ 7].real();
1994  cchi[ 7].real() += tri[site].offd[1][ 0].real() * ppsi[ 6].real();
1995  cchi[ 7].real() -= tri[site].offd[1][ 0].imag() * ppsi[ 6].imag();
1996  cchi[ 7].real() += tri[site].offd[1][ 2].real() * ppsi[ 8].real();
1997  cchi[ 7].real() += tri[site].offd[1][ 2].imag() * ppsi[ 8].imag();
1998  cchi[ 7].real() += tri[site].offd[1][ 4].real() * ppsi[ 9].real();
1999  cchi[ 7].real() += tri[site].offd[1][ 4].imag() * ppsi[ 9].imag();
2000  cchi[ 7].real() += tri[site].offd[1][ 7].real() * ppsi[10].real();
2001  cchi[ 7].real() += tri[site].offd[1][ 7].imag() * ppsi[10].imag();
2002  cchi[ 7].real() += tri[site].offd[1][11].real() * ppsi[11].real();
2003  cchi[ 7].real() += tri[site].offd[1][11].imag() * ppsi[11].imag();
2004 
2005  cchi[ 7].imag() = tri[site].diag[1][ 1].elem() * ppsi[ 7].imag();
2006  cchi[ 7].imag() += tri[site].offd[1][ 0].real() * ppsi[ 6].imag();
2007  cchi[ 7].imag() += tri[site].offd[1][ 0].imag() * ppsi[ 6].real();
2008  cchi[ 7].imag() += tri[site].offd[1][ 2].real() * ppsi[ 8].imag();
2009  cchi[ 7].imag() -= tri[site].offd[1][ 2].imag() * ppsi[ 8].real();
2010  cchi[ 7].imag() += tri[site].offd[1][ 4].real() * ppsi[ 9].imag();
2011  cchi[ 7].imag() -= tri[site].offd[1][ 4].imag() * ppsi[ 9].real();
2012  cchi[ 7].imag() += tri[site].offd[1][ 7].real() * ppsi[10].imag();
2013  cchi[ 7].imag() -= tri[site].offd[1][ 7].imag() * ppsi[10].real();
2014  cchi[ 7].imag() += tri[site].offd[1][11].real() * ppsi[11].imag();
2015  cchi[ 7].imag() -= tri[site].offd[1][11].imag() * ppsi[11].real();
2016 
2017 
2018  cchi[ 8].real() = tri[site].diag[1][ 2].elem() * ppsi[ 8].real();
2019  cchi[ 8].real() += tri[site].offd[1][ 1].real() * ppsi[ 6].real();
2020  cchi[ 8].real() -= tri[site].offd[1][ 1].imag() * ppsi[ 6].imag();
2021  cchi[ 8].real() += tri[site].offd[1][ 2].real() * ppsi[ 7].real();
2022  cchi[ 8].real() -= tri[site].offd[1][ 2].imag() * ppsi[ 7].imag();
2023  cchi[ 8].real() += tri[site].offd[1][5].real() * ppsi[ 9].real();
2024  cchi[ 8].real() += tri[site].offd[1][5].imag() * ppsi[ 9].imag();
2025  cchi[ 8].real() += tri[site].offd[1][8].real() * ppsi[10].real();
2026  cchi[ 8].real() += tri[site].offd[1][8].imag() * ppsi[10].imag();
2027  cchi[ 8].real() += tri[site].offd[1][12].real() * ppsi[11].real();
2028  cchi[ 8].real() += tri[site].offd[1][12].imag() * ppsi[11].imag();
2029 
2030  cchi[ 8].imag() = tri[site].diag[1][ 2].elem() * ppsi[ 8].imag();
2031  cchi[ 8].imag() += tri[site].offd[1][ 1].real() * ppsi[ 6].imag();
2032  cchi[ 8].imag() += tri[site].offd[1][ 1].imag() * ppsi[ 6].real();
2033  cchi[ 8].imag() += tri[site].offd[1][ 2].real() * ppsi[ 7].imag();
2034  cchi[ 8].imag() += tri[site].offd[1][ 2].imag() * ppsi[ 7].real();
2035  cchi[ 8].imag() += tri[site].offd[1][5].real() * ppsi[ 9].imag();
2036  cchi[ 8].imag() -= tri[site].offd[1][5].imag() * ppsi[ 9].real();
2037  cchi[ 8].imag() += tri[site].offd[1][8].real() * ppsi[10].imag();
2038  cchi[ 8].imag() -= tri[site].offd[1][8].imag() * ppsi[10].real();
2039  cchi[ 8].imag() += tri[site].offd[1][12].real() * ppsi[11].imag();
2040  cchi[ 8].imag() -= tri[site].offd[1][12].imag() * ppsi[11].real();
2041 
2042 
2043  cchi[ 9].real() = tri[site].diag[1][ 3].elem() * ppsi[ 9].real();
2044  cchi[ 9].real() += tri[site].offd[1][ 3].real() * ppsi[ 6].real();
2045  cchi[ 9].real() -= tri[site].offd[1][ 3].imag() * ppsi[ 6].imag();
2046  cchi[ 9].real() += tri[site].offd[1][ 4].real() * ppsi[ 7].real();
2047  cchi[ 9].real() -= tri[site].offd[1][ 4].imag() * ppsi[ 7].imag();
2048  cchi[ 9].real() += tri[site].offd[1][ 5].real() * ppsi[ 8].real();
2049  cchi[ 9].real() -= tri[site].offd[1][ 5].imag() * ppsi[ 8].imag();
2050  cchi[ 9].real() += tri[site].offd[1][ 9].real() * ppsi[10].real();
2051  cchi[ 9].real() += tri[site].offd[1][ 9].imag() * ppsi[10].imag();
2052  cchi[ 9].real() += tri[site].offd[1][13].real() * ppsi[11].real();
2053  cchi[ 9].real() += tri[site].offd[1][13].imag() * ppsi[11].imag();
2054 
2055  cchi[ 9].imag() = tri[site].diag[1][ 3].elem() * ppsi[ 9].imag();
2056  cchi[ 9].imag() += tri[site].offd[1][ 3].real() * ppsi[ 6].imag();
2057  cchi[ 9].imag() += tri[site].offd[1][ 3].imag() * ppsi[ 6].real();
2058  cchi[ 9].imag() += tri[site].offd[1][ 4].real() * ppsi[ 7].imag();
2059  cchi[ 9].imag() += tri[site].offd[1][ 4].imag() * ppsi[ 7].real();
2060  cchi[ 9].imag() += tri[site].offd[1][ 5].real() * ppsi[ 8].imag();
2061  cchi[ 9].imag() += tri[site].offd[1][ 5].imag() * ppsi[ 8].real();
2062  cchi[ 9].imag() += tri[site].offd[1][ 9].real() * ppsi[10].imag();
2063  cchi[ 9].imag() -= tri[site].offd[1][ 9].imag() * ppsi[10].real();
2064  cchi[ 9].imag() += tri[site].offd[1][13].real() * ppsi[11].imag();
2065  cchi[ 9].imag() -= tri[site].offd[1][13].imag() * ppsi[11].real();
2066 
2067 
2068  cchi[10].real() = tri[site].diag[1][ 4].elem() * ppsi[10].real();
2069  cchi[10].real() += tri[site].offd[1][ 6].real() * ppsi[ 6].real();
2070  cchi[10].real() -= tri[site].offd[1][ 6].imag() * ppsi[ 6].imag();
2071  cchi[10].real() += tri[site].offd[1][ 7].real() * ppsi[ 7].real();
2072  cchi[10].real() -= tri[site].offd[1][ 7].imag() * ppsi[ 7].imag();
2073  cchi[10].real() += tri[site].offd[1][ 8].real() * ppsi[ 8].real();
2074  cchi[10].real() -= tri[site].offd[1][ 8].imag() * ppsi[ 8].imag();
2075  cchi[10].real() += tri[site].offd[1][ 9].real() * ppsi[ 9].real();
2076  cchi[10].real() -= tri[site].offd[1][ 9].imag() * ppsi[ 9].imag();
2077  cchi[10].real() += tri[site].offd[1][14].real() * ppsi[11].real();
2078  cchi[10].real() += tri[site].offd[1][14].imag() * ppsi[11].imag();
2079 
2080  cchi[10].imag() = tri[site].diag[1][ 4].elem() * ppsi[10].imag();
2081  cchi[10].imag() += tri[site].offd[1][ 6].real() * ppsi[ 6].imag();
2082  cchi[10].imag() += tri[site].offd[1][ 6].imag() * ppsi[ 6].real();
2083  cchi[10].imag() += tri[site].offd[1][ 7].real() * ppsi[ 7].imag();
2084  cchi[10].imag() += tri[site].offd[1][ 7].imag() * ppsi[ 7].real();
2085  cchi[10].imag() += tri[site].offd[1][ 8].real() * ppsi[ 8].imag();
2086  cchi[10].imag() += tri[site].offd[1][ 8].imag() * ppsi[ 8].real();
2087  cchi[10].imag() += tri[site].offd[1][ 9].real() * ppsi[ 9].imag();
2088  cchi[10].imag() += tri[site].offd[1][ 9].imag() * ppsi[ 9].real();
2089  cchi[10].imag() += tri[site].offd[1][14].real() * ppsi[11].imag();
2090  cchi[10].imag() -= tri[site].offd[1][14].imag() * ppsi[11].real();
2091 
2092 
2093  cchi[11].real() = tri[site].diag[1][ 5].elem() * ppsi[11].real();
2094  cchi[11].real() += tri[site].offd[1][10].real() * ppsi[ 6].real();
2095  cchi[11].real() -= tri[site].offd[1][10].imag() * ppsi[ 6].imag();
2096  cchi[11].real() += tri[site].offd[1][11].real() * ppsi[ 7].real();
2097  cchi[11].real() -= tri[site].offd[1][11].imag() * ppsi[ 7].imag();
2098  cchi[11].real() += tri[site].offd[1][12].real() * ppsi[ 8].real();
2099  cchi[11].real() -= tri[site].offd[1][12].imag() * ppsi[ 8].imag();
2100  cchi[11].real() += tri[site].offd[1][13].real() * ppsi[ 9].real();
2101  cchi[11].real() -= tri[site].offd[1][13].imag() * ppsi[ 9].imag();
2102  cchi[11].real() += tri[site].offd[1][14].real() * ppsi[10].real();
2103  cchi[11].real() -= tri[site].offd[1][14].imag() * ppsi[10].imag();
2104 
2105  cchi[11].imag() = tri[site].diag[1][ 5].elem() * ppsi[11].imag();
2106  cchi[11].imag() += tri[site].offd[1][10].real() * ppsi[ 6].imag();
2107  cchi[11].imag() += tri[site].offd[1][10].imag() * ppsi[ 6].real();
2108  cchi[11].imag() += tri[site].offd[1][11].real() * ppsi[ 7].imag();
2109  cchi[11].imag() += tri[site].offd[1][11].imag() * ppsi[ 7].real();
2110  cchi[11].imag() += tri[site].offd[1][12].real() * ppsi[ 8].imag();
2111  cchi[11].imag() += tri[site].offd[1][12].imag() * ppsi[ 8].real();
2112  cchi[11].imag() += tri[site].offd[1][13].real() * ppsi[ 9].imag();
2113  cchi[11].imag() += tri[site].offd[1][13].imag() * ppsi[ 9].real();
2114  cchi[11].imag() += tri[site].offd[1][14].real() * ppsi[10].imag();
2115  cchi[11].imag() += tri[site].offd[1][14].imag() * ppsi[10].real();
2116 #endif
2117  }
2118 
2119  END_CODE();
2120  }// Function
2121  } // Namespace
2122 
2123  /**
2124  * Apply a dslash
2125  *
2126  * Performs the operation
2127  *
2128  * chi <- (L + D + L^dag) . psi
2129  *
2130  * where
2131  * L is a lower triangular matrix
2132  * D is the real diagonal. (stored together in type TRIANG)
2133  *
2134  * Arguments:
2135  * \param chi result (Write)
2136  * \param psi source (Read)
2137  * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read)
2138  * \param cb Checkerboard of OUTPUT std::vector (Read)
2139  */
2140  template<typename T, typename U>
2142  enum PlusMinus isign, int cb) const
2143  {
2144 #ifndef QDP_IS_QDPJIT
2145  START_CODE();
2146 
2147  if ( Ns != 4 ) {
2148  QDPIO::cerr << __func__ << ": CloverTerm::apply requires Ns==4" << std::endl;
2149  QDP_abort(1);
2150  }
2151 
2152  QDPCloverEnv::ApplyArgs<T> arg = { chi,psi,tri,cb };
2153  int num_sites = rb[cb].siteTable().size();
2154 
2155  // The dispatch function is at the end of the file
2156  // ought to work for non-threaded targets too...
2157  dispatch_to_threads(num_sites, arg, QDPCloverEnv::applySiteLoop<T>);
2158  (*this).getFermBC().modifyF(chi, QDP::rb[cb]);
2159 
2160  END_CODE();
2161 #endif
2162  }
2163 
2164 
2165  namespace QDPCloverEnv {
2166  template<typename R>
2167  struct QUDAPackArgs {
2168  int cb;
2169  multi1d< QUDAPackedClovSite<R> >& quda_array;
2171  };
2172 
2173  template<typename R>
2174  void qudaPackSiteLoop(int lo, int hi, int myId, QUDAPackArgs<R>* a) {
2175  int cb = a->cb;
2176  int Ns2 = Ns/2;
2177 
2178  multi1d< QUDAPackedClovSite<R> >& quda_array = a->quda_array;
2179  const PrimitiveClovTriang< R >* tri=a->tri;
2180 
2181  const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
2182 
2183  for(int ssite=lo; ssite < hi; ++ssite) {
2184  int site = rb[cb].siteTable()[ssite];
2185  // First Chiral Block
2186  for(int i=0; i < 6; i++) {
2187  quda_array[site].diag1[i] = tri[site].diag[0][i].elem();
2188  }
2189 
2190  int target_index=0;
2191 
2192  for(int col=0; col < Nc*Ns2-1; col++) {
2193  for(int row=col+1; row < Nc*Ns2; row++) {
2194 
2195  int source_index = row*(row-1)/2 + col;
2196 
2197  quda_array[site].offDiag1[target_index][0] = tri[site].offd[0][source_index].real();
2198  quda_array[site].offDiag1[target_index][1] = tri[site].offd[0][source_index].imag();
2199  target_index++;
2200  }
2201  }
2202  // Second Chiral Block
2203  for(int i=0; i < 6; i++) {
2204  quda_array[site].diag2[i] = tri[site].diag[1][i].elem();
2205  }
2206 
2207  target_index=0;
2208  for(int col=0; col < Nc*Ns2-1; col++) {
2209  for(int row=col+1; row < Nc*Ns2; row++) {
2210 
2211  int source_index = row*(row-1)/2 + col;
2212  quda_array[site].offDiag2[target_index][0] = tri[site].offd[1][source_index].real();
2213  quda_array[site].offDiag2[target_index][1] = tri[site].offd[1][source_index].imag();
2214  target_index++;
2215  }
2216  }
2217  }
2218  }
2219  }
2220 
2221  template<typename T, typename U>
2222  void QDPCloverTermT<T,U>::packForQUDA(multi1d<QUDAPackedClovSite<typename WordType<T>::Type_t> >& quda_array, int cb) const
2223  {
2224  typedef typename WordType<T>::Type_t REALT;
2225  int num_sites = rb[cb].siteTable().size();
2226 
2227  QDPCloverEnv::QUDAPackArgs<REALT> args = { cb, quda_array,tri };
2228  dispatch_to_threads(num_sites, args, QDPCloverEnv::qudaPackSiteLoop<REALT>);
2229 
2230 
2231 
2232  }
2233 
2234 
2235 
2239 } // End Namespace Chroma
2240 
2241 
2242 #endif
Base class for all fermion action boundary conditions.
Definition: fermbc.h:20
Support class for fermion actions and linear operators.
Definition: state.h:94
Class for counted reference semantics.
Definition: handle.h:33
Handle< FermBC< T, multi1d< U >, multi1d< U > > > fbc
CloverFermActParams param
multi1d< bool > choles_done
void create(Handle< FermState< T, multi1d< U >, multi1d< U > > > fs, const CloverFermActParams &param_)
Creation routine.
Double cholesDet(int cb) const
Computes the inverse of the term on cb using Cholesky.
~QDPCloverTermT()
No real need for cleanup here.
OLattice< PScalar< PScalar< RScalar< typename WordType< T >::Type_t > > > > LatticeREAL
void applySite(T &chi, const T &psi, enum PlusMinus isign, int site) const
OScalar< PScalar< PScalar< RScalar< REALT > > > > RealT
void packForQUDA(multi1d< QUDAPackedClovSite< REALT > > &quda_pack, int cb) const
PACK UP the Clover term for QUDA library:
const FermBC< T, multi1d< U >, multi1d< U > > & getFermBC() const
Return the fermion BC object for this linear operator.
void apply(T &chi, const T &psi, enum PlusMinus isign, int cb) const
void choles(int cb)
Computes the inverse of the term on cb using Cholesky.
void chlclovms(LatticeREAL &log_diag, int cb)
Invert the clover term on cb.
PrimitiveClovTriang< REALT > * tri
const multi1d< U > & getU() const
Get the u field.
void triacntr(U &B, int mat, int cb) const
Calculates Tr_D ( Gamma_mat L )
void ldagdlinv(LatticeREAL &tr_log_diag, int cb)
QDPCloverTermT()
Empty constructor. Must use create later.
void makeClov(const multi1d< U > &f, const RealT &diag_mass)
Create the clover term on cb.
WordType< T >::Type_t REALT
Real getCloverCoeff(int mu, int nu) const
Calculates Tr_D ( Gamma_mat L )
Parameters for Clover fermion action.
Clover term linear operator.
int mu
Definition: cool.cc:24
int nu
Definition: cool.cc:25
void read(XMLReader &xml, const std::string &path, AsqtadFermActParams &param)
Read parameters.
void write(XMLWriter &xml, const std::string &path, const AsqtadFermActParams &param)
Writer parameters.
void block(LatticeColorMatrix &u_block, const multi1d< LatticeColorMatrix > &u, int mu, int bl_level, const Real &BlkAccu, int BlkMax, int j_decay)
Construct block links.
Definition: block.cc:42
unsigned elem_ij
Definition: ldumul_w.cc:38
unsigned j
Definition: ldumul_w.cc:35
unsigned elem_ji
Definition: ldumul_w.cc:39
unsigned n
Definition: ldumul_w.cc:36
Calculates the antihermitian field strength tensor iF(mu,nu)
Nd
Definition: meslate.cc:74
void qudaPackSiteLoop(int lo, int hi, int myId, QUDAPackArgs< R, TD, TO > *a)
void LDagDLInvSiteLoop(int lo, int hi, int myId, LDagDLInvArgs< U > *a)
void applySiteLoop(int lo, int hi, int MyId, ApplyArgs< T > *arg)
void triaCntrSiteLoop(int lo, int hi, int myId, TriaCntrArgs< U > *a)
void cholesSiteLoop(int lo, int hi, int myId, LDagDLInvArgs< U > *a)
void makeClovSiteLoop(int lo, int hi, int myId, QDPCloverMakeClovArg< U > *a)
Asqtad Staggered-Dirac operator.
Definition: klein_gord.cc:10
static multi1d< LatticeColorMatrix > u
QDPCloverTermT< LatticeFermionF, LatticeColorMatrixF > QDPCloverTermF
Double one
Definition: invbicg.cc:105
LinOpSysSolverMGProtoClover::T T
int i
Definition: pbg5p_w.cc:55
QDPCloverTermT< LatticeFermionD, LatticeColorMatrixD > QDPCloverTermD
multi1d< LatticeFermion > chi(Ncb)
void mesField(multi1d< LatticeColorMatrixF > &f, const multi1d< LatticeColorMatrixF > &u)
Calculates the antihermitian field strength tensor iF(mu,nu)
Definition: mesfield.cc:80
Complex a
Definition: invbicg.cc:95
LatticeFermion psi
Definition: mespbg5p_w.cc:35
DComplex d
Definition: invbicg.cc:99
QDPCloverTermT< LatticeFermion, LatticeColorMatrix > QDPCloverTerm
START_CODE()
int cb
Definition: invbicg.cc:120
Double zero
Definition: invbicg.cc:106
int k
Definition: invbicg.cc:119
multi1d< LatticeFermion > s(Ncb)
const T1 const T2 const T3 & f3
Definition: gtest.h:1327
const T1 const T2 & f2
Definition: gtest.h:1321
const T1 & f1
Definition: gtest.h:1316
const T1 const T2 const T3 const T4 const T5 & f5
Definition: gtest.h:1339
const T1 const T2 const T3 const T4 & f4
Definition: gtest.h:1333
FloatingPoint< double > Double
Definition: gtest.h:7351
::std::string string
Definition: gtest.h:1979
int l
Definition: pade_trln_w.cc:111
Double sum
Definition: qtopcor.cc:37
Support class for fermion actions and linear operators.
Params for clover ferm acts.
Special structure used for triangular objects.
RScalar< R > diag[2][2 *Nc]
RComplex< R > offd[2][2 *Nc *Nc-Nc]
const PrimitiveClovTriang< REALT > * tri
OScalar< PScalar< PScalar< RScalar< REALT > > > > RealT
OLattice< PScalar< PScalar< RScalar< REALT > > > > LatticeRealT
PrimitiveClovTriang< REALT > * tri
OScalar< PScalar< PScalar< RScalar< REALT > > > > RealT
const PrimitiveClovTriang< R > * tri
multi1d< QUDAPackedClovSite< R > > & quda_array
const PrimitiveClovTriang< REALT > * tri
multi1d< LatticeColorMatrix > U