CHROMA
clover_term_llvm_w.h
Go to the documentation of this file.
1 // -*- C++ -*-
2 /*! \file
3  * \brief Clover term linear operator
4  */
5 
6 #ifndef __clover_term_llvm_w_h__
7 #define __clover_term_llvm_w_h__
8 
9 #warning "Using QPD-JIT/LLVM clover"
10 
11 #include "state.h"
14 #include "meas/glue/mesfield.h"
15 
16 
17 namespace QDP
18 {
20  double acc_time;
22  public:
24  static PackForQUDATimer singleton;
25  return singleton;
26  }
27 
28  double& get() { return acc_time; }
29  const double& get() const { return acc_time; }
30  };
31 
32 
33  template<typename T>
34  struct PComp
35  {
36  typedef T Sub_t;
37  enum { ThisSize = 2 };
38  T comp[2];
39  };
40 
41  template<class T> struct PCompREG;
42 
43  template<typename T>
44  struct PCompJIT: public BaseJIT<T,2>
45  {
46  template<class T1>
48  //std::cout << __PRETTY_FUNCTION__ << "\n";
49  elem(0) = rhs.elem(0);
50  elem(1) = rhs.elem(1);
51  return *this;
52  }
53 
54 
55  inline T& elem(int i) { return this->arrayF(i); }
56  inline const T& elem(int i) const { return this->arrayF(i); }
57  };
58 
59  template<class T>
60  struct PCompREG
61  {
62  T F[2];
63  void setup(const PCompJIT< typename JITType<T>::Type_t >& rhs ) {
64  F[0].setup( rhs.elem(0) );
65  F[1].setup( rhs.elem(1) );
66  }
67  inline T& elem(int i) { return F[i]; }
68  inline const T& elem(int i) const { return F[i]; }
69  };
70 
71  template<class T>
72  struct JITType<PComp<T> >
73  {
75  };
76 
77  template<class T>
78  struct JITType<PCompREG<T> >
79  {
81  };
82 
83  template<class T>
84  struct REGType<PCompJIT<T> >
85  {
87  };
88 
89  template<class T>
90  struct WordType<PComp<T> >
91  {
92  typedef typename WordType<T>::Type_t Type_t;
93  };
94 
95  template<class T>
96  struct WordType<PCompJIT<T> >
97  {
98  typedef typename WordType<T>::Type_t Type_t;
99  };
100 
101 
102 
103 
104 
105 
106  template<typename T>
107  struct PTriDia
108  {
109  typedef T Sub_t;
110  enum { ThisSize = 2*Nc };
111  T diag[2*Nc];
112  };
113 
114  template<class T> struct PTriDiaREG;
115 
116  template<typename T>
117  struct PTriDiaJIT: public BaseJIT<T,2*Nc>
118  {
119  template<class T1>
121  //std::cout << __PRETTY_FUNCTION__ << "\n";
122  for ( int i = 0 ; i < 2 * Nc ; i++ )
123  elem(i) = rhs.elem(i);
124  return *this;
125  }
126 
127  inline T& elem(int i) { return this->arrayF(i); }
128  inline const T& elem(int i) const { return this->arrayF(i); }
129  };
130 
131  template<class T>
132  struct PTriDiaREG
133  {
134  T F[2*Nc];
135  void setup(const PTriDiaJIT< typename JITType<T>::Type_t >& rhs ) {
136  for (int i=0;i<2*Nc;++i)
137  F[i].setup( rhs.elem(i) );
138  }
139  inline T& elem(int i) { return F[i]; }
140  inline const T& elem(int i) const { return F[i]; }
141  };
142 
143  template<class T>
144  struct JITType<PTriDia<T> >
145  {
147  };
148 
149  template<class T>
150  struct JITType<PTriDiaREG<T> >
151  {
153  };
154 
155  template<class T>
156  struct REGType<PTriDiaJIT<T> >
157  {
159  };
160 
161  template<class T>
162  struct WordType<PTriDia<T> >
163  {
164  typedef typename WordType<T>::Type_t Type_t;
165  };
166 
167  template<class T>
168  struct WordType<PTriDiaJIT<T> >
169  {
170  typedef typename WordType<T>::Type_t Type_t;
171  };
172 
173 
174 
175 
176 
177 
178  template<typename T>
179  struct PTriOff
180  {
181  typedef T Sub_t;
182  enum { ThisSize = 2*Nc*Nc-Nc };
183  T offd[2*Nc*Nc-Nc];
184  };
185 
186  template<class T> struct PTriOffREG;
187 
188  template<typename T>
189  struct PTriOffJIT: public BaseJIT<T,2*Nc*Nc-Nc>
190  {
191  template<class T1>
193  //std::cout << __PRETTY_FUNCTION__ << "\n";
194  for ( int i = 0 ; i < 2*Nc*Nc-Nc ; i++ )
195  elem(i) = rhs.elem(i);
196  return *this;
197  }
198 
199  inline T& elem(int i) { return this->arrayF(i); }
200  inline const T& elem(int i) const { return this->arrayF(i); }
201  };
202 
203  template<class T>
204  struct PTriOffREG
205  {
206  T F[2*Nc*Nc-Nc];
207  void setup(const PTriOffJIT< typename JITType<T>::Type_t >& rhs ) {
208  for (int i=0;i<2*Nc*Nc-Nc;++i)
209  F[i].setup( rhs.elem(i) );
210  }
211  inline T& elem(int i) { return F[i]; }
212  inline const T& elem(int i) const { return F[i]; }
213  };
214 
215  template<class T>
216  struct JITType<PTriOff<T> >
217  {
219  };
220 
221  template<class T>
222  struct JITType<PTriOffREG<T> >
223  {
225  };
226 
227  template<class T>
228  struct REGType<PTriOffJIT<T> >
229  {
231  };
232 
233  template<class T>
234  struct WordType<PTriOff<T> >
235  {
236  typedef typename WordType<T>::Type_t Type_t;
237  };
238 
239  template<class T>
240  struct WordType<PTriOffJIT<T> >
241  {
242  typedef typename WordType<T>::Type_t Type_t;
243  };
244 
245 
246 #if defined(QDP_USE_PROFILING)
247 template<class T>
248 struct LeafFunctor<PComp<T>, PrintTag>
249 {
250  typedef int Type_t;
251  static int apply(const PComp<T> &s, const PrintTag &f)
252  {
253  //LeafFunctor<T,PrintTag>::apply(s.elem(),f);
254  return 0;
255  }
256 };
257 template<class T>
258 struct LeafFunctor<PTriDia<T>, PrintTag>
259 {
260  typedef int Type_t;
261  static int apply(const PTriDia<T> &s, const PrintTag &f)
262  {
263  //LeafFunctor<T,PrintTag>::apply(s.elem(),f);
264  return 0;
265  }
266 };
267 template<class T>
268 struct LeafFunctor<PTriOff<T>, PrintTag>
269 {
270  typedef int Type_t;
271  static int apply(const PTriOff<T> &s, const PrintTag &f)
272  {
273  //LeafFunctor<T,PrintTag>::apply(s.elem(),f);
274  return 0;
275  }
276 };
277 #endif
278 
279 
280 
281 }
282 
283 
284 
285 namespace Chroma
286 {
287  template<typename R>
289  R diag1[6];
290  R offDiag1[15][2];
291  R diag2[6];
292  R offDiag2[15][2];
293  };
294 
295 
296  template<typename T, typename U>
297  class LLVMCloverTermT : public CloverTermBase<T, U>
298  {
299  public:
300  // Typedefs to save typing
301  typedef typename WordType<T>::Type_t REALT;
302 
303  typedef OLattice< PScalar< PScalar< RScalar< Word< REALT> > > > > LatticeREAL;
304  typedef OScalar< PScalar< PScalar< RScalar< Word< REALT> > > > > RealT;
305 
306  //! Empty constructor. Must use create later
307  LLVMCloverTermT();
308 
309  //! No real need for cleanup here
311 
312  //! Creation routine
313  void create(Handle< FermState<T, multi1d<U>, multi1d<U> > > fs,
314  const CloverFermActParams& param_);
315 
316  virtual void create(Handle< FermState<T, multi1d<U>, multi1d<U> > > fs,
317  const CloverFermActParams& param_,
318  const LLVMCloverTermT<T,U>& from_);
319 
320  //! Computes the inverse of the term on cb using Cholesky
321  /*!
322  * \param cb checkerboard of work (Read)
323  */
324  void choles(int cb);
325 
326  //! Computes the inverse of the term on cb using Cholesky
327  /*!
328  * \param cb checkerboard of work (Read)
329  * \return logarithm of the determinant
330  */
331  Double cholesDet(int cb) const ;
332 
333  /**
334  * Apply a dslash
335  *
336  * Performs the operation
337  *
338  * chi <- (L + D + L^dag) . psi
339  *
340  * where
341  * L is a lower triangular matrix
342  * D is the real diagonal. (stored together in type TRIANG)
343  *
344  * Arguments:
345  * \param chi result (Write)
346  * \param psi source (Read)
347  * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read)
348  * \param cb Checkerboard of OUTPUT std::vector (Read)
349  */
350  void apply (T& chi, const T& psi, enum PlusMinus isign, int cb) const;
351 
352 
353  void applySite(T& chi, const T& psi, enum PlusMinus isign, int site) const;
354 
355  //! Calculates Tr_D ( Gamma_mat L )
356  void triacntr(U& B, int mat, int cb) const;
357 
358  //! Return the fermion BC object for this linear operator
359  const FermBC<T, multi1d<U>, multi1d<U> >& getFermBC() const {return *fbc;}
360 
361  //! PACK UP the Clover term for QUDA library:
362  void packForQUDA(multi1d<QUDAPackedClovSite<REALT> >& quda_pack, int cb) const;
363 
364  int getDiaId() const { return tri_dia.getId(); }
365  int getOffId() const { return tri_off.getId(); }
366 
367  using DiagType = OLattice<PComp<PTriDia<RScalar <Word<REALT> > > > >;
368  using OffDiagType = OLattice<PComp<PTriOff<RComplex<Word<REALT> > > > >;
369 
370  const DiagType& getDiagBuffer() const { return tri_dia ; }
371  const OffDiagType& getOffDiagBuffer() const { return tri_off; }
372 
373  protected:
374  //! Create the clover term on cb
375  /*!
376  * \param f field strength tensor F(mu,nu) (Read)
377  * \param cb checkerboard (Read)
378  */
379  void makeClov(const multi1d<U>& f, const RealT& diag_mass);
380 
381  //! Invert the clover term on cb
382  //void chlclovms(LatticeREAL& log_diag, int cb);
383  void ldagdlinv(LatticeREAL& tr_log_diag, int cb);
384 
385  //! Get the u field
386  const multi1d<U>& getU() const {return u;}
387 
388  //! Calculates Tr_D ( Gamma_mat L )
389  Real getCloverCoeff(int mu, int nu) const;
390 
391 
392  private:
394  multi1d<U> u;
396  LatticeREAL tr_log_diag_; // Fill this out during create
397  // but save the global sum until needed.
398  multi1d<bool> choles_done; // Keep note of whether the decomposition has been done
399  // on a particular checkerboard.
400 
401  OLattice<PComp<PTriDia<RScalar <Word<REALT> > > > > tri_dia;
402  OLattice<PComp<PTriOff<RComplex<Word<REALT> > > > > tri_off;
403 
404  };
405 
406 
407  // Empty constructor. Must use create later
408  template<typename T, typename U>
410 
411  // Now copy
412  template<typename T, typename U>
413  void LLVMCloverTermT<T,U>::create(Handle< FermState<T,multi1d<U>,multi1d<U> > > fs,
414  const CloverFermActParams& param_,
415  const LLVMCloverTermT<T,U>& from)
416  {
417  START_CODE();
418 
419  //std::cout << "LLVM Clover create from other " << (void*)this << "\n";
420 
421  u.resize(Nd);
422 
423  u = fs->getLinks();
424  fbc = fs->getFermBC();
425  param = param_;
426 
427  // Sanity check
428  if (fbc.operator->() == 0) {
429  QDPIO::cerr << "LLVMCloverTerm: error: fbc is null" << std::endl;
430  QDP_abort(1);
431  }
432 
433  {
434  RealT ff = where(param.anisoParam.anisoP, Real(1) / param.anisoParam.xi_0, Real(1));
435  param.clovCoeffR *= Real(0.5) * ff;
436  param.clovCoeffT *= Real(0.5);
437  }
438 
439  //
440  // Yuk. Some bits of knowledge of the dslash term are buried in the
441  // effective mass term. They show up here. If I wanted some more
442  // complicated dslash then this will have to be fixed/adjusted.
443  //
444  RealT diag_mass;
445  {
446  RealT ff = where(param.anisoParam.anisoP, param.anisoParam.nu / param.anisoParam.xi_0, Real(1));
447  diag_mass = 1 + (Nd-1)*ff + param.Mass;
448  }
449 
450 
451  /* Calculate F(mu,nu) */
452  //multi1d<LatticeColorMatrix> f;
453  //mesField(f, u);
454  //makeClov(f, diag_mass);
455 
456  choles_done.resize(rb.numSubsets());
457  for(int i=0; i < rb.numSubsets(); i++) {
458  choles_done[i] = from.choles_done[i];
459  }
460 
461  tr_log_diag_ = from.tr_log_diag_;
462 
463  tri_dia = from.tri_dia;
464  tri_off = from.tri_off;
465 
466  END_CODE();
467  }
468 
469 
470  //! Creation routine
471  template<typename T, typename U>
472  void LLVMCloverTermT<T,U>::create(Handle< FermState<T,multi1d<U>,multi1d<U> > > fs,
473  const CloverFermActParams& param_)
474  {
475  START_CODE();
476 
477  //std::cout << "LLVM Clover create " << (void*)this << "\n";
478 
479  u.resize(Nd);
480 
481  u = fs->getLinks();
482  fbc = fs->getFermBC();
483  param = param_;
484 
485  // Sanity check
486  if (fbc.operator->() == 0) {
487  QDPIO::cerr << "LLVMCloverTerm: error: fbc is null" << std::endl;
488  QDP_abort(1);
489  }
490 
491  {
492  RealT ff = where(param.anisoParam.anisoP, Real(1) / param.anisoParam.xi_0, Real(1));
493  param.clovCoeffR *= RealT(0.5) * ff;
494  param.clovCoeffT *= RealT(0.5);
495  }
496 
497  //
498  // Yuk. Some bits of knowledge of the dslash term are buried in the
499  // effective mass term. They show up here. If I wanted some more
500  // complicated dslash then this will have to be fixed/adjusted.
501  //
502  RealT diag_mass;
503  {
504  RealT ff = where(param.anisoParam.anisoP, param.anisoParam.nu / param.anisoParam.xi_0, Real(1));
505  diag_mass = 1 + (Nd-1)*ff + param.Mass;
506  }
507 
508  /* Calculate F(mu,nu) */
509  multi1d<U> f;
510  mesField(f, u);
511  makeClov(f, diag_mass);
512 
513  choles_done.resize(rb.numSubsets());
514  for(int i=0; i < rb.numSubsets(); i++) {
515  choles_done[i] = false;
516  }
517 
518  END_CODE();
519  }
520 
521 
522  /*
523  * MAKCLOV
524  *
525  * In this routine, MAKCLOV calculates
526 
527  * 1 - (1/4)*sigma(mu,nu) F(mu,nu)
528 
529  * using F from mesfield
530 
531  * F(mu,nu) = (1/4) sum_p (1/2) [ U_p(x) - U^dag_p(x) ]
532 
533  * using basis of SPPROD and stores in a lower triangular matrix
534  * (no diagonal) plus real diagonal
535 
536  * where
537  * U_1 = u(x,mu)*u(x+mu,nu)*u_dag(x+nu,mu)*u_dag(x,nu)
538  * U_2 = u(x,nu)*u_dag(x-mu+nu,mu)*u_dag(x-mu,nu)*u(x-mu,mu)
539  * U_3 = u_dag(x-mu,mu)*u_dag(x-mu-nu,nu)*u(x-mu-nu,mu)*u(x-nu,nu)
540  * U_4 = u_dag(x-nu,nu)*u(x-nu,mu)*u(x-nu+mu,nu)*u_dag(x,mu)
541 
542  * and
543 
544  * | sigF(1) sigF(3) 0 0 |
545  * sigF = | sigF(5) -sigF(1) 0 0 |
546  * | 0 0 -sigF(0) -sigF(2) |
547  * | 0 0 -sigF(4) sigF(0) |
548  * where
549  * sigF(i) is a color matrix
550 
551  * sigF(0) = i*(ClovT*E_z + ClovR*B_z)
552  * = i*(ClovT*F(3,2) + ClovR*F(1,0))
553  * sigF(1) = i*(ClovT*E_z - ClovR*B_z)
554  * = i*(ClovT*F(3,2) - ClovR*F(1,0))
555  * sigF(2) = i*(E_+ + B_+)
556  * sigF(3) = i*(E_+ - B_+)
557  * sigF(4) = i*(E_- + B_-)
558  * sigF(5) = i*(E_- - B_-)
559  * i*E_+ = (i*ClovT*E_x - ClovT*E_y)
560  * = (i*ClovT*F(3,0) - ClovT*F(3,1))
561  * i*E_- = (i*ClovT*E_x + ClovT*E_y)
562  * = (i*ClovT*F(3,0) + ClovT*F(3,1))
563  * i*B_+ = (i*ClovR*B_x - ClovR*B_y)
564  * = (i*ClovR*F(2,1) + ClovR*F(2,0))
565  * i*B_- = (i*ClovR*B_x + ClovR*B_y)
566  * = (i*ClovR*F(2,1) - ClovR*F(2,0))
567 
568  * NOTE: I am using i*F of the usual F defined by UKQCD, Heatlie et.al.
569 
570  * NOTE: the above definitions assume that the time direction, t_dir,
571  * is 3. In general F(k,j) is multiplied with ClovT if either
572  * k=t_dir or j=t_dir, and with ClovR otherwise.
573 
574  *+++
575  * Here are some notes on the origin of this routine. NOTE, ClovCoeff or u0
576  * are not actually used in MAKCLOV.
577  *
578  * The clover mass term is suppose to act on a std::vector like
579  *
580  * chi = (1 - (ClovCoeff/u0^3) * kappa/4 * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)) * psi
581 
582  * Definitions used here (NOTE: no "i")
583  * sigma(mu,nu) = gamma(mu)*gamma(nu) - gamma(nu)*gamma(mu)
584  * = 2*gamma(mu)*gamma(nu) for mu != nu
585  *
586  * chi = sum_mu sum_nu F(mu,nu)*gamma(mu)*gamma(nu)*psi for mu < nu
587  * = (1/2) * sum_mu sum_nu F(mu,nu)*gamma(mu)*gamma(nu)*psi for mu != nu
588  * = (1/4) * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)*psi
589  *
590  *
591  * chi = (1 - (ClovCoeff/u0^3) * kappa/4 * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)) * psi
592  * = psi - (ClovCoeff/u0^3) * kappa * chi
593  * == psi - kappa * chi
594  *
595  * We have absorbed ClovCoeff/u0^3 into kappa. A u0 was previously absorbed into kappa
596  * for compatibility to ancient conventions.
597  *---
598 
599  * Arguments:
600  * \param f field strength tensor F(cb,mu,nu) (Read)
601  * \param diag_mass effective mass term (Read)
602  */
603 
604  template<typename RealT,typename U,typename X,typename Y>
605  void function_make_clov_exec(const JitFunction& function,
606  const RealT& diag_mass,
607  const U& f0,
608  const U& f1,
609  const U& f2,
610  const U& f3,
611  const U& f4,
612  const U& f5,
613  X& tri_dia,
614  Y& tri_off)
615  {
616  AddressLeaf addr_leaf(all);
617 
618  forEach(diag_mass, addr_leaf, NullCombine());
619  forEach(f0, addr_leaf, NullCombine());
620  forEach(f1, addr_leaf, NullCombine());
621  forEach(f2, addr_leaf, NullCombine());
622  forEach(f3, addr_leaf, NullCombine());
623  forEach(f4, addr_leaf, NullCombine());
624  forEach(f5, addr_leaf, NullCombine());
625  forEach(tri_dia, addr_leaf, NullCombine());
626  forEach(tri_off, addr_leaf, NullCombine());
627 
628  jit_dispatch(function.func().at(0),Layout::sitesOnNode(),getDataLayoutInnerSize(),true,0,addr_leaf);
629  }
630 
631 
632 
633  template<typename RealT,typename U,typename X,typename Y>
634  void function_make_clov_build( JitFunction& func,
635  const RealT& diag_mass,
636  const U& f0,
637  const U& f1,
638  const U& f2,
639  const U& f3,
640  const U& f4,
641  const U& f5,
642  const X& tri_dia,
643  const Y& tri_off)
644  {
645  //std::cout << __PRETTY_FUNCTION__ << ": entering\n";
646 
647  JitMainLoop loop;
648 
649  ParamLeaf param_leaf;
650 
651  typedef typename WordType<RealT>::Type_t REALT;
652 
653  typedef typename LeafFunctor<RealT, ParamLeaf>::Type_t RealTJIT;
654  RealTJIT diag_mass_jit(forEach(diag_mass, param_leaf, TreeCombine()));
655 
656  typedef typename LeafFunctor<U, ParamLeaf>::Type_t UJIT;
657  UJIT f0_jit(forEach(f0, param_leaf, TreeCombine()));
658  UJIT f1_jit(forEach(f1, param_leaf, TreeCombine()));
659  UJIT f2_jit(forEach(f2, param_leaf, TreeCombine()));
660  UJIT f3_jit(forEach(f3, param_leaf, TreeCombine()));
661  UJIT f4_jit(forEach(f4, param_leaf, TreeCombine()));
662  UJIT f5_jit(forEach(f5, param_leaf, TreeCombine()));
663 
664  typedef typename LeafFunctor<X, ParamLeaf>::Type_t XJIT;
665  XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine()));
666 
667  typedef typename LeafFunctor<Y, ParamLeaf>::Type_t YJIT;
668  YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine()));
669 
670  IndexDomainVector idx = loop.getIdx();
671 
672  typename UJIT::Subtype_t& f0_j = f0_jit.elem(JitDeviceLayout::LayoutCoalesced , idx );
673  typename UJIT::Subtype_t& f1_j = f1_jit.elem(JitDeviceLayout::LayoutCoalesced , idx );
674  typename UJIT::Subtype_t& f2_j = f2_jit.elem(JitDeviceLayout::LayoutCoalesced , idx );
675  typename UJIT::Subtype_t& f3_j = f3_jit.elem(JitDeviceLayout::LayoutCoalesced , idx );
676  typename UJIT::Subtype_t& f4_j = f4_jit.elem(JitDeviceLayout::LayoutCoalesced , idx );
677  typename UJIT::Subtype_t& f5_j = f5_jit.elem(JitDeviceLayout::LayoutCoalesced , idx );
678 
679  typename XJIT::Subtype_t& tri_dia_j = tri_dia_jit.elem(JitDeviceLayout::LayoutCoalesced , idx );
680  typename YJIT::Subtype_t& tri_off_j = tri_off_jit.elem(JitDeviceLayout::LayoutCoalesced , idx );
681 
682  typename REGType< typename RealTJIT::Subtype_t >::Type_t diag_mass_reg;
683  diag_mass_reg.setup( diag_mass_jit.elem() );
684 
685 
686 
687 
688  for(int jj = 0; jj < 2; jj++) {
689  for(int ii = 0; ii < 2*Nc; ii++) {
690  tri_dia_j.elem(jj).elem(ii) = diag_mass_reg.elem().elem();
691  //tri[site].diag[jj][ii] = diag_mass.elem().elem().elem();
692  }
693  }
694 
695 
696  RComplexREG<WordREG<REALT> > E_minus;
697  RComplexREG<WordREG<REALT> > B_minus;
698  RComplexREG<WordREG<REALT> > ctmp_0;
699  RComplexREG<WordREG<REALT> > ctmp_1;
700  RScalarREG<WordREG<REALT> > rtmp_0;
701  RScalarREG<WordREG<REALT> > rtmp_1;
702 
703 
704  for(int i = 0; i < Nc; ++i) {
705  ctmp_0 = f5_j.elem().elem(i,i);
706  ctmp_0 -= f0_j.elem().elem(i,i);
707  rtmp_0 = imag(ctmp_0);
708  tri_dia_j.elem(0).elem(i) += rtmp_0;
709 
710  tri_dia_j.elem(0).elem(i+Nc) -= rtmp_0;
711 
712  ctmp_1 = f5_j.elem().elem(i,i);
713  ctmp_1 += f0_j.elem().elem(i,i);
714  rtmp_1 = imag(ctmp_1);
715  tri_dia_j.elem(1).elem(i) -= rtmp_1;
716 
717  tri_dia_j.elem(1).elem(i+Nc) += rtmp_1;
718  }
719 
720  for(int i = 1; i < Nc; ++i) {
721  for(int j = 0; j < i; ++j) {
722 
723  int elem_ij = i*(i-1)/2 + j;
724  int elem_tmp = (i+Nc)*(i+Nc-1)/2 + j+Nc;
725 
726  ctmp_0 = f0_j.elem().elem(i,j);
727  ctmp_0 -= f5_j.elem().elem(i,j);
728  tri_off_j.elem(0).elem(elem_ij) = timesI(ctmp_0);
729 
730  zero_rep( tri_off_j.elem(0).elem(elem_tmp) );
731  tri_off_j.elem(0).elem(elem_tmp) -= tri_off_j.elem(0).elem(elem_ij);// * -1.0;
732 
733  ctmp_1 = f5_j.elem().elem(i,j);
734  ctmp_1 += f0_j.elem().elem(i,j);
735  tri_off_j.elem(1).elem(elem_ij) = timesI(ctmp_1);
736 
737  zero_rep( tri_off_j.elem(1).elem(elem_tmp) );
738  tri_off_j.elem(1).elem(elem_tmp) -= tri_off_j.elem(1).elem(elem_ij);
739  }
740  }
741 
742  for(int i = 0; i < Nc; ++i) {
743  for(int j = 0; j < Nc; ++j) {
744 
745  int elem_ij = (i+Nc)*(i+Nc-1)/2 + j;
746 
747  //E_minus = timesI(f2_j.elem().elem(i,j));
748  E_minus = f2_j.elem().elem(i,j);
749  E_minus = timesI( E_minus );
750 
751  E_minus += f4_j.elem().elem(i,j);
752 
753  //B_minus = timesI(f3_j.elem().elem(i,j));
754  B_minus = f3_j.elem().elem(i,j);
755  B_minus = timesI( B_minus );
756 
757  B_minus -= f1_j.elem().elem(i,j);
758 
759  tri_off_j.elem(0).elem(elem_ij) = B_minus - E_minus;
760 
761  tri_off_j.elem(1).elem(elem_ij) = E_minus + B_minus;
762  }
763  }
764 
765  loop.done();
766 
767  // std::cout << __PRETTY_FUNCTION__ << ": leaving\n";
768 
769  func.func().push_back( jit_function_epilogue_get("jit_make_clov.ptx") );
770  }
771 
772 
773 
774 
775  /* This now just sets up and dispatches... */
776  template<typename T, typename U>
777  void LLVMCloverTermT<T,U>::makeClov(const multi1d<U>& f, const RealT& diag_mass)
778  {
779  START_CODE();
780 
781  if ( Nd != 4 ){
782  QDPIO::cerr << __func__ << ": expecting Nd==4" << std::endl;
783  QDP_abort(1);
784  }
785 
786  if ( Ns != 4 ){
787  QDPIO::cerr << __func__ << ": expecting Ns==4" << std::endl;
788  QDP_abort(1);
789  }
790 
791  U f0 = f[0] * getCloverCoeff(0,1);
792  U f1 = f[1] * getCloverCoeff(0,2);
793  U f2 = f[2] * getCloverCoeff(0,3);
794  U f3 = f[3] * getCloverCoeff(1,2);
795  U f4 = f[4] * getCloverCoeff(1,3);
796  U f5 = f[5] * getCloverCoeff(2,3);
797 
798  //QDPIO::cout << "LLVM Clover make " << (void*)this << "\n";
799  //std::cout << "LLVM Clover make " << (void*)this << "\n";
800  static JitFunction function;
801 
802  if (!function.built()) {
803  QDPIO::cout << "Building JIT make clover function\n";
804  function_make_clov_build(function, diag_mass, f0,f1,f2,f3,f4,f5, tri_dia , tri_off );
805  }
806 
807  // Execute the function
808  function_make_clov_exec(function, diag_mass, f0,f1,f2,f3,f4,f5,tri_dia, tri_off);
809 
810  END_CODE();
811  }
812 
813 
814  //! Invert
815  /*!
816  * Computes the inverse of the term on cb using Cholesky
817  */
818  template<typename T, typename U>
820  {
821  START_CODE();
822 
823  // When you are doing the cholesky - also fill out the trace_log_diag piece)
824  // chlclovms(tr_log_diag_, cb);
825  // Switch to LDL^\dag inversion
826  ldagdlinv(tr_log_diag_,cb);
827 
828  END_CODE();
829  }
830 
831 
832  //! Invert
833  /*!
834  * Computes the inverse of the term on cb using Cholesky
835  *
836  * \return logarithm of the determinant
837  */
838  template<typename T, typename U>
840  {
841  START_CODE();
842 
843  if( choles_done[cb] == false )
844  {
845  QDPIO::cout << __func__ << ": Error: you have not done the Cholesky.on this operator on this subset" << std::endl;
846  QDPIO::cout << "You sure you should not be asking invclov?" << std::endl;
847  QDP_abort(1);
848  }
849 
850 
851  END_CODE();
852 
853  // Need to thread generic sums in QDP++?
854  // Need to thread generic norm2() in QDP++?
855  return sum(tr_log_diag_, rb[cb]);
856  }
857 
858 
859 
860  template<typename T,typename X,typename Y>
861  void function_ldagdlinv_exec( const JitFunction& function,
862  T& tr_log_diag,
863  X& tri_dia,
864  Y& tri_off,
865  const Subset& s)
866  {
867  if (!s.hasOrderedRep())
868  QDP_error_exit("ldagdlinv on subset with unordered representation not implemented");
869 
870  AddressLeaf addr_leaf(s);
871 
872  forEach(tr_log_diag, addr_leaf, NullCombine());
873  forEach(tri_dia, addr_leaf, NullCombine());
874  forEach(tri_off, addr_leaf, NullCombine());
875 
876  jit_dispatch(function.func().at(0),s.numSiteTable(),getDataLayoutInnerSize(),s.hasOrderedRep(),s.start(),addr_leaf);
877  }
878 
879 
880 
881 
882 
883  template<typename U,typename T,typename X,typename Y>
884  void function_ldagdlinv_build( JitFunction& func,
885  const T& tr_log_diag,
886  const X& tri_dia,
887  const Y& tri_off,
888  const Subset& s)
889  {
890  typedef typename WordType<U>::Type_t REALT;
891 
892  //std::cout << __PRETTY_FUNCTION__ << " entering\n";
893 
894  JitMainLoop loop;
895 
896  ParamLeaf param_leaf;
897 
898  typedef typename LeafFunctor<T, ParamLeaf>::Type_t TJIT;
899  TJIT tr_log_diag_jit(forEach(tr_log_diag, param_leaf, TreeCombine()));
900 
901  typedef typename LeafFunctor<X, ParamLeaf>::Type_t XJIT;
902  XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine()));
903 
904  typedef typename LeafFunctor<Y, ParamLeaf>::Type_t YJIT;
905  YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine()));
906 
907  IndexDomainVector idx = loop.getIdx();
908 
909  typename TJIT::Subtype_t& tr_log_diag_j = tr_log_diag_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
910  typename XJIT::Subtype_t& tri_dia_j = tri_dia_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
911  typename YJIT::Subtype_t& tri_off_j = tri_off_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
912 
913  typename REGType< typename XJIT::Subtype_t >::Type_t tri_dia_r;
914  typename REGType< typename YJIT::Subtype_t >::Type_t tri_off_r;
915 
916  tri_dia_r.setup( tri_dia_j );
917  tri_off_r.setup( tri_off_j );
918 
919 
920  RScalarREG<WordREG<REALT> > zip;
921  zero_rep(zip);
922  int N = 2*Nc;
923 
924  //int site_neg_logdet=0;
925 
926  for(int block=0; block < 2; block++) {
927 
928  RScalarREG<WordREG<REALT> > inv_d[6] ;
929  RComplexREG<WordREG<REALT> > inv_offd[15] ;
930  RComplexREG<WordREG<REALT> > v[6] ;
931  RScalarREG<WordREG<REALT> > diag_g[6] ;
932 
933  for(int i=0; i < N; i++) {
934  inv_d[i] = tri_dia_r.elem(block).elem(i);
935  }
936 
937  for(int i=0; i < 15; i++) {
938  inv_offd[i] = tri_off_r.elem(block).elem(i);
939  }
940 
941 
942  for(int j=0; j < N; ++j) {
943 
944  for(int i=0; i < j; i++) {
945  int elem_ji = j*(j-1)/2 + i;
946 
947  RComplexREG<WordREG<REALT> > A_ii = cmplx( inv_d[i], zip );
948  v[i] = A_ii*adj(inv_offd[elem_ji]);
949  }
950 
951 
952  v[j] = cmplx(inv_d[j],zip);
953 
954  for(int k=0; k < j; k++) {
955  int elem_jk = j*(j-1)/2 + k;
956  v[j] -= inv_offd[elem_jk]*v[k];
957  }
958 
959  inv_d[j] = real( v[j] );
960 
961  for(int k=j+1; k < N; k++) {
962  int elem_kj = k*(k-1)/2 + j;
963  for(int l=0; l < j; l++) {
964  int elem_kl = k*(k-1)/2 + l;
965  inv_offd[elem_kj] -= inv_offd[elem_kl] * v[l];
966  }
967  inv_offd[elem_kj] /= v[j];
968  }
969  }
970 
971 
972  // Now fix up the inverse
973  RScalarREG<WordREG<REALT> > one(1.0);
974  //one.elem() = (REALT)1;
975 
976  for(int i=0; i < N; i++) {
977  diag_g[i] = one/inv_d[i];
978 
979  // Compute the trace log
980  // NB we are always doing trace log | A |
981  // (because we are always working with actually A^\dagger A
982  // even in one flavour case where we square root)
983  tr_log_diag_j.elem().elem() += log(fabs(inv_d[i]));
984  // However, it is worth counting just the no of negative logdets
985  // on site
986 #if 0
987  if( inv_d[i].elem() < 0 ) {
988  site_neg_logdet++;
989  }
990 #endif
991  }
992 
993  // Now we need to invert the L D L^\dagger
994  // We can do this by solving:
995  //
996  // L D L^\dagger M^{-1} = 1
997  //
998  // This can be done by solving L D X = 1 (X = L^\dagger M^{-1})
999  //
1000  // Then solving L^\dagger M^{-1} = X
1001  //
1002  // LD is lower diagonal and so X will also be lower diagonal.
1003  // LD X = 1 can be solved by forward substitution.
1004  //
1005  // Likewise L^\dagger is strictly upper triagonal and so
1006  // L^\dagger M^{-1} = X can be solved by forward substitution.
1007  RComplexREG<WordREG<REALT> > sum;
1008  for(int k = 0; k < N; ++k) {
1009 
1010  for(int i = 0; i < k; ++i) {
1011  zero_rep(v[i]);
1012  }
1013 
1014  /*# Forward substitution */
1015 
1016  // The first element is the inverse of the diagonal
1017  v[k] = cmplx(diag_g[k],zip);
1018 
1019  for(int i = k+1; i < N; ++i) {
1020  zero_rep(v[i]);
1021 
1022  for(int j = k; j < i; ++j) {
1023  int elem_ij = i*(i-1)/2+j;
1024 
1025  // subtract l_ij*d_j*x_{kj}
1026  v[i] -= inv_offd[elem_ij] *inv_d[j]*v[j];
1027 
1028  }
1029 
1030  // scale out by 1/d_i
1031  v[i] *= diag_g[i];
1032  }
1033 
1034  /*# Backward substitution */
1035  // V[N-1] remains unchanged
1036  // Start from V[N-2]
1037 
1038  for(int i = N-2; (int)i >= (int)k; --i) {
1039  for(int j = i+1; j < N; ++j) {
1040  int elem_ji = j*(j-1)/2 + i;
1041  // Subtract terms of typ (l_ji)*x_kj
1042  v[i] -= adj(inv_offd[elem_ji]) * v[j];
1043  }
1044  }
1045 
1046  /*# Overwrite column k of invcl.offd */
1047  inv_d[k] = real(v[k]);
1048  for(int i = k+1; i < N; ++i) {
1049 
1050  int elem_ik = i*(i-1)/2+k;
1051  inv_offd[elem_ik] = v[i];
1052  }
1053  }
1054 
1055  // Overwrite original data
1056  for(int i=0; i < N; i++) {
1057  tri_dia_j.elem(block).elem(i) = inv_d[i];
1058  }
1059  for(int i=0; i < 15; i++) {
1060  tri_off_j.elem(block).elem(i) = inv_offd[i];
1061  }
1062  }
1063 
1064  // std::cout << __PRETTY_FUNCTION__ << " leaving\n";
1065  loop.done();
1066 
1067  func.func().push_back( jit_function_epilogue_get("jit_ldagdlinv.ptx") );
1068  }
1069 
1070 
1071 
1072 
1073 
1074  /*! An LDL^\dag decomposition and inversion? */
1075  template<typename T, typename U>
1077  {
1078  START_CODE();
1079 
1080  if ( 2*Nc < 3 )
1081  {
1082  QDPIO::cerr << __func__ << ": Matrix is too small" << std::endl;
1083  QDP_abort(1);
1084  }
1085 
1086  // Zero trace log
1087  tr_log_diag[rb[cb]] = zero;
1088 
1089  //QDPIO::cout << "LLVM Clover ldagdlinv " << (void*)this << "\n";
1090  //std::cout << "LLVM Clover ldagdlinv " << (void*)this << "\n";
1091  static JitFunction function;
1092 
1093  if (!function.built()) {
1094  QDPIO::cout << "Building JIT ldagdlinv\n";
1095  function_ldagdlinv_build<U>(function, tr_log_diag, tri_dia, tri_off, rb[cb] );
1096  }
1097 
1098  // Execute the function
1099  function_ldagdlinv_exec(function, tr_log_diag, tri_dia, tri_off, rb[cb] );
1100 
1101  // This comes from the days when we used to do Cholesky
1102  choles_done[cb] = true;
1103  END_CODE();
1104  }
1105 
1106  /*! CHLCLOVMS - Cholesky decompose the clover mass term and uses it to
1107  * compute lower(A^-1) = lower((L.L^dag)^-1)
1108  * Adapted from Golub and Van Loan, Matrix Computations, 2nd, Sec 4.2.4
1109  *
1110  * Arguments:
1111  *
1112  * \param DetP flag whether to compute determinant (Read)
1113  * \param logdet logarithm of the determinant (Write)
1114  * \param cb checkerboard of work (Read)
1115  */
1116 
1117 
1118 
1119 
1120 
1121 
1122  //! TRIACNTR
1123  /*!
1124  * \ingroup linop
1125  *
1126  * Calculates
1127  * Tr_D ( Gamma_mat L )
1128  *
1129  * This routine is specific to Wilson fermions!
1130  *
1131  * the trace over the Dirac indices for one of the 16 Gamma matrices
1132  * and a hermitian color x spin matrix A, stored as a block diagonal
1133  * complex lower triangular matrix L and a real diagonal diag_L.
1134 
1135  * Here 0 <= mat <= 15 and
1136  * if mat = mat_1 + mat_2 * 2 + mat_3 * 4 + mat_4 * 8
1137  *
1138  * Gamma(mat) = gamma(1)^(mat_1) * gamma(2)^(mat_2) * gamma(3)^(mat_3)
1139  * * gamma(4)^(mat_4)
1140  *
1141  * Further, in basis for the Gamma matrices used, A is of the form
1142  *
1143  * | A_0 | 0 |
1144  * A = | --------- |
1145  * | 0 | A_1 |
1146  *
1147  *
1148  * Arguments:
1149  *
1150  * \param B the resulting SU(N) color matrix (Write)
1151  * \param clov clover term (Read)
1152  * \param mat label of the Gamma matrix (Read)
1153  */
1154 
1155 
1156  template<typename U,typename X,typename Y>
1157  void function_triacntr_exec( const JitFunction& function,
1158  U& B,
1159  const X& tri_dia,
1160  const Y& tri_off,
1161  int mat,
1162  const Subset& s)
1163  {
1164  if (!s.hasOrderedRep())
1165  QDP_error_exit("triacntr on subset with unordered representation not implemented");
1166 
1167  AddressLeaf addr_leaf(s);
1168 
1169  addr_leaf.setLit( mat );
1170 
1171  forEach(B, addr_leaf, NullCombine());
1172  forEach(tri_dia, addr_leaf, NullCombine());
1173  forEach(tri_off, addr_leaf, NullCombine());
1174 
1175  jit_dispatch(function.func().at(0),s.numSiteTable(),getDataLayoutInnerSize(),s.hasOrderedRep(),s.start(),addr_leaf);
1176  }
1177 
1178 
1179 
1180 
1181  template<typename U,typename X,typename Y>
1182  void function_triacntr_build( JitFunction& func,
1183  const U& B,
1184  const X& tri_dia,
1185  const Y& tri_off,
1186  int mat,
1187  const Subset& s)
1188  {
1189  //std::cout << __PRETTY_FUNCTION__ << ": entering\n";
1190 
1191  typedef typename WordType<U>::Type_t REALT;
1192 
1193  JitMainLoop loop;
1194 
1195  ParamRef p_mat = llvm_add_param<int>();
1196 
1197  ParamLeaf param_leaf;
1198 
1199  typedef typename LeafFunctor<U, ParamLeaf>::Type_t UJIT;
1200  UJIT B_jit(forEach(B, param_leaf, TreeCombine()));
1201 
1202  typedef typename LeafFunctor<X, ParamLeaf>::Type_t XJIT;
1203  XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine()));
1204 
1205  typedef typename LeafFunctor<Y, ParamLeaf>::Type_t YJIT;
1206  YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine()));
1207 
1208  llvm::Value * r_mat = llvm_derefParam( p_mat );
1209 
1210  IndexDomainVector idx = loop.getIdx();
1211 
1212  typename UJIT::Subtype_t& B_j = B_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
1213  typename XJIT::Subtype_t& tri_dia_j = tri_dia_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
1214  typename YJIT::Subtype_t& tri_off_j = tri_off_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
1215 
1216  typename REGType< typename XJIT::Subtype_t >::Type_t tri_dia_r;
1217  typename REGType< typename YJIT::Subtype_t >::Type_t tri_off_r;
1218 
1219  tri_dia_r.setup( tri_dia_j );
1220  tri_off_r.setup( tri_off_j );
1221 
1222  llvm::BasicBlock * case_0 = llvm_new_basic_block();
1223  llvm::BasicBlock * case_3 = llvm_new_basic_block();
1224  llvm::BasicBlock * case_5 = llvm_new_basic_block();
1225  llvm::BasicBlock * case_6 = llvm_new_basic_block();
1226  llvm::BasicBlock * case_9 = llvm_new_basic_block();
1227  llvm::BasicBlock * case_10 = llvm_new_basic_block();
1228  llvm::BasicBlock * case_12 = llvm_new_basic_block();
1229  llvm::BasicBlock * case_default = llvm_new_basic_block();
1230 
1231  llvm::SwitchInst * mat_sw = llvm_switch( r_mat , case_default );
1232 
1233  mat_sw->addCase( llvm_create_const_int(0) , case_0 );
1234  mat_sw->addCase( llvm_create_const_int(3) , case_3 );
1235  mat_sw->addCase( llvm_create_const_int(5) , case_5 );
1236  mat_sw->addCase( llvm_create_const_int(6) , case_6 );
1237  mat_sw->addCase( llvm_create_const_int(9) , case_9 );
1238  mat_sw->addCase( llvm_create_const_int(10) , case_10 );
1239  mat_sw->addCase( llvm_create_const_int(12) , case_12 );
1240 
1241  /*# gamma( 0) 1 0 0 0 # ( 0000 ) --> 0 */
1242  /*# 0 1 0 0 */
1243  /*# 0 0 1 0 */
1244  /*# 0 0 0 1 */
1245  /*# From diagonal part */
1246  llvm_set_insert_point( case_0 );
1247  {
1248  RComplexREG<WordREG<REALT> > lctmp0;
1249  RScalarREG< WordREG<REALT> > lr_zero0;
1250  RScalarREG< WordREG<REALT> > lrtmp0;
1251 
1252  zero_rep(lr_zero0);
1253 
1254  for(int i0 = 0; i0 < Nc; ++i0) {
1255 
1256  lrtmp0 = tri_dia_r.elem(0).elem(i0);
1257  lrtmp0 += tri_dia_r.elem(0).elem(i0+Nc);
1258  lrtmp0 += tri_dia_r.elem(1).elem(i0);
1259  lrtmp0 += tri_dia_r.elem(1).elem(i0+Nc);
1260  B_j.elem().elem(i0,i0) = cmplx(lrtmp0,lr_zero0);
1261  }
1262 
1263  /*# From lower triangular portion */
1264  int elem_ij0 = 0;
1265  for(int i0 = 1; i0 < Nc; ++i0) {
1266 
1267  int elem_ijb0 = (i0+Nc)*(i0+Nc-1)/2 + Nc;
1268 
1269  for(int j0 = 0; j0 < i0; ++j0) {
1270 
1271  lctmp0 = tri_off_r.elem(0).elem(elem_ij0);
1272  lctmp0 += tri_off_r.elem(0).elem(elem_ijb0);
1273  lctmp0 += tri_off_r.elem(1).elem(elem_ij0);
1274  lctmp0 += tri_off_r.elem(1).elem(elem_ijb0);
1275 
1276  B_j.elem().elem(j0,i0) = lctmp0;
1277  B_j.elem().elem(i0,j0) = adj(lctmp0);
1278 
1279  elem_ij0++;
1280  elem_ijb0++;
1281  }
1282  }
1283  llvm_branch(case_default);
1284  }
1285 
1286 
1287  llvm_set_insert_point( case_3 );
1288  {
1289  /*# gamma( 12) -i 0 0 0 # ( 0011 ) --> 3 */
1290  /*# 0 i 0 0 */
1291  /*# 0 0 -i 0 */
1292  /*# 0 0 0 i */
1293  /*# From diagonal part */
1294 
1295  RComplexREG<WordREG<REALT> > lctmp3;
1296  RScalarREG<WordREG<REALT> > lr_zero3;
1297  RScalarREG<WordREG<REALT> > lrtmp3;
1298 
1299  lr_zero3 = 0;
1300 
1301  for(int i3 = 0; i3 < Nc; ++i3) {
1302 
1303  lrtmp3 = tri_dia_r.elem(0).elem(i3+Nc);
1304  lrtmp3 -= tri_dia_r.elem(0).elem(i3);
1305  lrtmp3 -= tri_dia_r.elem(1).elem(i3);
1306  lrtmp3 += tri_dia_r.elem(1).elem(i3+Nc);
1307  B_j.elem().elem(i3,i3) = cmplx(lr_zero3,lrtmp3);
1308  }
1309 
1310  /*# From lower triangular portion */
1311  int elem_ij3 = 0;
1312  for(int i3 = 1; i3 < Nc; ++i3) {
1313 
1314  int elem_ijb3 = (i3+Nc)*(i3+Nc-1)/2 + Nc;
1315 
1316  for(int j3 = 0; j3 < i3; ++j3) {
1317 
1318  lctmp3 = tri_off_r.elem(0).elem(elem_ijb3);
1319  lctmp3 -= tri_off_r.elem(0).elem(elem_ij3);
1320  lctmp3 -= tri_off_r.elem(1).elem(elem_ij3);
1321  lctmp3 += tri_off_r.elem(1).elem(elem_ijb3);
1322 
1323  B_j.elem().elem(j3,i3) = timesI(adj(lctmp3));
1324  B_j.elem().elem(i3,j3) = timesI(lctmp3);
1325 
1326  elem_ij3++;
1327  elem_ijb3++;
1328  }
1329  }
1330  llvm_branch(case_default);
1331  }
1332 
1333 
1334  llvm_set_insert_point( case_5 );
1335  {
1336  /*# gamma( 13) 0 -1 0 0 # ( 0101 ) --> 5 */
1337  /*# 1 0 0 0 */
1338  /*# 0 0 0 -1 */
1339  /*# 0 0 1 0 */
1340 
1341  RComplexREG<WordREG<REALT> > lctmp5;
1342  RScalarREG<WordREG<REALT> > lrtmp5;
1343 
1344  for(int i5 = 0; i5 < Nc; ++i5) {
1345 
1346  int elem_ij5 = (i5+Nc)*(i5+Nc-1)/2;
1347 
1348  for(int j5 = 0; j5 < Nc; ++j5) {
1349 
1350  int elem_ji5 = (j5+Nc)*(j5+Nc-1)/2 + i5;
1351 
1352 
1353  lctmp5 = adj(tri_off_r.elem(0).elem(elem_ji5));
1354  lctmp5 -= tri_off_r.elem(0).elem(elem_ij5);
1355  lctmp5 += adj(tri_off_r.elem(1).elem(elem_ji5));
1356  lctmp5 -= tri_off_r.elem(1).elem(elem_ij5);
1357 
1358  B_j.elem().elem(i5,j5) = lctmp5;
1359 
1360  elem_ij5++;
1361  }
1362  }
1363  llvm_branch(case_default);
1364  }
1365 
1366 
1367  llvm_set_insert_point( case_6 );
1368  {
1369  /*# gamma( 23) 0 -i 0 0 # ( 0110 ) --> 6 */
1370  /*# -i 0 0 0 */
1371  /*# 0 0 0 -i */
1372  /*# 0 0 -i 0 */
1373 
1374  RComplexREG<WordREG<REALT> > lctmp6;
1375  RScalarREG<WordREG<REALT> > lrtmp6;
1376 
1377  for(int i6 = 0; i6 < Nc; ++i6) {
1378 
1379  int elem_ij6 = (i6+Nc)*(i6+Nc-1)/2;
1380 
1381  for(int j6 = 0; j6 < Nc; ++j6) {
1382 
1383  int elem_ji6 = (j6+Nc)*(j6+Nc-1)/2 + i6;
1384 
1385  lctmp6 = adj(tri_off_r.elem(0).elem(elem_ji6));
1386  lctmp6 += tri_off_r.elem(0).elem(elem_ij6);
1387  lctmp6 += adj(tri_off_r.elem(1).elem(elem_ji6));
1388  lctmp6 += tri_off_r.elem(1).elem(elem_ij6);
1389 
1390  B_j.elem().elem(i6,j6) = timesMinusI(lctmp6);
1391 
1392  elem_ij6++;
1393  }
1394  }
1395  llvm_branch(case_default);
1396  }
1397 
1398 
1399  llvm_set_insert_point( case_9 );
1400  {
1401  /*# gamma( 14) 0 i 0 0 # ( 1001 ) --> 9 */
1402  /*# i 0 0 0 */
1403  /*# 0 0 0 -i */
1404  /*# 0 0 -i 0 */
1405 
1406  RComplexREG<WordREG<REALT> > lctmp9;
1407  RScalarREG<WordREG<REALT> > lrtmp9;
1408 
1409  for(int i9 = 0; i9 < Nc; ++i9) {
1410 
1411  int elem_ij9 = (i9+Nc)*(i9+Nc-1)/2;
1412 
1413  for(int j9 = 0; j9 < Nc; ++j9) {
1414 
1415  int elem_ji9 = (j9+Nc)*(j9+Nc-1)/2 + i9;
1416 
1417  lctmp9 = adj(tri_off_r.elem(0).elem(elem_ji9));
1418  lctmp9 += tri_off_r.elem(0).elem(elem_ij9);
1419  lctmp9 -= adj(tri_off_r.elem(1).elem(elem_ji9));
1420  lctmp9 -= tri_off_r.elem(1).elem(elem_ij9);
1421 
1422  B_j.elem().elem(i9,j9) = timesI(lctmp9);
1423 
1424  elem_ij9++;
1425  }
1426  }
1427  llvm_branch(case_default);
1428  }
1429 
1430 
1431  llvm_set_insert_point( case_10 );
1432  {
1433  /*# gamma( 24) 0 -1 0 0 # ( 1010 ) --> 10 */
1434  /*# 1 0 0 0 */
1435  /*# 0 0 0 1 */
1436  /*# 0 0 -1 0 */
1437 
1438  RComplexREG<WordREG<REALT> > lctmp10;
1439  RScalarREG<WordREG<REALT> > lrtmp10;
1440 
1441  for(int i10 = 0; i10 < Nc; ++i10) {
1442 
1443  int elem_ij10 = (i10+Nc)*(i10+Nc-1)/2;
1444 
1445  for(int j10 = 0; j10 < Nc; ++j10) {
1446 
1447  int elem_ji10 = (j10+Nc)*(j10+Nc-1)/2 + i10;
1448 
1449  lctmp10 = adj(tri_off_r.elem(0).elem(elem_ji10));
1450  lctmp10 -= tri_off_r.elem(0).elem(elem_ij10);
1451  lctmp10 -= adj(tri_off_r.elem(1).elem(elem_ji10));
1452  lctmp10 += tri_off_r.elem(1).elem(elem_ij10);
1453 
1454  B_j.elem().elem(i10,j10) = lctmp10;
1455 
1456  elem_ij10++;
1457  }
1458  }
1459  llvm_branch(case_default);
1460  }
1461 
1462 
1463  llvm_set_insert_point( case_12 );
1464  {
1465  /*# gamma( 34) i 0 0 0 # ( 1100 ) --> 12 */
1466  /*# 0 -i 0 0 */
1467  /*# 0 0 -i 0 */
1468  /*# 0 0 0 i */
1469  /*# From diagonal part */
1470 
1471  RComplexREG<WordREG<REALT> > lctmp12;
1472  RScalarREG<WordREG<REALT> > lr_zero12;
1473  RScalarREG<WordREG<REALT> > lrtmp12;
1474 
1475  lr_zero12 = 0;
1476 
1477  for(int i12 = 0; i12 < Nc; ++i12) {
1478 
1479  lrtmp12 = tri_dia_r.elem(0).elem(i12);
1480  lrtmp12 -= tri_dia_r.elem(0).elem(i12+Nc);
1481  lrtmp12 -= tri_dia_r.elem(1).elem(i12);
1482  lrtmp12 += tri_dia_r.elem(1).elem(i12+Nc);
1483  B_j.elem().elem(i12,i12) = cmplx(lr_zero12,lrtmp12);
1484  }
1485 
1486  /*# From lower triangular portion */
1487  int elem_ij12 = 0;
1488  for(int i12 = 1; i12 < Nc; ++i12) {
1489 
1490  int elem_ijb12 = (i12+Nc)*(i12+Nc-1)/2 + Nc;
1491 
1492  for(int j12 = 0; j12 < i12; ++j12) {
1493 
1494  lctmp12 = tri_off_r.elem(0).elem(elem_ij12);
1495  lctmp12 -= tri_off_r.elem(0).elem(elem_ijb12);
1496  lctmp12 -= tri_off_r.elem(1).elem(elem_ij12);
1497  lctmp12 += tri_off_r.elem(1).elem(elem_ijb12);
1498 
1499  B_j.elem().elem(i12,j12) = timesI(lctmp12);
1500  B_j.elem().elem(j12,i12) = timesI(adj(lctmp12));
1501 
1502  elem_ij12++;
1503  elem_ijb12++;
1504  }
1505  }
1506  llvm_branch(case_default);
1507  }
1508 
1509  llvm_set_insert_point( case_default );
1510 
1511  loop.done();
1512 
1513  func.func().push_back( jit_function_epilogue_get("jit_triacntr.ptx") );
1514  }
1515 
1516 
1517 
1518 
1519  template<typename T, typename U>
1520  void LLVMCloverTermT<T,U>::triacntr(U& B, int mat, int cb) const
1521  {
1522  START_CODE();
1523 
1524  B = zero;
1525 
1526  if ( mat < 0 || mat > 15 )
1527  {
1528  QDPIO::cerr << __func__ << ": Gamma out of range: mat = " << mat << std::endl;
1529  QDP_abort(1);
1530  }
1531 
1532  //QDPIO::cout << "LLVM Clover triacntr " << (void*)this << "\n";
1533  //std::cout << "LLVM Clover triacntr " << (void*)this << "\n";
1534  static JitFunction function;
1535 
1536  if (!function.built()) {
1537  QDPIO::cout << "Building JIT triacntr\n";
1538  function_triacntr_build<U>(function, B, tri_dia, tri_off, mat, rb[cb] );
1539  }
1540 
1541  // Execute the function
1542  function_triacntr_exec(function, B, tri_dia, tri_off, mat, rb[cb] );
1543 
1544  END_CODE();
1545  }
1546 
1547  //! Returns the appropriate clover coefficient for indices mu and nu
1548  template<typename T, typename U>
1549  Real
1551  {
1552  START_CODE();
1553 
1554  if( param.anisoParam.anisoP ) {
1555  if (mu==param.anisoParam.t_dir || nu == param.anisoParam.t_dir) {
1556  return param.clovCoeffT;
1557  }
1558  else {
1559  // Otherwise return the spatial coeff
1560  return param.clovCoeffR;
1561  }
1562  }
1563  else {
1564  // If there is no anisotropy just return the spatial one, it will
1565  // be the same as the temporal one
1566  return param.clovCoeffR;
1567  }
1568 
1569  END_CODE();
1570  }
1571 
1572 
1573 
1574  template<typename T,typename X,typename Y>
1575  void function_apply_clov_exec(const JitFunction& function,
1576  T& chi,
1577  const T& psi,
1578  const X& tri_dia,
1579  const Y& tri_off,
1580  const Subset& s)
1581  {
1582  if (!s.hasOrderedRep())
1583  QDP_error_exit("clover on subset with unordered representation not implemented");
1584 
1585  //std::cout << __PRETTY_FUNCTION__ << ": entering\n";
1586 
1587  AddressLeaf addr_leaf(s);
1588 
1589  forEach(chi, addr_leaf, NullCombine());
1590  forEach(psi, addr_leaf, NullCombine());
1591  forEach(tri_dia, addr_leaf, NullCombine());
1592  forEach(tri_off, addr_leaf, NullCombine());
1593 
1594  jit_dispatch(function.func().at(0),s.numSiteTable(),getDataLayoutInnerSize(),s.hasOrderedRep(),s.start(),addr_leaf);
1595  }
1596 
1597 
1598 
1599 
1600  template<typename T,typename X,typename Y>
1601  void function_apply_clov_build(JitFunction& func,
1602  const T& chi,
1603  const T& psi,
1604  const X& tri_dia,
1605  const Y& tri_off,
1606  const Subset& s)
1607  {
1608  //std::cout << __PRETTY_FUNCTION__ << ": entering\n";
1609  //typedef typename WordType<RealT>::Type_t REALT;
1610 
1611  JitMainLoop loop;
1612 
1613  ParamLeaf param_leaf;
1614 
1615  typedef typename LeafFunctor<T, ParamLeaf>::Type_t TJIT;
1616  TJIT chi_jit(forEach(chi, param_leaf, TreeCombine()));
1617  TJIT psi_jit(forEach(psi, param_leaf, TreeCombine()));
1618  typename REGType< typename TJIT::Subtype_t >::Type_t psi_r;
1619  typename REGType< typename TJIT::Subtype_t >::Type_t chi_r;
1620 
1621  typedef typename LeafFunctor<X, ParamLeaf>::Type_t XJIT;
1622  XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine()));
1623  typename REGType< typename XJIT::Subtype_t >::Type_t tri_dia_r;
1624 
1625  typedef typename LeafFunctor<Y, ParamLeaf>::Type_t YJIT;
1626  YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine()));
1627  typename REGType< typename YJIT::Subtype_t >::Type_t tri_off_r;
1628 
1629  IndexDomainVector idx = loop.getIdx();
1630 
1631  typename TJIT::Subtype_t& chi_j = chi_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
1632  psi_r.setup( psi_jit.elem(JitDeviceLayout::LayoutCoalesced,idx) );
1633  tri_dia_r.setup( tri_dia_jit.elem(JitDeviceLayout::LayoutCoalesced,idx) );
1634  tri_off_r.setup( tri_off_jit.elem(JitDeviceLayout::LayoutCoalesced,idx) );
1635 
1636  // RComplex<REALT>* cchi = (RComplex<REALT>*)&(chi.elem(site).elem(0).elem(0));
1637  // const RComplex<REALT>* ppsi = (const RComplex<REALT>*)&(psi.elem(site).elem(0).elem(0));
1638 
1639  int n = 2*Nc;
1640 
1641  for(int i = 0; i < n; ++i)
1642  {
1643  chi_r.elem((0*n+i)/3).elem((0*n+i)%3) = tri_dia_r.elem(0).elem(i) * psi_r.elem((0*n+i)/3).elem((0*n+i)%3);
1644  // cchi[0*n+i] = tri[site].diag[0][i] * ppsi[0*n+i];
1645 
1646  chi_r.elem((1*n+i)/3).elem((1*n+i)%3) = tri_dia_r.elem(1).elem(i) * psi_r.elem((1*n+i)/3).elem((1*n+i)%3);
1647  // cchi[1*n+i] = tri[site].diag[1][i] * ppsi[1*n+i];
1648  }
1649 
1650  int kij = 0;
1651  for(int i = 0; i < n; ++i)
1652  {
1653  for(int j = 0; j < i; j++)
1654  {
1655  chi_r.elem((0*n+i)/3).elem((0*n+i)%3) += tri_off_r.elem(0).elem(kij) * psi_r.elem((0*n+j)/3).elem((0*n+j)%3);
1656  // cchi[0*n+i] += tri[site].offd[0][kij] * ppsi[0*n+j];
1657 
1658  chi_r.elem((0*n+j)/3).elem((0*n+j)%3) += conj(tri_off_r.elem(0).elem(kij)) * psi_r.elem((0*n+i)/3).elem((0*n+i)%3);
1659  // cchi[0*n+j] += conj(tri[site].offd[0][kij]) * ppsi[0*n+i];
1660 
1661  chi_r.elem((1*n+i)/3).elem((1*n+i)%3) += tri_off_r.elem(1).elem(kij) * psi_r.elem((1*n+j)/3).elem((1*n+j)%3);
1662  // cchi[1*n+i] += tri[site].offd[1][kij] * ppsi[1*n+j];
1663 
1664  chi_r.elem((1*n+j)/3).elem((1*n+j)%3) += conj(tri_off_r.elem(1).elem(kij)) * psi_r.elem((1*n+i)/3).elem((1*n+i)%3);
1665  // cchi[1*n+j] += conj(tri[site].offd[1][kij]) * ppsi[1*n+i];
1666 
1667  kij++;
1668  }
1669  }
1670 
1671  chi_j = chi_r;
1672 
1673  loop.done();
1674 
1675  func.func().push_back( jit_function_epilogue_get("jit_apply_clov.ptx") );
1676  }
1677 
1678 
1679 
1680 
1681 
1682 
1683 
1684  /**
1685  * Apply a dslash
1686  *
1687  * Performs the operation
1688  *
1689  * chi <- (L + D + L^dag) . psi
1690  *
1691  * where
1692  * L is a lower triangular matrix
1693  * D is the real diagonal. (stored together in type TRIANG)
1694  *
1695  * Arguments:
1696  * \param chi result (Write)
1697  * \param psi source (Read)
1698  * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read)
1699  * \param cb Checkerboard of OUTPUT std::vector (Read)
1700  */
1701  template<typename T, typename U>
1703  enum PlusMinus isign, int cb) const
1704  {
1705  START_CODE();
1706 
1707  if ( Ns != 4 ) {
1708  QDPIO::cerr << __func__ << ": CloverTerm::apply requires Ns==4" << std::endl;
1709  QDP_abort(1);
1710  }
1711 
1712  //QDPIO::cout << "LLVM Clover apply" << (void*)this << "\n";
1713  //std::cout << "LLVM Clover apply" << (void*)this << "\n";
1714  static JitFunction function;
1715 
1716  if (!function.built()) {
1717  QDPIO::cout << "Building JIT clover apply function\n";
1718  function_apply_clov_build(function, chi, psi, tri_dia, tri_off, rb[cb] );
1719  }
1720 
1721  // Execute the function
1722  function_apply_clov_exec(function, chi, psi, tri_dia, tri_off, rb[cb] );
1723 
1724  (*this).getFermBC().modifyF(chi, QDP::rb[cb]);
1725 
1726  END_CODE();
1727  }
1728 
1729 
1730 
1731 
1732 
1733  namespace QDPCloverEnv {
1734  template<typename R,typename TD,typename TO>
1735  struct QUDAPackArgs {
1736  int cb;
1737  multi1d<QUDAPackedClovSite<R> >& quda_array;
1738  const TD& tri_dia;
1739  const TO& tri_off;
1740  };
1741 
1742  template<typename R,typename TD,typename TO>
1743  void qudaPackSiteLoop(int lo, int hi, int myId, QUDAPackArgs<R,TD,TO>* a) {
1744  int cb = a->cb;
1745  int Ns2 = Ns/2;
1746 
1747  multi1d<QUDAPackedClovSite<R> >& quda_array = a->quda_array;
1748 
1749  const TD& tri_dia = a->tri_dia;
1750  const TO& tri_off = a->tri_off;
1751 
1752  const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
1753 
1754  for(int ssite=lo; ssite < hi; ++ssite) {
1755  int site = rb[cb].siteTable()[ssite];
1756  // First Chiral Block
1757  for(int i=0; i < 6; i++) {
1758  quda_array[site].diag1[i] = tri_dia.elem(site).comp[0].diag[i].elem().elem();
1759  }
1760 
1761  int target_index=0;
1762 
1763  for(int col=0; col < Nc*Ns2-1; col++) {
1764  for(int row=col+1; row < Nc*Ns2; row++) {
1765 
1766  int source_index = row*(row-1)/2 + col;
1767 
1768  quda_array[site].offDiag1[target_index][0] = tri_off.elem(site).comp[0].offd[source_index].real().elem();
1769  quda_array[site].offDiag1[target_index][1] = tri_off.elem(site).comp[0].offd[source_index].imag().elem();
1770  target_index++;
1771  }
1772  }
1773  // Second Chiral Block
1774  for(int i=0; i < 6; i++) {
1775  quda_array[site].diag2[i] = tri_dia.elem(site).comp[1].diag[i].elem().elem();
1776  }
1777 
1778  target_index=0;
1779  for(int col=0; col < Nc*Ns2-1; col++) {
1780  for(int row=col+1; row < Nc*Ns2; row++) {
1781 
1782  int source_index = row*(row-1)/2 + col;
1783 
1784  quda_array[site].offDiag2[target_index][0] = tri_off.elem(site).comp[1].offd[source_index].real().elem();
1785  quda_array[site].offDiag2[target_index][1] = tri_off.elem(site).comp[1].offd[source_index].imag().elem();
1786  target_index++;
1787  }
1788  }
1789  }
1790  QDPIO::cout << "\n";
1791  }
1792  }
1793 
1794  template<typename T, typename U>
1795  void LLVMCloverTermT<T,U>::packForQUDA(multi1d<QUDAPackedClovSite<typename WordType<T>::Type_t> >& quda_array, int cb) const
1796  {
1797  typedef typename WordType<T>::Type_t REALT;
1798  int num_sites = rb[cb].siteTable().size();
1799 
1800  typedef OLattice<PComp<PTriDia<RScalar <Word<REALT> > > > > TD;
1801  typedef OLattice<PComp<PTriOff<RComplex<Word<REALT> > > > > TO;
1802 
1803  StopWatch watch;
1804  watch.start();
1805 
1806  QDPCloverEnv::QUDAPackArgs<REALT,TD,TO> args = { cb, quda_array , tri_dia , tri_off };
1807  dispatch_to_threads(num_sites, args, QDPCloverEnv::qudaPackSiteLoop<REALT,TD,TO>);
1808 
1809  watch.stop();
1810  PackForQUDATimer::Instance().get() += watch.getTimeInMicroseconds();
1811  }
1812 
1813 
1814 
1815  template<typename T, typename U>
1817  enum PlusMinus isign, int site) const
1818  {
1819  QDP_error_exit("LLVMCloverTermT<T,U>::applySite(T& chi, const T& psi,..) not implemented ");
1820 #if 0
1821 #endif
1822  }
1823 
1824 
1828 } // End Namespace Chroma
1829 
1830 
1831 #endif
Base class for all fermion action boundary conditions.
Definition: fermbc.h:20
Support class for fermion actions and linear operators.
Definition: state.h:94
Class for counted reference semantics.
Definition: handle.h:33
OScalar< PScalar< PScalar< RScalar< Word< REALT > > > > > RealT
~LLVMCloverTermT()
No real need for cleanup here.
OLattice< PComp< PTriDia< RScalar< Word< REALT > > > > > DiagType
const FermBC< T, multi1d< U >, multi1d< U > > & getFermBC() const
Return the fermion BC object for this linear operator.
void ldagdlinv(LatticeREAL &tr_log_diag, int cb)
Invert the clover term on cb.
void makeClov(const multi1d< U > &f, const RealT &diag_mass)
Create the clover term on cb.
void apply(T &chi, const T &psi, enum PlusMinus isign, int cb) const
OLattice< PComp< PTriDia< RScalar< Word< REALT > > > > > tri_dia
OLattice< PScalar< PScalar< RScalar< Word< REALT > > > > > LatticeREAL
CloverFermActParams param
OLattice< PComp< PTriOff< RComplex< Word< REALT > > > > > OffDiagType
void create(Handle< FermState< T, multi1d< U >, multi1d< U > > > fs, const CloverFermActParams &param_)
Creation routine.
Handle< FermBC< T, multi1d< U >, multi1d< U > > > fbc
const OffDiagType & getOffDiagBuffer() const
void applySite(T &chi, const T &psi, enum PlusMinus isign, int site) const
OLattice< PComp< PTriOff< RComplex< Word< REALT > > > > > tri_off
void choles(int cb)
Computes the inverse of the term on cb using Cholesky.
const DiagType & getDiagBuffer() const
void triacntr(U &B, int mat, int cb) const
Calculates Tr_D ( Gamma_mat L )
Double cholesDet(int cb) const
Computes the inverse of the term on cb using Cholesky.
Real getCloverCoeff(int mu, int nu) const
Calculates Tr_D ( Gamma_mat L )
LLVMCloverTermT()
Empty constructor. Must use create later.
WordType< T >::Type_t REALT
const multi1d< U > & getU() const
Get the u field.
void packForQUDA(multi1d< QUDAPackedClovSite< REALT > > &quda_pack, int cb) const
PACK UP the Clover term for QUDA library:
const double & get() const
static PackForQUDATimer & Instance()
Parameters for Clover fermion action.
Clover term linear operator.
int mu
Definition: cool.cc:24
int nu
Definition: cool.cc:25
void block(LatticeColorMatrix &u_block, const multi1d< LatticeColorMatrix > &u, int mu, int bl_level, const Real &BlkAccu, int BlkMax, int j_decay)
Construct block links.
Definition: block.cc:42
void function_triacntr_exec(const JitFunction &function, U &B, const X &tri_dia, const Y &tri_off, int mat, const Subset &s)
TRIACNTR.
unsigned elem_ij
Definition: ldumul_w.cc:38
unsigned s
Definition: ldumul_w.cc:37
unsigned j
Definition: ldumul_w.cc:35
unsigned elem_ji
Definition: ldumul_w.cc:39
unsigned i
Definition: ldumul_w.cc:34
unsigned n
Definition: ldumul_w.cc:36
Calculates the antihermitian field strength tensor iF(mu,nu)
Nd
Definition: meslate.cc:74
void qudaPackSiteLoop(int lo, int hi, int myId, QUDAPackArgs< R, TD, TO > *a)
Asqtad Staggered-Dirac operator.
Definition: klein_gord.cc:10
LLVMCloverTermT< LatticeFermionF, LatticeColorMatrixF > LLVMCloverTermF
QDP_error_exit("too many BiCG iterations", n_count, rsd_sq, cp, c, re_rvr, im_rvr, re_a, im_a, re_b, im_b)
void function_make_clov_exec(const JitFunction &function, const RealT &diag_mass, const U &f0, const U &f1, const U &f2, const U &f3, const U &f4, const U &f5, X &tri_dia, Y &tri_off)
static multi1d< LatticeColorMatrix > u
void function_ldagdlinv_exec(const JitFunction &function, T &tr_log_diag, X &tri_dia, Y &tri_off, const Subset &s)
void function_apply_clov_exec(const JitFunction &function, T &chi, const T &psi, const X &tri_dia, const Y &tri_off, const Subset &s)
Double one
Definition: invbicg.cc:105
LinOpSysSolverMGProtoClover::T T
int i
Definition: pbg5p_w.cc:55
void function_triacntr_build(JitFunction &func, const U &B, const X &tri_dia, const Y &tri_off, int mat, const Subset &s)
LLVMCloverTermT< LatticeFermionD, LatticeColorMatrixD > LLVMCloverTermD
void function_ldagdlinv_build(JitFunction &func, const T &tr_log_diag, const X &tri_dia, const Y &tri_off, const Subset &s)
multi1d< LatticeFermion > chi(Ncb)
void mesField(multi1d< LatticeColorMatrixF > &f, const multi1d< LatticeColorMatrixF > &u)
Calculates the antihermitian field strength tensor iF(mu,nu)
Definition: mesfield.cc:80
Complex a
Definition: invbicg.cc:95
LatticeFermion psi
Definition: mespbg5p_w.cc:35
void function_make_clov_build(JitFunction &func, const RealT &diag_mass, const U &f0, const U &f1, const U &f2, const U &f3, const U &f4, const U &f5, const X &tri_dia, const Y &tri_off)
START_CODE()
void function_apply_clov_build(JitFunction &func, const T &chi, const T &psi, const X &tri_dia, const Y &tri_off, const Subset &s)
int cb
Definition: invbicg.cc:120
LLVMCloverTermT< LatticeFermion, LatticeColorMatrix > LLVMCloverTerm
Double zero
Definition: invbicg.cc:106
int k
Definition: invbicg.cc:119
multi1d< LatticeFermion > s(Ncb)
const T1 const T2 const T3 & f3
Definition: gtest.h:1327
const T1 const T2 & f2
Definition: gtest.h:1321
const T1 & f1
Definition: gtest.h:1316
const T1 const T2 const T3 const T4 const T5 & f5
Definition: gtest.h:1339
const T1 const T2 const T3 const T4 & f4
Definition: gtest.h:1333
FloatingPoint< double > Double
Definition: gtest.h:7351
int l
Definition: pade_trln_w.cc:111
Double sum
Definition: qtopcor.cc:37
Support class for fermion actions and linear operators.
Params for clover ferm acts.
multi1d< QUDAPackedClovSite< R > > & quda_array
PCompJIT< typename JITType< T >::Type_t > Type_t
PCompJIT< typename JITType< T >::Type_t > Type_t
PTriDiaJIT< typename JITType< T >::Type_t > Type_t
PTriDiaJIT< typename JITType< T >::Type_t > Type_t
PTriOffJIT< typename JITType< T >::Type_t > Type_t
PTriOffJIT< typename JITType< T >::Type_t > Type_t
PCompJIT & operator=(const PCompREG< T1 > &rhs)
const T & elem(int i) const
void setup(const PCompJIT< typename JITType< T >::Type_t > &rhs)
const T & elem(int i) const
const T & elem(int i) const
PTriDiaJIT & operator=(const PTriDiaREG< T1 > &rhs)
const T & elem(int i) const
void setup(const PTriDiaJIT< typename JITType< T >::Type_t > &rhs)
PTriOffJIT & operator=(const PTriOffREG< T1 > &rhs)
const T & elem(int i) const
T F[2 *Nc *Nc-Nc]
const T & elem(int i) const
void setup(const PTriOffJIT< typename JITType< T >::Type_t > &rhs)
T offd[2 *Nc *Nc-Nc]
PCompREG< typename REGType< T >::Type_t > Type_t
PTriDiaREG< typename REGType< T >::Type_t > Type_t
PTriOffREG< typename REGType< T >::Type_t > Type_t
WordType< T >::Type_t Type_t
multi1d< LatticeColorMatrix > U
LatticeFermion T
Definition: t_clover.cc:11
func
Definition: t_preccfz.cc:17
LatticeFermionD TD
Definition: t_quda_tprec.cc:22