CHROMA
clover_term_nvvm_w.h
Go to the documentation of this file.
1 // -*- C++ -*-
2 /*! \file
3  * \brief Clover term linear operator
4  */
5 
6 #ifndef __clover_term_nvvm_w_h__
7 #define __clover_term_nvvm_w_h__
8 
9 #warning "Using QDP-JIT/NVVM clover"
10 
11 #include "state.h"
14 #include "meas/glue/mesfield.h"
15 
16 //#define QDP_JIT_NVVM_USE_LEGACY_LAUNCH
17 
18 namespace QDP
19 {
20  class PackForQUDATimer {
21  double acc_time;
23  public:
25  static PackForQUDATimer singleton;
26  return singleton;
27  }
28 
29  double& get() { return acc_time; }
30  const double& get() const { return acc_time; }
31  };
32 
33 
34  template<typename T>
35  struct PComp
36  {
37  typedef T Sub_t;
38  enum { ThisSize = 2 };
39  T comp[2];
40  };
41 
42  template<class T> struct PCompREG;
43 
44  template<typename T>
45  struct PCompJIT: public BaseJIT<T,2>
46  {
47  template<class T1>
49  //std::cout << __PRETTY_FUNCTION__ << "\n";
50  elem(0) = rhs.elem(0);
51  elem(1) = rhs.elem(1);
52  return *this;
53  }
54 
55 
56  inline T& elem(int i) { return this->arrayF(i); }
57  inline const T& elem(int i) const { return this->arrayF(i); }
58  };
59 
60  template<class T>
61  struct PCompREG
62  {
63  T F[2];
64  void setup(const PCompJIT< typename JITType<T>::Type_t >& rhs ) {
65  F[0].setup( rhs.elem(0) );
66  F[1].setup( rhs.elem(1) );
67  }
68  inline T& elem(int i) { return F[i]; }
69  inline const T& elem(int i) const { return F[i]; }
70  };
71 
72  template<class T>
73  struct JITType<PComp<T> >
74  {
76  };
77 
78  template<class T>
79  struct JITType<PCompREG<T> >
80  {
82  };
83 
84  template<class T>
85  struct REGType<PCompJIT<T> >
86  {
88  };
89 
90  template<class T>
91  struct WordType<PComp<T> >
92  {
93  typedef typename WordType<T>::Type_t Type_t;
94  };
95 
96  template<class T>
97  struct WordType<PCompJIT<T> >
98  {
99  typedef typename WordType<T>::Type_t Type_t;
100  };
101 
102 
103 
104 
105 
106 
107  template<typename T>
108  struct PTriDia
109  {
110  typedef T Sub_t;
111  enum { ThisSize = 2*Nc };
112  T diag[2*Nc];
113  };
114 
115  template<class T> struct PTriDiaREG;
116 
117  template<typename T>
118  struct PTriDiaJIT: public BaseJIT<T,2*Nc>
119  {
120  template<class T1>
122  //std::cout << __PRETTY_FUNCTION__ << "\n";
123  for ( int i = 0 ; i < 2 * Nc ; i++ )
124  elem(i) = rhs.elem(i);
125  return *this;
126  }
127 
128  inline T& elem(int i) { return this->arrayF(i); }
129  inline const T& elem(int i) const { return this->arrayF(i); }
130  };
131 
132  template<class T>
133  struct PTriDiaREG
134  {
135  T F[2*Nc];
136  void setup(const PTriDiaJIT< typename JITType<T>::Type_t >& rhs ) {
137  for (int i=0;i<2*Nc;++i)
138  F[i].setup( rhs.elem(i) );
139  }
140  inline T& elem(int i) { return F[i]; }
141  inline const T& elem(int i) const { return F[i]; }
142  };
143 
144  template<class T>
145  struct JITType<PTriDia<T> >
146  {
148  };
149 
150  template<class T>
151  struct JITType<PTriDiaREG<T> >
152  {
154  };
155 
156  template<class T>
157  struct REGType<PTriDiaJIT<T> >
158  {
160  };
161 
162  template<class T>
163  struct WordType<PTriDia<T> >
164  {
165  typedef typename WordType<T>::Type_t Type_t;
166  };
167 
168  template<class T>
169  struct WordType<PTriDiaJIT<T> >
170  {
171  typedef typename WordType<T>::Type_t Type_t;
172  };
173 
174 
175 
176 
177 
178 
179  template<typename T>
180  struct PTriOff
181  {
182  typedef T Sub_t;
183  enum { ThisSize = 2*Nc*Nc-Nc };
184  T offd[2*Nc*Nc-Nc];
185  };
186 
187  template<class T> struct PTriOffREG;
188 
189  template<typename T>
190  struct PTriOffJIT: public BaseJIT<T,2*Nc*Nc-Nc>
191  {
192  template<class T1>
194  //std::cout << __PRETTY_FUNCTION__ << "\n";
195  for ( int i = 0 ; i < 2*Nc*Nc-Nc ; i++ )
196  elem(i) = rhs.elem(i);
197  return *this;
198  }
199 
200  inline T& elem(int i) { return this->arrayF(i); }
201  inline const T& elem(int i) const { return this->arrayF(i); }
202  };
203 
204  template<class T>
205  struct PTriOffREG
206  {
207  T F[2*Nc*Nc-Nc];
208  void setup(const PTriOffJIT< typename JITType<T>::Type_t >& rhs ) {
209  for (int i=0;i<2*Nc*Nc-Nc;++i)
210  F[i].setup( rhs.elem(i) );
211  }
212  inline T& elem(int i) { return F[i]; }
213  inline const T& elem(int i) const { return F[i]; }
214  };
215 
216  template<class T>
217  struct JITType<PTriOff<T> >
218  {
220  };
221 
222  template<class T>
223  struct JITType<PTriOffREG<T> >
224  {
226  };
227 
228  template<class T>
229  struct REGType<PTriOffJIT<T> >
230  {
232  };
233 
234  template<class T>
235  struct WordType<PTriOff<T> >
236  {
237  typedef typename WordType<T>::Type_t Type_t;
238  };
239 
240  template<class T>
241  struct WordType<PTriOffJIT<T> >
242  {
243  typedef typename WordType<T>::Type_t Type_t;
244  };
245 
246 
247 #if defined(QDP_USE_PROFILING)
248 template<class T>
249 struct LeafFunctor<PComp<T>, PrintTag>
250 {
251  typedef int Type_t;
252  static int apply(const PComp<T> &s, const PrintTag &f)
253  {
254  //LeafFunctor<T,PrintTag>::apply(s.elem(),f);
255  return 0;
256  }
257 };
258 template<class T>
259 struct LeafFunctor<PTriDia<T>, PrintTag>
260 {
261  typedef int Type_t;
262  static int apply(const PTriDia<T> &s, const PrintTag &f)
263  {
264  //LeafFunctor<T,PrintTag>::apply(s.elem(),f);
265  return 0;
266  }
267 };
268 template<class T>
269 struct LeafFunctor<PTriOff<T>, PrintTag>
270 {
271  typedef int Type_t;
272  static int apply(const PTriOff<T> &s, const PrintTag &f)
273  {
274  //LeafFunctor<T,PrintTag>::apply(s.elem(),f);
275  return 0;
276  }
277 };
278 #endif
279 
280 
281 
282 }
283 
284 
285 
286 namespace Chroma
287 {
288  template<typename R>
289  struct QUDAPackedClovSite {
290  R diag1[6];
291  R offDiag1[15][2];
292  R diag2[6];
293  R offDiag2[15][2];
294  };
295 
296 
297  template<typename T, typename U>
298  class NVVMCloverTermT : public CloverTermBase<T, U>
299  {
300  public:
301  // Typedefs to save typing
302  typedef typename WordType<T>::Type_t REALT;
303 
304  typedef OLattice< PScalar< PScalar< RScalar< Word< REALT> > > > > LatticeREAL;
305  typedef OScalar< PScalar< PScalar< RScalar< Word< REALT> > > > > RealT;
306 
307  //! Empty constructor. Must use create later
308  NVVMCloverTermT();
309 
310  //! No real need for cleanup here
312 
313  //! Creation routine
314  void create(Handle< FermState<T, multi1d<U>, multi1d<U> > > fs,
315  const CloverFermActParams& param_);
316 
317  virtual void create(Handle< FermState<T, multi1d<U>, multi1d<U> > > fs,
318  const CloverFermActParams& param_,
319  const NVVMCloverTermT<T,U>& from_);
320 
321  //! Computes the inverse of the term on cb using Cholesky
322  /*!
323  * \param cb checkerboard of work (Read)
324  */
325  void choles(int cb);
326 
327  //! Computes the inverse of the term on cb using Cholesky
328  /*!
329  * \param cb checkerboard of work (Read)
330  * \return logarithm of the determinant
331  */
332  Double cholesDet(int cb) const ;
333 
334  /**
335  * Apply a dslash
336  *
337  * Performs the operation
338  *
339  * chi <- (L + D + L^dag) . psi
340  *
341  * where
342  * L is a lower triangular matrix
343  * D is the real diagonal. (stored together in type TRIANG)
344  *
345  * Arguments:
346  * \param chi result (Write)
347  * \param psi source (Read)
348  * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read)
349  * \param cb Checkerboard of OUTPUT std::vector (Read)
350  */
351  void apply (T& chi, const T& psi, enum PlusMinus isign, int cb) const;
352 
353 
354  void applySite(T& chi, const T& psi, enum PlusMinus isign, int site) const;
355 
356  //! Calculates Tr_D ( Gamma_mat L )
357  void triacntr(U& B, int mat, int cb) const;
358 
359  //! Return the fermion BC object for this linear operator
360  const FermBC<T, multi1d<U>, multi1d<U> >& getFermBC() const {return *fbc;}
361 
362  //! PACK UP the Clover term for QUDA library:
363  void packForQUDA(multi1d<QUDAPackedClovSite<REALT> >& quda_pack, int cb) const;
364 
365  int getDiaId() const { return tri_dia.getId(); }
366  int getOffId() const { return tri_off.getId(); }
367 
368 
369  protected:
370  //! Create the clover term on cb
371  /*!
372  * \param f field strength tensor F(mu,nu) (Read)
373  * \param cb checkerboard (Read)
374  */
375  void makeClov(const multi1d<U>& f, const RealT& diag_mass);
376 
377  //! Invert the clover term on cb
378  //void chlclovms(LatticeREAL& log_diag, int cb);
379  void ldagdlinv(LatticeREAL& tr_log_diag, int cb);
380 
381  //! Get the u field
382  const multi1d<U>& getU() const {return u;}
383 
384  //! Calculates Tr_D ( Gamma_mat L )
385  Real getCloverCoeff(int mu, int nu) const;
386 
387 
388  private:
390  multi1d<U> u;
392  LatticeREAL tr_log_diag_; // Fill this out during create
393  // but save the global sum until needed.
394  multi1d<bool> choles_done; // Keep note of whether the decomposition has been done
395  // on a particular checkerboard.
396 
397  OLattice<PComp<PTriDia<RScalar <Word<REALT> > > > > tri_dia;
398  OLattice<PComp<PTriOff<RComplex<Word<REALT> > > > > tri_off;
399 
400  };
401 
402 
403  // Empty constructor. Must use create later
404  template<typename T, typename U>
406 
407  // Now copy
408  template<typename T, typename U>
409  void NVVMCloverTermT<T,U>::create(Handle< FermState<T,multi1d<U>,multi1d<U> > > fs,
410  const CloverFermActParams& param_,
411  const NVVMCloverTermT<T,U>& from)
412  {
413  START_CODE();
414 
415  //std::cout << "PTX Clover create from other " << (void*)this << "\n";
416 
417  u.resize(Nd);
418 
419  u = fs->getLinks();
420  fbc = fs->getFermBC();
421  param = param_;
422 
423  // Sanity check
424  if (fbc.operator->() == 0) {
425  QDPIO::cerr << "NVVMCloverTerm: error: fbc is null" << std::endl;
426  QDP_abort(1);
427  }
428 
429  {
430  RealT ff = where(param.anisoParam.anisoP, Real(1) / param.anisoParam.xi_0, Real(1));
431  param.clovCoeffR *= Real(0.5) * ff;
432  param.clovCoeffT *= Real(0.5);
433  }
434 
435  //
436  // Yuk. Some bits of knowledge of the dslash term are buried in the
437  // effective mass term. They show up here. If I wanted some more
438  // complicated dslash then this will have to be fixed/adjusted.
439  //
440  RealT diag_mass;
441  {
442  RealT ff = where(param.anisoParam.anisoP, param.anisoParam.nu / param.anisoParam.xi_0, Real(1));
443  diag_mass = 1 + (Nd-1)*ff + param.Mass;
444  }
445 
446 
447  /* Calculate F(mu,nu) */
448  //multi1d<LatticeColorMatrix> f;
449  //mesField(f, u);
450  //makeClov(f, diag_mass);
451 
452  choles_done.resize(rb.numSubsets());
453  for(int i=0; i < rb.numSubsets(); i++) {
454  choles_done[i] = from.choles_done[i];
455  }
456 
457  tr_log_diag_ = from.tr_log_diag_;
458 
459  tri_dia = from.tri_dia;
460  tri_off = from.tri_off;
461 
462  END_CODE();
463  }
464 
465 
466  //! Creation routine
467  template<typename T, typename U>
468  void NVVMCloverTermT<T,U>::create(Handle< FermState<T,multi1d<U>,multi1d<U> > > fs,
469  const CloverFermActParams& param_)
470  {
471  START_CODE();
472 
473  //std::cout << "PTX Clover create " << (void*)this << "\n";
474 
475  u.resize(Nd);
476 
477  u = fs->getLinks();
478  fbc = fs->getFermBC();
479  param = param_;
480 
481  // Sanity check
482  if (fbc.operator->() == 0) {
483  QDPIO::cerr << "NVVMCloverTerm: error: fbc is null" << std::endl;
484  QDP_abort(1);
485  }
486 
487  {
488  RealT ff = where(param.anisoParam.anisoP, Real(1) / param.anisoParam.xi_0, Real(1));
489  param.clovCoeffR *= RealT(0.5) * ff;
490  param.clovCoeffT *= RealT(0.5);
491  }
492 
493  //
494  // Yuk. Some bits of knowledge of the dslash term are buried in the
495  // effective mass term. They show up here. If I wanted some more
496  // complicated dslash then this will have to be fixed/adjusted.
497  //
498  RealT diag_mass;
499  {
500  RealT ff = where(param.anisoParam.anisoP, param.anisoParam.nu / param.anisoParam.xi_0, Real(1));
501  diag_mass = 1 + (Nd-1)*ff + param.Mass;
502  }
503 
504  /* Calculate F(mu,nu) */
505  multi1d<U> f;
506  mesField(f, u);
507  makeClov(f, diag_mass);
508 
509  choles_done.resize(rb.numSubsets());
510  for(int i=0; i < rb.numSubsets(); i++) {
511  choles_done[i] = false;
512  }
513 
514  END_CODE();
515  }
516 
517 
518  /*
519  * MAKCLOV
520  *
521  * In this routine, MAKCLOV calculates
522 
523  * 1 - (1/4)*sigma(mu,nu) F(mu,nu)
524 
525  * using F from mesfield
526 
527  * F(mu,nu) = (1/4) sum_p (1/2) [ U_p(x) - U^dag_p(x) ]
528 
529  * using basis of SPPROD and stores in a lower triangular matrix
530  * (no diagonal) plus real diagonal
531 
532  * where
533  * U_1 = u(x,mu)*u(x+mu,nu)*u_dag(x+nu,mu)*u_dag(x,nu)
534  * U_2 = u(x,nu)*u_dag(x-mu+nu,mu)*u_dag(x-mu,nu)*u(x-mu,mu)
535  * U_3 = u_dag(x-mu,mu)*u_dag(x-mu-nu,nu)*u(x-mu-nu,mu)*u(x-nu,nu)
536  * U_4 = u_dag(x-nu,nu)*u(x-nu,mu)*u(x-nu+mu,nu)*u_dag(x,mu)
537 
538  * and
539 
540  * | sigF(1) sigF(3) 0 0 |
541  * sigF = | sigF(5) -sigF(1) 0 0 |
542  * | 0 0 -sigF(0) -sigF(2) |
543  * | 0 0 -sigF(4) sigF(0) |
544  * where
545  * sigF(i) is a color matrix
546 
547  * sigF(0) = i*(ClovT*E_z + ClovR*B_z)
548  * = i*(ClovT*F(3,2) + ClovR*F(1,0))
549  * sigF(1) = i*(ClovT*E_z - ClovR*B_z)
550  * = i*(ClovT*F(3,2) - ClovR*F(1,0))
551  * sigF(2) = i*(E_+ + B_+)
552  * sigF(3) = i*(E_+ - B_+)
553  * sigF(4) = i*(E_- + B_-)
554  * sigF(5) = i*(E_- - B_-)
555  * i*E_+ = (i*ClovT*E_x - ClovT*E_y)
556  * = (i*ClovT*F(3,0) - ClovT*F(3,1))
557  * i*E_- = (i*ClovT*E_x + ClovT*E_y)
558  * = (i*ClovT*F(3,0) + ClovT*F(3,1))
559  * i*B_+ = (i*ClovR*B_x - ClovR*B_y)
560  * = (i*ClovR*F(2,1) + ClovR*F(2,0))
561  * i*B_- = (i*ClovR*B_x + ClovR*B_y)
562  * = (i*ClovR*F(2,1) - ClovR*F(2,0))
563 
564  * NOTE: I am using i*F of the usual F defined by UKQCD, Heatlie et.al.
565 
566  * NOTE: the above definitions assume that the time direction, t_dir,
567  * is 3. In general F(k,j) is multiplied with ClovT if either
568  * k=t_dir or j=t_dir, and with ClovR otherwise.
569 
570  *+++
571  * Here are some notes on the origin of this routine. NOTE, ClovCoeff or u0
572  * are not actually used in MAKCLOV.
573  *
574  * The clover mass term is suppose to act on a std::vector like
575  *
576  * chi = (1 - (ClovCoeff/u0^3) * kappa/4 * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)) * psi
577 
578  * Definitions used here (NOTE: no "i")
579  * sigma(mu,nu) = gamma(mu)*gamma(nu) - gamma(nu)*gamma(mu)
580  * = 2*gamma(mu)*gamma(nu) for mu != nu
581  *
582  * chi = sum_mu sum_nu F(mu,nu)*gamma(mu)*gamma(nu)*psi for mu < nu
583  * = (1/2) * sum_mu sum_nu F(mu,nu)*gamma(mu)*gamma(nu)*psi for mu != nu
584  * = (1/4) * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)*psi
585  *
586  *
587  * chi = (1 - (ClovCoeff/u0^3) * kappa/4 * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)) * psi
588  * = psi - (ClovCoeff/u0^3) * kappa * chi
589  * == psi - kappa * chi
590  *
591  * We have absorbed ClovCoeff/u0^3 into kappa. A u0 was previously absorbed into kappa
592  * for compatibility to ancient conventions.
593  *---
594 
595  * Arguments:
596  * \param f field strength tensor F(cb,mu,nu) (Read)
597  * \param diag_mass effective mass term (Read)
598  */
599 
600  template<typename RealT,typename U,typename X,typename Y>
601  void function_make_clov_exec(CUfunction function,
602  const RealT& diag_mass,
603  const U& f0,
604  const U& f1,
605  const U& f2,
606  const U& f3,
607  const U& f4,
608  const U& f5,
609  X& tri_dia,
610  Y& tri_off)
611  {
612  AddressLeaf addr_leaf(all);
613 
614  forEach(diag_mass, addr_leaf, NullCombine());
615  forEach(f0, addr_leaf, NullCombine());
616  forEach(f1, addr_leaf, NullCombine());
617  forEach(f2, addr_leaf, NullCombine());
618  forEach(f3, addr_leaf, NullCombine());
619  forEach(f4, addr_leaf, NullCombine());
620  forEach(f5, addr_leaf, NullCombine());
621  forEach(tri_dia, addr_leaf, NullCombine());
622  forEach(tri_off, addr_leaf, NullCombine());
623 
624  // lo <= idx < hi
625  int lo = 0;
626  int hi = Layout::sitesOnNode();
627 
628 #ifndef QDP_JIT_NVVM_USE_LEGACY_LAUNCH
629  JitParam jit_lo( QDP_get_global_cache().addJitParamInt( lo ) );
630  JitParam jit_hi( QDP_get_global_cache().addJitParamInt( hi ) );
631 
632  std::vector<QDPCache::ArgKey> ids;
633  ids.push_back( jit_lo.get_id() );
634  ids.push_back( jit_hi.get_id() );
635  for(unsigned i=0; i < addr_leaf.ids.size(); ++i)
636  ids.push_back( addr_leaf.ids[i] );
637  jit_launch(function,Layout::sitesOnNode(),ids);
638 #else
639  std::vector<void*> addr;
640  addr.push_back( &lo );
641  addr.push_back( &hi );
642  for(unsigned i=0; i < addr_leaf.addr.size(); ++i) {
643  addr.push_back( &addr_leaf.addr[i] );
644  }
645  jit_launch(function,Layout::sitesOnNode(),addr);
646 #endif
647  }
648 
649 
650 
651  template<typename RealT,typename U,typename X,typename Y>
652  CUfunction function_make_clov_build(const RealT& diag_mass,
653  const U& f0,
654  const U& f1,
655  const U& f2,
656  const U& f3,
657  const U& f4,
658  const U& f5,
659  const X& tri_dia,
660  const Y& tri_off)
661  {
662  //std::cout << __PRETTY_FUNCTION__ << ": entering\n";
663  if (ptx_db::db_enabled) {
664  CUfunction func = llvm_ptx_db( __PRETTY_FUNCTION__ );
665  if (func)
666  return func;
667  }
668 
669  typedef typename WordType<RealT>::Type_t REALT;
670 
671  llvm_start_new_function();
672 
673  ParamRef p_lo = llvm_add_param<int>();
674  ParamRef p_hi = llvm_add_param<int>();
675 
676  ParamLeaf param_leaf;
677 
678  typedef typename LeafFunctor<RealT, ParamLeaf>::Type_t RealTJIT;
679  RealTJIT diag_mass_jit(forEach(diag_mass, param_leaf, TreeCombine()));
680 
681  typedef typename LeafFunctor<U, ParamLeaf>::Type_t UJIT;
682  UJIT f0_jit(forEach(f0, param_leaf, TreeCombine()));
683  UJIT f1_jit(forEach(f1, param_leaf, TreeCombine()));
684  UJIT f2_jit(forEach(f2, param_leaf, TreeCombine()));
685  UJIT f3_jit(forEach(f3, param_leaf, TreeCombine()));
686  UJIT f4_jit(forEach(f4, param_leaf, TreeCombine()));
687  UJIT f5_jit(forEach(f5, param_leaf, TreeCombine()));
688 
689  typedef typename LeafFunctor<X, ParamLeaf>::Type_t XJIT;
690  XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine()));
691 
692  typedef typename LeafFunctor<Y, ParamLeaf>::Type_t YJIT;
693  YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine()));
694 
695 
696  llvm_derefParam( p_lo );
697  llvm::Value * r_hi = llvm_derefParam( p_hi );
698  llvm::Value * r_idx = llvm_thread_idx();
699 
700  llvm_cond_exit( llvm_ge( r_idx , r_hi ) );
701 
702  auto& f0_j = f0_jit.elem(JitDeviceLayout::Coalesced , r_idx );
703  auto& f1_j = f1_jit.elem(JitDeviceLayout::Coalesced , r_idx );
704  auto& f2_j = f2_jit.elem(JitDeviceLayout::Coalesced , r_idx );
705  auto& f3_j = f3_jit.elem(JitDeviceLayout::Coalesced , r_idx );
706  auto& f4_j = f4_jit.elem(JitDeviceLayout::Coalesced , r_idx );
707  auto& f5_j = f5_jit.elem(JitDeviceLayout::Coalesced , r_idx );
708 
709  auto& tri_dia_j = tri_dia_jit.elem(JitDeviceLayout::Coalesced , r_idx );
710  auto& tri_off_j = tri_off_jit.elem(JitDeviceLayout::Coalesced , r_idx );
711 
712  typename REGType< typename RealTJIT::Subtype_t >::Type_t diag_mass_reg;
713  diag_mass_reg.setup( diag_mass_jit.elem() );
714 
715 
716 
717 
718  for(int jj = 0; jj < 2; jj++) {
719  for(int ii = 0; ii < 2*Nc; ii++) {
720  tri_dia_j.elem(jj).elem(ii) = diag_mass_reg.elem().elem();
721  //tri[site].diag[jj][ii] = diag_mass.elem().elem().elem();
722  }
723  }
724 
725 
726  RComplexREG<WordREG<REALT> > E_minus;
727  RComplexREG<WordREG<REALT> > B_minus;
728  RComplexREG<WordREG<REALT> > ctmp_0;
729  RComplexREG<WordREG<REALT> > ctmp_1;
730  RScalarREG<WordREG<REALT> > rtmp_0;
731  RScalarREG<WordREG<REALT> > rtmp_1;
732 
733 
734  for(int i = 0; i < Nc; ++i) {
735  ctmp_0 = f5_j.elem().elem(i,i);
736  ctmp_0 -= f0_j.elem().elem(i,i);
737  rtmp_0 = imag(ctmp_0);
738  tri_dia_j.elem(0).elem(i) += rtmp_0;
739 
740  tri_dia_j.elem(0).elem(i+Nc) -= rtmp_0;
741 
742  ctmp_1 = f5_j.elem().elem(i,i);
743  ctmp_1 += f0_j.elem().elem(i,i);
744  rtmp_1 = imag(ctmp_1);
745  tri_dia_j.elem(1).elem(i) -= rtmp_1;
746 
747  tri_dia_j.elem(1).elem(i+Nc) += rtmp_1;
748  }
749 
750  for(int i = 1; i < Nc; ++i) {
751  for(int j = 0; j < i; ++j) {
752 
753  int elem_ij = i*(i-1)/2 + j;
754  int elem_tmp = (i+Nc)*(i+Nc-1)/2 + j+Nc;
755 
756  ctmp_0 = f0_j.elem().elem(i,j);
757  ctmp_0 -= f5_j.elem().elem(i,j);
758  tri_off_j.elem(0).elem(elem_ij) = timesI(ctmp_0);
759 
760  zero_rep( tri_off_j.elem(0).elem(elem_tmp) );
761  tri_off_j.elem(0).elem(elem_tmp) -= tri_off_j.elem(0).elem(elem_ij);// * -1.0;
762 
763  ctmp_1 = f5_j.elem().elem(i,j);
764  ctmp_1 += f0_j.elem().elem(i,j);
765  tri_off_j.elem(1).elem(elem_ij) = timesI(ctmp_1);
766 
767  zero_rep( tri_off_j.elem(1).elem(elem_tmp) );
768  tri_off_j.elem(1).elem(elem_tmp) -= tri_off_j.elem(1).elem(elem_ij);
769  }
770  }
771 
772  for(int i = 0; i < Nc; ++i) {
773  for(int j = 0; j < Nc; ++j) {
774 
775  int elem_ij = (i+Nc)*(i+Nc-1)/2 + j;
776 
777  //E_minus = timesI(f2_j.elem().elem(i,j));
778  E_minus = f2_j.elem().elem(i,j);
779  E_minus = timesI( E_minus );
780 
781  E_minus += f4_j.elem().elem(i,j);
782 
783  //B_minus = timesI(f3_j.elem().elem(i,j));
784  B_minus = f3_j.elem().elem(i,j);
785  B_minus = timesI( B_minus );
786 
787  B_minus -= f1_j.elem().elem(i,j);
788 
789  tri_off_j.elem(0).elem(elem_ij) = B_minus - E_minus;
790 
791  tri_off_j.elem(1).elem(elem_ij) = E_minus + B_minus;
792  }
793  }
794 
795  // std::cout << __PRETTY_FUNCTION__ << ": leaving\n";
796 
797  return jit_function_epilogue_get_cuf("jit_make_clov.ptx" , __PRETTY_FUNCTION__ );
798  }
799 
800 
801 
802 
803  /* This now just sets up and dispatches... */
804  template<typename T, typename U>
805  void NVVMCloverTermT<T,U>::makeClov(const multi1d<U>& f, const RealT& diag_mass)
806  {
807  START_CODE();
808 
809  if ( Nd != 4 ){
810  QDPIO::cerr << __func__ << ": expecting Nd==4" << std::endl;
811  QDP_abort(1);
812  }
813 
814  if ( Ns != 4 ){
815  QDPIO::cerr << __func__ << ": expecting Ns==4" << std::endl;
816  QDP_abort(1);
817  }
818 
819  U f0 = f[0] * getCloverCoeff(0,1);
820  U f1 = f[1] * getCloverCoeff(0,2);
821  U f2 = f[2] * getCloverCoeff(0,3);
822  U f3 = f[3] * getCloverCoeff(1,2);
823  U f4 = f[4] * getCloverCoeff(1,3);
824  U f5 = f[5] * getCloverCoeff(2,3);
825 
826  //QDPIO::cout << "PTX Clover make " << (void*)this << "\n";
827  //std::cout << "PTX Clover make " << (void*)this << "\n";
828  static CUfunction function;
829 
830  if (function == NULL)
831  function = function_make_clov_build(diag_mass, f0,f1,f2,f3,f4,f5, tri_dia , tri_off );
832 
833  // Execute the function
834  function_make_clov_exec(function, diag_mass, f0,f1,f2,f3,f4,f5,tri_dia, tri_off);
835 
836  END_CODE();
837  }
838 
839 
840  //! Invert
841  /*!
842  * Computes the inverse of the term on cb using Cholesky
843  */
844  template<typename T, typename U>
846  {
847  START_CODE();
848 
849  // When you are doing the cholesky - also fill out the trace_log_diag piece)
850  // chlclovms(tr_log_diag_, cb);
851  // Switch to LDL^\dag inversion
852  ldagdlinv(tr_log_diag_,cb);
853 
854  END_CODE();
855  }
856 
857 
858  //! Invert
859  /*!
860  * Computes the inverse of the term on cb using Cholesky
861  *
862  * \return logarithm of the determinant
863  */
864  template<typename T, typename U>
866  {
867  START_CODE();
868 
869  if( choles_done[cb] == false )
870  {
871  QDPIO::cout << __func__ << ": Error: you have not done the Cholesky.on this operator on this subset" << std::endl;
872  QDPIO::cout << "You sure you should not be asking invclov?" << std::endl;
873  QDP_abort(1);
874  }
875 
876  LatticeREAL ff=tr_log_diag_;
877 
878 
879  END_CODE();
880 
881  // Need to thread generic sums in QDP++?
882  // Need to thread generic norm2() in QDP++?
883  return sum(tr_log_diag_, rb[cb]);
884  }
885 
886 
887 
888  template<typename T,typename X,typename Y>
889  void function_ldagdlinv_exec( CUfunction function,
890  T& tr_log_diag,
891  X& tri_dia,
892  Y& tri_off,
893  const Subset& s)
894  {
895  if (!s.hasOrderedRep())
896  QDP_error_exit("ldagdlinv on subset with unordered representation not implemented");
897 
898  AddressLeaf addr_leaf(s);
899 
900  forEach(tr_log_diag, addr_leaf, NullCombine());
901  forEach(tri_dia, addr_leaf, NullCombine());
902  forEach(tri_off, addr_leaf, NullCombine());
903 
904  // lo <= idx < hi
905  int lo = s.start();
906  int hi = s.end();
907 
908 #ifndef QDP_JIT_NVVM_USE_LEGACY_LAUNCH
909  JitParam jit_lo( QDP_get_global_cache().addJitParamInt( lo ) );
910  JitParam jit_hi( QDP_get_global_cache().addJitParamInt( hi ) );
911  std::vector<QDPCache::ArgKey> ids;
912  ids.push_back( jit_lo.get_id() );
913  ids.push_back( jit_hi.get_id() );
914  for(unsigned i=0; i < addr_leaf.ids.size(); ++i)
915  ids.push_back( addr_leaf.ids[i] );
916  jit_launch(function,s.numSiteTable(),ids);
917 #else
918  std::vector<void*> addr;
919  addr.push_back( &lo );
920  addr.push_back( &hi );
921  for(unsigned i=0; i < addr_leaf.addr.size(); ++i) {
922  addr.push_back( &addr_leaf.addr[i] );
923  }
924  jit_launch(function,s.numSiteTable(),addr);
925 #endif
926  }
927 
928 
929 
930 
931 
932  template<typename U,typename T,typename X,typename Y>
933  CUfunction function_ldagdlinv_build( const T& tr_log_diag,
934  const X& tri_dia,
935  const Y& tri_off,
936  const Subset& s)
937  {
938  typedef typename WordType<U>::Type_t REALT;
939 
940  if (ptx_db::db_enabled) {
941  CUfunction func = llvm_ptx_db( __PRETTY_FUNCTION__ );
942  if (func)
943  return func;
944  }
945 
946 
947  //std::cout << __PRETTY_FUNCTION__ << " entering\n";
948 
949  llvm_start_new_function();
950 
951  ParamRef p_lo = llvm_add_param<int>();
952  ParamRef p_hi = llvm_add_param<int>();
953 
954  ParamLeaf param_leaf;
955 
956  typedef typename LeafFunctor<T, ParamLeaf>::Type_t TJIT;
957  TJIT tr_log_diag_jit(forEach(tr_log_diag, param_leaf, TreeCombine()));
958 
959  typedef typename LeafFunctor<X, ParamLeaf>::Type_t XJIT;
960  XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine()));
961 
962  typedef typename LeafFunctor<Y, ParamLeaf>::Type_t YJIT;
963  YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine()));
964 
965  llvm::Value * r_lo = llvm_derefParam( p_lo );
966  llvm::Value * r_hi = llvm_derefParam( p_hi );
967  llvm::Value * r_idx_thread = llvm_thread_idx();
968 
969  llvm_cond_exit( llvm_gt( r_idx_thread , llvm_sub( r_hi , r_lo ) ) );
970 
971  llvm::Value * r_idx = llvm_add( r_lo , r_idx_thread );
972 
973  auto& tr_log_diag_j = tr_log_diag_jit.elem(JitDeviceLayout::Coalesced,r_idx);
974  auto& tri_dia_j = tri_dia_jit.elem(JitDeviceLayout::Coalesced,r_idx);
975  auto& tri_off_j = tri_off_jit.elem(JitDeviceLayout::Coalesced,r_idx);
976 
977  typename REGType< typename XJIT::Subtype_t >::Type_t tri_dia_r;
978  typename REGType< typename YJIT::Subtype_t >::Type_t tri_off_r;
979 
980  tri_dia_r.setup( tri_dia_j );
981  tri_off_r.setup( tri_off_j );
982 
983 
984  RScalarREG<WordREG<REALT> > zip;
985  zero_rep(zip);
986  int N = 2*Nc;
987 
988  //int site_neg_logdet=0;
989 
990  for(int block=0; block < 2; block++) {
991 
992  RScalarREG<WordREG<REALT> > inv_d[6] ;
993  RComplexREG<WordREG<REALT> > inv_offd[15] ;
994  RComplexREG<WordREG<REALT> > v[6] ;
995  RScalarREG<WordREG<REALT> > diag_g[6] ;
996 
997  for(int i=0; i < N; i++) {
998  inv_d[i] = tri_dia_r.elem(block).elem(i);
999  }
1000 
1001  for(int i=0; i < 15; i++) {
1002  inv_offd[i] = tri_off_r.elem(block).elem(i);
1003  }
1004 
1005 
1006  for(int j=0; j < N; ++j) {
1007 
1008  for(int i=0; i < j; i++) {
1009  int elem_ji = j*(j-1)/2 + i;
1010 
1011  RComplexREG<WordREG<REALT> > A_ii = cmplx( inv_d[i], zip );
1012  v[i] = A_ii*adj(inv_offd[elem_ji]);
1013  }
1014 
1015 
1016  v[j] = cmplx(inv_d[j],zip);
1017 
1018  for(int k=0; k < j; k++) {
1019  int elem_jk = j*(j-1)/2 + k;
1020  v[j] -= inv_offd[elem_jk]*v[k];
1021  }
1022 
1023  inv_d[j] = real( v[j] );
1024 
1025  for(int k=j+1; k < N; k++) {
1026  int elem_kj = k*(k-1)/2 + j;
1027  for(int l=0; l < j; l++) {
1028  int elem_kl = k*(k-1)/2 + l;
1029  inv_offd[elem_kj] -= inv_offd[elem_kl] * v[l];
1030  }
1031  inv_offd[elem_kj] /= v[j];
1032  }
1033  }
1034 
1035 
1036  // Now fix up the inverse
1037  RScalarREG<WordREG<REALT> > one(1.0);
1038  //one.elem() = (REALT)1;
1039 
1040  for(int i=0; i < N; i++) {
1041  diag_g[i] = one/inv_d[i];
1042 
1043  // Compute the trace log
1044  // NB we are always doing trace log | A |
1045  // (because we are always working with actually A^\dagger A
1046  // even in one flavour case where we square root)
1047  tr_log_diag_j.elem().elem() += log(fabs(inv_d[i]));
1048  // However, it is worth counting just the no of negative logdets
1049  // on site
1050 #if 0
1051  if( inv_d[i].elem() < 0 ) {
1052  site_neg_logdet++;
1053  }
1054 #endif
1055  }
1056 
1057  // Now we need to invert the L D L^\dagger
1058  // We can do this by solving:
1059  //
1060  // L D L^\dagger M^{-1} = 1
1061  //
1062  // This can be done by solving L D X = 1 (X = L^\dagger M^{-1})
1063  //
1064  // Then solving L^\dagger M^{-1} = X
1065  //
1066  // LD is lower diagonal and so X will also be lower diagonal.
1067  // LD X = 1 can be solved by forward substitution.
1068  //
1069  // Likewise L^\dagger is strictly upper triagonal and so
1070  // L^\dagger M^{-1} = X can be solved by forward substitution.
1071  RComplexREG<WordREG<REALT> > sum;
1072  for(int k = 0; k < N; ++k) {
1073 
1074  for(int i = 0; i < k; ++i) {
1075  zero_rep(v[i]);
1076  }
1077 
1078  /*# Forward substitution */
1079 
1080  // The first element is the inverse of the diagonal
1081  v[k] = cmplx(diag_g[k],zip);
1082 
1083  for(int i = k+1; i < N; ++i) {
1084  zero_rep(v[i]);
1085 
1086  for(int j = k; j < i; ++j) {
1087  int elem_ij = i*(i-1)/2+j;
1088 
1089  // subtract l_ij*d_j*x_{kj}
1090  v[i] -= inv_offd[elem_ij] *inv_d[j]*v[j];
1091 
1092  }
1093 
1094  // scale out by 1/d_i
1095  v[i] *= diag_g[i];
1096  }
1097 
1098  /*# Backward substitution */
1099  // V[N-1] remains unchanged
1100  // Start from V[N-2]
1101 
1102  for(int i = N-2; (int)i >= (int)k; --i) {
1103  for(int j = i+1; j < N; ++j) {
1104  int elem_ji = j*(j-1)/2 + i;
1105  // Subtract terms of typ (l_ji)*x_kj
1106  v[i] -= adj(inv_offd[elem_ji]) * v[j];
1107  }
1108  }
1109 
1110  /*# Overwrite column k of invcl.offd */
1111  inv_d[k] = real(v[k]);
1112  for(int i = k+1; i < N; ++i) {
1113 
1114  int elem_ik = i*(i-1)/2+k;
1115  inv_offd[elem_ik] = v[i];
1116  }
1117  }
1118 
1119  // Overwrite original data
1120  for(int i=0; i < N; i++) {
1121  tri_dia_j.elem(block).elem(i) = inv_d[i];
1122  }
1123  for(int i=0; i < 15; i++) {
1124  tri_off_j.elem(block).elem(i) = inv_offd[i];
1125  }
1126  }
1127 
1128  // std::cout << __PRETTY_FUNCTION__ << " leaving\n";
1129 
1130  return jit_function_epilogue_get_cuf("jit_ldagdlinv.ptx" , __PRETTY_FUNCTION__ );
1131  }
1132 
1133 
1134 
1135 
1136 
1137  /*! An LDL^\dag decomposition and inversion? */
1138  template<typename T, typename U>
1140  {
1141  START_CODE();
1142 
1143  if ( 2*Nc < 3 )
1144  {
1145  QDPIO::cerr << __func__ << ": Matrix is too small" << std::endl;
1146  QDP_abort(1);
1147  }
1148 
1149  // Zero trace log
1150  tr_log_diag[rb[cb]] = zero;
1151 
1152  //QDPIO::cout << "PTX Clover ldagdlinv " << (void*)this << "\n";
1153  //std::cout << "PTX Clover ldagdlinv " << (void*)this << "\n";
1154  static CUfunction function;
1155 
1156  if (function == NULL)
1157  function = function_ldagdlinv_build<U>(tr_log_diag, tri_dia, tri_off, rb[cb] );
1158 
1159  // Execute the function
1160  function_ldagdlinv_exec(function, tr_log_diag, tri_dia, tri_off, rb[cb] );
1161 
1162  // This comes from the days when we used to do Cholesky
1163  choles_done[cb] = true;
1164  END_CODE();
1165  }
1166 
1167  /*! CHLCLOVMS - Cholesky decompose the clover mass term and uses it to
1168  * compute lower(A^-1) = lower((L.L^dag)^-1)
1169  * Adapted from Golub and Van Loan, Matrix Computations, 2nd, Sec 4.2.4
1170  *
1171  * Arguments:
1172  *
1173  * \param DetP flag whether to compute determinant (Read)
1174  * \param logdet logarithm of the determinant (Write)
1175  * \param cb checkerboard of work (Read)
1176  */
1177 
1178 
1179 
1180 
1181 
1182 
1183  //! TRIACNTR
1184  /*!
1185  * \ingroup linop
1186  *
1187  * Calculates
1188  * Tr_D ( Gamma_mat L )
1189  *
1190  * This routine is specific to Wilson fermions!
1191  *
1192  * the trace over the Dirac indices for one of the 16 Gamma matrices
1193  * and a hermitian color x spin matrix A, stored as a block diagonal
1194  * complex lower triangular matrix L and a real diagonal diag_L.
1195 
1196  * Here 0 <= mat <= 15 and
1197  * if mat = mat_1 + mat_2 * 2 + mat_3 * 4 + mat_4 * 8
1198  *
1199  * Gamma(mat) = gamma(1)^(mat_1) * gamma(2)^(mat_2) * gamma(3)^(mat_3)
1200  * * gamma(4)^(mat_4)
1201  *
1202  * Further, in basis for the Gamma matrices used, A is of the form
1203  *
1204  * | A_0 | 0 |
1205  * A = | --------- |
1206  * | 0 | A_1 |
1207  *
1208  *
1209  * Arguments:
1210  *
1211  * \param B the resulting SU(N) color matrix (Write)
1212  * \param clov clover term (Read)
1213  * \param mat label of the Gamma matrix (Read)
1214  */
1215 
1216 
1217  template<typename U,typename X,typename Y>
1218  void function_triacntr_exec( CUfunction function,
1219  U& B,
1220  const X& tri_dia,
1221  const Y& tri_off,
1222  int mat,
1223  const Subset& s)
1224  {
1225  if (!s.hasOrderedRep())
1226  QDP_error_exit("triacntr on subset with unordered representation not implemented");
1227 
1228  AddressLeaf addr_leaf(s);
1229 
1230  forEach(B, addr_leaf, NullCombine());
1231  forEach(tri_dia, addr_leaf, NullCombine());
1232  forEach(tri_off, addr_leaf, NullCombine());
1233 
1234  // lo <= idx < hi
1235  int lo = s.start();
1236  int hi = s.end();
1237 
1238 #ifndef QDP_JIT_NVVM_USE_LEGACY_LAUNCH
1239  JitParam jit_lo( QDP_get_global_cache().addJitParamInt( lo ) );
1240  JitParam jit_hi( QDP_get_global_cache().addJitParamInt( hi ) );
1241  JitParam jit_mat( QDP_get_global_cache().addJitParamInt( mat ) );
1242 
1243  std::vector<QDPCache::ArgKey> ids;
1244  ids.push_back( jit_lo.get_id() );
1245  ids.push_back( jit_hi.get_id() );
1246  ids.push_back( jit_mat.get_id() );
1247  for(unsigned i=0; i < addr_leaf.ids.size(); ++i)
1248  ids.push_back( addr_leaf.ids[i] );
1249  jit_launch(function,s.numSiteTable(),ids);
1250 #else
1251  std::vector<void*> addr;
1252  addr.push_back( &lo );
1253  addr.push_back( &hi );
1254  addr.push_back( &mat );
1255  for(unsigned i=0; i < addr_leaf.addr.size(); ++i) {
1256  addr.push_back( &addr_leaf.addr[i] );
1257  }
1258  jit_launch(function,s.numSiteTable(),addr);
1259 #endif
1260  }
1261 
1262 
1263 
1264 
1265  template<typename U,typename X,typename Y>
1266  CUfunction function_triacntr_build( const U& B,
1267  const X& tri_dia,
1268  const Y& tri_off,
1269  int mat,
1270  const Subset& s)
1271  {
1272  if (ptx_db::db_enabled) {
1273  CUfunction func = llvm_ptx_db( __PRETTY_FUNCTION__ );
1274  if (func)
1275  return func;
1276  }
1277 
1278 
1279  //std::cout << __PRETTY_FUNCTION__ << ": entering\n";
1280 
1281  typedef typename WordType<U>::Type_t REALT;
1282 
1283  llvm_start_new_function();
1284 
1285  ParamRef p_lo = llvm_add_param<int>();
1286  ParamRef p_hi = llvm_add_param<int>();
1287 
1288  ParamRef p_mat = llvm_add_param<int>();
1289 
1290  ParamLeaf param_leaf;
1291 
1292 
1293  typedef typename LeafFunctor<U, ParamLeaf>::Type_t UJIT;
1294  UJIT B_jit(forEach(B, param_leaf, TreeCombine()));
1295 
1296  typedef typename LeafFunctor<X, ParamLeaf>::Type_t XJIT;
1297  XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine()));
1298 
1299  typedef typename LeafFunctor<Y, ParamLeaf>::Type_t YJIT;
1300  YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine()));
1301 
1302  llvm::Value * r_mat = llvm_derefParam( p_mat );
1303  llvm::Value * r_lo = llvm_derefParam( p_lo );
1304  llvm::Value * r_hi = llvm_derefParam( p_hi );
1305  llvm::Value * r_idx_thread = llvm_thread_idx();
1306 
1307  llvm_cond_exit( llvm_gt( r_idx_thread , llvm_sub( r_hi , r_lo ) ) );
1308 
1309  llvm::Value * r_idx = llvm_add( r_lo , r_idx_thread );
1310 
1311  auto& B_j = B_jit.elem(JitDeviceLayout::Coalesced,r_idx);
1312  auto& tri_dia_j = tri_dia_jit.elem(JitDeviceLayout::Coalesced,r_idx);
1313  auto& tri_off_j = tri_off_jit.elem(JitDeviceLayout::Coalesced,r_idx);
1314 
1315  typename REGType< typename XJIT::Subtype_t >::Type_t tri_dia_r;
1316  typename REGType< typename YJIT::Subtype_t >::Type_t tri_off_r;
1317 
1318  tri_dia_r.setup( tri_dia_j );
1319  tri_off_r.setup( tri_off_j );
1320 
1321 
1322  llvm::BasicBlock * case_0 = llvm_new_basic_block();
1323  llvm::BasicBlock * case_3 = llvm_new_basic_block();
1324  llvm::BasicBlock * case_5 = llvm_new_basic_block();
1325  llvm::BasicBlock * case_6 = llvm_new_basic_block();
1326  llvm::BasicBlock * case_9 = llvm_new_basic_block();
1327  llvm::BasicBlock * case_10 = llvm_new_basic_block();
1328  llvm::BasicBlock * case_12 = llvm_new_basic_block();
1329  llvm::BasicBlock * case_default = llvm_new_basic_block();
1330 
1331  llvm::SwitchInst * mat_sw = llvm_switch( r_mat , case_default );
1332 
1333  mat_sw->addCase( llvm_create_const_int(0) , case_0 );
1334  mat_sw->addCase( llvm_create_const_int(3) , case_3 );
1335  mat_sw->addCase( llvm_create_const_int(5) , case_5 );
1336  mat_sw->addCase( llvm_create_const_int(6) , case_6 );
1337  mat_sw->addCase( llvm_create_const_int(9) , case_9 );
1338  mat_sw->addCase( llvm_create_const_int(10) , case_10 );
1339  mat_sw->addCase( llvm_create_const_int(12) , case_12 );
1340 
1341  /*# gamma( 0) 1 0 0 0 # ( 0000 ) --> 0 */
1342  /*# 0 1 0 0 */
1343  /*# 0 0 1 0 */
1344  /*# 0 0 0 1 */
1345  /*# From diagonal part */
1346  llvm_set_insert_point( case_0 );
1347  {
1348  RComplexREG<WordREG<REALT> > lctmp0;
1349  RScalarREG< WordREG<REALT> > lr_zero0;
1350  RScalarREG< WordREG<REALT> > lrtmp0;
1351 
1352  zero_rep(lr_zero0);
1353 
1354  for(int i0 = 0; i0 < Nc; ++i0) {
1355 
1356  lrtmp0 = tri_dia_r.elem(0).elem(i0);
1357  lrtmp0 += tri_dia_r.elem(0).elem(i0+Nc);
1358  lrtmp0 += tri_dia_r.elem(1).elem(i0);
1359  lrtmp0 += tri_dia_r.elem(1).elem(i0+Nc);
1360  B_j.elem().elem(i0,i0) = cmplx(lrtmp0,lr_zero0);
1361  }
1362 
1363  /*# From lower triangular portion */
1364  int elem_ij0 = 0;
1365  for(int i0 = 1; i0 < Nc; ++i0) {
1366 
1367  int elem_ijb0 = (i0+Nc)*(i0+Nc-1)/2 + Nc;
1368 
1369  for(int j0 = 0; j0 < i0; ++j0) {
1370 
1371  lctmp0 = tri_off_r.elem(0).elem(elem_ij0);
1372  lctmp0 += tri_off_r.elem(0).elem(elem_ijb0);
1373  lctmp0 += tri_off_r.elem(1).elem(elem_ij0);
1374  lctmp0 += tri_off_r.elem(1).elem(elem_ijb0);
1375 
1376  B_j.elem().elem(j0,i0) = lctmp0;
1377  B_j.elem().elem(i0,j0) = adj(lctmp0);
1378 
1379  elem_ij0++;
1380  elem_ijb0++;
1381  }
1382  }
1383  llvm_exit( );
1384  }
1385 
1386 
1387  llvm_set_insert_point( case_3 );
1388  {
1389  /*# gamma( 12) -i 0 0 0 # ( 0011 ) --> 3 */
1390  /*# 0 i 0 0 */
1391  /*# 0 0 -i 0 */
1392  /*# 0 0 0 i */
1393  /*# From diagonal part */
1394 
1395  RComplexREG<WordREG<REALT> > lctmp3;
1396  RScalarREG<WordREG<REALT> > lr_zero3;
1397  RScalarREG<WordREG<REALT> > lrtmp3;
1398 
1399  lr_zero3 = 0;
1400 
1401  for(int i3 = 0; i3 < Nc; ++i3) {
1402 
1403  lrtmp3 = tri_dia_r.elem(0).elem(i3+Nc);
1404  lrtmp3 -= tri_dia_r.elem(0).elem(i3);
1405  lrtmp3 -= tri_dia_r.elem(1).elem(i3);
1406  lrtmp3 += tri_dia_r.elem(1).elem(i3+Nc);
1407  B_j.elem().elem(i3,i3) = cmplx(lr_zero3,lrtmp3);
1408  }
1409 
1410  /*# From lower triangular portion */
1411  int elem_ij3 = 0;
1412  for(int i3 = 1; i3 < Nc; ++i3) {
1413 
1414  int elem_ijb3 = (i3+Nc)*(i3+Nc-1)/2 + Nc;
1415 
1416  for(int j3 = 0; j3 < i3; ++j3) {
1417 
1418  lctmp3 = tri_off_r.elem(0).elem(elem_ijb3);
1419  lctmp3 -= tri_off_r.elem(0).elem(elem_ij3);
1420  lctmp3 -= tri_off_r.elem(1).elem(elem_ij3);
1421  lctmp3 += tri_off_r.elem(1).elem(elem_ijb3);
1422 
1423  B_j.elem().elem(j3,i3) = timesI(adj(lctmp3));
1424  B_j.elem().elem(i3,j3) = timesI(lctmp3);
1425 
1426  elem_ij3++;
1427  elem_ijb3++;
1428  }
1429  }
1430  llvm_exit( );
1431  }
1432 
1433 
1434  llvm_set_insert_point( case_5 );
1435  {
1436  /*# gamma( 13) 0 -1 0 0 # ( 0101 ) --> 5 */
1437  /*# 1 0 0 0 */
1438  /*# 0 0 0 -1 */
1439  /*# 0 0 1 0 */
1440 
1441  RComplexREG<WordREG<REALT> > lctmp5;
1442  RScalarREG<WordREG<REALT> > lrtmp5;
1443 
1444  for(int i5 = 0; i5 < Nc; ++i5) {
1445 
1446  int elem_ij5 = (i5+Nc)*(i5+Nc-1)/2;
1447 
1448  for(int j5 = 0; j5 < Nc; ++j5) {
1449 
1450  int elem_ji5 = (j5+Nc)*(j5+Nc-1)/2 + i5;
1451 
1452 
1453  lctmp5 = adj(tri_off_r.elem(0).elem(elem_ji5));
1454  lctmp5 -= tri_off_r.elem(0).elem(elem_ij5);
1455  lctmp5 += adj(tri_off_r.elem(1).elem(elem_ji5));
1456  lctmp5 -= tri_off_r.elem(1).elem(elem_ij5);
1457 
1458  B_j.elem().elem(i5,j5) = lctmp5;
1459 
1460  elem_ij5++;
1461  }
1462  }
1463  llvm_exit( );
1464  }
1465 
1466 
1467  llvm_set_insert_point( case_6 );
1468  {
1469  /*# gamma( 23) 0 -i 0 0 # ( 0110 ) --> 6 */
1470  /*# -i 0 0 0 */
1471  /*# 0 0 0 -i */
1472  /*# 0 0 -i 0 */
1473 
1474  RComplexREG<WordREG<REALT> > lctmp6;
1475  RScalarREG<WordREG<REALT> > lrtmp6;
1476 
1477  for(int i6 = 0; i6 < Nc; ++i6) {
1478 
1479  int elem_ij6 = (i6+Nc)*(i6+Nc-1)/2;
1480 
1481  for(int j6 = 0; j6 < Nc; ++j6) {
1482 
1483  int elem_ji6 = (j6+Nc)*(j6+Nc-1)/2 + i6;
1484 
1485  lctmp6 = adj(tri_off_r.elem(0).elem(elem_ji6));
1486  lctmp6 += tri_off_r.elem(0).elem(elem_ij6);
1487  lctmp6 += adj(tri_off_r.elem(1).elem(elem_ji6));
1488  lctmp6 += tri_off_r.elem(1).elem(elem_ij6);
1489 
1490  B_j.elem().elem(i6,j6) = timesMinusI(lctmp6);
1491 
1492  elem_ij6++;
1493  }
1494  }
1495  llvm_exit( );
1496  }
1497 
1498 
1499  llvm_set_insert_point( case_9 );
1500  {
1501  /*# gamma( 14) 0 i 0 0 # ( 1001 ) --> 9 */
1502  /*# i 0 0 0 */
1503  /*# 0 0 0 -i */
1504  /*# 0 0 -i 0 */
1505 
1506  RComplexREG<WordREG<REALT> > lctmp9;
1507  RScalarREG<WordREG<REALT> > lrtmp9;
1508 
1509  for(int i9 = 0; i9 < Nc; ++i9) {
1510 
1511  int elem_ij9 = (i9+Nc)*(i9+Nc-1)/2;
1512 
1513  for(int j9 = 0; j9 < Nc; ++j9) {
1514 
1515  int elem_ji9 = (j9+Nc)*(j9+Nc-1)/2 + i9;
1516 
1517  lctmp9 = adj(tri_off_r.elem(0).elem(elem_ji9));
1518  lctmp9 += tri_off_r.elem(0).elem(elem_ij9);
1519  lctmp9 -= adj(tri_off_r.elem(1).elem(elem_ji9));
1520  lctmp9 -= tri_off_r.elem(1).elem(elem_ij9);
1521 
1522  B_j.elem().elem(i9,j9) = timesI(lctmp9);
1523 
1524  elem_ij9++;
1525  }
1526  }
1527  llvm_exit( );
1528  }
1529 
1530 
1531  llvm_set_insert_point( case_10 );
1532  {
1533  /*# gamma( 24) 0 -1 0 0 # ( 1010 ) --> 10 */
1534  /*# 1 0 0 0 */
1535  /*# 0 0 0 1 */
1536  /*# 0 0 -1 0 */
1537 
1538  RComplexREG<WordREG<REALT> > lctmp10;
1539  RScalarREG<WordREG<REALT> > lrtmp10;
1540 
1541  for(int i10 = 0; i10 < Nc; ++i10) {
1542 
1543  int elem_ij10 = (i10+Nc)*(i10+Nc-1)/2;
1544 
1545  for(int j10 = 0; j10 < Nc; ++j10) {
1546 
1547  int elem_ji10 = (j10+Nc)*(j10+Nc-1)/2 + i10;
1548 
1549  lctmp10 = adj(tri_off_r.elem(0).elem(elem_ji10));
1550  lctmp10 -= tri_off_r.elem(0).elem(elem_ij10);
1551  lctmp10 -= adj(tri_off_r.elem(1).elem(elem_ji10));
1552  lctmp10 += tri_off_r.elem(1).elem(elem_ij10);
1553 
1554  B_j.elem().elem(i10,j10) = lctmp10;
1555 
1556  elem_ij10++;
1557  }
1558  }
1559  llvm_exit( );
1560  }
1561 
1562 
1563  llvm_set_insert_point( case_12 );
1564  {
1565  /*# gamma( 34) i 0 0 0 # ( 1100 ) --> 12 */
1566  /*# 0 -i 0 0 */
1567  /*# 0 0 -i 0 */
1568  /*# 0 0 0 i */
1569  /*# From diagonal part */
1570 
1571  RComplexREG<WordREG<REALT> > lctmp12;
1572  RScalarREG<WordREG<REALT> > lr_zero12;
1573  RScalarREG<WordREG<REALT> > lrtmp12;
1574 
1575  lr_zero12 = 0;
1576 
1577  for(int i12 = 0; i12 < Nc; ++i12) {
1578 
1579  lrtmp12 = tri_dia_r.elem(0).elem(i12);
1580  lrtmp12 -= tri_dia_r.elem(0).elem(i12+Nc);
1581  lrtmp12 -= tri_dia_r.elem(1).elem(i12);
1582  lrtmp12 += tri_dia_r.elem(1).elem(i12+Nc);
1583  B_j.elem().elem(i12,i12) = cmplx(lr_zero12,lrtmp12);
1584  }
1585 
1586  /*# From lower triangular portion */
1587  int elem_ij12 = 0;
1588  for(int i12 = 1; i12 < Nc; ++i12) {
1589 
1590  int elem_ijb12 = (i12+Nc)*(i12+Nc-1)/2 + Nc;
1591 
1592  for(int j12 = 0; j12 < i12; ++j12) {
1593 
1594  lctmp12 = tri_off_r.elem(0).elem(elem_ij12);
1595  lctmp12 -= tri_off_r.elem(0).elem(elem_ijb12);
1596  lctmp12 -= tri_off_r.elem(1).elem(elem_ij12);
1597  lctmp12 += tri_off_r.elem(1).elem(elem_ijb12);
1598 
1599  B_j.elem().elem(i12,j12) = timesI(lctmp12);
1600  B_j.elem().elem(j12,i12) = timesI(adj(lctmp12));
1601 
1602  elem_ij12++;
1603  elem_ijb12++;
1604  }
1605  }
1606  llvm_exit( );
1607  }
1608 
1609  llvm_set_insert_point( case_default );
1610 
1611  return jit_function_epilogue_get_cuf("jit_triacntr.ptx" , __PRETTY_FUNCTION__ );
1612 
1613  }
1614 
1615 
1616 
1617 
1618  template<typename T, typename U>
1619  void NVVMCloverTermT<T,U>::triacntr(U& B, int mat, int cb) const
1620  {
1621  START_CODE();
1622 
1623  B = zero;
1624 
1625  if ( mat < 0 || mat > 15 )
1626  {
1627  QDPIO::cerr << __func__ << ": Gamma out of range: mat = " << mat << std::endl;
1628  QDP_abort(1);
1629  }
1630 
1631  //QDPIO::cout << "PTX Clover triacntr " << (void*)this << "\n";
1632  //std::cout << "PTX Clover triacntr " << (void*)this << "\n";
1633  static CUfunction function;
1634 
1635  if (function == NULL)
1636  function = function_triacntr_build<U>( B, tri_dia, tri_off, mat, rb[cb] );
1637 
1638  // Execute the function
1639  function_triacntr_exec(function, B, tri_dia, tri_off, mat, rb[cb] );
1640 
1641  END_CODE();
1642  }
1643 
1644  //! Returns the appropriate clover coefficient for indices mu and nu
1645  template<typename T, typename U>
1646  Real
1648  {
1649  START_CODE();
1650 
1651  if( param.anisoParam.anisoP ) {
1652  if (mu==param.anisoParam.t_dir || nu == param.anisoParam.t_dir) {
1653  return param.clovCoeffT;
1654  }
1655  else {
1656  // Otherwise return the spatial coeff
1657  return param.clovCoeffR;
1658  }
1659  }
1660  else {
1661  // If there is no anisotropy just return the spatial one, it will
1662  // be the same as the temporal one
1663  return param.clovCoeffR;
1664  }
1665 
1666  END_CODE();
1667  }
1668 
1669 
1670 
1671  template<typename T,typename X,typename Y>
1672  void function_apply_clov_exec(CUfunction function,
1673  T& chi,
1674  const T& psi,
1675  const X& tri_dia,
1676  const Y& tri_off,
1677  const Subset& s)
1678  {
1679  if (!s.hasOrderedRep())
1680  QDP_error_exit("clover on subset with unordered representation not implemented");
1681 
1682  //std::cout << __PRETTY_FUNCTION__ << ": entering\n";
1683 
1684  AddressLeaf addr_leaf(s);
1685 
1686  forEach(chi, addr_leaf, NullCombine());
1687  forEach(psi, addr_leaf, NullCombine());
1688  forEach(tri_dia, addr_leaf, NullCombine());
1689  forEach(tri_off, addr_leaf, NullCombine());
1690 
1691  // lo <= idx < hi
1692  int lo = s.start();
1693  int hi = s.end();
1694 
1695 #ifndef QDP_JIT_NVVM_USE_LEGACY_LAUNCH
1696  JitParam jit_lo( QDP_get_global_cache().addJitParamInt( lo ) );
1697  JitParam jit_hi( QDP_get_global_cache().addJitParamInt( hi ) );
1698  std::vector<QDPCache::ArgKey> ids;
1699  ids.push_back( jit_lo.get_id() );
1700  ids.push_back( jit_hi.get_id() );
1701  for(unsigned i=0; i < addr_leaf.ids.size(); ++i)
1702  ids.push_back( addr_leaf.ids[i] );
1703  jit_launch(function,s.numSiteTable(),ids);
1704 #else
1705  std::vector<void*> addr;
1706  addr.push_back( &lo );
1707  addr.push_back( &hi );
1708  for(unsigned i=0; i < addr_leaf.addr.size(); ++i) {
1709  addr.push_back( &addr_leaf.addr[i] );
1710  }
1711  jit_launch(function,s.numSiteTable(),addr);
1712 #endif
1713  }
1714 
1715 
1716 
1717 
1718  template<typename T,typename X,typename Y>
1719  CUfunction function_apply_clov_build(const T& chi,
1720  const T& psi,
1721  const X& tri_dia,
1722  const Y& tri_off,
1723  const Subset& s)
1724  {
1725  if (ptx_db::db_enabled) {
1726  CUfunction func = llvm_ptx_db( __PRETTY_FUNCTION__ );
1727  if (func)
1728  return func;
1729  }
1730 
1731  //std::cout << __PRETTY_FUNCTION__ << ": entering\n";
1732  //typedef typename WordType<RealT>::Type_t REALT;
1733 
1734  //CUfunction func;
1735 
1736  llvm_start_new_function();
1737 
1738  //std::vector<ParamRef> params = jit_function_preamble_param();
1739 
1740  ParamRef p_lo = llvm_add_param<int>();
1741  ParamRef p_hi = llvm_add_param<int>();
1742 
1743 
1744 
1745 
1746  ParamLeaf param_leaf;
1747 
1748  typedef typename LeafFunctor<T, ParamLeaf>::Type_t TJIT;
1749  TJIT chi_jit(forEach(chi, param_leaf, TreeCombine()));
1750  TJIT psi_jit(forEach(psi, param_leaf, TreeCombine()));
1751  typename REGType< typename TJIT::Subtype_t >::Type_t psi_r;
1752  typename REGType< typename TJIT::Subtype_t >::Type_t chi_r;
1753 
1754  typedef typename LeafFunctor<X, ParamLeaf>::Type_t XJIT;
1755  XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine()));
1756  typename REGType< typename XJIT::Subtype_t >::Type_t tri_dia_r;
1757 
1758  typedef typename LeafFunctor<Y, ParamLeaf>::Type_t YJIT;
1759  YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine()));
1760  typename REGType< typename YJIT::Subtype_t >::Type_t tri_off_r;
1761 
1762  //llvm::Value * r_idx = jit_function_preamble_get_idx( params );
1763 
1764  llvm::Value * r_lo = llvm_derefParam( p_lo );
1765  llvm::Value * r_hi = llvm_derefParam( p_hi );
1766  llvm::Value * r_idx_thread = llvm_thread_idx();
1767 
1768  llvm_cond_exit( llvm_gt( r_idx_thread , llvm_sub( r_hi , r_lo ) ) );
1769 
1770  llvm::Value * r_idx = llvm_add( r_lo , r_idx_thread );
1771 
1772  auto& chi_j = chi_jit.elem(JitDeviceLayout::Coalesced,r_idx);
1773  psi_r.setup( psi_jit.elem(JitDeviceLayout::Coalesced,r_idx) );
1774  tri_dia_r.setup( tri_dia_jit.elem(JitDeviceLayout::Coalesced,r_idx) );
1775  tri_off_r.setup( tri_off_jit.elem(JitDeviceLayout::Coalesced,r_idx) );
1776 
1777  // RComplex<REALT>* cchi = (RComplex<REALT>*)&(chi.elem(site).elem(0).elem(0));
1778  // const RComplex<REALT>* ppsi = (const RComplex<REALT>*)&(psi.elem(site).elem(0).elem(0));
1779 
1780  int n = 2*Nc;
1781 
1782  for(int i = 0; i < n; ++i)
1783  {
1784  chi_r.elem((0*n+i)/3).elem((0*n+i)%3) = tri_dia_r.elem(0).elem(i) * psi_r.elem((0*n+i)/3).elem((0*n+i)%3);
1785  // cchi[0*n+i] = tri[site].diag[0][i] * ppsi[0*n+i];
1786 
1787  chi_r.elem((1*n+i)/3).elem((1*n+i)%3) = tri_dia_r.elem(1).elem(i) * psi_r.elem((1*n+i)/3).elem((1*n+i)%3);
1788  // cchi[1*n+i] = tri[site].diag[1][i] * ppsi[1*n+i];
1789  }
1790 
1791  int kij = 0;
1792  for(int i = 0; i < n; ++i)
1793  {
1794  for(int j = 0; j < i; j++)
1795  {
1796  chi_r.elem((0*n+i)/3).elem((0*n+i)%3) += tri_off_r.elem(0).elem(kij) * psi_r.elem((0*n+j)/3).elem((0*n+j)%3);
1797  // cchi[0*n+i] += tri[site].offd[0][kij] * ppsi[0*n+j];
1798 
1799  chi_r.elem((0*n+j)/3).elem((0*n+j)%3) += conj(tri_off_r.elem(0).elem(kij)) * psi_r.elem((0*n+i)/3).elem((0*n+i)%3);
1800  // cchi[0*n+j] += conj(tri[site].offd[0][kij]) * ppsi[0*n+i];
1801 
1802  chi_r.elem((1*n+i)/3).elem((1*n+i)%3) += tri_off_r.elem(1).elem(kij) * psi_r.elem((1*n+j)/3).elem((1*n+j)%3);
1803  // cchi[1*n+i] += tri[site].offd[1][kij] * ppsi[1*n+j];
1804 
1805  chi_r.elem((1*n+j)/3).elem((1*n+j)%3) += conj(tri_off_r.elem(1).elem(kij)) * psi_r.elem((1*n+i)/3).elem((1*n+i)%3);
1806  // cchi[1*n+j] += conj(tri[site].offd[1][kij]) * ppsi[1*n+i];
1807 
1808  kij++;
1809  }
1810  }
1811 
1812  chi_j = chi_r;
1813 
1814  return jit_function_epilogue_get_cuf("jit_apply_clov.ptx" , __PRETTY_FUNCTION__ );
1815  }
1816 
1817 
1818 
1819 
1820 
1821 
1822 
1823  /**
1824  * Apply a dslash
1825  *
1826  * Performs the operation
1827  *
1828  * chi <- (L + D + L^dag) . psi
1829  *
1830  * where
1831  * L is a lower triangular matrix
1832  * D is the real diagonal. (stored together in type TRIANG)
1833  *
1834  * Arguments:
1835  * \param chi result (Write)
1836  * \param psi source (Read)
1837  * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read)
1838  * \param cb Checkerboard of OUTPUT std::vector (Read)
1839  */
1840  template<typename T, typename U>
1842  enum PlusMinus isign, int cb) const
1843  {
1844  START_CODE();
1845 
1846  if ( Ns != 4 ) {
1847  QDPIO::cerr << __func__ << ": CloverTerm::apply requires Ns==4" << std::endl;
1848  QDP_abort(1);
1849  }
1850 
1851  //QDPIO::cout << "PTX Clover apply" << (void*)this << "\n";
1852  //std::cout << "PTX Clover apply" << (void*)this << "\n";
1853  static CUfunction function;
1854 
1855  if (function == NULL)
1856  function = function_apply_clov_build(chi, psi, tri_dia, tri_off, rb[cb] );
1857 
1858  // Execute the function
1859  function_apply_clov_exec(function, chi, psi, tri_dia, tri_off, rb[cb] );
1860 
1861  (*this).getFermBC().modifyF(chi, QDP::rb[cb]);
1862 
1863  END_CODE();
1864  }
1865 
1866 
1867 
1868 
1869 
1870  namespace QDPCloverEnv {
1871  template<typename R,typename TD,typename TO>
1872  struct QUDAPackArgs {
1873  int cb;
1874  multi1d<QUDAPackedClovSite<R> >& quda_array;
1875  const TD& tri_dia;
1876  const TO& tri_off;
1877  };
1878 
1879  template<typename R,typename TD,typename TO>
1880  void qudaPackSiteLoop(int lo, int hi, int myId, QUDAPackArgs<R,TD,TO>* a) {
1881  int cb = a->cb;
1882  int Ns2 = Ns/2;
1883 
1884  multi1d<QUDAPackedClovSite<R> >& quda_array = a->quda_array;
1885 
1886  const TD& tri_dia = a->tri_dia;
1887  const TO& tri_off = a->tri_off;
1888 
1889  const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
1890 
1891  for(int ssite=lo; ssite < hi; ++ssite) {
1892  int site = rb[cb].siteTable()[ssite];
1893  // First Chiral Block
1894  for(int i=0; i < 6; i++) {
1895  quda_array[site].diag1[i] = tri_dia.elem(site).comp[0].diag[i].elem().elem();
1896  }
1897 
1898  int target_index=0;
1899 
1900  for(int col=0; col < Nc*Ns2-1; col++) {
1901  for(int row=col+1; row < Nc*Ns2; row++) {
1902 
1903  int source_index = row*(row-1)/2 + col;
1904 
1905  quda_array[site].offDiag1[target_index][0] = tri_off.elem(site).comp[0].offd[source_index].real().elem();
1906  quda_array[site].offDiag1[target_index][1] = tri_off.elem(site).comp[0].offd[source_index].imag().elem();
1907  target_index++;
1908  }
1909  }
1910  // Second Chiral Block
1911  for(int i=0; i < 6; i++) {
1912  quda_array[site].diag2[i] = tri_dia.elem(site).comp[1].diag[i].elem().elem();
1913  }
1914 
1915  target_index=0;
1916  for(int col=0; col < Nc*Ns2-1; col++) {
1917  for(int row=col+1; row < Nc*Ns2; row++) {
1918 
1919  int source_index = row*(row-1)/2 + col;
1920 
1921  quda_array[site].offDiag2[target_index][0] = tri_off.elem(site).comp[1].offd[source_index].real().elem();
1922  quda_array[site].offDiag2[target_index][1] = tri_off.elem(site).comp[1].offd[source_index].imag().elem();
1923  target_index++;
1924  }
1925  }
1926  }
1927  QDPIO::cout << "\n";
1928  }
1929  }
1930 
1931  template<typename T, typename U>
1932  void NVVMCloverTermT<T,U>::packForQUDA(multi1d<QUDAPackedClovSite<typename WordType<T>::Type_t> >& quda_array, int cb) const
1933  {
1934  typedef typename WordType<T>::Type_t REALT;
1935  int num_sites = rb[cb].siteTable().size();
1936 
1937  typedef OLattice<PComp<PTriDia<RScalar <Word<REALT> > > > > TD;
1938  typedef OLattice<PComp<PTriOff<RComplex<Word<REALT> > > > > TO;
1939 
1940  StopWatch watch;
1941  watch.start();
1942 
1943  QDPCloverEnv::QUDAPackArgs<REALT,TD,TO> args = { cb, quda_array , tri_dia , tri_off };
1944  dispatch_to_threads(num_sites, args, QDPCloverEnv::qudaPackSiteLoop<REALT,TD,TO>);
1945 
1946  watch.stop();
1947  PackForQUDATimer::Instance().get() += watch.getTimeInMicroseconds();
1948  }
1949 
1950 
1951 
1952  template<typename T, typename U>
1954  enum PlusMinus isign, int site) const
1955  {
1956  QDP_error_exit("NVVMCloverTermT<T,U>::applySite(T& chi, const T& psi,..) not implemented ");
1957 #if 0
1958 #endif
1959  }
1960 
1961 
1965 } // End Namespace Chroma
1966 
1967 
1968 #undef QDP_JIT_NVVM_USE_LEGACY_LAUNCH
1969 
1970 #endif
Base class for all fermion action boundary conditions.
Definition: fermbc.h:20
Support class for fermion actions and linear operators.
Definition: state.h:94
Class for counted reference semantics.
Definition: handle.h:33
void create(Handle< FermState< T, multi1d< U >, multi1d< U > > > fs, const CloverFermActParams &param_)
Creation routine.
Double cholesDet(int cb) const
Computes the inverse of the term on cb using Cholesky.
OLattice< PComp< PTriDia< RScalar< Word< REALT > > > > > tri_dia
~NVVMCloverTermT()
No real need for cleanup here.
Real getCloverCoeff(int mu, int nu) const
Calculates Tr_D ( Gamma_mat L )
const multi1d< U > & getU() const
Get the u field.
Handle< FermBC< T, multi1d< U >, multi1d< U > > > fbc
void triacntr(U &B, int mat, int cb) const
Calculates Tr_D ( Gamma_mat L )
WordType< T >::Type_t REALT
void choles(int cb)
Computes the inverse of the term on cb using Cholesky.
OLattice< PComp< PTriOff< RComplex< Word< REALT > > > > > tri_off
OLattice< PScalar< PScalar< RScalar< Word< REALT > > > > > LatticeREAL
void applySite(T &chi, const T &psi, enum PlusMinus isign, int site) const
void apply(T &chi, const T &psi, enum PlusMinus isign, int cb) const
void packForQUDA(multi1d< QUDAPackedClovSite< REALT > > &quda_pack, int cb) const
PACK UP the Clover term for QUDA library:
NVVMCloverTermT()
Empty constructor. Must use create later.
void makeClov(const multi1d< U > &f, const RealT &diag_mass)
Create the clover term on cb.
const FermBC< T, multi1d< U >, multi1d< U > > & getFermBC() const
Return the fermion BC object for this linear operator.
CloverFermActParams param
OScalar< PScalar< PScalar< RScalar< Word< REALT > > > > > RealT
void ldagdlinv(LatticeREAL &tr_log_diag, int cb)
Invert the clover term on cb.
const double & get() const
static PackForQUDATimer & Instance()
Parameters for Clover fermion action.
Clover term linear operator.
int mu
Definition: cool.cc:24
int nu
Definition: cool.cc:25
void block(LatticeColorMatrix &u_block, const multi1d< LatticeColorMatrix > &u, int mu, int bl_level, const Real &BlkAccu, int BlkMax, int j_decay)
Construct block links.
Definition: block.cc:42
void function_triacntr_exec(const JitFunction &function, U &B, const X &tri_dia, const Y &tri_off, int mat, const Subset &s)
TRIACNTR.
unsigned elem_ij
Definition: ldumul_w.cc:38
unsigned s
Definition: ldumul_w.cc:37
unsigned j
Definition: ldumul_w.cc:35
unsigned elem_ji
Definition: ldumul_w.cc:39
unsigned i
Definition: ldumul_w.cc:34
unsigned n
Definition: ldumul_w.cc:36
Calculates the antihermitian field strength tensor iF(mu,nu)
Nd
Definition: meslate.cc:74
void qudaPackSiteLoop(int lo, int hi, int myId, QUDAPackArgs< R, TD, TO > *a)
Asqtad Staggered-Dirac operator.
Definition: klein_gord.cc:10
QDP_error_exit("too many BiCG iterations", n_count, rsd_sq, cp, c, re_rvr, im_rvr, re_a, im_a, re_b, im_b)
void function_make_clov_exec(const JitFunction &function, const RealT &diag_mass, const U &f0, const U &f1, const U &f2, const U &f3, const U &f4, const U &f5, X &tri_dia, Y &tri_off)
static multi1d< LatticeColorMatrix > u
void function_ldagdlinv_exec(const JitFunction &function, T &tr_log_diag, X &tri_dia, Y &tri_off, const Subset &s)
void function_apply_clov_exec(const JitFunction &function, T &chi, const T &psi, const X &tri_dia, const Y &tri_off, const Subset &s)
Double one
Definition: invbicg.cc:105
NVVMCloverTermT< LatticeFermionF, LatticeColorMatrixF > NVVMCloverTermF
LinOpSysSolverMGProtoClover::T T
int i
Definition: pbg5p_w.cc:55
void function_triacntr_build(JitFunction &func, const U &B, const X &tri_dia, const Y &tri_off, int mat, const Subset &s)
NVVMCloverTermT< LatticeFermion, LatticeColorMatrix > NVVMCloverTerm
void function_ldagdlinv_build(JitFunction &func, const T &tr_log_diag, const X &tri_dia, const Y &tri_off, const Subset &s)
multi1d< LatticeFermion > chi(Ncb)
void mesField(multi1d< LatticeColorMatrixF > &f, const multi1d< LatticeColorMatrixF > &u)
Calculates the antihermitian field strength tensor iF(mu,nu)
Definition: mesfield.cc:80
Complex a
Definition: invbicg.cc:95
LatticeFermion psi
Definition: mespbg5p_w.cc:35
NVVMCloverTermT< LatticeFermionD, LatticeColorMatrixD > NVVMCloverTermD
void function_make_clov_build(JitFunction &func, const RealT &diag_mass, const U &f0, const U &f1, const U &f2, const U &f3, const U &f4, const U &f5, const X &tri_dia, const Y &tri_off)
START_CODE()
void function_apply_clov_build(JitFunction &func, const T &chi, const T &psi, const X &tri_dia, const Y &tri_off, const Subset &s)
int cb
Definition: invbicg.cc:120
Double zero
Definition: invbicg.cc:106
int k
Definition: invbicg.cc:119
multi1d< LatticeFermion > s(Ncb)
const T1 const T2 const T3 & f3
Definition: gtest.h:1327
const T1 const T2 & f2
Definition: gtest.h:1321
const T1 & f1
Definition: gtest.h:1316
const T1 const T2 const T3 const T4 const T5 & f5
Definition: gtest.h:1339
const T1 const T2 const T3 const T4 & f4
Definition: gtest.h:1333
FloatingPoint< double > Double
Definition: gtest.h:7351
int l
Definition: pade_trln_w.cc:111
Double sum
Definition: qtopcor.cc:37
Support class for fermion actions and linear operators.
Params for clover ferm acts.
multi1d< QUDAPackedClovSite< R > > & quda_array
PCompJIT< typename JITType< T >::Type_t > Type_t
PCompJIT< typename JITType< T >::Type_t > Type_t
PTriDiaJIT< typename JITType< T >::Type_t > Type_t
PTriDiaJIT< typename JITType< T >::Type_t > Type_t
PTriOffJIT< typename JITType< T >::Type_t > Type_t
PTriOffJIT< typename JITType< T >::Type_t > Type_t
PCompJIT & operator=(const PCompREG< T1 > &rhs)
const T & elem(int i) const
void setup(const PCompJIT< typename JITType< T >::Type_t > &rhs)
const T & elem(int i) const
const T & elem(int i) const
PTriDiaJIT & operator=(const PTriDiaREG< T1 > &rhs)
const T & elem(int i) const
void setup(const PTriDiaJIT< typename JITType< T >::Type_t > &rhs)
PTriOffJIT & operator=(const PTriOffREG< T1 > &rhs)
const T & elem(int i) const
T F[2 *Nc *Nc-Nc]
const T & elem(int i) const
void setup(const PTriOffJIT< typename JITType< T >::Type_t > &rhs)
T offd[2 *Nc *Nc-Nc]
PCompREG< typename REGType< T >::Type_t > Type_t
PTriDiaREG< typename REGType< T >::Type_t > Type_t
PTriOffREG< typename REGType< T >::Type_t > Type_t
WordType< T >::Type_t Type_t
multi1d< LatticeColorMatrix > U
LatticeFermion T
Definition: t_clover.cc:11
func
Definition: t_preccfz.cc:17
LatticeFermionD TD
Definition: t_quda_tprec.cc:22