CHROMA
clover_term_ptx_w.h
Go to the documentation of this file.
1 // -*- C++ -*-
2 /*! \file
3  * \brief Clover term linear operator
4  */
5 
6 #ifndef __clover_term_ptx_w_h__
7 #define __clover_term_ptx_w_h__
8 
9 #warning "Using QPD-JIT/PTX clover"
10 
11 #include "state.h"
14 #include "meas/glue/mesfield.h"
15 
16 
17 namespace QDP
18 {
19  class PackForQUDATimer {
20  double acc_time;
22  public:
24  static PackForQUDATimer singleton;
25  return singleton;
26  }
27 
28  double& get() { return acc_time; }
29  const double& get() const { return acc_time; }
30  };
31 
32 
33  template<typename T>
34  struct PComp
35  {
36  typedef T Sub_t;
37  enum { ThisSize = 2 };
38  T comp[2];
39  };
40 
41  template<class T> struct PCompREG;
42 
43  template<typename T>
44  struct PCompJIT: public BaseJIT<T,2>
45  {
46  template<class T1>
48  //std::cout << __PRETTY_FUNCTION__ << "\n";
49  elem(0) = rhs.elem(0);
50  elem(1) = rhs.elem(1);
51  return *this;
52  }
53 
54 
55  inline T& elem(int i) { return this->arrayF(i); }
56  inline const T& elem(int i) const { return this->arrayF(i); }
57  };
58 
59  template<class T>
60  struct PCompREG
61  {
62  T F[2];
63  void setup(const PCompJIT< typename JITType<T>::Type_t >& rhs ) {
64  F[0].setup( rhs.elem(0) );
65  F[1].setup( rhs.elem(1) );
66  }
67  inline T& elem(int i) { return F[i]; }
68  inline const T& elem(int i) const { return F[i]; }
69  };
70 
71  template<class T>
72  struct JITType<PComp<T> >
73  {
75  };
76 
77  template<class T>
78  struct JITType<PCompREG<T> >
79  {
81  };
82 
83  template<class T>
84  struct REGType<PCompJIT<T> >
85  {
87  };
88 
89  template<class T>
90  struct WordType<PComp<T> >
91  {
92  typedef typename WordType<T>::Type_t Type_t;
93  };
94 
95  template<class T>
96  struct WordType<PCompJIT<T> >
97  {
98  typedef typename WordType<T>::Type_t Type_t;
99  };
100 
101 
102 
103 
104 
105 
106  template<typename T>
107  struct PTriDia
108  {
109  typedef T Sub_t;
110  enum { ThisSize = 2*Nc };
111  T diag[2*Nc];
112  };
113 
114  template<class T> struct PTriDiaREG;
115 
116  template<typename T>
117  struct PTriDiaJIT: public BaseJIT<T,2*Nc>
118  {
119  template<class T1>
121  //std::cout << __PRETTY_FUNCTION__ << "\n";
122  for ( int i = 0 ; i < 2 * Nc ; i++ )
123  elem(i) = rhs.elem(i);
124  return *this;
125  }
126 
127  inline T& elem(int i) { return this->arrayF(i); }
128  inline const T& elem(int i) const { return this->arrayF(i); }
129  };
130 
131  template<class T>
132  struct PTriDiaREG
133  {
134  T F[2*Nc];
135  void setup(const PTriDiaJIT< typename JITType<T>::Type_t >& rhs ) {
136  for (int i=0;i<2*Nc;++i)
137  F[i].setup( rhs.elem(i) );
138  }
139  inline T& elem(int i) { return F[i]; }
140  inline const T& elem(int i) const { return F[i]; }
141  };
142 
143  template<class T>
144  struct JITType<PTriDia<T> >
145  {
147  };
148 
149  template<class T>
150  struct JITType<PTriDiaREG<T> >
151  {
153  };
154 
155  template<class T>
156  struct REGType<PTriDiaJIT<T> >
157  {
159  };
160 
161  template<class T>
162  struct WordType<PTriDia<T> >
163  {
164  typedef typename WordType<T>::Type_t Type_t;
165  };
166 
167  template<class T>
168  struct WordType<PTriDiaJIT<T> >
169  {
170  typedef typename WordType<T>::Type_t Type_t;
171  };
172 
173 
174 
175 
176 
177 
178  template<typename T>
179  struct PTriOff
180  {
181  typedef T Sub_t;
182  enum { ThisSize = 2*Nc*Nc-Nc };
183  T offd[2*Nc*Nc-Nc];
184  };
185 
186  template<class T> struct PTriOffREG;
187 
188  template<typename T>
189  struct PTriOffJIT: public BaseJIT<T,2*Nc*Nc-Nc>
190  {
191  template<class T1>
193  //std::cout << __PRETTY_FUNCTION__ << "\n";
194  for ( int i = 0 ; i < 2*Nc*Nc-Nc ; i++ )
195  elem(i) = rhs.elem(i);
196  return *this;
197  }
198 
199  inline T& elem(int i) { return this->arrayF(i); }
200  inline const T& elem(int i) const { return this->arrayF(i); }
201  };
202 
203  template<class T>
204  struct PTriOffREG
205  {
206  T F[2*Nc*Nc-Nc];
207  void setup(const PTriOffJIT< typename JITType<T>::Type_t >& rhs ) {
208  for (int i=0;i<2*Nc*Nc-Nc;++i)
209  F[i].setup( rhs.elem(i) );
210  }
211  inline T& elem(int i) { return F[i]; }
212  inline const T& elem(int i) const { return F[i]; }
213  };
214 
215  template<class T>
216  struct JITType<PTriOff<T> >
217  {
219  };
220 
221  template<class T>
222  struct JITType<PTriOffREG<T> >
223  {
225  };
226 
227  template<class T>
228  struct REGType<PTriOffJIT<T> >
229  {
231  };
232 
233  template<class T>
234  struct WordType<PTriOff<T> >
235  {
236  typedef typename WordType<T>::Type_t Type_t;
237  };
238 
239  template<class T>
240  struct WordType<PTriOffJIT<T> >
241  {
242  typedef typename WordType<T>::Type_t Type_t;
243  };
244 
245 
246 #if defined(QDP_USE_PROFILING)
247 template<class T>
248 struct LeafFunctor<PComp<T>, PrintTag>
249 {
250  typedef int Type_t;
251  static int apply(const PComp<T> &s, const PrintTag &f)
252  {
253  //LeafFunctor<T,PrintTag>::apply(s.elem(),f);
254  return 0;
255  }
256 };
257 template<class T>
258 struct LeafFunctor<PTriDia<T>, PrintTag>
259 {
260  typedef int Type_t;
261  static int apply(const PTriDia<T> &s, const PrintTag &f)
262  {
263  //LeafFunctor<T,PrintTag>::apply(s.elem(),f);
264  return 0;
265  }
266 };
267 template<class T>
268 struct LeafFunctor<PTriOff<T>, PrintTag>
269 {
270  typedef int Type_t;
271  static int apply(const PTriOff<T> &s, const PrintTag &f)
272  {
273  //LeafFunctor<T,PrintTag>::apply(s.elem(),f);
274  return 0;
275  }
276 };
277 #endif
278 
279 
280 
281 }
282 
283 
284 
285 namespace Chroma
286 {
287  template<typename R>
288  struct QUDAPackedClovSite {
289  R diag1[6];
290  R offDiag1[15][2];
291  R diag2[6];
292  R offDiag2[15][2];
293  };
294 
295 
296  template<typename T, typename U>
297  class PTXCloverTermT : public CloverTermBase<T, U>
298  {
299  public:
300  // Typedefs to save typing
301  typedef typename WordType<T>::Type_t REALT;
302 
303  typedef OLattice< PScalar< PScalar< RScalar< Word< REALT> > > > > LatticeREAL;
304  typedef OScalar< PScalar< PScalar< RScalar< Word< REALT> > > > > RealT;
305 
306  //! Empty constructor. Must use create later
307  PTXCloverTermT();
308 
309  //! No real need for cleanup here
311 
312  //! Creation routine
313  void create(Handle< FermState<T, multi1d<U>, multi1d<U> > > fs,
314  const CloverFermActParams& param_);
315 
316  virtual void create(Handle< FermState<T, multi1d<U>, multi1d<U> > > fs,
317  const CloverFermActParams& param_,
318  const PTXCloverTermT<T,U>& from_);
319 
320  //! Computes the inverse of the term on cb using Cholesky
321  /*!
322  * \param cb checkerboard of work (Read)
323  */
324  void choles(int cb);
325 
326  //! Computes the inverse of the term on cb using Cholesky
327  /*!
328  * \param cb checkerboard of work (Read)
329  * \return logarithm of the determinant
330  */
331  Double cholesDet(int cb) const ;
332 
333  /**
334  * Apply a dslash
335  *
336  * Performs the operation
337  *
338  * chi <- (L + D + L^dag) . psi
339  *
340  * where
341  * L is a lower triangular matrix
342  * D is the real diagonal. (stored together in type TRIANG)
343  *
344  * Arguments:
345  * \param chi result (Write)
346  * \param psi source (Read)
347  * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read)
348  * \param cb Checkerboard of OUTPUT std::vector (Read)
349  */
350  void apply (T& chi, const T& psi, enum PlusMinus isign, int cb) const;
351 
352 
353  void applySite(T& chi, const T& psi, enum PlusMinus isign, int site) const;
354 
355  //! Calculates Tr_D ( Gamma_mat L )
356  void triacntr(U& B, int mat, int cb) const;
357 
358  //! Return the fermion BC object for this linear operator
359  const FermBC<T, multi1d<U>, multi1d<U> >& getFermBC() const {return *fbc;}
360 
361  //! PACK UP the Clover term for QUDA library:
362  void packForQUDA(multi1d<QUDAPackedClovSite<REALT> >& quda_pack, int cb) const;
363 
364  int getDiaId() const { return tri_dia.getId(); }
365  int getOffId() const { return tri_off.getId(); }
366 
367 
368  protected:
369  //! Create the clover term on cb
370  /*!
371  * \param f field strength tensor F(mu,nu) (Read)
372  * \param cb checkerboard (Read)
373  */
374  void makeClov(const multi1d<U>& f, const RealT& diag_mass);
375 
376  //! Invert the clover term on cb
377  //void chlclovms(LatticeREAL& log_diag, int cb);
378  void ldagdlinv(LatticeREAL& tr_log_diag, int cb);
379 
380  //! Get the u field
381  const multi1d<U>& getU() const {return u;}
382 
383  //! Calculates Tr_D ( Gamma_mat L )
384  Real getCloverCoeff(int mu, int nu) const;
385 
386 
387  private:
389  multi1d<U> u;
391  LatticeREAL tr_log_diag_; // Fill this out during create
392  // but save the global sum until needed.
393  multi1d<bool> choles_done; // Keep note of whether the decomposition has been done
394  // on a particular checkerboard.
395 
396  OLattice<PComp<PTriDia<RScalar <Word<REALT> > > > > tri_dia;
397  OLattice<PComp<PTriOff<RComplex<Word<REALT> > > > > tri_off;
398 
399  };
400 
401 
402  // Empty constructor. Must use create later
403  template<typename T, typename U>
405 
406  // Now copy
407  template<typename T, typename U>
408  void PTXCloverTermT<T,U>::create(Handle< FermState<T,multi1d<U>,multi1d<U> > > fs,
409  const CloverFermActParams& param_,
410  const PTXCloverTermT<T,U>& from)
411  {
412  START_CODE();
413 
414  //std::cout << "PTX Clover create from other " << (void*)this << "\n";
415 
416  u.resize(Nd);
417 
418  u = fs->getLinks();
419  fbc = fs->getFermBC();
420  param = param_;
421 
422  // Sanity check
423  if (fbc.operator->() == 0) {
424  QDPIO::cerr << "PTXCloverTerm: error: fbc is null" << std::endl;
425  QDP_abort(1);
426  }
427 
428  {
429  RealT ff = where(param.anisoParam.anisoP, Real(1) / param.anisoParam.xi_0, Real(1));
430  param.clovCoeffR *= Real(0.5) * ff;
431  param.clovCoeffT *= Real(0.5);
432  }
433 
434  //
435  // Yuk. Some bits of knowledge of the dslash term are buried in the
436  // effective mass term. They show up here. If I wanted some more
437  // complicated dslash then this will have to be fixed/adjusted.
438  //
439  RealT diag_mass;
440  {
441  RealT ff = where(param.anisoParam.anisoP, param.anisoParam.nu / param.anisoParam.xi_0, Real(1));
442  diag_mass = 1 + (Nd-1)*ff + param.Mass;
443  }
444 
445 
446  /* Calculate F(mu,nu) */
447  //multi1d<LatticeColorMatrix> f;
448  //mesField(f, u);
449  //makeClov(f, diag_mass);
450 
451  choles_done.resize(rb.numSubsets());
452  for(int i=0; i < rb.numSubsets(); i++) {
453  choles_done[i] = from.choles_done[i];
454  }
455 
456  tr_log_diag_ = from.tr_log_diag_;
457 
458  tri_dia = from.tri_dia;
459  tri_off = from.tri_off;
460 
461  END_CODE();
462  }
463 
464 
465  //! Creation routine
466  template<typename T, typename U>
467  void PTXCloverTermT<T,U>::create(Handle< FermState<T,multi1d<U>,multi1d<U> > > fs,
468  const CloverFermActParams& param_)
469  {
470  START_CODE();
471 
472  //std::cout << "PTX Clover create " << (void*)this << "\n";
473 
474  u.resize(Nd);
475 
476  u = fs->getLinks();
477  fbc = fs->getFermBC();
478  param = param_;
479 
480  // Sanity check
481  if (fbc.operator->() == 0) {
482  QDPIO::cerr << "PTXCloverTerm: error: fbc is null" << std::endl;
483  QDP_abort(1);
484  }
485 
486  {
487  RealT ff = where(param.anisoParam.anisoP, Real(1) / param.anisoParam.xi_0, Real(1));
488  param.clovCoeffR *= RealT(0.5) * ff;
489  param.clovCoeffT *= RealT(0.5);
490  }
491 
492  //
493  // Yuk. Some bits of knowledge of the dslash term are buried in the
494  // effective mass term. They show up here. If I wanted some more
495  // complicated dslash then this will have to be fixed/adjusted.
496  //
497  RealT diag_mass;
498  {
499  RealT ff = where(param.anisoParam.anisoP, param.anisoParam.nu / param.anisoParam.xi_0, Real(1));
500  diag_mass = 1 + (Nd-1)*ff + param.Mass;
501  }
502 
503  /* Calculate F(mu,nu) */
504  multi1d<U> f;
505  mesField(f, u);
506  makeClov(f, diag_mass);
507 
508  choles_done.resize(rb.numSubsets());
509  for(int i=0; i < rb.numSubsets(); i++) {
510  choles_done[i] = false;
511  }
512 
513  END_CODE();
514  }
515 
516 
517  /*
518  * MAKCLOV
519  *
520  * In this routine, MAKCLOV calculates
521 
522  * 1 - (1/4)*sigma(mu,nu) F(mu,nu)
523 
524  * using F from mesfield
525 
526  * F(mu,nu) = (1/4) sum_p (1/2) [ U_p(x) - U^dag_p(x) ]
527 
528  * using basis of SPPROD and stores in a lower triangular matrix
529  * (no diagonal) plus real diagonal
530 
531  * where
532  * U_1 = u(x,mu)*u(x+mu,nu)*u_dag(x+nu,mu)*u_dag(x,nu)
533  * U_2 = u(x,nu)*u_dag(x-mu+nu,mu)*u_dag(x-mu,nu)*u(x-mu,mu)
534  * U_3 = u_dag(x-mu,mu)*u_dag(x-mu-nu,nu)*u(x-mu-nu,mu)*u(x-nu,nu)
535  * U_4 = u_dag(x-nu,nu)*u(x-nu,mu)*u(x-nu+mu,nu)*u_dag(x,mu)
536 
537  * and
538 
539  * | sigF(1) sigF(3) 0 0 |
540  * sigF = | sigF(5) -sigF(1) 0 0 |
541  * | 0 0 -sigF(0) -sigF(2) |
542  * | 0 0 -sigF(4) sigF(0) |
543  * where
544  * sigF(i) is a color matrix
545 
546  * sigF(0) = i*(ClovT*E_z + ClovR*B_z)
547  * = i*(ClovT*F(3,2) + ClovR*F(1,0))
548  * sigF(1) = i*(ClovT*E_z - ClovR*B_z)
549  * = i*(ClovT*F(3,2) - ClovR*F(1,0))
550  * sigF(2) = i*(E_+ + B_+)
551  * sigF(3) = i*(E_+ - B_+)
552  * sigF(4) = i*(E_- + B_-)
553  * sigF(5) = i*(E_- - B_-)
554  * i*E_+ = (i*ClovT*E_x - ClovT*E_y)
555  * = (i*ClovT*F(3,0) - ClovT*F(3,1))
556  * i*E_- = (i*ClovT*E_x + ClovT*E_y)
557  * = (i*ClovT*F(3,0) + ClovT*F(3,1))
558  * i*B_+ = (i*ClovR*B_x - ClovR*B_y)
559  * = (i*ClovR*F(2,1) + ClovR*F(2,0))
560  * i*B_- = (i*ClovR*B_x + ClovR*B_y)
561  * = (i*ClovR*F(2,1) - ClovR*F(2,0))
562 
563  * NOTE: I am using i*F of the usual F defined by UKQCD, Heatlie et.al.
564 
565  * NOTE: the above definitions assume that the time direction, t_dir,
566  * is 3. In general F(k,j) is multiplied with ClovT if either
567  * k=t_dir or j=t_dir, and with ClovR otherwise.
568 
569  *+++
570  * Here are some notes on the origin of this routine. NOTE, ClovCoeff or u0
571  * are not actually used in MAKCLOV.
572  *
573  * The clover mass term is suppose to act on a std::vector like
574  *
575  * chi = (1 - (ClovCoeff/u0^3) * kappa/4 * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)) * psi
576 
577  * Definitions used here (NOTE: no "i")
578  * sigma(mu,nu) = gamma(mu)*gamma(nu) - gamma(nu)*gamma(mu)
579  * = 2*gamma(mu)*gamma(nu) for mu != nu
580  *
581  * chi = sum_mu sum_nu F(mu,nu)*gamma(mu)*gamma(nu)*psi for mu < nu
582  * = (1/2) * sum_mu sum_nu F(mu,nu)*gamma(mu)*gamma(nu)*psi for mu != nu
583  * = (1/4) * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)*psi
584  *
585  *
586  * chi = (1 - (ClovCoeff/u0^3) * kappa/4 * sum_mu sum_nu F(mu,nu)*sigma(mu,nu)) * psi
587  * = psi - (ClovCoeff/u0^3) * kappa * chi
588  * == psi - kappa * chi
589  *
590  * We have absorbed ClovCoeff/u0^3 into kappa. A u0 was previously absorbed into kappa
591  * for compatibility to ancient conventions.
592  *---
593 
594  * Arguments:
595  * \param f field strength tensor F(cb,mu,nu) (Read)
596  * \param diag_mass effective mass term (Read)
597  */
598 
599  template<typename RealT,typename U,typename X,typename Y>
600  CUfunction function_make_clov_exec(CUfunction function,
601  const RealT& diag_mass,
602  const U& f0,
603  const U& f1,
604  const U& f2,
605  const U& f3,
606  const U& f4,
607  const U& f5,
608  X& tri_dia,
609  Y& tri_off)
610  {
611  AddressLeaf addr_leaf;
612 
613  int junk_0 = forEach(diag_mass, addr_leaf, NullCombine());
614  int junk_1 = forEach(f0, addr_leaf, NullCombine());
615  int junk_2 = forEach(f1, addr_leaf, NullCombine());
616  int junk_3 = forEach(f2, addr_leaf, NullCombine());
617  int junk_4 = forEach(f3, addr_leaf, NullCombine());
618  int junk_5 = forEach(f4, addr_leaf, NullCombine());
619  int junk_6 = forEach(f5, addr_leaf, NullCombine());
620  int junk_7 = forEach(tri_dia, addr_leaf, NullCombine());
621  int junk_8 = forEach(tri_off, addr_leaf, NullCombine());
622 
623  // lo <= idx < hi
624  int lo = 0;
625  int hi = Layout::sitesOnNode();
626 
627  std::vector<void*> addr;
628 
629  addr.push_back( &lo );
630  //std::cout << "addr lo = " << addr[0] << " lo=" << lo << "\n";
631 
632  addr.push_back( &hi );
633  //std::cout << "addr hi = " << addr[1] << " hi=" << hi << "\n";
634 
635  int addr_dest=addr.size();
636  for(int i=0; i < addr_leaf.addr.size(); ++i) {
637  addr.push_back( &addr_leaf.addr[i] );
638  //std::cout << "addr = " << addr_leaf.addr[i] << "\n";
639  }
640 
641  jit_launch(function,Layout::sitesOnNode(),addr);
642  }
643 
644 
645 
646  template<typename RealT,typename U,typename X,typename Y>
647  CUfunction function_make_clov_build(const RealT& diag_mass,
648  const U& f0,
649  const U& f1,
650  const U& f2,
651  const U& f3,
652  const U& f4,
653  const U& f5,
654  const X& tri_dia,
655  const Y& tri_off)
656  {
657  //std::cout << __PRETTY_FUNCTION__ << ": entering\n";
658 
659  typedef typename WordType<RealT>::Type_t REALT;
660 
661  CUfunction func;
662 
663  jit_start_new_function();
664 
665  jit_value r_lo = jit_add_param( jit_ptx_type::s32 );
666  jit_value r_hi = jit_add_param( jit_ptx_type::s32 );
667 
668  jit_value r_idx = jit_geom_get_linear_th_idx();
669 
670  jit_value r_out_of_range = jit_ins_ge( r_idx , r_hi );
671  jit_ins_exit( r_out_of_range );
672 
673  ParamLeaf param_leaf( r_idx );
674 
675  typedef typename LeafFunctor<RealT, ParamLeaf>::Type_t RealTJIT;
676  RealTJIT diag_mass_jit(forEach(diag_mass, param_leaf, TreeCombine()));
677 
678  typedef typename LeafFunctor<U, ParamLeaf>::Type_t UJIT;
679  UJIT f0_jit(forEach(f0, param_leaf, TreeCombine()));
680  UJIT f1_jit(forEach(f1, param_leaf, TreeCombine()));
681  UJIT f2_jit(forEach(f2, param_leaf, TreeCombine()));
682  UJIT f3_jit(forEach(f3, param_leaf, TreeCombine()));
683  UJIT f4_jit(forEach(f4, param_leaf, TreeCombine()));
684  UJIT f5_jit(forEach(f5, param_leaf, TreeCombine()));
685  auto& f0_j = f0_jit.elem(JitDeviceLayout::Coalesced);
686  auto& f1_j = f1_jit.elem(JitDeviceLayout::Coalesced);
687  auto& f2_j = f2_jit.elem(JitDeviceLayout::Coalesced);
688  auto& f3_j = f3_jit.elem(JitDeviceLayout::Coalesced);
689  auto& f4_j = f4_jit.elem(JitDeviceLayout::Coalesced);
690  auto& f5_j = f5_jit.elem(JitDeviceLayout::Coalesced);
691 
692  typedef typename LeafFunctor<X, ParamLeaf>::Type_t XJIT;
693  XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine()));
694  auto& tri_dia_j = tri_dia_jit.elem(JitDeviceLayout::Coalesced);
695 
696  typedef typename LeafFunctor<Y, ParamLeaf>::Type_t YJIT;
697  YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine()));
698  auto& tri_off_j = tri_off_jit.elem(JitDeviceLayout::Coalesced);
699 
700  typename REGType< typename RealTJIT::Subtype_t >::Type_t diag_mass_reg;
701  diag_mass_reg.setup( diag_mass_jit.elem( JitDeviceLayout::Scalar ) );
702 
703  for(int jj = 0; jj < 2; jj++) {
704  for(int ii = 0; ii < 2*Nc; ii++) {
705  tri_dia_j.elem(jj).elem(ii) = diag_mass_reg.elem().elem();
706  //tri[site].diag[jj][ii] = diag_mass.elem().elem().elem();
707  }
708  }
709 
710 
711  RComplexREG<WordREG<REALT> > E_minus;
712  RComplexREG<WordREG<REALT> > B_minus;
713  RComplexREG<WordREG<REALT> > ctmp_0;
714  RComplexREG<WordREG<REALT> > ctmp_1;
715  RScalarREG<WordREG<REALT> > rtmp_0;
716  RScalarREG<WordREG<REALT> > rtmp_1;
717 
718 
719  for(int i = 0; i < Nc; ++i) {
720  ctmp_0 = f5_j.elem().elem(i,i);
721  ctmp_0 -= f0_j.elem().elem(i,i);
722  rtmp_0 = imag(ctmp_0);
723  tri_dia_j.elem(0).elem(i) += rtmp_0;
724 
725  tri_dia_j.elem(0).elem(i+Nc) -= rtmp_0;
726 
727  ctmp_1 = f5_j.elem().elem(i,i);
728  ctmp_1 += f0_j.elem().elem(i,i);
729  rtmp_1 = imag(ctmp_1);
730  tri_dia_j.elem(1).elem(i) -= rtmp_1;
731 
732  tri_dia_j.elem(1).elem(i+Nc) += rtmp_1;
733  }
734 
735  for(int i = 1; i < Nc; ++i) {
736  for(int j = 0; j < i; ++j) {
737 
738  int elem_ij = i*(i-1)/2 + j;
739  int elem_tmp = (i+Nc)*(i+Nc-1)/2 + j+Nc;
740 
741  ctmp_0 = f0_j.elem().elem(i,j);
742  ctmp_0 -= f5_j.elem().elem(i,j);
743  tri_off_j.elem(0).elem(elem_ij) = timesI(ctmp_0);
744 
745  zero_rep( tri_off_j.elem(0).elem(elem_tmp) );
746  tri_off_j.elem(0).elem(elem_tmp) -= tri_off_j.elem(0).elem(elem_ij);// * -1.0;
747 
748  ctmp_1 = f5_j.elem().elem(i,j);
749  ctmp_1 += f0_j.elem().elem(i,j);
750  tri_off_j.elem(1).elem(elem_ij) = timesI(ctmp_1);
751 
752  zero_rep( tri_off_j.elem(1).elem(elem_tmp) );
753  tri_off_j.elem(1).elem(elem_tmp) -= tri_off_j.elem(1).elem(elem_ij);
754  }
755  }
756 
757  for(int i = 0; i < Nc; ++i) {
758  for(int j = 0; j < Nc; ++j) {
759 
760  int elem_ij = (i+Nc)*(i+Nc-1)/2 + j;
761 
762  //E_minus = timesI(f2_j.elem().elem(i,j));
763  E_minus = f2_j.elem().elem(i,j);
764  E_minus = timesI( E_minus );
765 
766  E_minus += f4_j.elem().elem(i,j);
767 
768  //B_minus = timesI(f3_j.elem().elem(i,j));
769  B_minus = f3_j.elem().elem(i,j);
770  B_minus = timesI( B_minus );
771 
772  B_minus -= f1_j.elem().elem(i,j);
773 
774  tri_off_j.elem(0).elem(elem_ij) = B_minus - E_minus;
775 
776  tri_off_j.elem(1).elem(elem_ij) = E_minus + B_minus;
777  }
778  }
779 
780  return jit_get_cufunction("ptx_make_clov.ptx");
781  }
782 
783 
784 
785 
786  /* This now just sets up and dispatches... */
787  template<typename T, typename U>
788  void PTXCloverTermT<T,U>::makeClov(const multi1d<U>& f, const RealT& diag_mass)
789  {
790  START_CODE();
791 
792  if ( Nd != 4 ){
793  QDPIO::cerr << __func__ << ": expecting Nd==4" << std::endl;
794  QDP_abort(1);
795  }
796 
797  if ( Ns != 4 ){
798  QDPIO::cerr << __func__ << ": expecting Ns==4" << std::endl;
799  QDP_abort(1);
800  }
801 
802  U f0 = f[0] * getCloverCoeff(0,1);
803  U f1 = f[1] * getCloverCoeff(0,2);
804  U f2 = f[2] * getCloverCoeff(0,3);
805  U f3 = f[3] * getCloverCoeff(1,2);
806  U f4 = f[4] * getCloverCoeff(1,3);
807  U f5 = f[5] * getCloverCoeff(2,3);
808 
809  //QDPIO::cout << "PTX Clover make " << (void*)this << "\n";
810  //std::cout << "PTX Clover make " << (void*)this << "\n";
811  static CUfunction function;
812 
813  if (function == NULL)
814  function = function_make_clov_build(diag_mass, f0,f1,f2,f3,f4,f5, tri_dia , tri_off );
815 
816  // Execute the function
817  function_make_clov_exec(function, diag_mass, f0,f1,f2,f3,f4,f5,tri_dia, tri_off);
818 
819  END_CODE();
820  }
821 
822 
823  //! Invert
824  /*!
825  * Computes the inverse of the term on cb using Cholesky
826  */
827  template<typename T, typename U>
829  {
830  START_CODE();
831 
832  // When you are doing the cholesky - also fill out the trace_log_diag piece)
833  // chlclovms(tr_log_diag_, cb);
834  // Switch to LDL^\dag inversion
835  ldagdlinv(tr_log_diag_,cb);
836 
837  END_CODE();
838  }
839 
840 
841  //! Invert
842  /*!
843  * Computes the inverse of the term on cb using Cholesky
844  *
845  * \return logarithm of the determinant
846  */
847  template<typename T, typename U>
849  {
850  START_CODE();
851 
852  if( choles_done[cb] == false )
853  {
854  QDPIO::cout << __func__ << ": Error: you have not done the Cholesky.on this operator on this subset" << std::endl;
855  QDPIO::cout << "You sure you should not be asking invclov?" << std::endl;
856  QDP_abort(1);
857  }
858 
859 
860  END_CODE();
861 
862  // Need to thread generic sums in QDP++?
863  // Need to thread generic norm2() in QDP++?
864  return sum(tr_log_diag_, rb[cb]);
865  }
866 
867 
868 
869  template<typename T,typename X,typename Y>
870  CUfunction function_ldagdlinv_exec( CUfunction function,
871  T& tr_log_diag,
872  X& tri_dia,
873  Y& tri_off,
874  const Subset& s)
875  {
876  if (!s.hasOrderedRep())
877  QDP_error_exit("ldagdlinv on subset with unordered representation not implemented");
878 
879  AddressLeaf addr_leaf;
880 
881  int junk_0 = forEach(tr_log_diag, addr_leaf, NullCombine());
882  int junk_2 = forEach(tri_dia, addr_leaf, NullCombine());
883  int junk_3 = forEach(tri_off, addr_leaf, NullCombine());
884 
885  // lo <= idx < hi
886  int lo = s.start();
887  int hi = s.end();
888 
889  std::vector<void*> addr;
890 
891  addr.push_back( &lo );
892  //std::cout << "addr lo = " << addr[0] << " lo=" << lo << "\n";
893 
894  addr.push_back( &hi );
895  //std::cout << "addr hi = " << addr[1] << " hi=" << hi << "\n";
896 
897  int addr_dest=addr.size();
898  for(int i=0; i < addr_leaf.addr.size(); ++i) {
899  addr.push_back( &addr_leaf.addr[i] );
900  //std::cout << "addr = " << addr_leaf.addr[i] << "\n";
901  }
902 
903  jit_launch(function,s.numSiteTable(),addr);
904  }
905 
906 
907 
908 
909 
910  template<typename U,typename T,typename X,typename Y>
911  CUfunction function_ldagdlinv_build( const T& tr_log_diag,
912  const X& tri_dia,
913  const Y& tri_off,
914  const Subset& s)
915  {
916  typedef typename WordType<U>::Type_t REALT;
917 
918  //std::cout << __PRETTY_FUNCTION__ << "\n";
919 
920  CUfunction func;
921 
922  jit_start_new_function();
923 
924  jit_value r_lo = jit_add_param( jit_ptx_type::s32 );
925  jit_value r_hi = jit_add_param( jit_ptx_type::s32 );
926 
927  jit_value r_idx_thread = jit_geom_get_linear_th_idx();
928 
929  jit_value r_out_of_range = jit_ins_gt( r_idx_thread , jit_ins_sub( r_hi , r_lo ) );
930  jit_ins_exit( r_out_of_range );
931 
932  jit_value r_idx = jit_ins_add( r_lo , r_idx_thread );
933 
934  ParamLeaf param_leaf( r_idx );
935 
936 
937  typedef typename LeafFunctor<T, ParamLeaf>::Type_t TJIT;
938  TJIT tr_log_diag_jit(forEach(tr_log_diag, param_leaf, TreeCombine()));
939  auto& tr_log_diag_j = tr_log_diag_jit.elem(JitDeviceLayout::Coalesced);
940 
941  typedef typename LeafFunctor<X, ParamLeaf>::Type_t XJIT;
942  XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine()));
943  auto& tri_dia_j = tri_dia_jit.elem(JitDeviceLayout::Coalesced);
944 
945  typename REGType< typename XJIT::Subtype_t >::Type_t tri_dia_r;
946  tri_dia_r.setup( tri_dia_j );
947  // tri_dia_r.setup( tri_dia_jit.elem(JitDeviceLayout::Coalesced) );
948 
949  typedef typename LeafFunctor<Y, ParamLeaf>::Type_t YJIT;
950  YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine()));
951  auto& tri_off_j = tri_off_jit.elem(JitDeviceLayout::Coalesced);
952 
953  typename REGType< typename YJIT::Subtype_t >::Type_t tri_off_r;
954  tri_off_r.setup( tri_off_j );
955  // tri_off_r.setup( tri_off_jit.elem(JitDeviceLayout::Coalesced) );
956 
957 
958  RScalarREG<WordREG<REALT> > zip;
959  zero_rep(zip);
960  int N = 2*Nc;
961 
962  int site_neg_logdet=0;
963 
964  for(int block=0; block < 2; block++) {
965 
966  RScalarREG<WordREG<REALT> > inv_d[6] ;
967  RComplexREG<WordREG<REALT> > inv_offd[15] ;
968  RComplexREG<WordREG<REALT> > v[6] ;
969  RScalarREG<WordREG<REALT> > diag_g[6] ;
970 
971  for(int i=0; i < N; i++) {
972  inv_d[i] = tri_dia_r.elem(block).elem(i);
973  }
974 
975  for(int i=0; i < 15; i++) {
976  inv_offd[i] = tri_off_r.elem(block).elem(i);
977  }
978 
979 
980  for(int j=0; j < N; ++j) {
981 
982  for(int i=0; i < j; i++) {
983  int elem_ji = j*(j-1)/2 + i;
984 
985  RComplexREG<WordREG<REALT> > A_ii = cmplx( inv_d[i], zip );
986  v[i] = A_ii*adj(inv_offd[elem_ji]);
987  }
988 
989 
990  v[j] = cmplx(inv_d[j],zip);
991 
992  for(int k=0; k < j; k++) {
993  int elem_jk = j*(j-1)/2 + k;
994  v[j] -= inv_offd[elem_jk]*v[k];
995  }
996 
997  inv_d[j] = real( v[j] );
998 
999  for(int k=j+1; k < N; k++) {
1000  int elem_kj = k*(k-1)/2 + j;
1001  for(int l=0; l < j; l++) {
1002  int elem_kl = k*(k-1)/2 + l;
1003  inv_offd[elem_kj] -= inv_offd[elem_kl] * v[l];
1004  }
1005  inv_offd[elem_kj] /= v[j];
1006  }
1007  }
1008 
1009 
1010  // Now fix up the inverse
1011  RScalarREG<WordREG<REALT> > one(1.0);
1012  //one.elem() = (REALT)1;
1013 
1014  for(int i=0; i < N; i++) {
1015  diag_g[i] = one/inv_d[i];
1016 
1017  // Compute the trace log
1018  // NB we are always doing trace log | A |
1019  // (because we are always working with actually A^\dagger A
1020  // even in one flavour case where we square root)
1021  tr_log_diag_j.elem().elem() += log(fabs(inv_d[i]));
1022  // However, it is worth counting just the no of negative logdets
1023  // on site
1024 #if 0
1025  if( inv_d[i].elem() < 0 ) {
1026  site_neg_logdet++;
1027  }
1028 #endif
1029  }
1030 
1031  // Now we need to invert the L D L^\dagger
1032  // We can do this by solving:
1033  //
1034  // L D L^\dagger M^{-1} = 1
1035  //
1036  // This can be done by solving L D X = 1 (X = L^\dagger M^{-1})
1037  //
1038  // Then solving L^\dagger M^{-1} = X
1039  //
1040  // LD is lower diagonal and so X will also be lower diagonal.
1041  // LD X = 1 can be solved by forward substitution.
1042  //
1043  // Likewise L^\dagger is strictly upper triagonal and so
1044  // L^\dagger M^{-1} = X can be solved by forward substitution.
1045  RComplexREG<WordREG<REALT> > sum;
1046  for(int k = 0; k < N; ++k) {
1047 
1048  for(int i = 0; i < k; ++i) {
1049  zero_rep(v[i]);
1050  }
1051 
1052  /*# Forward substitution */
1053 
1054  // The first element is the inverse of the diagonal
1055  v[k] = cmplx(diag_g[k],zip);
1056 
1057  for(int i = k+1; i < N; ++i) {
1058  zero_rep(v[i]);
1059 
1060  for(int j = k; j < i; ++j) {
1061  int elem_ij = i*(i-1)/2+j;
1062 
1063  // subtract l_ij*d_j*x_{kj}
1064  v[i] -= inv_offd[elem_ij] *inv_d[j]*v[j];
1065 
1066  }
1067 
1068  // scale out by 1/d_i
1069  v[i] *= diag_g[i];
1070  }
1071 
1072  /*# Backward substitution */
1073  // V[N-1] remains unchanged
1074  // Start from V[N-2]
1075 
1076  for(int i = N-2; (int)i >= (int)k; --i) {
1077  for(int j = i+1; j < N; ++j) {
1078  int elem_ji = j*(j-1)/2 + i;
1079  // Subtract terms of typ (l_ji)*x_kj
1080  v[i] -= adj(inv_offd[elem_ji]) * v[j];
1081  }
1082  }
1083 
1084  /*# Overwrite column k of invcl.offd */
1085  inv_d[k] = real(v[k]);
1086  for(int i = k+1; i < N; ++i) {
1087 
1088  int elem_ik = i*(i-1)/2+k;
1089  inv_offd[elem_ik] = v[i];
1090  }
1091  }
1092 
1093  // Overwrite original data
1094  for(int i=0; i < N; i++) {
1095  tri_dia_j.elem(block).elem(i) = inv_d[i];
1096  }
1097  for(int i=0; i < 15; i++) {
1098  tri_off_j.elem(block).elem(i) = inv_offd[i];
1099  }
1100  }
1101 
1102  return jit_get_cufunction("ptx_ldagdlinv.ptx");
1103  }
1104 
1105 
1106 
1107 
1108 
1109  /*! An LDL^\dag decomposition and inversion? */
1110  template<typename T, typename U>
1112  {
1113  START_CODE();
1114 
1115  if ( 2*Nc < 3 )
1116  {
1117  QDPIO::cerr << __func__ << ": Matrix is too small" << std::endl;
1118  QDP_abort(1);
1119  }
1120 
1121  // Zero trace log
1122  tr_log_diag[rb[cb]] = zero;
1123 
1124  //QDPIO::cout << "PTX Clover ldagdlinv " << (void*)this << "\n";
1125  //std::cout << "PTX Clover ldagdlinv " << (void*)this << "\n";
1126  static CUfunction function;
1127 
1128  if (function == NULL)
1129  function = function_ldagdlinv_build<U>(tr_log_diag, tri_dia, tri_off, rb[cb] );
1130 
1131  // Execute the function
1132  function_ldagdlinv_exec(function, tr_log_diag, tri_dia, tri_off, rb[cb] );
1133 
1134  // This comes from the days when we used to do Cholesky
1135  choles_done[cb] = true;
1136  END_CODE();
1137  }
1138 
1139  /*! CHLCLOVMS - Cholesky decompose the clover mass term and uses it to
1140  * compute lower(A^-1) = lower((L.L^dag)^-1)
1141  * Adapted from Golub and Van Loan, Matrix Computations, 2nd, Sec 4.2.4
1142  *
1143  * Arguments:
1144  *
1145  * \param DetP flag whether to compute determinant (Read)
1146  * \param logdet logarithm of the determinant (Write)
1147  * \param cb checkerboard of work (Read)
1148  */
1149 
1150 
1151 
1152 
1153 
1154 
1155  //! TRIACNTR
1156  /*!
1157  * \ingroup linop
1158  *
1159  * Calculates
1160  * Tr_D ( Gamma_mat L )
1161  *
1162  * This routine is specific to Wilson fermions!
1163  *
1164  * the trace over the Dirac indices for one of the 16 Gamma matrices
1165  * and a hermitian color x spin matrix A, stored as a block diagonal
1166  * complex lower triangular matrix L and a real diagonal diag_L.
1167 
1168  * Here 0 <= mat <= 15 and
1169  * if mat = mat_1 + mat_2 * 2 + mat_3 * 4 + mat_4 * 8
1170  *
1171  * Gamma(mat) = gamma(1)^(mat_1) * gamma(2)^(mat_2) * gamma(3)^(mat_3)
1172  * * gamma(4)^(mat_4)
1173  *
1174  * Further, in basis for the Gamma matrices used, A is of the form
1175  *
1176  * | A_0 | 0 |
1177  * A = | --------- |
1178  * | 0 | A_1 |
1179  *
1180  *
1181  * Arguments:
1182  *
1183  * \param B the resulting SU(N) color matrix (Write)
1184  * \param clov clover term (Read)
1185  * \param mat label of the Gamma matrix (Read)
1186  */
1187 
1188 
1189  template<typename U,typename X,typename Y>
1190  CUfunction function_triacntr_exec( CUfunction function,
1191  U& B,
1192  const X& tri_dia,
1193  const Y& tri_off,
1194  int mat,
1195  const Subset& s)
1196  {
1197  if (!s.hasOrderedRep())
1198  QDP_error_exit("triacntr on subset with unordered representation not implemented");
1199 
1200  AddressLeaf addr_leaf;
1201 
1202  int junk_0 = forEach(B, addr_leaf, NullCombine());
1203  int junk_2 = forEach(tri_dia, addr_leaf, NullCombine());
1204  int junk_3 = forEach(tri_off, addr_leaf, NullCombine());
1205 
1206  // lo <= idx < hi
1207  int lo = s.start();
1208  int hi = s.end();
1209 
1210  std::vector<void*> addr;
1211 
1212  addr.push_back( &lo );
1213  //std::cout << "addr lo = " << addr[0] << " lo=" << lo << "\n";
1214 
1215  addr.push_back( &hi );
1216  //std::cout << "addr hi = " << addr[1] << " hi=" << hi << "\n";
1217 
1218  addr.push_back( &mat );
1219  //std::cout << "addr hi = " << addr[1] << " hi=" << hi << "\n";
1220 
1221  int addr_dest=addr.size();
1222  for(int i=0; i < addr_leaf.addr.size(); ++i) {
1223  addr.push_back( &addr_leaf.addr[i] );
1224  //std::cout << "addr = " << addr_leaf.addr[i] << "\n";
1225  }
1226 
1227  jit_launch(function,s.numSiteTable(),addr);
1228  }
1229 
1230 
1231 
1232 
1233  template<typename U,typename X,typename Y>
1234  CUfunction function_triacntr_build( const U& B,
1235  const X& tri_dia,
1236  const Y& tri_off,
1237  int mat,
1238  const Subset& s)
1239  {
1240  //std::cout << __PRETTY_FUNCTION__ << ": entering\n";
1241 
1242  typedef typename WordType<U>::Type_t REALT;
1243 
1244  //std::cout << __PRETTY_FUNCTION__ << "\n";
1245 
1246  CUfunction func;
1247 
1248  jit_start_new_function();
1249 
1250  jit_value r_lo = jit_add_param( jit_ptx_type::s32 );
1251  jit_value r_hi = jit_add_param( jit_ptx_type::s32 );
1252  jit_value r_mat = jit_add_param( jit_ptx_type::s32 );
1253 
1254  jit_value r_idx_thread = jit_geom_get_linear_th_idx();
1255 
1256  jit_value r_out_of_range = jit_ins_gt( r_idx_thread , jit_ins_sub( r_hi , r_lo ) );
1257  jit_ins_exit( r_out_of_range );
1258 
1259  jit_value r_idx = jit_ins_add( r_lo , r_idx_thread );
1260 
1261  ParamLeaf param_leaf( r_idx );
1262 
1263 
1264  typedef typename LeafFunctor<U, ParamLeaf>::Type_t UJIT;
1265  UJIT B_jit(forEach(B, param_leaf, TreeCombine()));
1266  auto& B_j = B_jit.elem(JitDeviceLayout::Coalesced);
1267 
1268  typedef typename LeafFunctor<X, ParamLeaf>::Type_t XJIT;
1269  XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine()));
1270  auto& tri_dia_j = tri_dia_jit.elem(JitDeviceLayout::Coalesced);
1271 
1272  typename REGType< typename XJIT::Subtype_t >::Type_t tri_dia_r;
1273  tri_dia_r.setup( tri_dia_j );
1274  // tri_dia_r.setup( tri_dia_jit.elem(JitDeviceLayout::Coalesced) );
1275 
1276  typedef typename LeafFunctor<Y, ParamLeaf>::Type_t YJIT;
1277  YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine()));
1278  auto& tri_off_j = tri_off_jit.elem(JitDeviceLayout::Coalesced);
1279 
1280  typename REGType< typename YJIT::Subtype_t >::Type_t tri_off_r;
1281  tri_off_r.setup( tri_off_j );
1282  // tri_off_r.setup( tri_off_jit.elem(JitDeviceLayout::Coalesced) );
1283 
1284 
1285  jit_label_t case_0;
1286  jit_label_t case_3;
1287  jit_label_t case_5;
1288  jit_label_t case_6;
1289  jit_label_t case_9;
1290  jit_label_t case_10;
1291  jit_label_t case_12;
1292 
1293 
1294  jit_ins_branch( case_0 , jit_ins_eq( r_mat , jit_value(0) ) );
1295  jit_ins_branch( case_3 , jit_ins_eq( r_mat , jit_value(3) ) );
1296  jit_ins_branch( case_5 , jit_ins_eq( r_mat , jit_value(5) ) );
1297  jit_ins_branch( case_6 , jit_ins_eq( r_mat , jit_value(6) ) );
1298  jit_ins_branch( case_9 , jit_ins_eq( r_mat , jit_value(9) ) );
1299  jit_ins_branch( case_10 , jit_ins_eq( r_mat , jit_value(10) ) );
1300  jit_ins_branch( case_12 , jit_ins_eq( r_mat , jit_value(12) ) );
1301  jit_ins_exit();
1302 
1303 
1304  /*# gamma( 0) 1 0 0 0 # ( 0000 ) --> 0 */
1305  /*# 0 1 0 0 */
1306  /*# 0 0 1 0 */
1307  /*# 0 0 0 1 */
1308  /*# From diagonal part */
1309  jit_ins_label( case_0 );
1310  {
1311  RComplexREG<WordREG<REALT> > lctmp0;
1312  RScalarREG< WordREG<REALT> > lr_zero0;
1313  RScalarREG< WordREG<REALT> > lrtmp0;
1314 
1315  zero_rep(lr_zero0);
1316 
1317  for(int i0 = 0; i0 < Nc; ++i0) {
1318 
1319  lrtmp0 = tri_dia_r.elem(0).elem(i0);
1320  lrtmp0 += tri_dia_r.elem(0).elem(i0+Nc);
1321  lrtmp0 += tri_dia_r.elem(1).elem(i0);
1322  lrtmp0 += tri_dia_r.elem(1).elem(i0+Nc);
1323  B_j.elem().elem(i0,i0) = cmplx(lrtmp0,lr_zero0);
1324  }
1325 
1326  /*# From lower triangular portion */
1327  int elem_ij0 = 0;
1328  for(int i0 = 1; i0 < Nc; ++i0) {
1329 
1330  int elem_ijb0 = (i0+Nc)*(i0+Nc-1)/2 + Nc;
1331 
1332  for(int j0 = 0; j0 < i0; ++j0) {
1333 
1334  lctmp0 = tri_off_r.elem(0).elem(elem_ij0);
1335  lctmp0 += tri_off_r.elem(0).elem(elem_ijb0);
1336  lctmp0 += tri_off_r.elem(1).elem(elem_ij0);
1337  lctmp0 += tri_off_r.elem(1).elem(elem_ijb0);
1338 
1339  B_j.elem().elem(j0,i0) = lctmp0;
1340  B_j.elem().elem(i0,j0) = adj(lctmp0);
1341 
1342  elem_ij0++;
1343  elem_ijb0++;
1344  }
1345  }
1346  jit_ins_exit( );
1347  }
1348 
1349 
1350  jit_ins_label( case_3 );
1351  {
1352  /*# gamma( 12) -i 0 0 0 # ( 0011 ) --> 3 */
1353  /*# 0 i 0 0 */
1354  /*# 0 0 -i 0 */
1355  /*# 0 0 0 i */
1356  /*# From diagonal part */
1357 
1358  RComplexREG<WordREG<REALT> > lctmp3;
1359  RScalarREG<WordREG<REALT> > lr_zero3;
1360  RScalarREG<WordREG<REALT> > lrtmp3;
1361 
1362  lr_zero3 = 0;
1363 
1364  for(int i3 = 0; i3 < Nc; ++i3) {
1365 
1366  lrtmp3 = tri_dia_r.elem(0).elem(i3+Nc);
1367  lrtmp3 -= tri_dia_r.elem(0).elem(i3);
1368  lrtmp3 -= tri_dia_r.elem(1).elem(i3);
1369  lrtmp3 += tri_dia_r.elem(1).elem(i3+Nc);
1370  B_j.elem().elem(i3,i3) = cmplx(lr_zero3,lrtmp3);
1371  }
1372 
1373  /*# From lower triangular portion */
1374  int elem_ij3 = 0;
1375  for(int i3 = 1; i3 < Nc; ++i3) {
1376 
1377  int elem_ijb3 = (i3+Nc)*(i3+Nc-1)/2 + Nc;
1378 
1379  for(int j3 = 0; j3 < i3; ++j3) {
1380 
1381  lctmp3 = tri_off_r.elem(0).elem(elem_ijb3);
1382  lctmp3 -= tri_off_r.elem(0).elem(elem_ij3);
1383  lctmp3 -= tri_off_r.elem(1).elem(elem_ij3);
1384  lctmp3 += tri_off_r.elem(1).elem(elem_ijb3);
1385 
1386  B_j.elem().elem(j3,i3) = timesI(adj(lctmp3));
1387  B_j.elem().elem(i3,j3) = timesI(lctmp3);
1388 
1389  elem_ij3++;
1390  elem_ijb3++;
1391  }
1392  }
1393  jit_ins_exit( );
1394  }
1395 
1396 
1397  jit_ins_label( case_5 );
1398  {
1399  /*# gamma( 13) 0 -1 0 0 # ( 0101 ) --> 5 */
1400  /*# 1 0 0 0 */
1401  /*# 0 0 0 -1 */
1402  /*# 0 0 1 0 */
1403 
1404  RComplexREG<WordREG<REALT> > lctmp5;
1405  RScalarREG<WordREG<REALT> > lrtmp5;
1406 
1407  for(int i5 = 0; i5 < Nc; ++i5) {
1408 
1409  int elem_ij5 = (i5+Nc)*(i5+Nc-1)/2;
1410 
1411  for(int j5 = 0; j5 < Nc; ++j5) {
1412 
1413  int elem_ji5 = (j5+Nc)*(j5+Nc-1)/2 + i5;
1414 
1415 
1416  lctmp5 = adj(tri_off_r.elem(0).elem(elem_ji5));
1417  lctmp5 -= tri_off_r.elem(0).elem(elem_ij5);
1418  lctmp5 += adj(tri_off_r.elem(1).elem(elem_ji5));
1419  lctmp5 -= tri_off_r.elem(1).elem(elem_ij5);
1420 
1421  B_j.elem().elem(i5,j5) = lctmp5;
1422 
1423  elem_ij5++;
1424  }
1425  }
1426  jit_ins_exit( );
1427  }
1428 
1429 
1430  jit_ins_label( case_6 );
1431  {
1432  /*# gamma( 23) 0 -i 0 0 # ( 0110 ) --> 6 */
1433  /*# -i 0 0 0 */
1434  /*# 0 0 0 -i */
1435  /*# 0 0 -i 0 */
1436 
1437  RComplexREG<WordREG<REALT> > lctmp6;
1438  RScalarREG<WordREG<REALT> > lrtmp6;
1439 
1440  for(int i6 = 0; i6 < Nc; ++i6) {
1441 
1442  int elem_ij6 = (i6+Nc)*(i6+Nc-1)/2;
1443 
1444  for(int j6 = 0; j6 < Nc; ++j6) {
1445 
1446  int elem_ji6 = (j6+Nc)*(j6+Nc-1)/2 + i6;
1447 
1448  lctmp6 = adj(tri_off_r.elem(0).elem(elem_ji6));
1449  lctmp6 += tri_off_r.elem(0).elem(elem_ij6);
1450  lctmp6 += adj(tri_off_r.elem(1).elem(elem_ji6));
1451  lctmp6 += tri_off_r.elem(1).elem(elem_ij6);
1452 
1453  B_j.elem().elem(i6,j6) = timesMinusI(lctmp6);
1454 
1455  elem_ij6++;
1456  }
1457  }
1458  jit_ins_exit( );
1459  }
1460 
1461 
1462  jit_ins_label( case_9 );
1463  {
1464  /*# gamma( 14) 0 i 0 0 # ( 1001 ) --> 9 */
1465  /*# i 0 0 0 */
1466  /*# 0 0 0 -i */
1467  /*# 0 0 -i 0 */
1468 
1469  RComplexREG<WordREG<REALT> > lctmp9;
1470  RScalarREG<WordREG<REALT> > lrtmp9;
1471 
1472  for(int i9 = 0; i9 < Nc; ++i9) {
1473 
1474  int elem_ij9 = (i9+Nc)*(i9+Nc-1)/2;
1475 
1476  for(int j9 = 0; j9 < Nc; ++j9) {
1477 
1478  int elem_ji9 = (j9+Nc)*(j9+Nc-1)/2 + i9;
1479 
1480  lctmp9 = adj(tri_off_r.elem(0).elem(elem_ji9));
1481  lctmp9 += tri_off_r.elem(0).elem(elem_ij9);
1482  lctmp9 -= adj(tri_off_r.elem(1).elem(elem_ji9));
1483  lctmp9 -= tri_off_r.elem(1).elem(elem_ij9);
1484 
1485  B_j.elem().elem(i9,j9) = timesI(lctmp9);
1486 
1487  elem_ij9++;
1488  }
1489  }
1490  jit_ins_exit( );
1491  }
1492 
1493 
1494  jit_ins_label( case_10 );
1495  {
1496  /*# gamma( 24) 0 -1 0 0 # ( 1010 ) --> 10 */
1497  /*# 1 0 0 0 */
1498  /*# 0 0 0 1 */
1499  /*# 0 0 -1 0 */
1500 
1501  RComplexREG<WordREG<REALT> > lctmp10;
1502  RScalarREG<WordREG<REALT> > lrtmp10;
1503 
1504  for(int i10 = 0; i10 < Nc; ++i10) {
1505 
1506  int elem_ij10 = (i10+Nc)*(i10+Nc-1)/2;
1507 
1508  for(int j10 = 0; j10 < Nc; ++j10) {
1509 
1510  int elem_ji10 = (j10+Nc)*(j10+Nc-1)/2 + i10;
1511 
1512  lctmp10 = adj(tri_off_r.elem(0).elem(elem_ji10));
1513  lctmp10 -= tri_off_r.elem(0).elem(elem_ij10);
1514  lctmp10 -= adj(tri_off_r.elem(1).elem(elem_ji10));
1515  lctmp10 += tri_off_r.elem(1).elem(elem_ij10);
1516 
1517  B_j.elem().elem(i10,j10) = lctmp10;
1518 
1519  elem_ij10++;
1520  }
1521  }
1522  jit_ins_exit( );
1523  }
1524 
1525 
1526  jit_ins_label( case_12 );
1527  {
1528  /*# gamma( 34) i 0 0 0 # ( 1100 ) --> 12 */
1529  /*# 0 -i 0 0 */
1530  /*# 0 0 -i 0 */
1531  /*# 0 0 0 i */
1532  /*# From diagonal part */
1533 
1534  RComplexREG<WordREG<REALT> > lctmp12;
1535  RScalarREG<WordREG<REALT> > lr_zero12;
1536  RScalarREG<WordREG<REALT> > lrtmp12;
1537 
1538  lr_zero12 = 0;
1539 
1540  for(int i12 = 0; i12 < Nc; ++i12) {
1541 
1542  lrtmp12 = tri_dia_r.elem(0).elem(i12);
1543  lrtmp12 -= tri_dia_r.elem(0).elem(i12+Nc);
1544  lrtmp12 -= tri_dia_r.elem(1).elem(i12);
1545  lrtmp12 += tri_dia_r.elem(1).elem(i12+Nc);
1546  B_j.elem().elem(i12,i12) = cmplx(lr_zero12,lrtmp12);
1547  }
1548 
1549  /*# From lower triangular portion */
1550  int elem_ij12 = 0;
1551  for(int i12 = 1; i12 < Nc; ++i12) {
1552 
1553  int elem_ijb12 = (i12+Nc)*(i12+Nc-1)/2 + Nc;
1554 
1555  for(int j12 = 0; j12 < i12; ++j12) {
1556 
1557  lctmp12 = tri_off_r.elem(0).elem(elem_ij12);
1558  lctmp12 -= tri_off_r.elem(0).elem(elem_ijb12);
1559  lctmp12 -= tri_off_r.elem(1).elem(elem_ij12);
1560  lctmp12 += tri_off_r.elem(1).elem(elem_ijb12);
1561 
1562  B_j.elem().elem(i12,j12) = timesI(lctmp12);
1563  B_j.elem().elem(j12,i12) = timesI(adj(lctmp12));
1564 
1565  elem_ij12++;
1566  elem_ijb12++;
1567  }
1568  }
1569  jit_ins_exit( );
1570  }
1571 
1572  return jit_get_cufunction("ptx_triacntr.ptx");
1573  }
1574 
1575 
1576 
1577 
1578  template<typename T, typename U>
1579  void PTXCloverTermT<T,U>::triacntr(U& B, int mat, int cb) const
1580  {
1581  START_CODE();
1582 
1583  B = zero;
1584 
1585  if ( mat < 0 || mat > 15 )
1586  {
1587  QDPIO::cerr << __func__ << ": Gamma out of range: mat = " << mat << std::endl;
1588  QDP_abort(1);
1589  }
1590 
1591  //QDPIO::cout << "PTX Clover triacntr " << (void*)this << "\n";
1592  //std::cout << "PTX Clover triacntr " << (void*)this << "\n";
1593  static CUfunction function;
1594 
1595  if (function == NULL)
1596  function = function_triacntr_build<U>( B, tri_dia, tri_off, mat, rb[cb] );
1597 
1598  // Execute the function
1599  function_triacntr_exec(function, B, tri_dia, tri_off, mat, rb[cb] );
1600 
1601  END_CODE();
1602  }
1603 
1604  //! Returns the appropriate clover coefficient for indices mu and nu
1605  template<typename T, typename U>
1606  Real
1608  {
1609  START_CODE();
1610 
1611  if( param.anisoParam.anisoP ) {
1612  if (mu==param.anisoParam.t_dir || nu == param.anisoParam.t_dir) {
1613  return param.clovCoeffT;
1614  }
1615  else {
1616  // Otherwise return the spatial coeff
1617  return param.clovCoeffR;
1618  }
1619  }
1620  else {
1621  // If there is no anisotropy just return the spatial one, it will
1622  // be the same as the temporal one
1623  return param.clovCoeffR;
1624  }
1625 
1626  END_CODE();
1627  }
1628 
1629 
1630 
1631  template<typename T,typename X,typename Y>
1632  void function_apply_clov_exec(CUfunction function,
1633  T& chi,
1634  const T& psi,
1635  const X& tri_dia,
1636  const Y& tri_off,
1637  const Subset& s)
1638  {
1639  if (!s.hasOrderedRep())
1640  QDP_error_exit("clover on subset with unordered representation not implemented");
1641 
1642  //std::cout << __PRETTY_FUNCTION__ << ": entering\n";
1643 
1644  AddressLeaf addr_leaf;
1645 
1646  int junk_0 = forEach(chi, addr_leaf, NullCombine());
1647  int junk_1 = forEach(psi, addr_leaf, NullCombine());
1648  int junk_2 = forEach(tri_dia, addr_leaf, NullCombine());
1649  int junk_3 = forEach(tri_off, addr_leaf, NullCombine());
1650 
1651  // lo <= idx < hi
1652  int lo = s.start();
1653  int hi = s.end();
1654 
1655  std::vector<void*> addr;
1656 
1657  addr.push_back( &lo );
1658  //std::cout << "addr lo = " << addr[0] << " lo=" << lo << "\n";
1659 
1660  addr.push_back( &hi );
1661  //std::cout << "addr hi = " << addr[1] << " hi=" << hi << "\n";
1662 
1663  int addr_dest=addr.size();
1664  for(int i=0; i < addr_leaf.addr.size(); ++i) {
1665  addr.push_back( &addr_leaf.addr[i] );
1666  //std::cout << "addr = " << addr_leaf.addr[i] << "\n";
1667  }
1668 
1669  jit_launch(function,s.numSiteTable(),addr);
1670  }
1671 
1672 
1673 
1674 
1675  template<typename T,typename X,typename Y>
1676  CUfunction function_apply_clov_build(const T& chi,
1677  const T& psi,
1678  const X& tri_dia,
1679  const Y& tri_off,
1680  const Subset& s)
1681  {
1682  //std::cout << __PRETTY_FUNCTION__ << ": entering\n";
1683  //typedef typename WordType<RealT>::Type_t REALT;
1684 
1685  CUfunction func;
1686 
1687  jit_start_new_function();
1688 
1689  jit_value r_lo = jit_add_param( jit_ptx_type::s32 );
1690  jit_value r_hi = jit_add_param( jit_ptx_type::s32 );
1691 
1692  jit_value r_idx_thread = jit_geom_get_linear_th_idx();
1693 
1694  jit_value r_out_of_range = jit_ins_gt( r_idx_thread , jit_ins_sub( r_hi , r_lo ) );
1695  jit_ins_exit( r_out_of_range );
1696 
1697  jit_value r_idx = jit_ins_add( r_lo , r_idx_thread );
1698 
1699  ParamLeaf param_leaf( r_idx );
1700 
1701 
1702  typedef typename LeafFunctor<T, ParamLeaf>::Type_t TJIT;
1703  TJIT chi_jit(forEach(chi, param_leaf, TreeCombine()));
1704  TJIT psi_jit(forEach(psi, param_leaf, TreeCombine()));
1705  auto& chi_j = chi_jit.elem(JitDeviceLayout::Coalesced);
1706 
1707  typename REGType< typename TJIT::Subtype_t >::Type_t psi_r;
1708  psi_r.setup( psi_jit.elem(JitDeviceLayout::Coalesced) );
1709 
1710  typename REGType< typename TJIT::Subtype_t >::Type_t chi_r;
1711 
1712 
1713  typedef typename LeafFunctor<X, ParamLeaf>::Type_t XJIT;
1714  XJIT tri_dia_jit(forEach(tri_dia, param_leaf, TreeCombine()));
1715 
1716  typename REGType< typename XJIT::Subtype_t >::Type_t tri_dia_r;
1717  tri_dia_r.setup( tri_dia_jit.elem(JitDeviceLayout::Coalesced) );
1718 
1719 
1720 
1721 
1722  typedef typename LeafFunctor<Y, ParamLeaf>::Type_t YJIT;
1723  YJIT tri_off_jit(forEach(tri_off, param_leaf, TreeCombine()));
1724  auto& tri_off_j = tri_off_jit.elem(JitDeviceLayout::Coalesced);
1725 
1726  typename REGType< typename YJIT::Subtype_t >::Type_t tri_off_r;
1727  tri_off_r.setup( tri_off_jit.elem(JitDeviceLayout::Coalesced) );
1728 
1729 
1730 
1731 
1732  // RComplex<REALT>* cchi = (RComplex<REALT>*)&(chi.elem(site).elem(0).elem(0));
1733  // const RComplex<REALT>* ppsi = (const RComplex<REALT>*)&(psi.elem(site).elem(0).elem(0));
1734 
1735  int n = 2*Nc;
1736 
1737  for(int i = 0; i < n; ++i)
1738  {
1739  chi_r.elem((0*n+i)/3).elem((0*n+i)%3) = tri_dia_r.elem(0).elem(i) * psi_r.elem((0*n+i)/3).elem((0*n+i)%3);
1740  // cchi[0*n+i] = tri[site].diag[0][i] * ppsi[0*n+i];
1741 
1742  chi_r.elem((1*n+i)/3).elem((1*n+i)%3) = tri_dia_r.elem(1).elem(i) * psi_r.elem((1*n+i)/3).elem((1*n+i)%3);
1743  // cchi[1*n+i] = tri[site].diag[1][i] * ppsi[1*n+i];
1744  }
1745 
1746  int kij = 0;
1747  for(int i = 0; i < n; ++i)
1748  {
1749  for(int j = 0; j < i; j++)
1750  {
1751  chi_r.elem((0*n+i)/3).elem((0*n+i)%3) += tri_off_r.elem(0).elem(kij) * psi_r.elem((0*n+j)/3).elem((0*n+j)%3);
1752  // cchi[0*n+i] += tri[site].offd[0][kij] * ppsi[0*n+j];
1753 
1754  chi_r.elem((0*n+j)/3).elem((0*n+j)%3) += conj(tri_off_r.elem(0).elem(kij)) * psi_r.elem((0*n+i)/3).elem((0*n+i)%3);
1755  // cchi[0*n+j] += conj(tri[site].offd[0][kij]) * ppsi[0*n+i];
1756 
1757  chi_r.elem((1*n+i)/3).elem((1*n+i)%3) += tri_off_r.elem(1).elem(kij) * psi_r.elem((1*n+j)/3).elem((1*n+j)%3);
1758  // cchi[1*n+i] += tri[site].offd[1][kij] * ppsi[1*n+j];
1759 
1760  chi_r.elem((1*n+j)/3).elem((1*n+j)%3) += conj(tri_off_r.elem(1).elem(kij)) * psi_r.elem((1*n+i)/3).elem((1*n+i)%3);
1761  // cchi[1*n+j] += conj(tri[site].offd[1][kij]) * ppsi[1*n+i];
1762 
1763  kij++;
1764  }
1765  }
1766 
1767  chi_j = chi_r;
1768 
1769  return jit_get_cufunction("ptx_apply_clov.ptx");
1770  }
1771 
1772 
1773 
1774 
1775 
1776 
1777 
1778  /**
1779  * Apply a dslash
1780  *
1781  * Performs the operation
1782  *
1783  * chi <- (L + D + L^dag) . psi
1784  *
1785  * where
1786  * L is a lower triangular matrix
1787  * D is the real diagonal. (stored together in type TRIANG)
1788  *
1789  * Arguments:
1790  * \param chi result (Write)
1791  * \param psi source (Read)
1792  * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read)
1793  * \param cb Checkerboard of OUTPUT std::vector (Read)
1794  */
1795  template<typename T, typename U>
1797  enum PlusMinus isign, int cb) const
1798  {
1799  START_CODE();
1800 
1801  if ( Ns != 4 ) {
1802  QDPIO::cerr << __func__ << ": CloverTerm::apply requires Ns==4" << std::endl;
1803  QDP_abort(1);
1804  }
1805 
1806  //QDPIO::cout << "PTX Clover apply" << (void*)this << "\n";
1807  //std::cout << "PTX Clover apply" << (void*)this << "\n";
1808  static CUfunction function;
1809 
1810  if (function == NULL)
1811  function = function_apply_clov_build(chi, psi, tri_dia, tri_off, rb[cb] );
1812 
1813  // Execute the function
1814  function_apply_clov_exec(function, chi, psi, tri_dia, tri_off, rb[cb] );
1815 
1816  (*this).getFermBC().modifyF(chi, QDP::rb[cb]);
1817 
1818  END_CODE();
1819  }
1820 
1821 
1822 
1823 
1824 
1825  namespace QDPCloverEnv {
1826  template<typename R,typename TD,typename TO>
1827  struct QUDAPackArgs {
1828  int cb;
1829  multi1d<QUDAPackedClovSite<R> >& quda_array;
1830  const TD& tri_dia;
1831  const TO& tri_off;
1832  };
1833 
1834  template<typename R,typename TD,typename TO>
1835  void qudaPackSiteLoop(int lo, int hi, int myId, QUDAPackArgs<R,TD,TO>* a) {
1836  int cb = a->cb;
1837  int Ns2 = Ns/2;
1838 
1839  multi1d<QUDAPackedClovSite<R> >& quda_array = a->quda_array;
1840 
1841  const TD& tri_dia = a->tri_dia;
1842  const TO& tri_off = a->tri_off;
1843 
1844  const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
1845 
1846  for(int ssite=lo; ssite < hi; ++ssite) {
1847  int site = rb[cb].siteTable()[ssite];
1848  // First Chiral Block
1849  for(int i=0; i < 6; i++) {
1850  quda_array[site].diag1[i] = tri_dia.elem(site).comp[0].diag[i].elem().elem();
1851  }
1852 
1853  int target_index=0;
1854 
1855  for(int col=0; col < Nc*Ns2-1; col++) {
1856  for(int row=col+1; row < Nc*Ns2; row++) {
1857 
1858  int source_index = row*(row-1)/2 + col;
1859 
1860  quda_array[site].offDiag1[target_index][0] = tri_off.elem(site).comp[0].offd[source_index].real().elem();
1861  quda_array[site].offDiag1[target_index][1] = tri_off.elem(site).comp[0].offd[source_index].imag().elem();
1862  target_index++;
1863  }
1864  }
1865  // Second Chiral Block
1866  for(int i=0; i < 6; i++) {
1867  quda_array[site].diag2[i] = tri_dia.elem(site).comp[1].diag[i].elem().elem();
1868  }
1869 
1870  target_index=0;
1871  for(int col=0; col < Nc*Ns2-1; col++) {
1872  for(int row=col+1; row < Nc*Ns2; row++) {
1873 
1874  int source_index = row*(row-1)/2 + col;
1875 
1876  quda_array[site].offDiag2[target_index][0] = tri_off.elem(site).comp[1].offd[source_index].real().elem();
1877  quda_array[site].offDiag2[target_index][1] = tri_off.elem(site).comp[1].offd[source_index].imag().elem();
1878  target_index++;
1879  }
1880  }
1881  }
1882  QDPIO::cout << "\n";
1883  }
1884  }
1885 
1886  template<typename T, typename U>
1887  void PTXCloverTermT<T,U>::packForQUDA(multi1d<QUDAPackedClovSite<typename WordType<T>::Type_t> >& quda_array, int cb) const
1888  {
1889  typedef typename WordType<T>::Type_t REALT;
1890  int num_sites = rb[cb].siteTable().size();
1891 
1892  typedef OLattice<PComp<PTriDia<RScalar <Word<REALT> > > > > TD;
1893  typedef OLattice<PComp<PTriOff<RComplex<Word<REALT> > > > > TO;
1894 
1895  StopWatch watch;
1896  watch.start();
1897 
1898  QDPCloverEnv::QUDAPackArgs<REALT,TD,TO> args = { cb, quda_array , tri_dia , tri_off };
1899  dispatch_to_threads(num_sites, args, QDPCloverEnv::qudaPackSiteLoop<REALT,TD,TO>);
1900 
1901  watch.stop();
1902  PackForQUDATimer::Instance().get() += watch.getTimeInMicroseconds();
1903  }
1904 
1905 
1906 
1907  template<typename T, typename U>
1909  enum PlusMinus isign, int site) const
1910  {
1911  QDP_error_exit("PTXCloverTermT<T,U>::applySite(T& chi, const T& psi,..) not implemented ");
1912 #if 0
1913 #endif
1914  }
1915 
1916 
1920 } // End Namespace Chroma
1921 
1922 
1923 #endif
Base class for all fermion action boundary conditions.
Definition: fermbc.h:20
Support class for fermion actions and linear operators.
Definition: state.h:94
Class for counted reference semantics.
Definition: handle.h:33
void choles(int cb)
Computes the inverse of the term on cb using Cholesky.
OLattice< PScalar< PScalar< RScalar< Word< REALT > > > > > LatticeREAL
const multi1d< U > & getU() const
Get the u field.
multi1d< bool > choles_done
CloverFermActParams param
void applySite(T &chi, const T &psi, enum PlusMinus isign, int site) const
void makeClov(const multi1d< U > &f, const RealT &diag_mass)
Create the clover term on cb.
Real getCloverCoeff(int mu, int nu) const
Calculates Tr_D ( Gamma_mat L )
Handle< FermBC< T, multi1d< U >, multi1d< U > > > fbc
OLattice< PComp< PTriDia< RScalar< Word< REALT > > > > > tri_dia
const FermBC< T, multi1d< U >, multi1d< U > > & getFermBC() const
Return the fermion BC object for this linear operator.
Double cholesDet(int cb) const
Computes the inverse of the term on cb using Cholesky.
WordType< T >::Type_t REALT
void ldagdlinv(LatticeREAL &tr_log_diag, int cb)
Invert the clover term on cb.
OScalar< PScalar< PScalar< RScalar< Word< REALT > > > > > RealT
void packForQUDA(multi1d< QUDAPackedClovSite< REALT > > &quda_pack, int cb) const
PACK UP the Clover term for QUDA library:
~PTXCloverTermT()
No real need for cleanup here.
OLattice< PComp< PTriOff< RComplex< Word< REALT > > > > > tri_off
void create(Handle< FermState< T, multi1d< U >, multi1d< U > > > fs, const CloverFermActParams &param_)
Creation routine.
void apply(T &chi, const T &psi, enum PlusMinus isign, int cb) const
void triacntr(U &B, int mat, int cb) const
Calculates Tr_D ( Gamma_mat L )
PTXCloverTermT()
Empty constructor. Must use create later.
const double & get() const
static PackForQUDATimer & Instance()
Parameters for Clover fermion action.
Clover term linear operator.
int mu
Definition: cool.cc:24
int nu
Definition: cool.cc:25
void block(LatticeColorMatrix &u_block, const multi1d< LatticeColorMatrix > &u, int mu, int bl_level, const Real &BlkAccu, int BlkMax, int j_decay)
Construct block links.
Definition: block.cc:42
void function_triacntr_exec(const JitFunction &function, U &B, const X &tri_dia, const Y &tri_off, int mat, const Subset &s)
TRIACNTR.
unsigned elem_ij
Definition: ldumul_w.cc:38
unsigned s
Definition: ldumul_w.cc:37
unsigned j
Definition: ldumul_w.cc:35
unsigned elem_ji
Definition: ldumul_w.cc:39
unsigned i
Definition: ldumul_w.cc:34
unsigned n
Definition: ldumul_w.cc:36
Calculates the antihermitian field strength tensor iF(mu,nu)
Nd
Definition: meslate.cc:74
void qudaPackSiteLoop(int lo, int hi, int myId, QUDAPackArgs< R, TD, TO > *a)
Asqtad Staggered-Dirac operator.
Definition: klein_gord.cc:10
QDP_error_exit("too many BiCG iterations", n_count, rsd_sq, cp, c, re_rvr, im_rvr, re_a, im_a, re_b, im_b)
void function_make_clov_exec(const JitFunction &function, const RealT &diag_mass, const U &f0, const U &f1, const U &f2, const U &f3, const U &f4, const U &f5, X &tri_dia, Y &tri_off)
static multi1d< LatticeColorMatrix > u
void function_ldagdlinv_exec(const JitFunction &function, T &tr_log_diag, X &tri_dia, Y &tri_off, const Subset &s)
void function_apply_clov_exec(const JitFunction &function, T &chi, const T &psi, const X &tri_dia, const Y &tri_off, const Subset &s)
Double one
Definition: invbicg.cc:105
LinOpSysSolverMGProtoClover::T T
int i
Definition: pbg5p_w.cc:55
PTXCloverTermT< LatticeFermion, LatticeColorMatrix > PTXCloverTerm
void function_triacntr_build(JitFunction &func, const U &B, const X &tri_dia, const Y &tri_off, int mat, const Subset &s)
void function_ldagdlinv_build(JitFunction &func, const T &tr_log_diag, const X &tri_dia, const Y &tri_off, const Subset &s)
multi1d< LatticeFermion > chi(Ncb)
PTXCloverTermT< LatticeFermionD, LatticeColorMatrixD > PTXCloverTermD
void mesField(multi1d< LatticeColorMatrixF > &f, const multi1d< LatticeColorMatrixF > &u)
Calculates the antihermitian field strength tensor iF(mu,nu)
Definition: mesfield.cc:80
Complex a
Definition: invbicg.cc:95
LatticeFermion psi
Definition: mespbg5p_w.cc:35
void function_make_clov_build(JitFunction &func, const RealT &diag_mass, const U &f0, const U &f1, const U &f2, const U &f3, const U &f4, const U &f5, const X &tri_dia, const Y &tri_off)
PTXCloverTermT< LatticeFermionF, LatticeColorMatrixF > PTXCloverTermF
START_CODE()
void function_apply_clov_build(JitFunction &func, const T &chi, const T &psi, const X &tri_dia, const Y &tri_off, const Subset &s)
int cb
Definition: invbicg.cc:120
Double zero
Definition: invbicg.cc:106
int k
Definition: invbicg.cc:119
multi1d< LatticeFermion > s(Ncb)
const T1 const T2 const T3 & f3
Definition: gtest.h:1327
const T1 const T2 & f2
Definition: gtest.h:1321
const T1 & f1
Definition: gtest.h:1316
const T1 const T2 const T3 const T4 const T5 & f5
Definition: gtest.h:1339
const T1 const T2 const T3 const T4 & f4
Definition: gtest.h:1333
FloatingPoint< double > Double
Definition: gtest.h:7351
int l
Definition: pade_trln_w.cc:111
Double sum
Definition: qtopcor.cc:37
Support class for fermion actions and linear operators.
Params for clover ferm acts.
multi1d< QUDAPackedClovSite< R > > & quda_array
PCompJIT< typename JITType< T >::Type_t > Type_t
PCompJIT< typename JITType< T >::Type_t > Type_t
PTriDiaJIT< typename JITType< T >::Type_t > Type_t
PTriDiaJIT< typename JITType< T >::Type_t > Type_t
PTriOffJIT< typename JITType< T >::Type_t > Type_t
PTriOffJIT< typename JITType< T >::Type_t > Type_t
PCompJIT & operator=(const PCompREG< T1 > &rhs)
const T & elem(int i) const
void setup(const PCompJIT< typename JITType< T >::Type_t > &rhs)
const T & elem(int i) const
const T & elem(int i) const
PTriDiaJIT & operator=(const PTriDiaREG< T1 > &rhs)
const T & elem(int i) const
void setup(const PTriDiaJIT< typename JITType< T >::Type_t > &rhs)
PTriOffJIT & operator=(const PTriOffREG< T1 > &rhs)
const T & elem(int i) const
T F[2 *Nc *Nc-Nc]
const T & elem(int i) const
void setup(const PTriOffJIT< typename JITType< T >::Type_t > &rhs)
T offd[2 *Nc *Nc-Nc]
PCompREG< typename REGType< T >::Type_t > Type_t
PTriDiaREG< typename REGType< T >::Type_t > Type_t
PTriOffREG< typename REGType< T >::Type_t > Type_t
WordType< T >::Type_t Type_t
WordType< T >::Type_t Type_t
WordType< T >::Type_t Type_t
WordType< T >::Type_t Type_t
multi1d< LatticeColorMatrix > U
LatticeFermion T
Definition: t_clover.cc:11
func
Definition: t_preccfz.cc:17
LatticeFermionD TD
Definition: t_quda_tprec.cc:22