CHROMA
stout_utils.cc
Go to the documentation of this file.
1 // -*- C++ -*-
2 /*! \file
3  * \brief Stout utilities
4  */
5 
6 #include "chroma_config.h"
7 #include "chromabase.h"
9 
10 //#if defined(BUILD_JIT_CLOVER_TERM)
11 //#include "util/gauge/stout_utils_ptx.h"
12 //#endif
13 
14 #if defined(BUILD_JIT_CLOVER_TERM)
15 #ifdef QDPJIT_IS_QDPJITPTX
16 CUfunction function_get_fs_bs_exec(CUfunction function,
17  const LatticeColorMatrix& Q,
18  const LatticeColorMatrix& QQ,
19  multi1d<LatticeComplex>& f,
20  multi1d<LatticeComplex>& b1,
21  multi1d<LatticeComplex>& b2,
22  bool dobs);
23 CUfunction function_get_fs_bs_build(const LatticeColorMatrix& Q,
24  const LatticeColorMatrix& QQ,
25  multi1d<LatticeComplex>& f,
26  multi1d<LatticeComplex>& b1,
27  multi1d<LatticeComplex>& b2,
28  bool dobs);
29 #elif defined(QDPJIT_IS_QDPJITNVVM)
30 CUfunction function_get_fs_bs_exec(CUfunction function,
31  const LatticeColorMatrix& Q,
32  const LatticeColorMatrix& QQ,
33  multi1d<LatticeComplex>& f,
34  multi1d<LatticeComplex>& b1,
35  multi1d<LatticeComplex>& b2,
36  bool dobs);
37 CUfunction function_get_fs_bs_build(const LatticeColorMatrix& Q,
38  const LatticeColorMatrix& QQ,
39  multi1d<LatticeComplex>& f,
40  multi1d<LatticeComplex>& b1,
41  multi1d<LatticeComplex>& b2);
42 #else
43 void function_get_fs_bs_exec(const JitFunction& func,
44  const LatticeColorMatrix& Q,
45  const LatticeColorMatrix& QQ,
46  multi1d<LatticeComplex>& f,
47  multi1d<LatticeComplex>& b1,
48  multi1d<LatticeComplex>& b2,
49  bool dobs);
50 void *function_get_fs_bs_build(JitFunction& function,
51  const LatticeColorMatrix& Q,
52  const LatticeColorMatrix& QQ,
53  multi1d<LatticeComplex>& f,
54  multi1d<LatticeComplex>& b1,
55  multi1d<LatticeComplex>& b2);
56 #endif
57 #endif
58 
59 
60 
61 namespace Chroma
62 {
63 
64  //! Timings
65  /*! \ingroup gauge */
66  namespace StoutLinkTimings {
67  static double smearing_secs = 0;
68  double getSmearingTime() {
69  return smearing_secs;
70  }
71 
72  static double force_secs = 0;
73  double getForceTime() {
74  return force_secs;
75  }
76 
77  static double functions_secs = 0;
78  double getFunctionsTime() {
79  return functions_secs;
80  }
81  }
82 
83  //! Utilities
84  /*! \ingroup gauge */
85  namespace Stouting
86  {
87 
88  /*! \ingroup gauge */
89  void getQs(const multi1d<LatticeColorMatrix>& u, LatticeColorMatrix& Q,
90  LatticeColorMatrix& QQ,
91  int mu,
92  const multi1d<bool>& smear_in_this_dirP,
93  const multi2d<Real>& rho)
94  {
95  START_CODE();
96 
97  LatticeColorMatrix C;
98  getQsandCs(u, Q, QQ, C, mu, smear_in_this_dirP,rho);
99 
100  END_CODE();
101  }
102 
103 
104  /*! \ingroup gauge */
105  void getQsandCs(const multi1d<LatticeColorMatrix>& u, LatticeColorMatrix& Q,
106  LatticeColorMatrix& QQ,
107  LatticeColorMatrix& C,
108  int mu,
109  const multi1d<bool>& smear_in_this_dirP,
110  const multi2d<Real>& rho)
111  {
112  START_CODE();
113 
114  C = zero;
115 
116  // If rho is nonzero in this direction then accumulate the staples
117  for(int nu=0; nu < Nd; nu++)
118  {
119  // Accumulate mu-nu staple
120  if( (mu != nu) && smear_in_this_dirP[nu] )
121  {
122  LatticeColorMatrix U_nu_plus_mu = shift(u[nu], FORWARD, mu);
123  LatticeColorMatrix tmp_mat;
124  LatticeColorMatrix tmp_mat2;
125 
126 
127  // Forward staple
128  // 2
129  // ^ --------->
130  // | |
131  // 1 | | 3
132  // | |
133  // | V
134  // x x + mu
135  //
136  tmp_mat = shift(u[mu],FORWARD, nu);
137  tmp_mat2 = u[nu]*tmp_mat;
138  tmp_mat = tmp_mat2*adj(U_nu_plus_mu);
139 
140 
141  // Backward staple
142  //
143  // | ^
144  // | |
145  // 1 | | 3
146  // | 2 |
147  // V--------->|
148  // x-nu x - nu + mu
149  //
150  //
151  // we construct it on x-nu and shift it up to x,
152  // (with a backward shift)
153 
154  // This is the staple on x-nu:
155  // tmp_1(x) = u_dag(x,nu)*u(x,mu)*u(x+mu,nu)
156  {
157  LatticeColorMatrix tmp_mat3;
158  tmp_mat3 = adj(u[nu])*u[mu];
159  tmp_mat2 = tmp_mat3*U_nu_plus_mu;
160  tmp_mat += shift(tmp_mat2, BACKWARD, nu);
161  tmp_mat *= rho(mu,nu);
162  }
163  C += tmp_mat;
164 
165  }
166  }
167 
168  // Now I can form the Q
169  LatticeColorMatrix Omega;
170  Omega = C*adj(u[mu]); // Q_mu is Omega mu here (eq 2 part 2)
171 
172  LatticeColorMatrix tmp2 = adj(Omega) - Omega;
173  LatticeColorMatrix tmp3 = trace(tmp2);
174  tmp3 *= Real(1)/Real(Nc);
175  tmp2 -= tmp3;
176  tmp2 *= Real(0.5);
177  Q = timesI(tmp2);
178  QQ = Q*Q;
179 
180  END_CODE();
181  }
182 
183  /*! \ingroup gauge */
184  // Do the force recursion from level i+1, to level i
185  // The input fat_force F is modified.
186  void deriv_recurse(multi1d<LatticeColorMatrix>& F,
187  const multi1d<bool>& smear_in_this_dirP,
188  const multi2d<Real>& rho,
189  const multi1d<LatticeColorMatrix>& u)
190  {
191  START_CODE();
192 
193  // Things I need
194  // C_{\mu} = staple multiplied appropriately by the rho
195  // Lambda matrices asper eq(73)
196  multi1d<LatticeColorMatrix> F_plus(Nd);
197 
198  // Save the fat force
199  F_plus = F;
200 
201 
202  multi1d<LatticeColorMatrix> Lambda(Nd);
203  multi1d<LatticeColorMatrix> C(Nd);
204 
205  // The links at this level (unprimed in the paper).
206  //const multi1d<LatticeColorMatrix>& u = smeared_links[level];
207 
208  for(int mu=0; mu < Nd; mu++)
209  {
210  if( smear_in_this_dirP[mu] )
211  {
212  LatticeColorMatrix Q,QQ; // This is the C U^{dag}_mu suitably antisymmetrized
213 
214  // Get Q, Q^2, C, c0 and c1 -- this code is the same as used in stout_smear()
215  getQsandCs(u, Q, QQ, C[mu], mu, smear_in_this_dirP,rho);
216 
217  // Now work the f-s and b-s
218  multi1d<LatticeComplex> f;
219  multi1d<LatticeComplex> b_1;
220  multi1d<LatticeComplex> b_2;
221 
222  // Get the fs and bs -- does internal resize to make them arrays of length 3
223  // QDPIO::cout << __func__ << ": mu=" << mu << std::endl;
224  getFsAndBs(Q,QQ, f, b_1, b_2, true);
225 
226 
227  LatticeColorMatrix B_1 = b_1[0] + b_1[1]*Q + b_1[2]*QQ;
228  LatticeColorMatrix B_2 = b_2[0] + b_2[1]*Q + b_2[2]*QQ;
229 
230 
231  // Construct the Gamma ( eq 74 and 73 )
232  LatticeColorMatrix USigma = u[mu]*F_plus[mu];
233  LatticeColorMatrix Gamma = f[1]*USigma + f[2]*(USigma*Q + Q*USigma)
234  + trace(B_1*USigma)*Q
235  + trace(B_2*USigma)*QQ;
236 
237  // Take the traceless hermitian part to form Lambda_mu (eq 72)
238  Lambda[mu] = Gamma + adj(Gamma); // Make it hermitian
239  LatticeColorMatrix tmp3 = (Double(1)/Double(Nc))*trace(Lambda[mu]); // Subtract off the trace
240  Lambda[mu] -= tmp3;
241  Lambda[mu] *= Double(0.5); // overall factor of 1/2
242 
243  // The first 3 terms of eq 75
244  // Now the Fat force * the exp(iQ)
245  F[mu] = F_plus[mu]*(f[0] + f[1]*Q + f[2]*QQ);
246 
247 #if 0
248  QDPIO::cout << __func__ << ": F[" << mu << "]= " << norm2(F[mu])
249  << " F_plus[mu]=" << norm2(F_plus[mu])
250  << " f[0]=" << norm2(f[0])
251  << " f[1]=" << norm2(f[1])
252  << " f[2]=" << norm2(f[2])
253  << " B_1=" << norm2(B_1)
254  << " B_2=" << norm2(B_2)
255  << " Q=" << norm2(Q)
256  << " QQ=" << norm2(QQ)
257  << std::endl;
258 #endif
259 
260  } // End of if( smear_in_this_dirP[mu] )
261  // else what is in F_mu is the right force
262  }
263 
264  // At this point we should have
265  //
266  // F[mu] = F_plus[mu]*exp(iQ)
267  //
268  // We need the 8 staple terms left in dOmega/dU (last 6 terms in eq 75 + the iC{+}Lambda
269  // term in eq 75 which in reality just covers 2 staples.
270 
271  // We have to make this a separate loop from the above, because we need to know the
272  // Lambda[mu] and [nu] for all the avaliable mu-nu combinations
273  for(int mu = 0; mu < Nd; mu++)
274  {
275  if( smear_in_this_dirP[mu] )
276  {
277  LatticeColorMatrix staple_sum = zero;
278  // LatticeColorMatrix staple_sum_dag = adj(C[mu])*Lambda[mu];
279  for(int nu = 0; nu < Nd; nu++) {
280  if((mu != nu) && smear_in_this_dirP[nu] ) {
281  LatticeColorMatrix U_nu_plus_mu = shift(u[nu],FORWARD, mu);
282  LatticeColorMatrix U_mu_plus_nu = shift(u[mu],FORWARD, nu);
283  LatticeColorMatrix Lambda_nu_plus_mu = shift(Lambda[nu], FORWARD, mu);
284  LatticeColorMatrix tmp_mat;
285  LatticeColorMatrix tmp_mat2;
286 
287 
288  // THe three upward staples
289  // Staples 1 5 and 6 in the paper
290  //
291  // Staple 1
292  // rho(nu,mu) * ( [ U_nu(x+mu) U^+_mu(x+nu) ] U^+_nu(x) ) Lambda_nu(x)
293  // Staple 5
294  // - rho(nu,mu) * Lambda_nu(x+mu)*( [ U_nu(x+mu) U^+_mu(x+nu) ] U^+_nu(x) )
295  // Staple 6
296  // rho(mu,nu) * [ U_nu(x+mu) U^+_mu(x+nu) ] Lambda_mu(x + nu) U^{+}_nu(x)
297  //
298  //
299  // Here the suggestive [] are common to all three terms.
300  // Also Staple 1 and 5 share the additional U^+_nu(x) as indicated by suggestive ()
301  // Staple 1 and 5 also share the same rho(nu,mu) but have different sign
302 
303  {
304  LatticeColorMatrix tmp_mat3;
305  LatticeColorMatrix tmp_mat4;
306 
307  tmp_mat = U_nu_plus_mu*adj(U_mu_plus_nu); // Term in square brackets common to all
308  tmp_mat2 = tmp_mat*adj(u[nu]); // Term in round brackets common to staples 1 and 5
309  tmp_mat3 = tmp_mat2*Lambda[nu]; // Staple 1
310  tmp_mat4 = Lambda_nu_plus_mu*tmp_mat2;
311  tmp_mat3 -= tmp_mat4; // Staple 5 and minus sign
312  tmp_mat3 *= rho(nu,mu); // Common factor on staple 1 and 5
313 
314  tmp_mat4 = shift(Lambda[mu],FORWARD,nu);
315  tmp_mat2 = tmp_mat*tmp_mat4; // Staple 6
316  tmp_mat = tmp_mat2*adj(u[nu]); // and again
317  tmp_mat *= rho(mu,nu); // rho(mu, nu) factor on staple 6
318  tmp_mat += tmp_mat3; // collect staples 1 5 and 6 onto staple sum
319  staple_sum += tmp_mat; // slap onto the staple sum
320 
321  // Tmp 3 disappears here
322  }
323 
324  // The three downward staples
325  //
326  // Paper Staples 2, 3, and 4;
327  //
328  // Staple 2:
329  // rho_mu_nu * U^{+}_nu(x-nu+mu) [ U^+_mu(x-nu) Lambda_mu(x-nu) ] U_nu(x-nu)
330  // Staple 3
331  // rho_nu_mu * U^{+}_nu(x-nu+mu) [ Lambda_nu(x-nu+mu) U^{+}_mu(x-nu) ] U_nu(x-nu)
332  // Staple 4
333  // - rho_nu_mu * U^{+}_nu(x-nu+mu) [ U^{+}_mu(x-nu) Lambda_nu(x-nu) ] U_nu(x-nu)
334  //
335  // I have suggestively placed brackets to show that all these staples share a common
336  // first and last term. Secondly staples 3 and 4 share the same value of rho (but opposite sign)
337  //
338  // Finally this can all be communicated on site and then shifted altoghether to x-nu
339  {
340  LatticeColorMatrix tmp_mat4;
341 
342  tmp_mat = Lambda_nu_plus_mu*adj(u[mu]); // Staple 3 term in brackets
343  tmp_mat4 = adj(u[mu])*Lambda[nu];
344  tmp_mat -= tmp_mat4; // Staple 4 term in brackets and -ve sign
345  tmp_mat *= rho(nu,mu); // Staple 3 & 4 common rho value
346  tmp_mat2 = adj(u[mu])*Lambda[mu]; // Staple 2 term in brackets
347  tmp_mat2 *= rho(mu,nu); // Staple 2 rho factor
348  tmp_mat += tmp_mat2; // Combine terms in brackets, signs and rho factors
349 
350  tmp_mat2 = adj(U_nu_plus_mu)*tmp_mat; // Common first matrix
351  tmp_mat = tmp_mat2*u[nu]; // Common last matrix
352 
353  staple_sum += shift(tmp_mat, BACKWARD, nu); // Shift it all back to x-nu
354  }
355 
356  } // end of if mu != nu
357  } // end nu loop
358 
359  // Add on this term - there is a relative minus sign which will be corrected by the sign on
360  // on accumulation to F
361  staple_sum -= adj(C[mu])*Lambda[mu];
362 
363  F[mu] -= timesI(staple_sum);
364 
365 #if 0
366  QDPIO::cout << __func__ << ":b, F[" << mu << "]= " << norm2(F[mu])
367  << " staple=" << norm2(staple_sum)
368  << " Lambda=" << norm2(Lambda[mu])
369  << " C=" << norm2(C[mu])
370  << std::endl;
371 #endif
372  } // End of if(smear_in_this_dirP[mu]
373  // Else nothing needs done to the force
374  } // end mu loop
375 
376 
377 
378  // Done
379  END_CODE();
380  }
381 
382  /*! \ingroup gauge */
383  void getFs(const LatticeColorMatrix& Q,
384  const LatticeColorMatrix& QQ,
385  multi1d<LatticeComplex>& f)
386  {
387  START_CODE();
388 
389  // Now compute the f's -- use the same function as for computing the fs, bs etc in derivative
390  // but don't compute the b-'s
391  multi1d<LatticeComplex> b_1; // Dummy - not used -- throwaway -- won't even get resized
392  multi1d<LatticeComplex> b_2; // Dummy - not used here -- throwaway -- won't even get resized
393 
394  getFsAndBs(Q,QQ,f,b_1,b_2,false); // This routine computes the f-s
395 
396  END_CODE();
397  }
398 
399 
400  /* A namespace to hide the thread dispatcher in */
401  namespace StoutUtils {
402  struct GetFsAndBsArgs {
403  const LatticeColorMatrix& Q;
404  const LatticeColorMatrix& QQ;
405  multi1d<LatticeComplex>& f;
406  multi1d<LatticeComplex>& b1;
407  multi1d<LatticeComplex>& b2;
408  bool dobs;
409  };
410 
411 
412 
413 
414  inline
415  void getFsAndBsSiteLoop(int lo, int hi, int myId,
416  GetFsAndBsArgs* arg)
417  {
418 #ifndef QDP_IS_QDPJIT
419  const LatticeColorMatrix& Q = arg->Q;
420  const LatticeColorMatrix& QQ = arg->QQ;
421  multi1d<LatticeComplex>& f = arg->f;
422  multi1d<LatticeComplex>& b1 = arg->b1;
423  multi1d<LatticeComplex>& b2 = arg->b2;
424  bool dobs=arg->dobs;
425 
426  for(int site=lo; site < hi; site++)
427  {
428  // Get the traces
429  PColorMatrix<QDP::RComplex<REAL>, Nc> Q_site = Q.elem(site).elem();
430  PColorMatrix<QDP::RComplex<REAL>, Nc> QQ_site = QQ.elem(site).elem();
431  PColorMatrix<QDP::RComplex<REAL>, Nc> QQQ = QQ_site*Q_site;
432 
433  Real trQQQ;
434  trQQQ.elem() = realTrace(QQQ);
435  Real trQQ;
436  trQQ.elem() = realTrace(QQ_site);
437 
438  REAL c0 = ((REAL)1/(REAL)3) * trQQQ.elem().elem().elem().elem(); // eq 13
439  REAL c1 = ((REAL)1/(REAL)2) * trQQ.elem().elem().elem().elem(); // eq 15
440 
441 
442  if( c1 < 4.0e-3 )
443  { // RGE: set to 4.0e-3 (CM uses this value). I ran into nans with 1.0e-4
444  // ================================================================================
445  //
446  // Corner Case 1: if c1 < 1.0e-4 this implies c0max ~ 3x10^-7
447  // and in this case the division c0/c0max in arccos c0/c0max can be undefined
448  // and produce NaN's
449 
450  // In this case what we can do is get the f-s a different way. We go back to basics:
451  //
452  // We solve (using std::maple) the matrix equations using the eigenvalues
453  //
454  // [ 1, q_1, q_1^2 ] [ f_0 ] [ exp( iq_1 ) ]
455  // [ 1, q_2, q_2^2 ] [ f_1 ] = [ exp( iq_2 ) ]
456  // [ 1, q_3, q_3^2 ] [ f_2 ] [ exp( iq_3 ) ]
457  //
458  // with q_1 = 2 u w, q_2 = -u + w, q_3 = - u - w
459  //
460  // with u and w defined as u = sqrt( c_1/ 3 ) cos (theta/3)
461  // and w = sqrt( c_1 ) sin (theta/3)
462  // theta = arccos ( c0 / c0max )
463  // leaving c0max as a symbol.
464  //
465  // we then expand the resulting f_i as a series around c0 = 0 and c1 = 0
466  // and then substitute in c0max = 2 ( c_1/ 3)^(3/2)
467  //
468  // we then convert the results to polynomials and take the real and imaginary parts:
469  // we get at the end of the day (to low order)
470 
471  // 1 2
472  // f0[re] := 1 - --- c0 + h.o.t
473  // 720
474  //
475  // 1 1 1 2
476  // f0[im] := - - c0 + --- c0 c1 - ---- c0 c1 + h.o.t
477  // 6 120 5040
478  //
479  //
480  // 1 1 1 2
481  // f1[re] := -- c0 - --- c0 c1 + ----- c0 c1 + h.o.t
482  // 24 360 13440 f
483  //
484  // 1 1 2 1 3 1 2
485  // f1[im] := 1 - - c1 + --- c1 - ---- c1 - ---- c0 + h.o.t
486  // 6 120 5040 5040
487  //
488  // 1 1 1 2 1 3 1 2
489  // f2[re] := - - + -- c1 - --- c1 + ----- c1 + ----- c0 + h.o.t
490  // 2 24 720 40320 40320
491  //
492  // 1 1 1 2
493  // f2[im] := --- c0 - ---- c0 c1 + ------ c0 c1 + h.o.t
494  // 120 2520 120960
495 
496  // We then express these using Horner's rule for more stable evaluation.
497  //
498  // to get the b-s we use the fact that
499  // b2_i = d f_i / d c0
500  // and b1_i = d f_i / d c1
501  //
502  // where the derivatives are partial derivativs
503  //
504  // And we just differentiate the polynomials above (keeping the same level
505  // of truncation) and reexpress that as Horner's rule
506  //
507  // This clearly also handles the case of a unit gauge as no c1, u etc appears in the
508  // denominator and the arccos is never taken. In this case, we have the results in
509  // the raw c0, c1 form and we don't need to flip signs and take complex conjugates.
510  //
511  // I checked the expressions below by taking the difference between the Horner forms
512  // below from the expanded forms (and their derivatives) above and checking for the
513  // differences to be zero. At this point in time std::maple seems happy.
514  // ==================================================================================
515 
516  f[0].elem(site).elem().elem().real() = 1.0-c0*c0/720.0;
517  f[0].elem(site).elem().elem().imag() = -(c0/6.0)*(1.0-(c1/20.0)*(1.0-(c1/42.0))) ;
518 
519  f[1].elem(site).elem().elem().real() = c0/24.0*(1.0-c1/15.0*(1.0-3.0*c1/112.0)) ;
520  f[1].elem(site).elem().elem().imag() = 1.0-c1/6.0*(1.0-c1/20.0*(1.0-c1/42.0))-c0*c0/5040.0 ;
521 
522  f[2].elem(site).elem().elem().real() = 0.5*(-1.0+c1/12.0*(1.0-c1/30.0*(1.0-c1/56.0))+c0*c0/20160.0);
523  f[2].elem(site).elem().elem().imag() = 0.5*(c0/60.0*(1.0-c1/21.0*(1.0-c1/48.0)));
524 
525  if( dobs == true ) {
526  // partial f0/ partial c0
527  b2[0].elem(site).elem().elem().real() = -c0/360.0;
528  b2[0].elem(site).elem().elem().imag() = -(1.0/6.0)*(1.0-(c1/20.0)*(1.0-c1/42.0));
529 
530  // partial f0 / partial c1
531  //
532  b1[0].elem(site).elem().elem().real() = 0;
533  b1[0].elem(site).elem().elem().imag() = (c0/120.0)*(1.0-c1/21.0);
534 
535  // partial f1 / partial c0
536  //
537  b2[1].elem(site).elem().elem().real() = (1.0/24.0)*(1.0-c1/15.0*(1.0-3.0*c1/112.0));
538  b2[1].elem(site).elem().elem().imag() = -c0/2520.0;
539 
540 
541  // partial f1 / partial c1
542  b1[1].elem(site).elem().elem().real() = -c0/360.0*(1.0 - 3.0*c1/56.0 );
543  b1[1].elem(site).elem().elem().imag() = -1.0/6.0*(1.0-c1/10.0*(1.0-c1/28.0));
544 
545  // partial f2/ partial c0
546  b2[2].elem(site).elem().elem().real() = 0.5*c0/10080.0;
547  b2[2].elem(site).elem().elem().imag() = 0.5*( 1.0/60.0*(1.0-c1/21.0*(1.0-c1/48.0)) );
548 
549  // partial f2/ partial c1
550  b1[2].elem(site).elem().elem().real() = 0.5*( 1.0/12.0*(1.0-(2.0*c1/30.0)*(1.0-3.0*c1/112.0)) );
551  b1[2].elem(site).elem().elem().imag() = 0.5*( -c0/1260.0*(1.0-c1/24.0) );
552 
553 #if 0
554  {
555  multi1d<int> coord = Layout::siteCoords(Layout::nodeNumber(), site);
556 
557  QMP_fprintf(stdout,
558  "%s: corner; site=%d coord=[%d,%d,%d,%d] f[0]=%g f[1]=%g f[2]=%g b1[0]=%g b1[1]=%g b1[2]=%g b2[0]=%g b2[1]=%g b2[2]=%g c0=%g c1=%g",
559 
560  __func__, site, coord[0], coord[1], coord[2], coord[3],
561  toDouble(localNorm2(f[0].elem(site))),
562  toDouble(localNorm2(f[1].elem(site))),
563  toDouble(localNorm2(f[2].elem(site))),
564  toDouble(localNorm2(b1[0].elem(site))),
565  toDouble(localNorm2(b1[1].elem(site))),
566  toDouble(localNorm2(b1[2].elem(site))),
567  toDouble(localNorm2(b2[0].elem(site))),
568  toDouble(localNorm2(b2[1].elem(site))),
569  toDouble(localNorm2(b2[2].elem(site))),
570  c0, c1);
571  }
572 #endif
573  } // Dobs==true
574  }
575  else
576  {
577  // ===================================================================================
578  // Normal case: Do as per paper
579  // ===================================================================================
580  bool c0_negativeP = c0 < 0;
581  REAL c0abs = fabs((double)c0);
582  REAL c0max = 2*pow( (double)(c1/(double)3), (double)1.5);
583  REAL theta;
584 
585  // ======================================================================================
586  // Now work out theta. In the paper the case where c0 -> c0max even when c1 is reasonable
587  // Has never been considered, even though it can arise and can cause the arccos function
588  // to fail
589  // Here we handle it with series expansion
590  // =====================================================================================
591  REAL eps = (c0max - c0abs)/c0max;
592 
593  if( eps < 0 ) {
594  // ===============================================================================
595  // Corner Case 2: Handle case when c0abs is bigger than c0max.
596  // This can happen only when there is a rounding error in the ratio, and that the
597  // ratio is really 1. This implies theta = 0 which we'll just set.
598  // ===============================================================================
599  {
600  multi1d<int> coord = Layout::siteCoords(Layout::nodeNumber(), site);
601 
602 // fprintf(stdout,
603 // "%s: corner2; site=%d coord=[%d,%d,%d,%d] c0max=%g c0abs=%d eps=%g\n Setting theta=0",
604 // __func__, site, coord[0], coord[1], coord[2], coord[3],
605 // c0abs, c0max,eps);
606  }
607  theta = 0;
608  }
609  else if ( eps < 1.0e-3 ) {
610  // ===============================================================================
611  // Corner Case 3: c0->c0max even though c1 may be actually quite reasonable.
612  // The ratio |c0|/c0max -> 1 but is still less than one, so that a
613  // series expansion is possible.
614  // SERIES of acos(1-epsilon): Good to O(eps^6) or with this cutoff to O(10^{-18}) Computed with Maple.
615  // BTW: 1-epsilon = 1 - (c0max-c0abs)/c0max = 1-(1 - c0abs/c0max) = +c0abs/c0max
616  //
617  // ===============================================================================
618  REAL sqtwo = sqrt((REAL)2);
619 
620  theta = sqtwo*sqrt(eps)*( 1.0 + ( (1/(REAL)12) + ( (3/(REAL)160) + ( (5/(REAL)896) + ( (35/(REAL)18432) + (63/(REAL)90112)*eps ) *eps) *eps) *eps) *eps);
621 
622  }
623  else {
624  //
625  theta = acos( c0abs/c0max );
626  }
627 
628  multi1d<REAL> f_site_re(3);
629  multi1d<REAL> f_site_im(3);
630 
631  multi1d<REAL> b1_site_re(3);
632  multi1d<REAL> b1_site_im(3);
633 
634  multi1d<REAL> b2_site_re(3);
635  multi1d<REAL> b2_site_im(3);
636 
637 
638 
639  REAL u = sqrt(c1/3)*cos(theta/3);
640  REAL w = sqrt(c1)*sin(theta/3);
641 
642  REAL u_sq = u*u;
643  REAL w_sq = w*w;
644 
645  REAL xi0,xi1;
646  {
647  bool w_smallP = fabs(w) < 0.05;
648  if( w_smallP ) {
649  xi0 = (REAL)1 - ((REAL)1/(REAL)6)*w_sq*( 1 - ((REAL)1/(REAL)20)*w_sq*( (REAL)1 - ((REAL)1/(REAL)42)*w_sq ) );
650  }
651  else {
652  xi0 = sin(w)/w;
653  }
654 
655  if( dobs==true) {
656 
657  if( w_smallP ) {
658  xi1 = -1*( ((REAL)1/(REAL)3) - ((REAL)1/(REAL)30)*w_sq*( (REAL)1 - ((REAL)1/(REAL)28)*w_sq*( (REAL)1 - ((REAL)1/(REAL)54)*w_sq ) ) );
659  }
660  else {
661  xi1 = cos(w)/w_sq - sin(w)/(w_sq*w);
662  }
663  }
664  }
665 
666  REAL cosu = cos(u);
667  REAL sinu = sin(u);
668  REAL cosw = cos(w);
669  REAL sinw = sin(w);
670  REAL sin2u = sin(2*u);
671  REAL cos2u = cos(2*u);
672  REAL ucosu = u*cosu;
673  REAL usinu = u*sinu;
674  REAL ucos2u = u*cos2u;
675  REAL usin2u = u*sin2u;
676 
677  REAL denum = (REAL)9*u_sq - w_sq;
678 
679  {
680  REAL subexp1 = u_sq - w_sq;
681  REAL subexp2 = 8*u_sq*cosw;
682  REAL subexp3 = (3*u_sq + w_sq)*xi0;
683 
684  f_site_re[0] = ( (subexp1)*cos2u + cosu*subexp2 + 2*usinu*subexp3 ) / denum ;
685  f_site_im[0] = ( (subexp1)*sin2u - sinu*subexp2 + 2*ucosu*subexp3 ) / denum ;
686  }
687  {
688  REAL subexp = (3*u_sq -w_sq)*xi0;
689 
690  f_site_re[1] = (2*(ucos2u - ucosu*cosw)+subexp*sinu)/denum;
691  f_site_im[1] = (2*(usin2u + usinu*cosw)+subexp*cosu)/denum;
692  }
693 
694 
695  {
696  REAL subexp=3*xi0;
697 
698  f_site_re[2] = (cos2u - cosu*cosw -usinu*subexp) /denum ;
699  f_site_im[2] = (sin2u + sinu*cosw -ucosu*subexp) /denum ;
700  }
701 
702  if( dobs == true )
703  {
704  multi1d<REAL> r_1_re(3);
705  multi1d<REAL> r_1_im(3);
706  multi1d<REAL> r_2_re(3);
707  multi1d<REAL> r_2_im(3);
708 
709  // r_1[0]=Double(2)*cmplx(u, u_sq-w_sq)*exp2iu
710  // + 2.0*expmiu*( cmplx(8.0*u*cosw, -4.0*u_sq*cosw)
711  // + cmplx(u*(3.0*u_sq+w_sq),9.0*u_sq+w_sq)*xi0 );
712  {
713  REAL subexp1 = u_sq - w_sq;
714  REAL subexp2 = 8*cosw + (3*u_sq + w_sq)*xi0 ;
715  REAL subexp3 = 4*u_sq*cosw - (9*u_sq + w_sq)*xi0 ;
716 
717  r_1_re[0] = 2*(ucos2u - sin2u *(subexp1)+ucosu*( subexp2 )- sinu*( subexp3 ) );
718  r_1_im[0] = 2*(usin2u + cos2u *(subexp1)-usinu*( subexp2 )- cosu*( subexp3 ) );
719  }
720 
721  // r_1[1]=cmplx(2.0, 4.0*u)*exp2iu + expmiu*cmplx(-2.0*cosw-(w_sq-3.0*u_sq)*xi0,2.0*u*cosw+6.0*u*xi0);
722  {
723  REAL subexp1 = cosw+3*xi0;
724  REAL subexp2 = 2*cosw + xi0*(w_sq - 3*u_sq);
725 
726  r_1_re[1] = 2*((cos2u - 2*usin2u) + usinu*( subexp1 )) - cosu*( subexp2 );
727  r_1_im[1] = 2*((sin2u + 2*ucos2u) + ucosu*( subexp1 )) + sinu*( subexp2 );
728  }
729 
730 
731  // r_1[2]=2.0*timesI(exp2iu) +expmiu*cmplx(-3.0*u*xi0, cosw-3*xi0);
732  {
733  REAL subexp = cosw - 3*xi0;
734  r_1_re[2] = -2*sin2u -3*ucosu*xi0 + sinu*( subexp );
735  r_1_im[2] = 2*cos2u +3*usinu*xi0 + cosu*( subexp );
736  }
737 
738 
739  //r_2[0]=-2.0*exp2iu + 2*cmplx(0,u)*expmiu*cmplx(cosw+xi0+3*u_sq*xi1,
740  // 4*u*xi0);
741  {
742  REAL subexp = cosw + xi0 + 3*u_sq*xi1;
743  r_2_re[0] = -2*(cos2u + u*( 4*ucosu*xi0 - sinu*(subexp )) );
744  r_2_im[0] = -2*(sin2u - u*( 4*usinu*xi0 + cosu*(subexp )) );
745  }
746 
747 
748  // r_2[1]= expmiu*cmplx(cosw+xi0-3.0*u_sq*xi1, 2.0*u*xi0);
749  // r_2[1] = timesMinusI(r_2[1]);
750  {
751  REAL subexp = cosw + xi0 - 3*u_sq*xi1;
752  r_2_re[1] = 2*ucosu*xi0 - sinu*( subexp ) ;
753  r_2_im[1] = -2*usinu*xi0 - cosu*( subexp ) ;
754  }
755 
756  //r_2[2]=expmiu*cmplx(xi0, -3.0*u*xi1);
757  {
758  REAL subexp = 3*xi1;
759 
760  r_2_re[2] = cosu*xi0 - usinu*subexp ;
761  r_2_im[2] = -( sinu*xi0 + ucosu*subexp ) ;
762  }
763 
764  REAL b_denum=2*denum*denum;
765 
766 
767  for(int j=0; j < 3; j++) {
768 
769  {
770  REAL subexp1 = 2*u;
771  REAL subexp2 = 3*u_sq - w_sq;
772  REAL subexp3 = 2*(15*u_sq + w_sq);
773 
774  b1_site_re[j]=( subexp1*r_1_re[j] + subexp2*r_2_re[j] - subexp3*f_site_re[j] )/b_denum;
775  b1_site_im[j]=( subexp1*r_1_im[j] + subexp2*r_2_im[j] - subexp3*f_site_im[j] )/b_denum;
776  }
777 
778  {
779  REAL subexp1 = 3*u;
780  REAL subexp2 = 24*u;
781 
782  b2_site_re[j]=( r_1_re[j]- subexp1*r_2_re[j] - subexp2 * f_site_re[j] )/b_denum;
783  b2_site_im[j]=( r_1_im[j] -subexp1*r_2_im[j] - subexp2 * f_site_im[j] )/b_denum;
784  }
785  }
786 
787  // Now flip the coefficients of the b-s
788  if( c0_negativeP )
789  {
790  //b1_site[0] = conj(b1_site[0]);
791  b1_site_im[0] *= -1;
792 
793  //b1_site[1] = -conj(b1_site[1]);
794  b1_site_re[1] *= -1;
795 
796  //b1_site[2] = conj(b1_site[2]);
797  b1_site_im[2] *= -1;
798 
799  //b2_site[0] = -conj(b2_site[0]);
800  b2_site_re[0] *= -1;
801 
802  //b2_site[1] = conj(b2_site[1]);
803  b2_site_im[1] *= -1;
804 
805  //b2_site[2] = -conj(b2_site[2]);
806  b2_site_re[2] *= -1;
807  }
808 
809  // Load back into the lattice sized object
810  for(int j=0; j < 3; j++) {
811 
812  b1[j].elem(site).elem().elem().real() = b1_site_re[j];
813  b1[j].elem(site).elem().elem().imag() = b1_site_im[j];
814 
815  b2[j].elem(site).elem().elem().real() = b2_site_re[j];
816  b2[j].elem(site).elem().elem().imag() = b2_site_im[j];
817  }
818 #if 0
819  {
820  multi1d<int> coord = Layout::siteCoords(Layout::nodeNumber(), site);
821  REAL rat = c0abs/c0max;
822 
823  QMP_fprintf(stdout,
824  "%s: site=%d coord=[%d,%d,%d,%d] f_site[0]=%g f_site[1]=%g f_site[2]=%g 1[0]=%g b1[1]=%g b1[2]=%g b2[0]=%g b2[1]=%g b2[2]=%g denum=%g c0=%g c1=%g c0max=%g rat=%g theta=%g",
825 
826  __func__, site, coord[0], coord[1], coord[2], coord[3],
827  toDouble(localNorm2(cmplx(Real(f_site_re[0]),Real(f_site_im[0])))),
828  toDouble(localNorm2(cmplx(Real(f_site_re[1]),Real(f_site_im[1])))),
829  toDouble(localNorm2(cmplx(Real(f_site_re[2]),Real(f_site_im[2])))),
830  toDouble(localNorm2(b1[0].elem(site))),
831  toDouble(localNorm2(b1[1].elem(site))),
832  toDouble(localNorm2(b1[2].elem(site))),
833  toDouble(localNorm2(b2[0].elem(site))),
834  toDouble(localNorm2(b2[1].elem(site))),
835  toDouble(localNorm2(b2[2].elem(site))),
836  denum,
837  c0, c1, c0max,
838  rat, theta);
839  }
840 #endif
841 
842  } // end of if (dobs==true)
843 
844  // Now when everything is done flip signs of the b-s (can't do this before
845  // as the unflipped f-s are needed to find the b-s
846 
847  if( c0_negativeP ) {
848 
849  // f_site[0] = conj(f_site[0]);
850  f_site_im[0] *= -1;
851 
852  //f_site[1] = -conj(f_site[1]);
853  f_site_re[1] *= -1;
854 
855  //f_site[2] = conj(f_site[2]);
856  f_site_im[2] *= -1;
857 
858  }
859 
860  // Load back into the lattice sized object
861  for(int j=0; j < 3; j++) {
862  f[j].elem(site).elem().elem().real() = f_site_re[j];
863  f[j].elem(site).elem().elem().imag() = f_site_im[j];
864  }
865 
866  } // End of if( corner_caseP ) else {}
867  } // End site loop
868 #endif
869  } // End Function
870 
871  } // End Namespace
872 
873  /*! \ingroup gauge */
874  void getFsAndBs(const LatticeColorMatrix& Q,
875  const LatticeColorMatrix& QQ,
876  multi1d<LatticeComplex>& f,
877  multi1d<LatticeComplex>& b1,
878  multi1d<LatticeComplex>& b2,
879  bool dobs)
880  {
881  START_CODE();
882  QDP::StopWatch swatch;
883  swatch.reset();
884  swatch.start();
885 
886  f.resize(3);
887 
888  b1.resize(3);
889  b2.resize(3);
890 
891 
892  int num_sites = Layout::sitesOnNode();
893  StoutUtils::GetFsAndBsArgs args={Q,QQ,f,b1,b2,dobs};
894 
895 
896 #if !defined(BUILD_JIT_CLOVER_TERM)
897 #warning "Using QDP++ stouting"
898  dispatch_to_threads(num_sites, args, StoutUtils::getFsAndBsSiteLoop);
899 #else
900 #if defined(QDPJIT_IS_QDPJITPTX)
901  #warning "Using QDP-JIT/PTX stouting"
902  //QDPIO::cout << "PTX getFsAndBs dobs = " << dobs << "\n";
903  static CUfunction function;
904 
905  if (function == NULL)
906  function = function_get_fs_bs_build( Q,QQ,f,b1,b2,dobs );
907 
908  // Execute the function
909  function_get_fs_bs_exec(function, Q,QQ,f,b1,b2,dobs );
910 #elif defined(QDPJIT_IS_QDPJITNVVM)
911  #warning "Using QDP-JIT/NVVM stouting"
912  static CUfunction function;
913  if (function == NULL)
914  function = function_get_fs_bs_build( Q,QQ,f,b1,b2 );
915  function_get_fs_bs_exec(function, Q,QQ,f,b1,b2,dobs );
916 #else
917  #warning "Using QDP-JIT/LLVM stouting"
918  static JitFunction function;
919 
920  if (!function.built()) {
921  QDPIO::cout << "Building JIT stouting function\n";
922  function_get_fs_bs_build( function,Q,QQ,f,b1,b2 );
923  }
924 
925  // Execute the function
926  function_get_fs_bs_exec(function, Q,QQ,f,b1,b2,dobs );
927 #endif
928 #endif
929 
930  swatch.stop();
931  StoutLinkTimings::functions_secs += swatch.getTimeInSeconds();
932  END_CODE();
933  }
934 
935  /*! \ingroup gauge */
936  void smear_links(const multi1d<LatticeColorMatrix>& current,
937  multi1d<LatticeColorMatrix>& next,
938  const multi1d<bool>& smear_in_this_dirP,
939  const multi2d<Real>& rho)
940  {
941  START_CODE();
942 
943  for(int mu = 0; mu < Nd; mu++)
944  {
945  if( smear_in_this_dirP[mu] )
946  {
947  LatticeColorMatrix Q, QQ;
948 
949  // Q contains the staple term. C is a throwaway
950  getQs(current, Q, QQ, mu, smear_in_this_dirP, rho);
951 
952  // Now compute the f's
953  multi1d<LatticeComplex> f; // routine will resize these
954  getFs(Q,QQ,f); // This routine computes the f-s
955 
956  // Assemble the stout links exp(iQ)U_{mu}
957  next[mu]=(f[0] + f[1]*Q + f[2]*QQ)*current[mu];
958  }
959  else {
960  next[mu]=current[mu]; // Unsmeared
961  }
962 
963  }
964 
965  END_CODE();
966  }
967 
968 
969  /*! \ingroup gauge */
970  void stout_smear(LatticeColorMatrix& next,
971  const multi1d<LatticeColorMatrix>& current,
972  int mu,
973  const multi1d<bool>& smear_in_this_dirP,
974  const multi2d<Real>& rho)
975  {
976  START_CODE();
977 
978  LatticeColorMatrix Q, QQ;
979 
980  // Q contains the staple term. C is a throwaway
981  getQs(current, Q, QQ, mu, smear_in_this_dirP, rho);
982 
983  // Now compute the f's
984  multi1d<LatticeComplex> f; // routine will resize these
985  getFs(Q,QQ,f); // This routine computes the f-s
986 
987  // Assemble the stout links exp(iQ)U_{mu}
988  next = (f[0] + f[1]*Q + f[2]*QQ)*current[mu];
989 
990  END_CODE();
991  }
992 
993  }
994 
995 }
Primary include file for CHROMA library code.
int mu
Definition: cool.cc:24
int nu
Definition: cool.cc:25
void getFs(const LatticeColorMatrix &Q, const LatticeColorMatrix &QQ, multi1d< LatticeComplex > &f)
Given c0 and c1 compute the f-s and b-s.
Definition: stout_utils.cc:383
void smear_links(const multi1d< LatticeColorMatrix > &current, multi1d< LatticeColorMatrix > &next, const multi1d< bool > &smear_in_this_dirP, const multi2d< Real > &rho)
Do the smearing from level i to level i+1.
Definition: stout_utils.cc:936
void getQs(const multi1d< LatticeColorMatrix > &u, LatticeColorMatrix &Q, LatticeColorMatrix &QQ, int mu, const multi1d< bool > &smear_in_this_dirP, const multi2d< Real > &rho)
Given field U, form Q and Q^2.
Definition: stout_utils.cc:89
void deriv_recurse(multi1d< LatticeColorMatrix > &F, const multi1d< bool > &smear_in_this_dirP, const multi2d< Real > &rho, const multi1d< LatticeColorMatrix > &u)
Do the force recursion from level i+1, to level i.
Definition: stout_utils.cc:186
void getFsAndBs(const LatticeColorMatrix &Q, const LatticeColorMatrix &QQ, multi1d< LatticeComplex > &f, multi1d< LatticeComplex > &b1, multi1d< LatticeComplex > &b2, bool dobs)
Given c0 and c1 compute the f-s and b-s.
Definition: stout_utils.cc:874
void getQsandCs(const multi1d< LatticeColorMatrix > &u, LatticeColorMatrix &Q, LatticeColorMatrix &QQ, LatticeColorMatrix &C, int mu, const multi1d< bool > &smear_in_this_dirP, const multi2d< Real > &rho)
Given field U, construct the staples into C, form Q and Q^2 and compute c0 and c1.
Definition: stout_utils.cc:105
void stout_smear(LatticeColorMatrix &next, const multi1d< LatticeColorMatrix > &current, int mu, const multi1d< bool > &smear_in_this_dirP, const multi2d< Real > &rho)
Stout smear in a specific link direction.
Definition: stout_utils.cc:970
unsigned j
Definition: ldumul_w.cc:35
multi1d< int > coord(Nd)
Nd
Definition: meslate.cc:74
Double tmp2
Definition: mesq.cc:30
Double tmp3
Definition: mesq.cc:31
SpinMatrix C()
C = Gamma(10)
Definition: barspinmat_w.cc:29
static double functions_secs
Definition: stout_utils.cc:77
static double smearing_secs
Definition: stout_utils.cc:67
static double force_secs
Definition: stout_utils.cc:72
void getFsAndBsSiteLoop(int lo, int hi, int myId, GetFsAndBsArgs *arg)
Definition: stout_utils.cc:415
Asqtad Staggered-Dirac operator.
Definition: klein_gord.cc:10
static multi1d< LatticeColorMatrix > u
LinOpSysSolverMGProtoClover::Q Q
START_CODE()
Double zero
Definition: invbicg.cc:106
FloatingPoint< double > Double
Definition: gtest.h:7351
#define FORWARD
Definition: primitives.h:82
#define BACKWARD
Definition: primitives.h:83
Stout utilities.
multi1d< LatticeColorMatrix > Q
Definition: t_clover.cc:12
func
Definition: t_preccfz.cc:17
static INTERNAL_PRECISION F