CHROMA
invibicgstab.cc
Go to the documentation of this file.
1 /*! \file
2  * \brief Conjugate-Gradient algorithm for a generic Linear Operator
3  */
4 
5 #include "chromabase.h"
8 
9 namespace Chroma {
10 
11  using namespace BiCGStabKernels;
12 
13  template<typename T, typename CR>
14 SystemSolverResults_t
16  const T& chi,
17  T& psi,
18  const Real& RsdBiCGStab,
19  int MaxBiCGStab,
20  enum PlusMinus isign)
21 
22 {
24  initKernels();
25  StopWatch swatch;
26  FlopCounter flopcount;
27  flopcount.reset();
28  const Subset& sub = A.subset();
29  bool convP = false;
30  ret.n_count = MaxBiCGStab;
31 
32  swatch.reset();
33  swatch.start();
34 
35  Double chi_sq = norm2(chi,sub);
36  flopcount.addSiteFlops(4*Nc*Ns,sub);
37 
38 
39  Double rsd_sq = RsdBiCGStab*RsdBiCGStab*chi_sq;
40 
41  // First get r = r0 = chi - A psi
42  T r;
43  T r0;
44  T v;
45  T tmp;
46  T t;
47  T s;
48  T u;
49  T q;
50  T f0;
51  T z;
52  ComplexD rhon, rhon_1; // rho_n, rho_{n-1}
53  ComplexD alphan, alphan_1; // alpha_{n}, alpha_{n-1}
54  ComplexD omega; // omega_{n} AND omega_{n-1} (before omega update)
55  ComplexD taun,taun_1; // tau_{n}, tau_{n-1}
56  RealD kappa;
57  RealD rnorm_prev; // lagged residuum
58  ComplexD theta;
59  ComplexD sigman_2, sigman_1;
60  ComplexD phi; // phi_{n} AND phi_{n-1} before phi update
61  ComplexD pi_c; // pi_{n} and pi_{n-1} before pi update
62  ComplexD gamma, eta;
63 
64  // r = r0 = chi - A psi_0
65  A(r0, psi, isign);
66  r[sub] = chi - r0;
67  r0[sub] = r;
68 
69  flopcount.addFlops(A.nFlops());
70  flopcount.addSiteFlops(2*Nc*Ns,sub);
71 
72 
73  // f0 = A^\dag r_0 and plays a role like r_0 in inner products
74  if(isign == PLUS) {
75  A(f0,r0,MINUS);
76  }
77  else {
78  A(f0,r0, PLUS);
79  }
80  flopcount.addFlops(A.nFlops());
81 
82  // Now initialise v_0 = q_0 = z_0 = 0
83  v[sub] = zero;
84  q[sub] = zero;
85  z[sub] = zero;
86 
87  // u_0 = A r_0
88  A(u,r,isign);
89  flopcount.addFlops(A.nFlops());
90 
91 
92  // rho_0 := alpha_0 := omega_0 = 1
93  rhon_1 = Double(1);
94  alphan_1 = Double(1);
95  omega = Double(1);
96 
97  // tau_0 = <r0,v0> = 0 since v0 = 0
98  taun_1 = Double(0);
99 
100  // sigma_{-1} = 0
101  sigman_2 = Double(0);
102 
103  // pi_0 = <r0,q_0> = 0
104  pi_c = Double(0);
105 
106  // sigma_{0} = <r0, Ar_0> = <r0, u_0>
107  sigman_1 = innerProduct(r0,u,sub);
108 
109  //**** NB: Paper says phi_0 = 0.
110  //**** PETSc code claims it is <r0,r0>
111  //**** Using phi_0 = 0 leads to rho_1 = 0
112  //**** which leads to breakdown in iteration 2
113  //**** So I will use the PETSc version which appears to work
114 
115  // phi_0 = <r0,r0>
116  phi = innerProduct(r0,r0,sub);
117  flopcount.addSiteFlops(16*Nc*Ns,sub); // (for sigma & phi)
118 
119 
120  // OK The big for loop
121  for(int n = 1; n <= MaxBiCGStab && !convP ; n++) {
122 
123  // Regular BiCGStab: rho = <r0,r>
124  // For IBiCGStab this has been unwound into
125  // the recurrencce:
126  //
127  // rho = phi_{n-1} - omega_{n-1}*( sigma_{n-2} - alpha_{n-1}*pi_{n-1} )
128  //
129  // Check for rho=0. If it is it will lead to breakdown in next iteration
130  rhon = phi - omega*(sigman_2 - alphan_1*pi_c); // 16 flops
131 
132 
133  if( toBool( real(rhon) == 0 ) && toBool( imag(rhon) == 0 ) ) {
134  QDPIO::cout << "BiCGStab breakdown: rho = 0" << std::endl;
135  QDP_abort(1);
136  }
137 
138 
139  // Regular BiCGStab: beta = ( rho_{n}/rho_{n-1})(alpha_{n-1}/omega_{n-1})
140  //
141  // For IBiCGStab where one can use delta_n = beta*omega_{n-1}
142  // it is useful to compute:
143  //
144  // delta_n = (rho_{n}/rho_{n-1})*alpha_{n-1}
145  // beta_n = delta_n/ omega_{n-1}
146  ComplexD beta;
147  ComplexD delta;
148  delta =( rhon / rhon_1 ) * alphan_1; // 15 flops
149  beta = delta/omega; // 9 flops
150 
151 
152  // tau_n = <r0, v> needed for denominator of alpha
153  // but can be updated by recurrance
154  taun = sigman_1 + beta*(taun_1- omega*pi_c); // 16 flops
155 
156  if( toBool( real(taun) == 0 ) && toBool( imag(taun) == 0 ) ) {
157  QDPIO::cout << "BiCGStab breakdown: <r_0|v> = 0" << std::endl;
158  QDP_abort(1);
159  }
160 
161  // form alpha = rho/tau
162  alphan = rhon / taun; // 9 flops
163 
164 
165  // z_n plays role of alpha_n p_n in normal BiCGstab
166  // it is only used to update the solution.
167  //
168  // NB one needs alpha_n p_{-1}
169  // = (alpha_n/alpha_{n-1}) (alpha_{n-1} p_{n-1})
170  // = (alpha_n/alpha_{n-1}) z_{n-1}
171  //
172  // The Paper in line (12) of the algorithm leaves out this
173  // (alpha_n/alpha_{n-1}) factor.
174  //
175  // Also z update needs to be pulled before the v update (line 8) of paper
176  // otherwise a shadow copy of v_{n-1} needs to be kept,
177  //
178  // z = alphan r_n-1 + (beta*alphan/alpha_{n-1})*z_{n-1}
179  // - (beta*alphan*omegan_1)*v_{n-1}
180  ComplexD bar = beta*alphan/alphan_1; // 15 flops
181  ComplexD alphadelta = alphan*delta; // 6 flops
182 
183 #if 0
184  tmp[sub] = bar*z;
185  z[sub] = alphan*r+tmp;
186  z[sub] -= alphadelta*v; // 22 Nc*Ns flops/site
187 
188 
189  // v = u_{n-1} + beta*v_{n-1} - beta*omegan_1*q_n_1
190  tmp[sub] = beta*v; // 6Nc Ns
191  v[sub] = u + tmp; // 2Nc Ns
192  v[sub] -= delta*q; // 8Nc Ns
193 #else
194  ibicgstab_zvupdates(r,z,v,u,q,alphan, bar, alphadelta, beta, delta, sub);
195 #endif
196 
197 
198  // q = Av
199  A(q,v,isign);
200 
201 #if 0
202  // t = u - alpha q
203  t[sub] = u - alphan * q; // 8 Nc Ns
204 
205  // s = r - alpha v
206  s[sub] = r - alphan*v; // 8 Nc Ns
207 
208 
209  // This should all be done with one sync point
210  // BIG ALLREDUCE
211  phi = innerProduct(r0,s,sub); // 8 Nc Ns flops/site
212  gamma = innerProduct(f0,s,sub); // 8 Nc Ns flops/site
213  pi_c = innerProduct(r0,q,sub); // 8 Nc Ns flops/site
214  eta = innerProduct(f0,t,sub); // 8 Nc Ns flops/site
215  theta = innerProduct(t,s,sub); // 8 Nc Ns flops/site
216  kappa = norm2(t,sub); // 4 Nc Ns flops/site
217  rnorm_prev = norm2(r,sub); // 4 Nc Ns flops/site
218 #else
220  r,
221  u,
222  v,
223  q,
224  r0,
225  f0,
226  s,
227  t,
228  phi,
229  pi_c,
230  gamma,
231  eta,
232  theta,
233  kappa,
234  rnorm_prev,
235  sub);
236 #endif
237 
238  // Collected flopcounts
239  // coefficient recurrences: flopcount.addFlops(86);
240  // z & v updates flopcount.addSiteFlops(38*Nc*Ns, sub);
241  // q = Av flopcount.addFlops(A.nFlops());
242  // s & t updates: flopcount.addSiteFlops(16*Nc*Ns, sub);
243  // 5 inner products, 2 norms: flopcount.addSiteFlops(48*Nc*Ns, sub);
244  flopcount.addFlops(A.nFlops() + 86);
245  flopcount.addSiteFlops(102*Nc*Ns, sub);
246 
247  // Check Norm: NB this is a lagged norm.
248  // So we're checking whether we converged last iteration
249  // This is the price of having only 1 allreduce
250  if( toBool(rnorm_prev < rsd_sq ) ) {
251  // Yes we've converged
252  convP = true;
253  ret.resid = sqrt(rnorm_prev);
254  ret.n_count = n;
255 
256  }
257  else {
258 
259  // No we haven't converged ... continue...
260  convP = false;
261 
262  // Check kappa for breakdown
263  if( toBool(kappa == 0) ) {
264  QDPIO::cerr << "Breakdown || Ms || = || t || = 0 " << std::endl;
265  QDP_abort(1);
266  }
267 
268  // Regular BiCGStab omega_n = <t,s> / <t,t> = theta/kappa
269  omega = theta/kappa; // 9 flops
270 
271  // sigma_n = <r0, A u_n> = gamma_n - omega_n * eta_n;
272  //
273  // NB: sigma_n is never explicitly used only sigma_{n-1}, sigma_{n-2}
274  // So if I stuck this in sigma_n, I'd just end up moving it to sigma_{n-1}
275  // So I'll just stick it straight into sigma_{n-1}
276  sigman_2 = sigman_1; // Preserve
277  sigman_1 = gamma - omega*eta; // 8 flops
278 
279 
280 #if 0
281  // Update r, psi
282  // r = s - omega t
283  r[sub] = s - omega*t;
284 
285  // psi = psi + omega s + z
286  tmp[sub] = psi + omega*s;
287  psi[sub] = tmp + z;
288 #else
290 #endif
291 
292  // Recompute next u = A r
293  A(u,r,isign);
294 
295 
296  // Update past values: Some of this could be saved I am sure
297  rhon_1 = rhon;
298  alphan_1 = alphan;
299  taun_1 = taun;
300 
301  // Collected Flops
302  // Omega + Sigma Updates: flopcount.addFlops(17);
303  // r & x updates: flopcount.addSiteFlops(18*Nc*Ns, sub)
304  // u update flopcount.addFlops(A.NFlops)
305  flopcount.addFlops(A.nFlops()+17);
306  flopcount.addSiteFlops(18*Nc*Ns,sub);
307  }
308 
309 
310  }
311 
312  swatch.stop();
313 
314  QDPIO::cout << "InvIBiCGStab: n = " << ret.n_count << " resid = " << ret.resid << std::endl;
315  flopcount.report("invibicgstab", swatch.getTimeInSeconds());
316 
317  if ( ret.n_count == MaxBiCGStab ) {
318  QDPIO::cerr << "Nonconvergence of IBiCGStab. MaxIters reached " << std::endl;
319  }
320 
321  finishKernels();
322  return ret;
323 }
324 
325 
326 template<>
327 SystemSolverResults_t
329  const LatticeFermionF& chi,
330  LatticeFermionF& psi,
331  const Real& RsdBiCGStab,
332  int MaxBiCGStab,
333  enum PlusMinus isign)
334 
335 {
336  return InvIBiCGStab_a<LatticeFermionF, ComplexF>(A, chi, psi, RsdBiCGStab, MaxBiCGStab, isign);
337 }
338 
339 template<>
340 SystemSolverResults_t
342  const LatticeFermionD& chi,
343  LatticeFermionD& psi,
344  const Real& RsdBiCGStab,
345  int MaxBiCGStab,
346  enum PlusMinus isign)
347 
348 {
349  return InvIBiCGStab_a<LatticeFermionD, ComplexD>(A, chi, psi, RsdBiCGStab, MaxBiCGStab, isign);
350 }
351 
352 // Staggered
353 template<>
354 SystemSolverResults_t
356  const LatticeStaggeredFermion& chi,
357  LatticeStaggeredFermion& psi,
358  const Real& RsdBiCGStab,
359  int MaxBiCGStab,
360  enum PlusMinus isign)
361 
362 {
363  return InvIBiCGStab_a<LatticeStaggeredFermion, Complex>(A, chi, psi, RsdBiCGStab, MaxBiCGStab, isign);
364 }
365 
366 } // end namespace Chroma
Primary include file for CHROMA library code.
Linear Operator.
Definition: linearop.h:27
Conjugate-Gradient algorithm for a generic Linear Operator.
unsigned n
Definition: ldumul_w.cc:36
int z
Definition: meslate.cc:36
int t
Definition: meslate.cc:37
Double q
Definition: mesq.cc:17
LatticeReal phi
Definition: mesq.cc:27
void ibicgstab_rxupdate(const C &omega, const T &s, const T &t, const T &z, T &r, T &x, const Subset &sub)
void ibicgstab_zvupdates(const T &r, T &z, T &v, const T &u, const T &q, const C &alpha, const C &alpha_rat_beta, const C &alpha_delta, const C &beta, const C &delta, const Subset &s)
void ibicgstab_stupdates_reduces(const C &alpha, const T &r, const T &u, const T &v, const T &q, const T &r0, const T &f0, T &s, T &t, C &phi, C &pi, C &gamma, C &eta, C &theta, F &kappa, F &rnorm, const Subset &sub)
BinaryReturn< C1, C2, FnInnerProduct >::Type_t innerProduct(const QDPSubType< T1, C1 > &s1, const QDPType< T2, C2 > &s2)
static const LatticeInteger & beta(const int dim)
Definition: stag_phases_s.h:47
Asqtad Staggered-Dirac operator.
Definition: klein_gord.cc:10
static multi1d< LatticeColorMatrix > u
SystemSolverResults_t InvIBiCGStab(const LinearOperator< LatticeFermionF > &A, const LatticeFermionF &chi, LatticeFermionF &psi, const Real &RsdBiCGStab, int MaxBiCGStab, enum PlusMinus isign)
LatticeFermion tmp
Definition: mespbg5p_w.cc:36
LinOpSysSolverMGProtoClover::T T
Real rsd_sq
Definition: invbicg.cc:121
@ MINUS
Definition: chromabase.h:45
@ PLUS
Definition: chromabase.h:45
multi1d< LatticeFermion > chi(Ncb)
Complex omega
Definition: invbicg.cc:97
LatticeFermion psi
Definition: mespbg5p_w.cc:35
LatticeFermion eta
Definition: mespbg5p_w.cc:37
SystemSolverResults_t InvIBiCGStab_a(const LinearOperator< T > &A, const T &chi, T &psi, const Real &RsdBiCGStab, int MaxBiCGStab, enum PlusMinus isign)
Definition: invibicgstab.cc:15
A(A, psi, r, Ncb, PLUS)
Double zero
Definition: invbicg.cc:106
multi1d< LatticeFermion > s(Ncb)
FloatingPoint< double > Double
Definition: gtest.h:7351
int kappa
Definition: pade_trln_w.cc:112
int r0
Definition: qtopcor.cc:41
Holds return info from SystemSolver call.
Definition: syssolver.h:17