CHROMA
minvcg_accumulate_array.cc
Go to the documentation of this file.
1 
2 /*! \file
3  * \brief Multishift Conjugate-Gradient algorithm for a Linear Operator
4  */
5 
6 #include "linearop.h"
8 
9 using namespace QDP::Hints;
10 
11 namespace Chroma
12 {
13 
14 
15  //! Multishift Conjugate-Gradient (CG1) algorithm for a Linear Operator
16  /*! \ingroup invert
17  *
18  * This subroutine uses the Conjugate Gradient (CG) algorithm to find
19  * the solution of the set of linear equations
20  * Method used is described in Jegerlehner, hep-lat/9708029
21 
22  * We are searching in a subspace orthogonal to the eigenvectors EigVec
23  * of A. The source chi is assumed to already be orthogonal!
24 
25  * Chi = A . Psi
26 
27  * Algorithm:
28 
29  * Psi[0] := 0; Zeroed
30  * r[0] := Chi; Initial residual
31  * p[1] := Chi ; Initial direction
32  * b[0] := |r[0]|**2 / <p[0],Ap[0]> ;
33  * z[0] := 1 / (1 - (shift - shift(0))*b)
34  * bs[0] := b[0] * z[0]
35  * r[1] += b[k] A . p[0] ; New residual
36  * Psi[1] = - b[k] p[k] ; Starting solution std::vector
37  * IF |r[0]| <= RsdCG |Chi| THEN RETURN; Converged?
38  * FOR k FROM 1 TO MaxCG DO CG iterations
39  * a[k] := |r[k]|**2 / |r[k-1]|**2 ;
40  * p[k] := r[k] + a[k] p[k-1]; New direction
41  * b[k+1] := |r[k]|**2 / <p[k],Ap[k]> ;
42  * r[k+1] += b[k+1] A . p[k] ; New residual
43  * Psi[k+1] -= b[k+1] p[k] ; New solution std::vector
44  * IF |[k+1]| <= RsdCG |Chi| THEN RETURN; Converged?
45 
46  * Arguments:
47 
48  * A Hermitian linear operator (Read)
49  * Chi Source (Read)
50  * Psi array of solutions (Write)
51  * shifts shifts of form A + mass (Read)
52  * RsdCG residual accuracy (Read/Write)
53  * n_count Number of CG iteration (Write)
54 
55  * Local Variables:
56 
57  * p Direction std::vector
58  * r Residual std::vector
59  * cp | r[k] |**2
60  * c | r[k-1] |**2
61  * k CG iteration counter
62  * a a[k]
63  * b b[k+1]
64  * d < p[k], A.p[k] >
65  * Ap Temporary for M.p
66 
67  * MaxCG Maximum number of CG iterations allowed
68 
69  * Subroutines:
70  * A Apply matrix hermitian A to std::vector
71  */
72 
73  template<typename T, typename C>
74  void MInvCGAccum_a(const C& A,
75  const multi1d<T>& chi,
76  multi1d<T>& psi,
77  const Real& norm,
78  const multi1d<Real>& residues,
79  const multi1d<Real>& poles,
80  const Real& RsdCG,
81  const int MaxCG,
82  int& n_count)
83  {
84  START_CODE();
85 
86  // Setup
87  const Subset& sub = A.subset();
88  int n_shift = poles.size();
89  int N = A.size();
90 
91  FlopCounter flopcount;
92  StopWatch swatch;
93 
94 
95  if (n_shift == 0) {
96  QDPIO::cerr << "MinvCG: You must supply at least 1 shift. shift.size() = " << n_shift << std::endl;
97  QDP_abort(1);
98  }
99 
100  /* Now find the smallest mass */
101  int isz = 0;
102  for(int findit=1; findit < n_shift; ++findit) {
103  if ( toBool( poles[findit] < poles[isz]) ) {
104  isz = findit;
105  }
106  }
107 
108  /* Keep the solutions here */
109  multi1d< multi1d<T> > X(n_shift);
110 
111  // Now arrange the memory
112  // The outer size of the multi1d is n_shift
113  X.resize(n_shift);
114 
115  // For this algorithm, all the X have to be 0 to start
116  // Only is that way the initial residuum r = chi
117  for(int i= 0; i < n_shift; ++i) {
118  X[i].resize(N);
119  }
120 
121  psi.resize(N);
122  for(int i=0; i < N; i++) {
123  psi[i] = zero;
124  }
125 
126 
127  multi1d<T> r(N);
128  multi1d< multi1d<T> > p(n_shift);
129 
130  for(int s = 0; s < n_shift; ++s) {
131  p[s].resize(N);
132  }
133 
134  multi1d<T> Ap(N);
135  multi1d<T> chi_internal(N);
136 
137  // Now arrange the memory:
138  // These guys get most hits so locate them first
139  moveToFastMemoryHint(Ap);
140  moveToFastMemoryHint(p[isz]);
141  moveToFastMemoryHint(r);
142  moveToFastMemoryHint(X[isz]);
143  moveToFastMemoryHint(chi_internal);
144 
145  {
146  multi1d<T> blocker1(N); moveToFastMemoryHint(blocker1);
147  multi1d<T> blocker2(N); moveToFastMemoryHint(blocker2);
148 
149  // Now the rest of the p-s
150  for(int i=0; i < n_shift; i++) {
151  if( i!=isz ) {
152  moveToFastMemoryHint(p[i]);
153  moveToFastMemoryHint(X[i]);
154  }
155  }
156 
157  // Blockers get freed here
158  }
159 
160  moveToFastMemoryHint(psi);
161 
162  // initialise X-s
163  for(int i=0; i < n_shift; i++) {
164  for(int n=0; n < N; ++n) {
165  X[i][n][sub] = zero;
166  }
167  }
168 
169  // initialise chi internal
170  for(int i=0; i < N; i++) {
171  chi_internal[i][sub] = chi[i];
172  }
173 
174  // -------- All memory setup and copies done. Timer starts here
175  QDPIO::cout << "MinvCG starting" << std::endl;
176  flopcount.reset();
177  swatch.reset();
178  swatch.start();
179 
180  // If chi has zero norm then the result is zero
181  Double chi_norm_sq = norm2(chi_internal,sub);
182  flopcount.addSiteFlops(4*Nc*Ns*N,sub);
183 
184  Double chi_norm = sqrt(chi_norm_sq);
185 
186  if( toBool( chi_norm < fuzz )) {
187  n_count = 0;
188  swatch.stop();
189  QDPIO::cout << "MinvCG: Finished. Iters taken = " << n_count << std::endl;
190  flopcount.report("MinvCGArray", swatch.getTimeInSeconds());
191 
192 
193  // Accumulate psi
194  for(int n=0; n < N; n++) {
195  psi[n][sub]=norm*chi_internal[n];
196  }
197  for(int s=0; s < n_shift; s++) {
198  for(int n=0; n < N; n++) {
199  psi[n][sub] += residues[s]*X[s][n];
200  }
201  }
202 
203  // Revert X-s
204  for(int i=0; i < n_shift; i++) {
205  revertFromFastMemoryHint(X[i], false);
206  }
207 
208  revertFromFastMemoryHint(psi, true);
209 
210 
211  END_CODE();
212  return;
213  }
214 
215  multi1d<Double> rsd_sq(n_shift);
216  multi1d<Double> rsdcg_sq(n_shift);
217 
218  Double cp = chi_norm_sq;
219  for(int s = 0; s < n_shift; ++s) {
220  rsdcg_sq[s] = RsdCG * RsdCG; // RsdCG^2
221  rsd_sq[s] = Real(cp) * rsdcg_sq[s]; // || chi ||^2 RsdCG^2
222  }
223 
224 
225  // r[0] := p[0] := Chi
226  for(int n=0; n < N; ++n) {
227  r[n][sub] = chi_internal[n];
228 
229  for(int s=0; s < n_shift; s++) {
230  p[s][n][sub] = chi_internal[n];
231  }
232  }
233 
234 
235 
236  // b[0] := - | r[0] |**2 / < p[0], Ap[0] > ;/
237  // First compute d = < p, A.p >
238  // Ap = A . p */
239  A(Ap, p[isz], PLUS);
240  flopcount.addFlops(A.nFlops());
241 
242  for(int n=0; n < N; ++n) {
243  Ap[n][sub] += poles[isz] * p[isz][n];
244  }
245  flopcount.addSiteFlops(4*Nc*Ns*N, sub);
246 
247  /* d = < p, A.p > */
248  Double d = innerProductReal(p[isz], Ap, sub);
249  flopcount.addSiteFlops(4*Nc*Ns*N ,sub);
250 
251  Double b = -cp/d;
252 
253  /* Compute the shifted bs and z */
254  multi1d<Double> bs(n_shift);
255  multi2d<Double> z(2, n_shift);
256  int iz;
257 
258  z[0][isz] = Double(1);
259  z[1][isz] = Double(1);
260  bs[isz] = b;
261  iz = 1;
262 
263  for(int s = 0; s < n_shift; ++s)
264  {
265  if( s != isz ) {
266  z[1-iz][s] = Double(1);
267  z[iz][s] = Double(1) / (Double(1) - (Double(poles[s])-Double(poles[isz]))*b);
268  bs[s] = b * z[iz][s];
269  }
270  }
271 
272  // r[1] += b[0] A . p[0];
273  for(int n=0; n < N; ++n) {
274  r[n][sub] += Real(b)* Ap[n];
275  }
276  flopcount.addSiteFlops(4*Nc*Ns*N, sub);
277 
278  // X[1] -= b[0] p[0] = - b[0] chi;
279  // X[0] are all 0 so I can write this as a -= bs*chi
280  for(int s = 0; s < n_shift; ++s) {
281  for(int n=0; n < N; ++n) {
282  X[s][n][sub] -= Real(bs[s])*chi_internal[n];
283  }
284  }
285  flopcount.addSiteFlops(4*Nc*Ns*N*n_shift, sub);
286 
287 
288  // c = |r[1]|^2
289  Double c = norm2(r,sub);
290  flopcount.addSiteFlops(4*Nc*Ns*N,sub);
291 
292 
293  // Check convergence of first solution
294  multi1d<bool> convsP(n_shift);
295  for(int s = 0; s < n_shift; ++s) {
296  convsP[s] = false;
297  }
298 
299  bool convP = toBool( c < rsd_sq[isz] );
300 
301 
302  // FOR k FROM 1 TO MaxCG DO
303  // IF |X[k+1] - X[k]| <= RsdCG |X[k+1]| THEN RETURN;
304  Double z0, z1;
305  Double ztmp;
306  Double cs;
307  Double a;
308  Double as;
309  Double bp;
310  int k;
311 
312 
313  for(k = 1; k <= MaxCG && !convP ; ++k)
314  {
315  // a[k+1] := |r[k]|**2 / |r[k-1]|**2 ;
316  a = c/cp;
317 
318  // p[k+1] := r[k+1] + a[k+1] p[k];
319  // Compute the shifted as */
320  // ps[k+1] := zs[k+1] r[k+1] + a[k+1] ps[k];
321  for(int s = 0; s < n_shift; ++s) {
322 
323  // Always update p[isz] even if isz is converged
324  // since the other p-s depend on it.
325  if (s == isz) {
326 
327  for(int n=0; n < N; ++n) {
328  p[s][n][sub] = r[n] + Real(a)*p[s][n];
329  // p[s][n][sub] *= Real(a);
330  // p[s][n][sub] += r[n];
331  }
332  flopcount.addSiteFlops(4*Nc*Ns*N,sub);
333 
334  }
335  else {
336  // Don't update other p-s if converged.
337  if( ! convsP[s] ) {
338  as = a * z[iz][s]*bs[s] / (z[1-iz][s]*b);
339 
340  for(int n=0; n < N; ++n) {
341  p[s][n][sub] = Real(as)*p[s][n] + Real(z[iz][s])*r[n];
342  //p[s][n][sub] *= Real(as);
343  //p[s][n][sub] += Real(z[iz][s])*r[n];
344  }
345  flopcount.addSiteFlops(6*Nc*Ns*N, sub);
346  }
347  }
348 
349  }
350 
351  // cp = | r[k] |**2
352  cp = c;
353 
354  // b[k] := | r[k] |**2 / < p[k], Ap[k] > ;
355  // First compute d = < p, A.p >
356  // Ap = A . p
357  A(Ap, p[isz], PLUS);
358  flopcount.addFlops(A.nFlops());
359 
360  for(int n=0; n < N; ++n) {
361  Ap[n][sub] += poles[isz] *p[isz][n];
362  }
363  flopcount.addSiteFlops(4*Nc*Ns*N, sub);
364 
365  /* d = < p, A.p > */
366  d = innerProductReal(p[isz], Ap, sub);
367  flopcount.addSiteFlops(4*Nc*Ns*N, sub);
368 
369  bp = b;
370  b = -cp/d;
371 
372  // Compute the shifted bs and z
373  bs[isz] = b;
374  iz = 1 - iz;
375  for(int s = 0; s < n_shift; s++) {
376 
377  if (s != isz && !convsP[s] ) {
378  z0 = z[1-iz][s];
379  z1 = z[iz][s];
380  z[iz][s] = z0*z1*bp;
381  z[iz][s] /= b*a*(z1-z0) + z1*bp*(Double(1) - (poles[s] - poles[isz])*b);
382  bs[s] = b*z[iz][s]/z0;
383  }
384  }
385 
386  // r[k+1] += b[k] A . p[k] ;
387  for(int n=0; n < N; ++n) {
388  r[n][sub] += Real(b)*Ap[n];
389  }
390  flopcount.addSiteFlops(4*Nc*Ns*N, sub);
391 
392 
393  // Check convergence
394  // c = | r[k] |**2
395  c = norm2(r,sub);
396  flopcount.addSiteFlops(4*Nc*Ns*N, sub);
397 
398  // IF |psi[k+1] - psi[k]| <= RsdCG |psi[k+1]| THEN RETURN;
399  // or IF |r[k+1]| <= RsdCG |chi| THEN RETURN;
400  convP = true;
401  for(int s = 0; s < n_shift; s++) {
402  if (! convsP[s] ) {
403 
404 
405  // Convergence methods
406  // Check norm of shifted residuals
407  Double css = c * z[iz][s]* z[iz][s];
408  convsP[s] = toBool( css < rsd_sq[s] );
409 
410  }
411  convP &= convsP[s];
412  }
413 
414  // Check accumulated result -- modify X-s in the same loop
415  multi1d<T> Delta(N);
416  for(int n=0; n < N; n++) {
417  Delta[n][sub] = zero;
418  psi[n][sub] = norm*chi_internal[n];
419  }
420  for(int s=0 ; s < n_shift; s++) {
421  if ( !convsP[s] ) {
422  multi1d<T> tmp(N);
423  for(int n=0; n < N; n++) {
424  tmp[n][sub] = Real(bs[s])*p[s][n];
425  X[s][n][sub] -= tmp[n];
426  Delta[n][sub] += residues[s]*tmp[n];
427  }
428  }
429  // Always use the all the new X-s to update psi.
430  for(int n=0; n < N; n++) {
431  psi[n][sub] += residues[s]*X[s][n];
432  }
433  }
434 
435  Double delta_norm = norm2(Delta, sub);
436  Double psi_norm = norm2(psi,sub);
437  convP |= toBool( delta_norm < rsdcg_sq[0]*psi_norm);
438  n_count = k;
439  }
440 
441  swatch.stop();
442  QDPIO::cout << "MinvCGAccumArray finished: " << n_count << " iterations " << std::endl;
443  flopcount.report("MinvCGAccumArray", swatch.getTimeInSeconds());
444 
445  for(int i=0; i < n_shift; i++) {
446  revertFromFastMemoryHint(psi[i],true);
447  }
448 
449  if (n_count == MaxCG) {
450  QDPIO::cout << "too many CG iterationns: " << n_count << std::endl;
451  QDP_abort(1);
452 
453  }
454  END_CODE();
455  return;
456  }
457 
458 
459  /*! \ingroup invert */
460  template<>
462  const multi1d<LatticeFermion>& chi,
463  multi1d<LatticeFermion>& psi,
464  const Real& norm,
465  const multi1d<Real>& residues,
466  const multi1d<Real>& poles,
467  const Real& RsdCG,
468  const int MaxCG,
469  int &n_count)
470  {
471  MInvCGAccum_a(M, chi, psi, norm, residues, poles, RsdCG, MaxCG, n_count);
472  }
473 
474 
475  /*! \ingroup invert */
476  template<>
477  void MInvCGAccum(const DiffLinearOperatorArray<LatticeFermion,
478  multi1d<LatticeColorMatrix>,
479  multi1d<LatticeColorMatrix> >& M,
480  const multi1d<LatticeFermion>& chi,
481  multi1d<LatticeFermion>& psi,
482  const Real& norm,
483  const multi1d<Real>& residues,
484  const multi1d<Real>& poles,
485  const Real& RsdCG,
486  const int MaxCG,
487  int &n_count)
488  {
489  MInvCGAccum_a(M, chi, psi, norm, residues, poles, RsdCG, MaxCG, n_count);
490  }
491 
492 } // end namespace Chroma
#define END_CODE()
Definition: chromabase.h:65
#define START_CODE()
Definition: chromabase.h:64
Differentiable Linear Operator.
Definition: linearop.h:150
Linear Operator to arrays.
Definition: linearop.h:61
EXTERN int MaxCG
void MInvCGAccum(const DiffLinearOperatorArray< LatticeFermion, multi1d< LatticeColorMatrix >, multi1d< LatticeColorMatrix > > &M, const multi1d< LatticeFermion > &chi, multi1d< LatticeFermion > &psi, const Real &norm, const multi1d< Real > &residues, const multi1d< Real > &poles, const Real &RsdCG, const int MaxCG, int &n_count)
void MInvCGAccum_a(const C &A, const multi1d< T > &chi, multi1d< T > &psi, const Real &norm, const multi1d< Real > &residues, const multi1d< Real > &poles, const Real &RsdCG, const int MaxCG, int &n_count)
Multishift Conjugate-Gradient (CG1) algorithm for a Linear Operator.
unsigned s
Definition: ldumul_w.cc:37
unsigned i
Definition: ldumul_w.cc:34
unsigned n
Definition: ldumul_w.cc:36
Linear Operators.
Double tmp
Definition: meslate.cc:60
int z
Definition: meslate.cc:36
int c
Definition: meslate.cc:61
int iz
Definition: meslate.cc:32
Multishift Conjugate-Gradient algorithm for a Linear Operator.
SpinMatrix C()
C = Gamma(10)
Definition: barspinmat_w.cc:29
Asqtad Staggered-Dirac operator.
Definition: klein_gord.cc:10
const WilsonTypeFermAct< multi1d< LatticeFermion > > Handle< const ConnectState > const multi1d< Real > enum InvType invType const multi1d< Real > & RsdCG
Definition: pbg5p_w.cc:30
Real rsd_sq
Definition: invbicg.cc:121
@ PLUS
Definition: chromabase.h:45
Double cp
Definition: invbicg.cc:107
Complex a
Definition: invbicg.cc:95
DComplex d
Definition: invbicg.cc:99
A(A, psi, r, Ncb, PLUS)
Complex b
Definition: invbicg.cc:96
Double zero
Definition: invbicg.cc:106
int k
Definition: invbicg.cc:119
FloatingPoint< double > Double
Definition: gtest.h:7351
Double chi_norm
Definition: pade_trln_w.cc:85
int isz
Definition: pade_trln_w.cc:151
multi1d< LatticeFermion > r(Ncb)
chi
Definition: pade_trln_w.cc:24
psi
Definition: pade_trln_w.cc:191
int n_count
Definition: pade_trln_w.cc:69
int norm
Definition: qtopcor.cc:35