CHROMA
invcg2_timing_hacks_3.h
Go to the documentation of this file.
1 // -*- C++ -*-
2 /*! \file
3  * \brief Conjugate-Gradient algorithm for a generic Linear Operator
4  */
5 
6 #ifndef __invcg2_timing_hacks_2_h__
7 #define __invcg2_timing_hacks_2_h__
8 
9 #include "chromabase.h"
10 #include "linearop.h"
12 
13 //! Highly optimised Conjugate-Gradient (CGNE) algorithm for a Even Odd Preconditioned
14 
15 // Perversly theser are the types used in our axpys.
16 typedef OLattice< PSpinVector< PColorVector< RComplex< PScalar<REAL> >, Nc>, Ns> > LFerm;
17 
18 typedef OScalar< PScalar < PScalar < RScalar< PScalar < REAL > > > > > LScal;
19 typedef OScalar< PScalar < PScalar < RScalar< PScalar < DOUBLE > > > > > LDble;
20 // Get at the REAL embedded in an LScal
21 #define AT_REAL(a) (a.elem().elem().elem().elem().elem())
22 
23 // Get the first element of a std::vector over a subset
24 #define FIRST_ELEM(a,s) (&(a.elem(s.start()).elem(0).elem(0).real().elem()))
25 
27  const LFerm& chi,
28  LFerm& psi,
29  const LScal& mass,
30  const LScal& RsdCG,
31  int MaxCG,
32  int& n_count);
33 
34 
35 // GNUC std::vector type
36 typedef float v4sf __attribute__((mode(V4SF),aligned(16)));
37 
38 // vaxpy3 and norm put together
39 inline
40 void vaxpy3_norm(REAL *Out,REAL *scalep,REAL *InScale, REAL *Add,int n_3vec,
41  REAL* dsum)
42 {
43 #ifdef DEBUG_BLAS
44  QDPIO::cout << "SSE_TEST: vaxpy3_norm" << std::endl;
45 #endif
46 
47  int n_loops = n_3vec;
48 
49  v4sf vscalep = __builtin_ia32_loadss(scalep);
50  asm("shufps\t$0,%0,%0" : "+x" (vscalep));
51 
52  REAL fzero = 0.0;
53  register v4sf vsum = __builtin_ia32_loadss(&fzero);
54  asm("shufps\t$0,%0,%0" : "+x" (vsum));
55 
56  for (; n_loops-- > 0; )
57  {
58  register v4sf vtmp;
59 
60  vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+ 0)), __builtin_ia32_loadaps(Add+ 0));
61  vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
62  __builtin_ia32_storeaps(Out+ 0, vtmp);
63 
64  vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+ 4)), __builtin_ia32_loadaps(Add+ 4));
65  vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
66  __builtin_ia32_storeaps(Out+ 4, vtmp);
67 
68  vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+ 8)), __builtin_ia32_loadaps(Add+ 8));
69  vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
70  __builtin_ia32_storeaps(Out+ 8, vtmp);
71 
72  vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+12)), __builtin_ia32_loadaps(Add+12));
73  vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
74  __builtin_ia32_storeaps(Out+12, vtmp);
75 
76  vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+16)), __builtin_ia32_loadaps(Add+16));
77  vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
78  __builtin_ia32_storeaps(Out+16, vtmp);
79 
80  vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+20)), __builtin_ia32_loadaps(Add+20));
81  vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
82  __builtin_ia32_storeaps(Out+20, vtmp);
83 
84  Out += 24; InScale += 24; Add += 24;
85  }
86 
87  REAL fsum[4];
88  __builtin_ia32_storeaps(fsum, vsum);
89  *dsum = (REAL)(fsum[0] + fsum[1] + fsum[2] + fsum[3]);
90 }
91 
92 
93 
94 #endif
Primary include file for CHROMA library code.
EXTERN int MaxCG
Include possibly optimized Wilson dslash.
void InvCG2EvenOddPrecWilsLinOpTHack(const WilsonDslash &D, const LFerm &chi, LFerm &psi, const LScal &mass, const LScal &RsdCG, int MaxCG, int &n_count)
Conjugate-Gradient (CGNE) algorithm for a generic Linear Operator.
OLattice< PSpinVector< PColorVector< RComplex< PScalar< REAL > >, Nc >, Ns > > LFerm
Highly optimised Conjugate-Gradient (CGNE) algorithm for a Even Odd Preconditioned.
OScalar< PScalar< PScalar< RScalar< PScalar< REAL > > > > > LScal
OLattice< PSpinVector< PColorVector< RComplex< PScalar< REAL > >, Nc >, Ns > > LFerm
Highly optimised Conjugate-Gradient (CGNE) algorithm for a Even Odd Preconditioned.
void vaxpy3_norm(REAL *Out, REAL *scalep, REAL *InScale, REAL *Add, int n_3vec, REAL *dsum)
OScalar< PScalar< PScalar< RScalar< PScalar< DOUBLE > > > > > LDble
float v4sf __attribute__((mode(V4SF), aligned(16)))
OScalar< PScalar< PScalar< RScalar< PScalar< REAL > > > > > LScal
Linear Operators.
QDPWilsonDslash WilsonDslash
Definition: dslash_w.h:132
const WilsonTypeFermAct< multi1d< LatticeFermion > > Handle< const ConnectState > const multi1d< Real > enum InvType invType const multi1d< Real > & RsdCG
Definition: pbg5p_w.cc:30
Double mass
Definition: pbg5p_w.cc:54
chi
Definition: pade_trln_w.cc:24
psi
Definition: pade_trln_w.cc:191
int n_count
Definition: pade_trln_w.cc:69