6 #ifndef __invcg2_timing_hacks_2_h__
7 #define __invcg2_timing_hacks_2_h__
16 typedef OLattice< PSpinVector< PColorVector< RComplex< PScalar<REAL> >, Nc>, Ns> >
LFerm;
18 typedef OScalar< PScalar < PScalar < RScalar< PScalar < REAL > > > > >
LScal;
19 typedef OScalar< PScalar < PScalar < RScalar< PScalar < DOUBLE > > > > >
LDble;
21 #define AT_REAL(a) (a.elem().elem().elem().elem().elem())
24 #define FIRST_ELEM(a,s) (&(a.elem(s.start()).elem(0).elem(0).real().elem()))
40 void vaxpy3_norm(REAL *Out,REAL *scalep,REAL *InScale, REAL *Add,
int n_3vec,
44 QDPIO::cout <<
"SSE_TEST: vaxpy3_norm" << std::endl;
49 v4sf vscalep = __builtin_ia32_loadss(scalep);
50 asm(
"shufps\t$0,%0,%0" :
"+x" (vscalep));
53 register v4sf vsum = __builtin_ia32_loadss(&fzero);
54 asm(
"shufps\t$0,%0,%0" :
"+x" (vsum));
56 for (; n_loops-- > 0; )
60 vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+ 0)), __builtin_ia32_loadaps(Add+ 0));
61 vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
62 __builtin_ia32_storeaps(Out+ 0, vtmp);
64 vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+ 4)), __builtin_ia32_loadaps(Add+ 4));
65 vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
66 __builtin_ia32_storeaps(Out+ 4, vtmp);
68 vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+ 8)), __builtin_ia32_loadaps(Add+ 8));
69 vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
70 __builtin_ia32_storeaps(Out+ 8, vtmp);
72 vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+12)), __builtin_ia32_loadaps(Add+12));
73 vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
74 __builtin_ia32_storeaps(Out+12, vtmp);
76 vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+16)), __builtin_ia32_loadaps(Add+16));
77 vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
78 __builtin_ia32_storeaps(Out+16, vtmp);
80 vtmp = __builtin_ia32_addps(__builtin_ia32_mulps(vscalep, __builtin_ia32_loadaps(InScale+20)), __builtin_ia32_loadaps(Add+20));
81 vsum = __builtin_ia32_addps(vsum, __builtin_ia32_mulps(vtmp, vtmp));
82 __builtin_ia32_storeaps(Out+20, vtmp);
84 Out += 24; InScale += 24; Add += 24;
88 __builtin_ia32_storeaps(fsum, vsum);
89 *dsum = (REAL)(fsum[0] + fsum[1] + fsum[2] + fsum[3]);
Primary include file for CHROMA library code.
Include possibly optimized Wilson dslash.
void InvCG2EvenOddPrecWilsLinOpTHack(const WilsonDslash &D, const LFerm &chi, LFerm &psi, const LScal &mass, const LScal &RsdCG, int MaxCG, int &n_count)
Conjugate-Gradient (CGNE) algorithm for a generic Linear Operator.
OLattice< PSpinVector< PColorVector< RComplex< PScalar< REAL > >, Nc >, Ns > > LFerm
Highly optimised Conjugate-Gradient (CGNE) algorithm for a Even Odd Preconditioned.
OScalar< PScalar< PScalar< RScalar< PScalar< REAL > > > > > LScal
OLattice< PSpinVector< PColorVector< RComplex< PScalar< REAL > >, Nc >, Ns > > LFerm
Highly optimised Conjugate-Gradient (CGNE) algorithm for a Even Odd Preconditioned.
void vaxpy3_norm(REAL *Out, REAL *scalep, REAL *InScale, REAL *Add, int n_3vec, REAL *dsum)
OScalar< PScalar< PScalar< RScalar< PScalar< DOUBLE > > > > > LDble
float v4sf __attribute__((mode(V4SF), aligned(16)))
OScalar< PScalar< PScalar< RScalar< PScalar< REAL > > > > > LScal
QDPWilsonDslash WilsonDslash
const WilsonTypeFermAct< multi1d< LatticeFermion > > Handle< const ConnectState > const multi1d< Real > enum InvType invType const multi1d< Real > & RsdCG