1 #include "qdp_config.h"
2 #include "qdp_precision.h"
7 #include "scalarsite_sse/sse_dcomplex_mult_macros.h"
8 void ssed_clover_apply(REAL64* diag, REAL64* offd, REAL64* psiptr, REAL64* chiptr,
int n_sites)
27 REAL64* psi_p=(REAL64 *)psiptr;
28 REAL64* offd_p = (REAL64 *)offd;
29 REAL64* diag_p = (REAL64 *)diag;
32 for(
int site=0; site < 2*n_sites; site++) {
38 psi0 = _mm_load_pd(psi_p);
39 psi1 = _mm_load_pd(psi_p+2);
40 psi2 = _mm_load_pd(psi_p+4);
41 psi3 = _mm_load_pd(psi_p+6);
42 psi4 = _mm_load_pd(psi_p+8);
43 psi5 = _mm_load_pd(psi_p+10);
49 tmp0 = _mm_load_pd(diag_p);
52 tmp1 = _mm_shuffle_pd(tmp0,tmp0,0x1);
54 tmp0 = _mm_shuffle_pd(tmp0,tmp0, 0x0);
56 tmp1 = _mm_shuffle_pd(tmp1,tmp1, 0x0);
59 chi0 = _mm_mul_pd(tmp0,psi0);
62 chi1 = _mm_mul_pd(tmp1,psi1);
66 tmp0 = _mm_load_pd(offd_p);
67 CONJMADD(chi0,psi1,tmp0);
68 CMADD(chi1, psi0, tmp0);
72 tmp0 = _mm_load_pd(offd_p+2);
73 tmp1 = _mm_load_pd(offd_p+4);
74 CONJMADD(chi0, psi2, tmp0);
75 CONJMADD(chi1, psi2, tmp1);
79 tmp0 = _mm_load_pd(offd_p+6);
80 tmp1 = _mm_load_pd(offd_p+8);
81 CONJMADD(chi0, psi3, tmp0);
82 CONJMADD(chi1, psi3, tmp1);
86 tmp0 = _mm_load_pd(offd_p+12);
87 tmp1 = _mm_load_pd(offd_p+14);
88 CONJMADD(chi0, psi4,tmp0);
89 CONJMADD(chi1, psi4,tmp1);
93 tmp0 = _mm_load_pd(offd_p+20);
94 tmp1 = _mm_load_pd(offd_p+22);
95 CONJMADD(chi0, psi5,tmp0);
96 CONJMADD(chi1, psi5,tmp1);
98 _mm_store_pd(chi_p, chi0);
99 _mm_store_pd(chi_p+2, chi1);
105 tmp0 = _mm_load_pd(diag_p+2);
109 tmp1 = _mm_shuffle_pd(tmp0,tmp0,0x1);
111 tmp0 = _mm_shuffle_pd(tmp0,tmp0, 0x0);
113 tmp1 = _mm_shuffle_pd(tmp1,tmp1, 0x0);
116 chi0 = _mm_mul_pd(tmp0,psi2);
119 chi1 = _mm_mul_pd(tmp1,psi3);
123 tmp0 = _mm_load_pd(offd_p+2);
124 tmp1 = _mm_load_pd(offd_p+4);
125 CMADD(chi0,psi0,tmp0);
126 CMADD(chi0,psi1,tmp1);
130 tmp0 = _mm_load_pd(offd_p+6);
131 tmp1 = _mm_load_pd(offd_p+8);
132 CMADD(chi1,psi0,tmp0);
133 CMADD(chi1,psi1,tmp1);
137 tmp0 = _mm_load_pd(offd_p+10);
138 CONJMADD(chi0,psi3,tmp0);
139 CMADD(chi1,psi2, tmp0);
143 tmp0 = _mm_load_pd(offd_p+16);
144 tmp1 = _mm_load_pd(offd_p+18);
145 CONJMADD(chi0,psi4,tmp0);
146 CONJMADD(chi1,psi4,tmp1);
150 tmp0 = _mm_load_pd(offd_p+24);
151 tmp1 = _mm_load_pd(offd_p+26);
152 CONJMADD(chi0,psi5,tmp0);
153 CONJMADD(chi1,psi5,tmp1);
155 _mm_store_pd(chi_p+4, chi0);
156 _mm_store_pd(chi_p+6, chi1);
163 tmp0 = _mm_load_pd(diag_p+4);
167 tmp1 = _mm_shuffle_pd(tmp0,tmp0,0x1);
169 tmp0 = _mm_shuffle_pd(tmp0,tmp0, 0x0);
171 tmp1 = _mm_shuffle_pd(tmp1,tmp1, 0x0);
174 chi0 = _mm_mul_pd(tmp0,psi4);
177 chi1 = _mm_mul_pd(tmp1,psi5);
181 tmp0 = _mm_load_pd(offd_p+12);
182 tmp1 = _mm_load_pd(offd_p+14);
183 CMADD(chi0,psi0,tmp0);
184 CMADD(chi0,psi1,tmp1);
188 tmp0 = _mm_load_pd(offd_p+16);
189 tmp1 = _mm_load_pd(offd_p+18);
190 CMADD(chi0,psi2,tmp0);
191 CMADD(chi0,psi3,tmp1);
195 tmp0 = _mm_load_pd(offd_p+20);
196 tmp1 = _mm_load_pd(offd_p+22);
197 CMADD(chi1,psi0,tmp0);
198 CMADD(chi1,psi1,tmp1);
202 tmp0 = _mm_load_pd(offd_p+24);
203 tmp1 = _mm_load_pd(offd_p+26);
204 CMADD(chi1,psi2,tmp0);
205 CMADD(chi1,psi3,tmp1);
209 tmp0 = _mm_load_pd(offd_p+28);
210 CONJMADD(chi0,psi5,tmp0);
211 CMADD(chi1,psi4,tmp0);
213 _mm_store_pd(chi_p+8, chi0);
214 _mm_store_pd(chi_p+10,chi1);
Asqtad Staggered-Dirac operator.
void ssed_clover_apply(REAL64 *diag, REAL64 *offd, REAL64 *psiptr, REAL64 *chiptr, int n_sites)