CHROMA
ord_ib_zvupdates_kernel_sse.h
Go to the documentation of this file.
1 // 32 BIT Version: Use std::vector length of 4 for easy std::vectorization.
2 // This is guaranteed good for LatticeDiracFermions
3 
4 inline
5 void ord_ib_zvupdates_kernel_real32(int lo, int hi, int my_id, ib_zvupdates_arg<REAL32>* a)
6 {
7 
8  int atom=a->atom;
9  int low = atom*lo;
10  int len = atom*(hi - lo);
11 
12  REAL32* r = &(a->r_ptr[low]);
13  REAL32* z = &(a->z_ptr[low]);
14  REAL32* v = &(a->v_ptr[low]);
15  REAL32* u = &(a->u_ptr[low]);
16  REAL32* q = &(a->q_ptr[low]);
17 
18  REAL32 a_re = a->alpha_re;
19  REAL32 a_im = a->alpha_im;
20 
21  REAL32 arb_re = a->alpha_rat_beta_re;
22  REAL32 arb_im = a->alpha_rat_beta_im;
23 
24  REAL32 ad_re = a->alpha_delta_re;
25  REAL32 ad_im = a->alpha_delta_im;
26 
27  REAL32 b_re = a->beta_re;
28  REAL32 b_im = a->beta_im;
29 
30  REAL32 d_re = a->delta_re;
31  REAL32 d_im = a->delta_im;
32 
33 
34 
35 
36  __m128 ztmp, vtmp;
37 
38  __m128 zvec, rvec, vvec, uvec, qvec, tmpshuf1, tmpshuf2, tmpshuf3,tmpshuf4;
39  const __m128 arb_re_vec = _mm_set_ps(arb_re,arb_re,arb_re,arb_re);
40  const __m128 arb_im_vec = _mm_set_ps(arb_im,-arb_im,arb_im,-arb_im);
41  const __m128 a_re_vec = _mm_set_ps(a_re,a_re,a_re,a_re);
42  const __m128 a_im_vec = _mm_set_ps(a_im,-a_im,a_im,-a_im);
43  const __m128 ad_re_vec = _mm_set_ps(ad_re,ad_re,ad_re,ad_re);
44  const __m128 ad_im_vec = _mm_set_ps(ad_im,-ad_im,ad_im,-ad_im);
45  const __m128 b_re_vec = _mm_set_ps(b_re,b_re,b_re,b_re);
46  const __m128 b_im_vec = _mm_set_ps(b_im,-b_im,b_im,-b_im);
47  const __m128 d_re_vec = _mm_set_ps(d_re,d_re,d_re,d_re);
48  const __m128 d_im_vec = _mm_set_ps(d_im,-d_im,d_im,-d_im);
49 
50  if( len % 4 == 0 ) {
51  for(int count = 0; count < len; count+=4) {
52 
53  /* ztmp = { z[count], z[count+1], z[count+2], z[count+3] } */
54  ztmp = _mm_load_ps(&z[count]);
55 
56  /* rvec = { r[count], r[count+1], r[count+2], r[count+3] } */
57  rvec = _mm_load_ps(&r[count]);
58 
59  /* vtmp = { v[count], v[count+1], v[count+2], v[count+3] } */
60  vtmp = _mm_load_ps(&v[count]);
61 
62  /* uvec = { u[count], u[count+1], u[count+2], u[count+3] } */
63  uvec = _mm_load_ps(&u[count]);
64 
65  /* qvec = { q[count], q[count+1], q[count+2], q[count+3] } */
66  qvec = _mm_load_ps(&q[count]);
67 
68  /* tmpshuf1 = { z[count+1], z[count], z[count+3], z[count+2] } */
69  tmpshuf1 = _mm_shuffle_ps(ztmp,ztmp, 0xb1);
70 
71  /* tmpshuf2 = { r[count+1], r[count], r[count+3], r[count+2] } */
72  tmpshuf2 = _mm_shuffle_ps(rvec,rvec, 0xb1);
73 
74  /* tmpshuf3 = { v[count+1], v[count], v[count+3], v[count+2] } */
75  tmpshuf3 = _mm_shuffle_ps(vtmp,vtmp, 0xb1);
76 
77  /* tmpshuf3 = { q[count+1], q[count], q[count+3], q[count+2] } */
78  tmpshuf4 = _mm_shuffle_ps(qvec,qvec, 0xb1);
79 
80  /******* z = (alpha_n/alpha_n-1)*beta z *******/
81  /*
82  * arb =(alpha_n/alpha_n-1)*beta
83  *
84  * ztmp = { z[count], z[count+1], z[count+2], z[count+3] }
85  * tmpshuf1 = { z[count+1], z[count], z[count+3], z[count+2] }
86  * arb_re_vec = { arb_re, arb_re, arb_re, arb_re }
87  * arb_im_vec = {-arb_im, arb_im,-arb_im, arb_im }
88  *
89  * z[count] = arb_re * ztmp[0] - arb_im * ztmp[1];
90  * z[count+1] = arb_re * ztmp[1] + arb_im * ztmp[0];
91  * z[count+2] = arb_re * ztmp[2] - arb_im * ztmp[3];
92  * z[count+3] = arb_re * ztmp[3] + arb_im * ztmp[2];
93  */
94  zvec = _mm_mul_ps(arb_re_vec,ztmp);
95  zvec = _mm_add_ps(zvec, _mm_mul_ps(arb_im_vec,tmpshuf1));
96 
97  /********** z += alpha * r ***********/
98  /*
99  * a_re_vec = [ a_re,a_re,a_re,a_re ]
100 
101  z[count ] += a_re * r[count];
102  z[count+1] += a_re * r[count+1];
103  z[count+2] += a_re * r[count+2];
104  z[count+3] += a_re * r[count+3];
105  */
106  zvec = _mm_add_ps(zvec, _mm_mul_ps(a_re_vec,rvec));
107 
108  /*
109  * a_im_vec = [ -a_im, a_im, -a_im, a_im ]
110  z[count ] -= a_im * r[count+1];
111  z[count+1] += a_im * r[count];
112  z[count+2] -= a_im * r[count+3];
113  z[count+3] += a_im * r[count+2];
114  */
115 
116  zvec = _mm_add_ps(zvec, _mm_mul_ps(a_im_vec,tmpshuf2));
117 
118  /***** z -= (alpha*delta) v **********/
119  /*
120  z[count ] -= ad_re * v[count] ;
121  z[count+1] -= ad_re * v[count+1];
122  z[count+2] -= ad_re * v[count+2] ;
123  z[count+3] -= ad_re * v[count+3];
124  */
125 
126  zvec = _mm_sub_ps(zvec, _mm_mul_ps(ad_re_vec, vtmp));
127 
128  /*
129  z[count ] += ad_im * v[count+1];
130  z[count+1] -= ad_im * v[count];
131  z[count+2] += ad_im * v[count+3];
132  z[count+3] -= ad_im * v[count+2];
133  */
134  zvec = _mm_sub_ps(zvec, _mm_mul_ps(ad_im_vec, tmpshuf3));
135 
136  _mm_store_ps(&z[count],zvec);
137 
138  /************* v = u + b v ****************/
139  /*
140  v[count] = u[count] + b_re*vtmp[0] - b_im*vtmp[1];
141  v[count+1] = u[count+1] + b_re*vtmp[1] + b_im*vtmp[0];
142  v[count+2] = u[count+2] + b_re*vtmp[2] - b_im*vtmp[3];
143  v[count+3] = u[count+3] + b_re*vtmp[3] + b_im*vtmp[2];
144  */
145  vvec = _mm_add_ps(uvec, _mm_mul_ps(b_re_vec,vtmp));
146  vvec = _mm_add_ps(vvec, _mm_mul_ps(b_im_vec,tmpshuf3));
147 
148  /***************** v -= d*q ******************/
149  /*
150  v[count] -= d_re*q[count];
151  v[count+1] -= d_re*q[count+1];
152  v[count+2] -= d_re*q[count+2];
153  v[count+3] -= d_re*q[count+3];
154  */
155  vvec = _mm_sub_ps(vvec, _mm_mul_ps(d_re_vec,qvec));
156  /*
157  v[count] += d_im*q[count+1];
158  v[count+1] -= d_im*q[count];
159  v[count+2] += d_im*q[count+3];
160  v[count+3] -= d_im*q[count+2];
161  */
162  vvec = _mm_sub_ps(vvec, _mm_mul_ps(d_im_vec,tmpshuf4));
163 
164  _mm_store_ps(&v[count],vvec);
165 
166 
167  }
168  }
169  else {
170  QDPIO::cout << "ord_ib_zvupdates_sse.h: len not divisible by 4" << std::endl;
171  QDP_abort(1);
172 
173 
174  }
175 }
176 
177 // 64 BIT Version: Use std::vector length of 2 for easy std::vectorization.
178 // This is guaranteed good for LatticeDiracFermions
179 
180 
181 inline
182 void ord_ib_zvupdates_kernel_real64(int lo, int hi, int my_id, ib_zvupdates_arg<REAL64>* a)
183 {
184 
185  int atom=a->atom;
186  int low = atom*lo;
187  int len = atom*(hi - lo);
188 
189  REAL64* r = &(a->r_ptr[low]);
190  REAL64* z = &(a->z_ptr[low]);
191  REAL64* v = &(a->v_ptr[low]);
192  REAL64* u = &(a->u_ptr[low]);
193  REAL64* q = &(a->q_ptr[low]);
194 
195  REAL64 a_re = a->alpha_re;
196  REAL64 a_im = a->alpha_im;
197 
198  REAL64 arb_re = a->alpha_rat_beta_re;
199  REAL64 arb_im = a->alpha_rat_beta_im;
200 
201  REAL64 ad_re = a->alpha_delta_re;
202  REAL64 ad_im = a->alpha_delta_im;
203 
204  REAL64 b_re = a->beta_re;
205  REAL64 b_im = a->beta_im;
206 
207  REAL64 d_re = a->delta_re;
208  REAL64 d_im = a->delta_im;
209 
210 
211  __m128d ztmp, vtmp;
212 
213  __m128d zvec, rvec, vvec, uvec, qvec, tmpshuf1, tmpshuf2, tmpshuf3,tmpshuf4;
214  const __m128d arb_re_vec = _mm_set_pd(arb_re,arb_re);
215  const __m128d arb_im_vec = _mm_set_pd(arb_im,-arb_im);
216  const __m128d a_re_vec = _mm_set_pd(a_re,a_re);
217  const __m128d a_im_vec = _mm_set_pd(a_im,-a_im);
218  const __m128d ad_re_vec = _mm_set_pd(ad_re,ad_re);
219  const __m128d ad_im_vec = _mm_set_pd(ad_im,-ad_im);
220  const __m128d b_re_vec = _mm_set_pd(b_re,b_re);
221  const __m128d b_im_vec = _mm_set_pd(b_im,-b_im);
222  const __m128d d_re_vec = _mm_set_pd(d_re,d_re);
223  const __m128d d_im_vec = _mm_set_pd(d_im,-d_im);
224 
225  if( len % 2 == 0 ) {
226 
227  for(int count = 0; count < len; count+=2) {
228  ztmp = _mm_load_pd(&z[count]);
229  vtmp = _mm_load_pd(&v[count]);
230  rvec = _mm_load_pd(&r[count]);
231  uvec = _mm_load_pd(&u[count]);
232  qvec = _mm_load_pd(&q[count]);
233 
234 
235  tmpshuf1= _mm_shuffle_pd(ztmp,ztmp,0x1);
236  tmpshuf2 = _mm_shuffle_pd(rvec,rvec,0x1);
237  tmpshuf3 = _mm_shuffle_pd(vtmp,vtmp,0x1);
238  tmpshuf4 = _mm_shuffle_pd(qvec,qvec,0x1);
239 
240  /* z = (alpha_n/alpha_n-1)*beta z */
241  /*
242  ztmp[0] = z[count];
243  ztmp[1] = z[count+1];
244  z[count] = arb_re * ztmp[0] - arb_im * ztmp[1];
245  z[count+1] = arb_re * ztmp[1] + arb_im * ztmp[0];
246 
247  */
248 
249  zvec = _mm_mul_pd(arb_re_vec, ztmp);
250  zvec = _mm_add_pd(zvec, _mm_mul_pd(arb_im_vec, tmpshuf1));
251 
252 
253 
254  /* z += alpha*r */
255  /*
256  z[count ] += a_re * r[count];
257  z[count+1] += a_re * r[count+1];
258 
259  z[count ] -= a_im * r[count+1];
260  z[count+1] += a_im * r[count];
261 
262 
263  */
264  zvec = _mm_add_pd(zvec,_mm_mul_pd(a_re_vec,rvec));
265  zvec = _mm_add_pd(zvec,_mm_mul_pd(a_im_vec,tmpshuf2));
266 
267  /* z -= alpha*delta*v */
268  /*
269  z[count ] -= ad_re * v[count] ;
270  z[count+1] -= ad_re * v[count+1];
271 
272  z[count ] += ad_im * v[count+1];
273  z[count+1] -= ad_im * v[count];
274  */
275  zvec = _mm_sub_pd(zvec, _mm_mul_pd(ad_re_vec,vtmp));
276  zvec = _mm_sub_pd(zvec, _mm_mul_pd(ad_im_vec, tmpshuf3));
277  _mm_store_pd(&z[count], zvec);
278 
279 
280 
281 
282  /* v = u + b*v */
283  /*
284  v[count] = u[count] + b_re*vtmp[0] - b_im*vtmp[1];
285  v[count+1] = u[count+1] + b_re*vtmp[1] + b_im*vtmp[0];
286 
287  */
288  vvec = _mm_add_pd( uvec, _mm_mul_pd(b_re_vec,vtmp));
289  vvec = _mm_add_pd( vvec, _mm_mul_pd(b_im_vec,tmpshuf3));
290 
291  /*
292  v[count] -= d_re*q[count];
293  v[count+1] -= d_re*q[count+1];
294  */
295  vvec = _mm_sub_pd( vvec, _mm_mul_pd(d_re_vec,qvec));
296 
297  /*
298  v[count] += d_im*q[count+1];
299  v[count+1] -= d_im*q[count];
300  */
301  vvec = _mm_sub_pd( vvec, _mm_mul_pd(d_im_vec, tmpshuf4));
302  _mm_store_pd(&v[count],vvec);
303  }
304  }
305  else {
306  QDPIO::cout << "ord_ib_zvupdates_sse.h: len not divisible by 2" << std::endl;
307  QDP_abort(1);
308  }
309 
310 }
311 
int z
Definition: meslate.cc:36
Double q
Definition: mesq.cc:17
static multi1d< LatticeColorMatrix > u
Complex a
Definition: invbicg.cc:95
int count
Definition: octave.h:14
void ord_ib_zvupdates_kernel_real32(int lo, int hi, int my_id, ib_zvupdates_arg< REAL32 > *a)
void ord_ib_zvupdates_kernel_real64(int lo, int hi, int my_id, ib_zvupdates_arg< REAL64 > *a)
multi1d< LatticeFermion > r(Ncb)