CHROMA
ord_ib_rxupdate_kernel_sse.h
Go to the documentation of this file.
1 // 32 BIT Version. Use std::vector length of 4 (guaranteed OK for LatticeDirac Fermion)
2 // for easy std::vectorization (with suitably good compiler).
3 
4 #include <xmmintrin.h>
5 #include <emmintrin.h>
6 
7 inline
8 void ord_ib_rxupdate_kernel_real32(int lo, int hi, int my_id, ib_rxupdate_arg<REAL32>* a)
9 {
10  int atom=a->atom;
11  int low = atom*lo;
12  int len = atom*(hi - lo);
13 
14  REAL32* s = &(a->s_ptr[low]);
15  REAL32* t = &(a->t_ptr[low]);
16  REAL32* z = &(a->z_ptr[low]);
17  REAL32* r = &(a->r_ptr[low]);
18  REAL32* x = &(a->x_ptr[low]);
19 
20  REAL32 om_re = a->omega_re;
21  REAL32 om_im = a->omega_im;
22 
23  __m128 om_re_vec = _mm_set_ps(om_re, om_re, om_re, om_re);
24  __m128 om_im_vec = _mm_set_ps(-om_im, om_im, -om_im, om_im);
25  __m128 mom_im_vec = _mm_set_ps(om_im, -om_im, om_im, -om_im);
26 
27  if( len % 4 == 0 ) {
28  for(int count = 0; count < len; count+=4) {
29  /*
30  * r[count] = s[count] - om_re*t[count] + om_im*t[count+1];
31  * r[count+1] = s[count+1] - om_re*t[count+1] - om_im*t[count];
32  * r[count+2] = s[count+2] - om_re*t[count+2] + om_im*t[count+3];
33  * r[count+3] = s[count+3] - om_re*t[count+3] - om_im*t[count+2];
34  */
35 
36 
37  __m128 t_vec = _mm_load_ps(&t[count]);
38  __m128 s_vec = _mm_load_ps(&s[count]);
39 
40  __m128 tmpv1 = _mm_mul_ps(om_re_vec, t_vec);
41 
42  __m128 tmpv2 = _mm_shuffle_ps(t_vec, t_vec, 0xb1);
43  __m128 r_vec = _mm_sub_ps(s_vec, tmpv1);
44 
45  tmpv1 = _mm_mul_ps(om_im_vec,tmpv2);
46  r_vec = _mm_add_ps(r_vec, tmpv1);
47  _mm_store_ps(&r[count], r_vec);
48 
49 
50  /*
51  x[count] += om_re*s[count] - om_im*s[count+1] + z[count];
52  x[count+1] += om_re*s[count+1] + om_im*s[count] + z[count+1];
53  x[count+2] += om_re*s[count+2] - om_im*s[count+3] + z[count+2];
54  x[count+3] += om_re*s[count+3] + om_im*s[count+2] + z[count+3];
55  */
56 
57  /* x[ ... ] */
58  __m128 xvec = _mm_load_ps(&x[count]);
59 
60  /* z[ ... ] */
61  __m128 zvec = _mm_load_ps(&z[count]);
62 
63 
64  tmpv1 = _mm_mul_ps(om_re_vec, s_vec);
65  xvec = _mm_add_ps(xvec, tmpv1);
66 
67  tmpv2 = _mm_shuffle_ps(s_vec, s_vec, 0xb1);
68  xvec = _mm_add_ps(xvec,zvec);
69 
70  tmpv1 = _mm_mul_ps(mom_im_vec,tmpv2);
71  xvec = _mm_add_ps(xvec, tmpv1);
72 
73  _mm_store_ps(&x[count], xvec);
74 
75  }
76  }
77  else {
78  QDPIO::cout << "ord_ib_rxupdate_kernel_sse.h: len not divisible by 4" << std::endl;
79  QDP_abort(1);
80  }
81 }
82 
83 
84 
85 // 64 BIT Version. Use std::vector length of 2 (guaranteed OK for Complex Numbers)
86 // for easy std::vectorization (with suitably good compiler). Can do function
87 // overloading so no need to change kernel name
88 
89 inline
90 void ord_ib_rxupdate_kernel_real64(int lo, int hi, int my_id, ib_rxupdate_arg<REAL64>* a)
91 {
92 
93  int atom=a->atom;
94  int low = atom*lo;
95  int len = atom*(hi - lo)
96 ;
97  REAL64* s = &(a->s_ptr[low]);
98  REAL64* t = &(a->t_ptr[low]);
99  REAL64* z = &(a->z_ptr[low]);
100  REAL64* r = &(a->r_ptr[low]);
101  REAL64* x = &(a->x_ptr[low]);
102 
103  REAL64 om_re = a->omega_re;
104  REAL64 om_im = a->omega_im;
105 
106 
107 
108  __m128d om_re_vec = _mm_set_pd(om_re, om_re);
109  __m128d om_im_vec = _mm_set_pd(-om_im, om_im);
110  __m128d mom_im_vec = _mm_set_pd(om_im, -om_im);
111 
112  if(len % 2 == 0) {
113  for(int count = 0; count < len; count+=2) {
114 
115  /*
116  r[count] = s[count] - om_re*t[count] + om_im*t[count+1];
117  r[count+1] = s[count+1] - om_re*t[count+1] - om_im*t[count];
118  */
119  __m128d svec = _mm_load_pd(&s[count]);
120  __m128d tvec = _mm_load_pd(&t[count]);
121 
122  __m128d tmpv2 = _mm_mul_pd(om_re_vec,tvec);
123  __m128d rvec = _mm_sub_pd(svec, tmpv2);
124 
125 
126  __m128d tmpv1 = _mm_shuffle_pd(tvec,tvec,0x1);
127  tmpv2 = _mm_mul_pd(om_im_vec,tmpv1);
128  rvec = _mm_add_pd(rvec, tmpv2);
129  _mm_store_pd(&r[count], rvec);
130 
131 
132 
133  /*
134  x[count] += om_re*s[count] - om_im*s[count+1] + z[count];
135  x[count+1] += om_re*s[count+1] + om_im*s[count] + z[count+1];
136 
137  */
138  __m128d xvec = _mm_load_pd(&x[count]);
139  __m128d zvec = _mm_load_pd(&z[count]);
140 
141  tmpv1 = _mm_mul_pd(om_re_vec, svec);
142  xvec = _mm_add_pd(xvec,tmpv1);
143 
144  tmpv2 = _mm_shuffle_pd(svec,svec,0x1);
145  xvec = _mm_add_pd(xvec,zvec);
146 
147  tmpv1 = _mm_mul_pd(mom_im_vec, tmpv2);
148  xvec = _mm_add_pd(xvec,tmpv1);
149 
150  _mm_store_pd(&x[count],xvec);
151 
152  }
153  }
154  else {
155  QDPIO::cout << "ord_ib_rxupdate_kernel_sse.h: len not divisible by 2" << std::endl;
156  QDP_abort(1);
157  }
158 
159 }
160 
161 
162 
unsigned s
Definition: ldumul_w.cc:37
int z
Definition: meslate.cc:36
int x
Definition: meslate.cc:34
int t
Definition: meslate.cc:37
Complex a
Definition: invbicg.cc:95
int count
Definition: octave.h:14
void ord_ib_rxupdate_kernel_real64(int lo, int hi, int my_id, ib_rxupdate_arg< REAL64 > *a)
void ord_ib_rxupdate_kernel_real32(int lo, int hi, int my_id, ib_rxupdate_arg< REAL32 > *a)
multi1d< LatticeFermion > r(Ncb)