CHROMA
ord_xpaypbz_kernel_sse.h
Go to the documentation of this file.
1 #include <xmmintrin.h>
2 #include <emmintrin.h>
3 
4 inline
5 void ord_xpaypbz_kernel(int lo, int hi, int my_id, ord_xpaypbz_arg* a)
6 {
7  int atom = a->atom;
8  int low = atom*lo;
9  int len = atom*(hi - lo);
10 
11  REAL32* x_ptr = &(a->x_ptr[low]);
12  REAL32* y_ptr = &(a->y_ptr[low]);
13  REAL32* z_ptr = &(a->z_ptr[low]);
14 
15  REAL32 a_re = a->a_re;
16  REAL32 a_im = a->a_im;
17  REAL32 b_re = a->b_re;
18  REAL32 b_im = a->b_im;
19 
20  __m128 bv_re = _mm_set_ps(b_re, b_re, b_re, b_re);
21  __m128 bv_im = _mm_set_ps(b_im, -b_im, b_im, -b_im);
22 
23  __m128 av_re = _mm_set_ps(a_re, a_re, a_re, a_re);
24  __m128 av_im = _mm_set_ps(a_im, -a_im, a_im, -a_im);
25 
26  if( len % 4 == 0) {
27  for(int count = 0; count < len; count+=4) {
28  __m128 xv = _mm_load_ps(&x_ptr[count]);
29  __m128 yv = _mm_load_ps(&y_ptr[count]);
30  __m128 zv = _mm_load_ps(&z_ptr[count]);
31 
32  // yv = yv_4 yv_3 yv_2 yv_1 = im | re | im | re
33  // t1 = yv_3 yv_4 yv_1 yv_2 = re | im | re | im
34 
35  __m128 t1 = _mm_shuffle_ps(yv,yv, 0xb1);
36  __m128 t2 = _mm_mul_ps(av_re, yv);
37  __m128 t3 = _mm_add_ps(xv,t2);
38  __m128 t4 = _mm_mul_ps(av_im, t1);
39  xv = _mm_add_ps(t3, t4);
40 
41  t1 = _mm_shuffle_ps(zv,zv,0xb1);
42  t2 = _mm_mul_ps(bv_re, zv);
43  t3 = _mm_add_ps(xv, t2);
44  t4 = _mm_mul_ps(bv_im, t1);
45  xv = _mm_add_ps(t3,t4);
46 
47  _mm_store_ps(&x_ptr[count], xv);
48  // _mm_stream_ps(&x_ptr[count], xv);
49 
50 #if 0
51 
52  REAL32 tmp_re, tmp_im;
53  REAL32 tmp_re2, tmp_im2;
54 
55  tmp_re = x_ptr[count] + a_re*y_ptr[count];
56  tmp_re -= a_im*y_ptr[count+1];
57  tmp_im = x_ptr[count+1] + a_re*y_ptr[count+1];
58  tmp_im += a_im*y_ptr[count];
59 
60 
61 
62  tmp_re2 = x_ptr[count+2] + a_re*y_ptr[count+2];
63  tmp_re2 -= a_im*y_ptr[count+3];
64  tmp_im2 = x_ptr[count+3] + a_re*y_ptr[count+3];
65  tmp_im2 += a_im*y_ptr[count+2];
66 
67 
68  x_ptr[count] = tmp_re + b_re*z_ptr[count] ;
69  x_ptr[count] -= b_im*z_ptr[count+1];
70  x_ptr[count+1] = tmp_im + b_re*z_ptr[count+1];
71  x_ptr[count+1]+= b_im *z_ptr[count];
72 
73 
74  x_ptr[count+2] = tmp_re2 + b_re*z_ptr[count+2] ;
75  x_ptr[count+2] -= b_im*z_ptr[count+3];
76  x_ptr[count+3] = tmp_im2 + b_re*z_ptr[count+3];
77  x_ptr[count+3]+= b_im *z_ptr[count+2];
78 #endif
79 
80  }
81  }
82  else {
83  QDPIO::cout << "ord_xpaypbz_kernel_sse.h: len not divisible by 4" << std::endl;
84  QDP_abort(1);
85  }
86 }
Complex a
Definition: invbicg.cc:95
int count
Definition: octave.h:14
void ord_xpaypbz_kernel(int lo, int hi, int my_id, ord_xpaypbz_arg *a)