CHROMA
ord_yxpaymabz_kernel_sse.h
Go to the documentation of this file.
1 #include <xmmintrin.h>
2 #include <emmintrin.h>
3 
4 inline
5 void ord_yxpaymabz_kernel(int lo, int hi, int my_id, ord_yxpaymabz_arg* a)
6 {
7  int atom = a->atom;
8  int low = atom*lo;
9  int len = atom*(hi - lo);
10 
11  REAL32* x_ptr = &(a->x_ptr[low]);
12  REAL32* y_ptr = &(a->y_ptr[low]);
13  REAL32* z_ptr = &(a->z_ptr[low]);
14 
15  REAL32 a_re = a->a_re;
16  REAL32 a_im = a->a_im;
17  REAL32 b_re = a->b_re;
18  REAL32 b_im = a->b_im;
19 
20  typedef union {
21  float v[4];
22  __m128 vec;
23  } Vec4;
24 
25  __m128 av_re = _mm_set_ps(a_re, a_re, a_re, a_re);
26  __m128 av_im = _mm_set_ps(a_im, -a_im, a_im,-a_im);
27  __m128 bv_re = _mm_set_ps(b_re, b_re, b_re, b_re);
28  __m128 bv_im = _mm_set_ps(-b_im, b_im, -b_im, b_im);
29 
30  if( len % 4 == 0) {
31  for(int count = 0; count < len; count+=4) {
32  // Load x, y, z
33  __m128 xv = _mm_load_ps(&x_ptr[count]);
34  __m128 yv = _mm_load_ps(&y_ptr[count]);
35  __m128 zv = _mm_load_ps(&z_ptr[count]);
36 
37  // Step1: t = y - b*z
38  // = yv - bv_re zv + bv_im zv2
39 
40  // zv = [ re | im | re | im ]
41  // zv2 = [ im | re | im | re ]
42  __m128 zv2 = zv;
43  zv2 = _mm_shuffle_ps(zv2,zv2,0xb1);
44 
45 
46  // tmp = y - bv_re zv + bv_im zv2
47  __m128 t1 = _mm_mul_ps(bv_re, zv);
48  __m128 t2 = _mm_sub_ps(yv,t1);
49 
50  __m128 t3 = _mm_mul_ps(bv_im, zv2);
51  t2 = _mm_add_ps(t2,t3);
52 
53 
54  // zv2 holds t2
55  zv2 = t2;
56  zv2 = _mm_shuffle_ps(zv2,zv2,0xb1);
57  // yv= x + av_re t2 + av_im zv2
58 
59  t1 = _mm_mul_ps(av_re, t2);
60  yv = _mm_add_ps(xv, t1);
61  t3 = _mm_mul_ps(av_im, zv2);
62  yv = _mm_add_ps(yv, t3);
63 
64  _mm_store_ps(&y_ptr[count], yv);
65 
66 
67  }
68  }
69  else {
70  QDPIO::cout << "ord_yxpaymabz_kernel_sse.h: len not divisible by 4" << std::endl;
71  QDP_abort(1);
72  }
73 
74 }
Complex a
Definition: invbicg.cc:95
int count
Definition: octave.h:14
void ord_yxpaymabz_kernel(int lo, int hi, int my_id, ord_yxpaymabz_arg *a)