CHROMA
ord_xmay_normx_cdotzx_kernel_sse.h
Go to the documentation of this file.
1 inline
2 void ord_xmay_normx_cdotzx_kernel(int lo, int hi, int my_id, ord_xmay_normx_cdotzx_arg *a)
3 {
4  int atom = a->atom;
5  int low = atom*lo;
6  int len = atom*(hi-lo);
7 
8  REAL32* x_ptr=&(a->x_ptr[low]);
9  REAL32* y_ptr=&(a->y_ptr[low]);
10  REAL32* z_ptr=&(a->z_ptr[low]);
11  REAL32 a_re = a->a_re;
12  REAL32 a_im = a->a_im;
13 
14 
15  __m128d sum = _mm_set_pd((double)0,(double)0);
16  __m128d dotprod = _mm_set_pd((double)0,(double)0);
17  __m128d mask = _mm_set_pd((double)-1, (double)1);
18 
19  __m128 av_re = _mm_set_ps( a_re, a_re, a_re, a_re);
20  __m128 av_im = _mm_set_ps(-a_im, a_im, -a_im, a_im);
21 
22  if( len % 4 == 0 ) {
23  for(int count = 0; count < len; count+=4) {
24  __m128 xv = _mm_load_ps(&x_ptr[count]);
25  __m128 yv = _mm_load_ps(&y_ptr[count]);
26  __m128 zv = _mm_load_ps(&z_ptr[count]);
27 
28 
29  //x_ptr[count] -= a_re*y_ptr[count];
30  //x_ptr[count+1] -= a_re*y_ptr[count+1];
31  //x_ptr[count+2] -= a_re*y_ptr[count+2];
32  //x_ptr[count+3] -= a_re*y_ptr[count+3];
33 
34  __m128 t1 = _mm_mul_ps(av_re, yv);
35  xv = _mm_sub_ps(xv,t1);
36 
37  // x_ptr[count] += a_im*y_ptr[count+1];
38  // x_ptr[count+1] -= a_im*y_ptr[count];
39  // x_ptr[count+2] += a_im*y_ptr[count+3];
40  // x_ptr[count+3] -= a_im*y_ptr[count+2];
41 
42  t1 = _mm_shuffle_ps(yv,yv,0xb1);
43  __m128 t2 = _mm_mul_ps(av_im, t1);
44  xv = _mm_add_ps(xv, t2);
45 
46  _mm_store_ps(&x_ptr[count], xv);
47 
48 
49  __m128d xlow,xhi;
50  __m128d zlow,zhi;
51 
52  xlow = _mm_cvtps_pd(xv);
53 
54  // Swap low and high pairs
55  xv = _mm_shuffle_ps( xv,xv, 0x4e);
56 
57  // Take hi part of x into xhi
58  xhi = _mm_cvtps_pd(xv);
59 
60  // Now samething with y
61  zlow = _mm_cvtps_pd(zv);
62  zv = _mm_shuffle_ps( zv,zv, 0x4e);
63  zhi = _mm_cvtps_pd(zv);
64 
65  //norm_array[0] += x_ptr[count]*x_ptr[count];
66  // norm_array[0] += x_ptr[count+1]*x_ptr[count+1];
67  //norm_array[0] += x_ptr[count+2]*x_ptr[count+2];
68  //norm_array[0] += x_ptr[count+3]*x_ptr[count+3];
69 
70  sum = _mm_add_pd(sum,_mm_mul_pd(xlow,xlow));
71  sum = _mm_add_pd(sum,_mm_mul_pd(xhi,xhi));
72 
73 
74  // dotprod[0] += z_ptr[count]*x_ptr[count];
75  // dotprod[1] -= z_ptr[count+1]*x_ptr[count];
76 
77  __m128d t1d = _mm_shuffle_pd(xlow, xlow, 0x0);
78  __m128d t2d = _mm_mul_pd(mask, t1d);
79  __m128d t3d = _mm_mul_pd(zlow, t2d);
80 
81  dotprod = _mm_add_pd(dotprod, t3d);
82 
83 
84  //dotprod[0] += z_ptr[count+1]*x_ptr[count+1];
85  //dotprod[1] += z_ptr[count]*x_ptr[count+1];
86  t1d = _mm_shuffle_pd(xlow, xlow, 0x3);
87  t2d = _mm_shuffle_pd(zlow, zlow, 0x1);
88  t3d = _mm_mul_pd(t1d,t2d);
89  dotprod = _mm_add_pd(dotprod,t3d);
90 
91 
92  // dotprod[0] += z_ptr[count+2]*x_ptr[count+2];
93  // dotprod[1] -= z_ptr[count+3]*x_ptr[count+2];
94  t1d = _mm_shuffle_pd(xhi,xhi,0x0);
95  t2d = _mm_mul_pd(mask,t1d);
96  t3d = _mm_mul_pd(zhi,t2d);
97  dotprod = _mm_add_pd(dotprod,t3d);
98 
99  // dotprod[0] += z_ptr[count+3]*x_ptr[count+3];
100  // dotprod[1] += z_ptr[count+2]*x_ptr[count+3];
101  t1d = _mm_shuffle_pd(xhi,xhi, 0x3);
102  t2d = _mm_shuffle_pd(zhi,zhi,0x1);
103  t3d = _mm_mul_pd(t1d, t2d);
104  dotprod = _mm_add_pd(dotprod,t3d);
105 
106  }
107  a->norm_space[3*my_id]=((double *)&sum)[0] + ((double *)&sum)[1];
108  a->norm_space[3*my_id+1]=((double *)&dotprod)[0];
109  a->norm_space[3*my_id+2]=((double *)&dotprod)[1];
110  }
111  else {
112  QDPIO::cout << " ord_xmay_normx_cdotzx_kernel_sse.h: len not divisible by 4" << std::endl;
113  QDP_abort(1);
114  }
115 }
116 
Complex a
Definition: invbicg.cc:95
int count
Definition: octave.h:14
void ord_xmay_normx_cdotzx_kernel(int lo, int hi, int my_id, ord_xmay_normx_cdotzx_arg *a)
Double sum
Definition: qtopcor.cc:37