CHROMA
ord_norm2x_cdotxy_kernel_sse.h
Go to the documentation of this file.
1 #include <xmmintrin.h>
2 #include <emmintrin.h>
3 
4 inline
5 void ord_norm2x_cdotxy_kernel(int lo, int hi, int my_id, ord_norm2x_cdotxy_arg* a)
6 {
7  int atom = a->atom;
8  int low = atom*lo;
9  int len = atom*(hi-lo);
10 
11  REAL32* x_ptr = &(a->x_ptr[low]);
12  REAL32* y_ptr = &(a->y_ptr[low]);
13  REAL64 norm_array[3] = {0,0,0};
14 
15  __m128d sum = _mm_set_pd((double)0,(double)0);
16  __m128d dotprod = _mm_set_pd((double)0,(double)0);
17  __m128d mask = _mm_set_pd((double)-1, (double)1);
18 
19  if( atom % 4 == 0) {
20  for(int count = 0; count < len; count+=4) {
21 
22  //
23  __m128 xv = _mm_load_ps(&x_ptr[count]);
24  __m128 yv = _mm_load_ps(&y_ptr[count]);
25 
26  __m128d xlow,xhi;
27  __m128d ylow,yhi;
28 
29  // Convert low 2 in X to be doubles in xlow
30  xlow = _mm_cvtps_pd(xv);
31 
32  // Swap low and high pairs
33  xv = _mm_shuffle_ps( xv,xv, 0x4e);
34 
35  // Take hi part of x into xhi
36  xhi = _mm_cvtps_pd(xv);
37 
38  // Now samething with y
39  ylow = _mm_cvtps_pd(yv);
40  yv = _mm_shuffle_ps( yv,yv, 0x4e);
41  yhi = _mm_cvtps_pd(yv);
42 
43  sum = _mm_add_pd(sum,_mm_mul_pd(xlow,xlow));
44  sum = _mm_add_pd(sum,_mm_mul_pd(xhi,xhi));
45 
46 
47  // norm
48  // norm_array[0] += x_ptr[count]*x_ptr[count];
49  // norm_array[1] += x_ptr[count+1]*x_ptr[count+1];
50  // norm_array[0] += x_ptr[count+2]*x_ptr[count+2];
51  // norm_array[1] += x_ptr[count+3]*x_ptr[count+3];
52 
53  // Need tmp to hold (y0, y_0)
54  // dotprod[0] += x_ptr[count]*y_ptr[count];
55  // dotprod[1] -= x_ptr[count+1]*y_ptr[count];
56 
57  __m128d t1 = _mm_shuffle_pd(ylow, ylow, 0x0);
58  __m128d t2 = _mm_mul_pd(mask, t1);
59  __m128d t3 = _mm_mul_pd(xlow, t2);
60  dotprod = _mm_add_pd(dotprod, t3);
61 
62  //dotprod[0] += x_ptr[count+1]*y_ptr[count+1];
63  //dotprod[1] += x_ptr[count]*y_ptr[count+1];
64  t1 = _mm_shuffle_pd(ylow, ylow, 0x3);
65  t2 = _mm_shuffle_pd(xlow, xlow, 0x1);
66  t3 = _mm_mul_pd(t1,t2);
67  dotprod = _mm_add_pd(dotprod,t3);
68 
69 
70  // dotprod[0] += x_ptr[count+2]*y_ptr[count+2];
71  // dotprod[1] -= x_ptr[count+3]*y_ptr[count+2]
72  t1 = _mm_shuffle_pd(yhi,yhi,0x0);
73  t2 = _mm_mul_pd(mask,t1);
74  t3 = _mm_mul_pd(xhi,t2);
75  dotprod = _mm_add_pd(dotprod,t3);
76 
77  // dotprod[0] += x_ptr[count+3]*y_ptr[count+3];
78  // dotprod[1] += x_ptr[count+2]*y_ptr[count+3];
79  t1 = _mm_shuffle_pd(yhi,yhi, 0x3);
80  t2 = _mm_shuffle_pd(xhi,xhi,0x1);
81  t3 = _mm_mul_pd(t1, t2);
82  dotprod = _mm_add_pd(dotprod,t3);
83  }
84 
85 
86  a->norm_space[3*my_id]=((double *)&sum)[0] + ((double *)&sum)[1];
87  a->norm_space[3*my_id+1]=((double *)&dotprod)[0];
88  a->norm_space[3*my_id+2]=((double *)&dotprod)[1];
89  }
90  else {
91  QDPIO::cout << "ord_norm2x_cdotxy_kernel_sse.h: len not divisible by 4" << std::endl;
92  QDP_abort(1);
93  }
94 }
Complex a
Definition: invbicg.cc:95
int count
Definition: octave.h:14
void ord_norm2x_cdotxy_kernel(int lo, int hi, int my_id, ord_norm2x_cdotxy_arg *a)
Double sum
Definition: qtopcor.cc:37