CHROMA
ord_xmyz_normx_kernel_sse.h
Go to the documentation of this file.
1 #include <xmmintrin.h>
2 #include <emmintrin.h>
3 #include <pmmintrin.h>
4 
5 inline
6 void ord_xymz_normx_kernel(int lo, int hi, int my_id, ord_xymz_normx_arg* a)
7 {
8  REAL64* x_ptr;
9  REAL64* y_ptr;
10  REAL64* z_ptr;
11  REAL64 norm=0;
12  int atom = a->atom;
13  int low = atom*lo;
14  int len = atom*(hi-lo);
15 
16  x_ptr = &(a->x_ptr[low]);
17  y_ptr = &(a->y_ptr[low]);
18  z_ptr = &(a->z_ptr[low]);
19 
20  __m128d norm_vec1 = _mm_set_pd((double)0,(double)0);
21  __m128d norm_vec2 = _mm_set_pd((double)0,(double)0);
22 #if 0
23  __m128d norm_vec3 = _mm_set_pd((double)0,(double)0);
24  __m128d norm_vec4 = _mm_set_pd((double)0,(double)0);
25 #endif
26 
27  if( len % 4 == 0) {
28  for(int count = 0; count < len; count+=4) {
29 
30  __m128d xvec1,xvec2;
31 #if 0
32  __m128d xvec3,xvec4;
33 #endif
34 
35  __m128d yvec1, yvec2;
36 
37 #if 0
38  __m128d yvec3,yvec4;
39 #endif
40 
41  __m128d zvec1, zvec2;
42 
43 #if 0
44  __m128d zvec3,zvec4;
45 #endif
46 
47  yvec1 = _mm_load_pd(&y_ptr[count]);
48  zvec1 = _mm_load_pd(&z_ptr[count]);
49  xvec1 = _mm_sub_pd(yvec1,zvec1);
50  _mm_store_pd(&x_ptr[count], xvec1);
51 
52  yvec2 = _mm_load_pd(&y_ptr[count+2]);
53  zvec2 = _mm_load_pd(&z_ptr[count+2]);
54  xvec2 = _mm_sub_pd(yvec2,zvec2);
55  _mm_store_pd(&x_ptr[count+2], xvec2);
56 
57 #if 0
58  yvec3 = _mm_load_pd(&y_ptr[count+4]);
59  zvec3 = _mm_load_pd(&z_ptr[count+4]);
60  xvec3 = _mm_sub_pd(yvec3,zvec3);
61  // _mm_store_pd(&x_ptr[count+4], xvec3);
62  _mm_stream_pd(&x_ptr[count+4], xvec3);
63 
64  yvec4 = _mm_load_pd(&y_ptr[count+6]);
65  zvec4 = _mm_load_pd(&z_ptr[count+6]);
66  xvec4 = _mm_sub_pd(yvec4,zvec4);
67  //_mm_store_pd(&x_ptr[count+6], xvec4);
68  _mm_stream_pd(&x_ptr[count+6], xvec4);
69 #endif
70 
71  yvec1 = _mm_mul_pd(xvec1,xvec1);
72  norm_vec1 = _mm_add_pd(norm_vec1,yvec1);
73  yvec2 = _mm_mul_pd(xvec2,xvec2);
74  norm_vec2 = _mm_add_pd(norm_vec2,yvec2);
75 
76 #if 0
77  yvec3 = _mm_mul_pd(xvec3,xvec3);
78  norm_vec3 = _mm_add_pd(norm_vec3,yvec3);
79 
80  yvec4 = _mm_mul_pd(xvec4,xvec4);
81  norm_vec4 = _mm_add_pd(norm_vec4,yvec4);
82 #endif
83  }
84 
85  norm_vec1 = _mm_add_pd(norm_vec1, norm_vec2);
86 #if 0
87  norm_vec3 = _mm_add_pd(norm_vec3, norm_vec4);
88  norm_vec1 = _mm_add_pd(norm_vec1, norm_vec3);
89 #endif
90 
91  a->norm_ptr[my_id] = ((double *)&norm_vec1)[0] + ((double *)&norm_vec1)[1];
92  }
93  else {
94  QDPIO::cout << "ord_xmyz_normx_kernel_sse.h: len not divisible by 4" << std::endl;
95  QDP_abort(1);
96  }
97 }
Complex a
Definition: invbicg.cc:95
int count
Definition: octave.h:14
void ord_xymz_normx_kernel(int lo, int hi, int my_id, ord_xymz_normx_arg *a)
int norm
Definition: qtopcor.cc:35