14 int len = atom*(hi-lo);
16 x_ptr = &(
a->x_ptr[low]);
17 y_ptr = &(
a->y_ptr[low]);
18 z_ptr = &(
a->z_ptr[low]);
20 __m128d norm_vec1 = _mm_set_pd((
double)0,(
double)0);
21 __m128d norm_vec2 = _mm_set_pd((
double)0,(
double)0);
23 __m128d norm_vec3 = _mm_set_pd((
double)0,(
double)0);
24 __m128d norm_vec4 = _mm_set_pd((
double)0,(
double)0);
47 yvec1 = _mm_load_pd(&y_ptr[
count]);
48 zvec1 = _mm_load_pd(&z_ptr[
count]);
49 xvec1 = _mm_sub_pd(yvec1,zvec1);
50 _mm_store_pd(&x_ptr[
count], xvec1);
52 yvec2 = _mm_load_pd(&y_ptr[
count+2]);
53 zvec2 = _mm_load_pd(&z_ptr[
count+2]);
54 xvec2 = _mm_sub_pd(yvec2,zvec2);
55 _mm_store_pd(&x_ptr[
count+2], xvec2);
58 yvec3 = _mm_load_pd(&y_ptr[
count+4]);
59 zvec3 = _mm_load_pd(&z_ptr[
count+4]);
60 xvec3 = _mm_sub_pd(yvec3,zvec3);
62 _mm_stream_pd(&x_ptr[
count+4], xvec3);
64 yvec4 = _mm_load_pd(&y_ptr[
count+6]);
65 zvec4 = _mm_load_pd(&z_ptr[
count+6]);
66 xvec4 = _mm_sub_pd(yvec4,zvec4);
68 _mm_stream_pd(&x_ptr[
count+6], xvec4);
71 yvec1 = _mm_mul_pd(xvec1,xvec1);
72 norm_vec1 = _mm_add_pd(norm_vec1,yvec1);
73 yvec2 = _mm_mul_pd(xvec2,xvec2);
74 norm_vec2 = _mm_add_pd(norm_vec2,yvec2);
77 yvec3 = _mm_mul_pd(xvec3,xvec3);
78 norm_vec3 = _mm_add_pd(norm_vec3,yvec3);
80 yvec4 = _mm_mul_pd(xvec4,xvec4);
81 norm_vec4 = _mm_add_pd(norm_vec4,yvec4);
85 norm_vec1 = _mm_add_pd(norm_vec1, norm_vec2);
87 norm_vec3 = _mm_add_pd(norm_vec3, norm_vec4);
88 norm_vec1 = _mm_add_pd(norm_vec1, norm_vec3);
91 a->norm_ptr[my_id] = ((
double *)&norm_vec1)[0] + ((
double *)&norm_vec1)[1];
94 QDPIO::cout <<
"ord_xmyz_normx_kernel_sse.h: len not divisible by 4" << std::endl;
void ord_xymz_normx_kernel(int lo, int hi, int my_id, ord_xymz_normx_arg *a)