4 #ifdef QDPJIT_IS_QDPJITNVVM
10 void function_get_fs_bs_exec(CUfunction
function,
11 const LatticeColorMatrix&
Q,
12 const LatticeColorMatrix& QQ,
13 multi1d<LatticeComplex>& f,
14 multi1d<LatticeComplex>& b1,
15 multi1d<LatticeComplex>& b2,
20 AddressLeaf addr_leaf(all);
22 int junk_0 = forEach(
Q, addr_leaf, NullCombine());
23 int junk_1 = forEach(QQ, addr_leaf, NullCombine());
24 int junk_2 = forEach(f[0], addr_leaf, NullCombine());
25 int junk_3 = forEach(f[1], addr_leaf, NullCombine());
26 int junk_4 = forEach(f[2], addr_leaf, NullCombine());
27 int junk_5 = forEach(b1[0], addr_leaf, NullCombine());
28 int junk_6 = forEach(b1[1], addr_leaf, NullCombine());
29 int junk_7 = forEach(b1[2], addr_leaf, NullCombine());
30 int junk_8 = forEach(b2[0], addr_leaf, NullCombine());
31 int junk_9 = forEach(b2[1], addr_leaf, NullCombine());
32 int junk_10= forEach(b2[2], addr_leaf, NullCombine());
35 int hi = Layout::sitesOnNode();
36 #ifndef QDP_JIT_NVVM_USE_LEGACY_LAUNCH
37 JitParam jit_lo( QDP_get_global_cache().addJitParamInt( lo ) );
38 JitParam jit_hi( QDP_get_global_cache().addJitParamInt( hi ) );
39 JitParam jit_dobs( QDP_get_global_cache().addJitParamBool( dobs ) );
40 std::vector<QDPCache::ArgKey> ids;
41 ids.push_back( jit_lo.get_id() );
42 ids.push_back( jit_hi.get_id() );
43 ids.push_back( jit_dobs.get_id() );
44 for(
unsigned i=0;
i < addr_leaf.ids.size(); ++
i)
45 ids.push_back( addr_leaf.ids[
i] );
46 jit_launch(
function,hi-lo,ids);
48 unsigned char dobs_u8 = dobs ? 1 : 0;
49 std::vector<void*> addr;
50 addr.push_back( &lo );
51 addr.push_back( &hi );
52 addr.push_back( &dobs_u8 );
53 int addr_dest=addr.size();
54 for(
int i=0;
i < addr_leaf.addr.size(); ++
i) {
55 addr.push_back( &addr_leaf.addr[
i] );
57 jit_launch(
function,hi-lo,addr);
63 WordREG<REAL> jit_constant(
double f )
65 return WordREG<REAL>(f);
69 CUfunction function_get_fs_bs_build(
const LatticeColorMatrix&
Q,
70 const LatticeColorMatrix& QQ,
71 multi1d<LatticeComplex>& f,
72 multi1d<LatticeComplex>& b1,
73 multi1d<LatticeComplex>& b2)
75 if (ptx_db::db_enabled) {
76 CUfunction
func = llvm_ptx_db( __PRETTY_FUNCTION__ );
83 llvm_start_new_function();
85 ParamRef p_lo = llvm_add_param<int>();
86 ParamRef p_hi = llvm_add_param<int>();
87 ParamRef p_dobs = llvm_add_param<bool>();
91 typedef typename LeafFunctor<LatticeColorMatrix, ParamLeaf>::Type_t LCMJIT;
92 typedef typename LeafFunctor<LatticeComplex , ParamLeaf>::Type_t LCJIT;
94 LCMJIT Q_jit(forEach(
Q, param_leaf, TreeCombine()));
95 LCMJIT QQ_jit(forEach(QQ, param_leaf, TreeCombine()));
96 LCJIT f0_jit(forEach(f[0], param_leaf, TreeCombine()));
97 LCJIT f1_jit(forEach(f[1], param_leaf, TreeCombine()));
98 LCJIT f2_jit(forEach(f[2], param_leaf, TreeCombine()));
99 LCJIT b10_jit(forEach(b1[0], param_leaf, TreeCombine()));
100 LCJIT b11_jit(forEach(b1[1], param_leaf, TreeCombine()));
101 LCJIT b12_jit(forEach(b1[2], param_leaf, TreeCombine()));
102 LCJIT b20_jit(forEach(b2[0], param_leaf, TreeCombine()));
103 LCJIT b21_jit(forEach(b2[1], param_leaf, TreeCombine()));
104 LCJIT b22_jit(forEach(b2[2], param_leaf, TreeCombine()));
106 llvm::Value* r_lo = llvm_derefParam( p_lo );
107 llvm::Value* r_hi = llvm_derefParam( p_hi );
108 llvm::Value* r_dobs = llvm_derefParam( p_dobs );
111 llvm::Value* r_idx = llvm_thread_idx();
113 llvm_cond_exit( llvm_ge( r_idx , r_hi ) );
116 auto& Q_j = Q_jit.elem(JitDeviceLayout::Coalesced,r_idx);
117 auto& QQ_j = QQ_jit.elem(JitDeviceLayout::Coalesced,r_idx);
119 auto& f0_j = f0_jit.elem(JitDeviceLayout::Coalesced,r_idx);
120 auto& f1_j = f1_jit.elem(JitDeviceLayout::Coalesced,r_idx);
121 auto& f2_j = f2_jit.elem(JitDeviceLayout::Coalesced,r_idx);
123 auto& b10_j = b10_jit.elem(JitDeviceLayout::Coalesced,r_idx);
124 auto& b11_j = b11_jit.elem(JitDeviceLayout::Coalesced,r_idx);
125 auto& b12_j = b12_jit.elem(JitDeviceLayout::Coalesced,r_idx);
126 auto& b20_j = b20_jit.elem(JitDeviceLayout::Coalesced,r_idx);
127 auto& b21_j = b21_jit.elem(JitDeviceLayout::Coalesced,r_idx);
128 auto& b22_j = b22_jit.elem(JitDeviceLayout::Coalesced,r_idx);
130 llvm::BasicBlock * block_exit = llvm_new_basic_block();
133 PColorMatrixREG< RComplexREG< WordREG<REAL> >, Nc> Q_site = Q_j.elem();
134 PColorMatrixREG< RComplexREG< WordREG<REAL> >, Nc> QQ_site = QQ_j.elem();
135 PColorMatrixREG< RComplexREG< WordREG<REAL> >, Nc> QQQ = QQ_site*Q_site;
145 PScalarREG< RScalarREG< WordREG<REAL> > > trQQQ = realTrace(QQQ);
146 PScalarREG< RScalarREG< WordREG<REAL> > > trQQ = realTrace(QQ_site);
148 WordREG<REAL> c0 = jit_constant((REAL)1/(REAL)3) * trQQQ.elem().elem();
149 WordREG<REAL> c1 = jit_constant((REAL)1/(REAL)2) * trQQ.elem().elem();
151 llvm::BasicBlock * block_c1_lt = llvm_new_basic_block();
152 llvm::BasicBlock * block_c1_nlt = llvm_new_basic_block();
153 llvm_cond_branch( llvm_lt( c1.get_val() , llvm_create_value( 4.0e-3 ) ) , block_c1_lt , block_c1_nlt );
155 llvm_set_insert_point( block_c1_lt );
157 f0_j.elem().elem().real() = jit_constant(1.0) - c0 * c0 / jit_constant(720.0);
158 f0_j.elem().elem().imag() = -( c0 / jit_constant(6.0) )*( jit_constant(1.0) -(c1/jit_constant(20.0))*(jit_constant(1.0)-(c1/jit_constant(42.0)))) ;
160 f1_j.elem().elem().real() = c0/jit_constant(24.0)*(jit_constant(1.0)-c1/jit_constant(15.0)*(jit_constant(1.0)-jit_constant(3.0)*c1/jit_constant(112.0))) ;
161 f1_j.elem().elem().imag() = jit_constant(1.0)-c1/jit_constant(6.0)*(jit_constant(1.0)-c1/jit_constant(20.0)*(jit_constant(1.0)-c1/jit_constant(42.0)))-c0*c0/jit_constant(5040.0);
163 f2_j.elem().elem().real() = jit_constant(0.5)*(jit_constant(-1.0)+c1/jit_constant(12.0)*(jit_constant(1.0)-c1/jit_constant(30.0)*(jit_constant(1.0)-c1/jit_constant(56.0)))+c0*c0/jit_constant(20160.0));
164 f2_j.elem().elem().imag() = jit_constant(0.5)*(c0/jit_constant(60.0)*(jit_constant(1.0)-c1/jit_constant(21.0)*(jit_constant(1.0)-c1/jit_constant(48.0))));
167 llvm::BasicBlock * block_dobs_0 = llvm_new_basic_block();
168 llvm_cond_branch( r_dobs , block_dobs_0 , block_exit );
171 llvm_set_insert_point( block_dobs_0 );
173 b20_j.elem().elem().real() = -c0/jit_constant(360.0);
174 b20_j.elem().elem().imag() = -jit_constant(1.0/6.0)*(jit_constant(1.0)-(c1/jit_constant(20.0))*(jit_constant(1.0)-c1/jit_constant(42.0)));
178 b10_j.elem().elem().real() = jit_constant(0);
179 b10_j.elem().elem().imag() = (c0/jit_constant(120.0))*(jit_constant(1.0)-c1/jit_constant(21.0));
183 b21_j.elem().elem().real() = jit_constant(1.0/24.0)*(jit_constant(1.0)-c1/jit_constant(15.0)*(jit_constant(1.0)-jit_constant(3.0)*c1/jit_constant(112.0)));
184 b21_j.elem().elem().imag() = -c0/jit_constant(2520.0);
188 b11_j.elem().elem().real() = -c0/jit_constant(360.0)*(jit_constant(1.0) - jit_constant(3.0)*c1/jit_constant(56.0) );
189 b11_j.elem().elem().imag() = -jit_constant(1.0/6.0)*(jit_constant(1.0)-c1/jit_constant(10.0)*(jit_constant(1.0)-c1/jit_constant(28.0)));
192 b22_j.elem().elem().real() = jit_constant(0.5)*c0/jit_constant(10080.0);
193 b22_j.elem().elem().imag() = jit_constant(0.5)*( jit_constant(1.0/60.0)*(jit_constant(1.0)-c1/jit_constant(21.0)*(jit_constant(1.0)-c1/jit_constant(48.0))) );
196 b12_j.elem().elem().real() = jit_constant(0.5)*( jit_constant(1.0/12.0)*(jit_constant(1.0)-(jit_constant(2.0)*c1/jit_constant(30.0))*(jit_constant(1.0)-jit_constant(3.0)*c1/jit_constant(112.0))) );
197 b12_j.elem().elem().imag() = jit_constant(0.5)*( -c0/jit_constant(1260.0)*(jit_constant(1.0)-c1/jit_constant(24.0)) );
199 llvm_branch( block_exit );
205 llvm_set_insert_point( block_c1_nlt );
208 llvm::Value* c0_negativeP = llvm_lt( c0.get_val() , llvm_create_value(0.0) );
209 WordREG<REAL> c0abs = fabs(c0);
210 WordREG<REAL> c0max = jit_constant(2.0) * pow( c1 / jit_constant(3.0) , jit_constant(1.5) );
212 WordREG<REAL>
eps = (c0max - c0abs)/c0max;
218 llvm::Value* theta_phi0;
219 llvm::Value* theta_phi1;
220 llvm::Value* theta_phi2;
222 llvm::BasicBlock * block_eps_0 = llvm_new_basic_block();
223 llvm::BasicBlock * block_eps_1 = llvm_new_basic_block();
224 llvm::BasicBlock * block_eps_2 = llvm_new_basic_block();
225 llvm::BasicBlock * block_eps_3 = llvm_new_basic_block();
226 llvm::BasicBlock * block_eps_exit = llvm_new_basic_block();
228 llvm_cond_branch( llvm_lt(
eps.get_val() , llvm_create_value(0.0) ) , block_eps_0 , block_eps_1 );
230 llvm_set_insert_point( block_eps_0 );
231 theta_phi0 = llvm_create_value(0.0);
232 llvm_branch(block_eps_exit);
234 llvm_set_insert_point( block_eps_1 );
235 llvm_cond_branch( llvm_lt(
eps.get_val() , llvm_create_value(1.0e-3) ) , block_eps_2 , block_eps_3 );
237 llvm_set_insert_point( block_eps_2 );
238 WordREG<REAL> sqtwo = sqrt( jit_constant(2.0) );
239 WordREG<REAL> theta_tmp;
243 ( jit_constant(1.0) +
244 ( jit_constant(1/(REAL)12) +
245 ( jit_constant(3/(REAL)160) +
246 ( jit_constant(5/(REAL)896) +
247 ( jit_constant(35/(REAL)18432) +
248 jit_constant(63/(REAL)90112) *
eps ) *
253 theta_phi1 = theta_tmp.get_val();
254 llvm_branch(block_eps_exit);
256 llvm_set_insert_point( block_eps_3 );
257 WordREG<REAL> theta_tmp2;
258 theta_tmp2 = acos( c0abs/c0max );
259 theta_phi2 = theta_tmp2.get_val();
260 llvm_branch(block_eps_exit);
262 llvm_set_insert_point( block_eps_exit );
264 llvm::PHINode* theta_phi = llvm_phi( llvm_type<REAL>::value , 3 );
265 theta_phi->addIncoming( theta_phi0 , block_eps_0 );
266 theta_phi->addIncoming( theta_phi1 , block_eps_2 );
267 theta_phi->addIncoming( theta_phi2 , block_eps_3 );
269 theta.setup( theta_phi );
273 multi1d<WordREG<REAL> > f_site_re(3);
274 multi1d<WordREG<REAL> > f_site_im(3);
276 multi1d<WordREG<REAL> > b1_site_re_phi0(3);
277 multi1d<WordREG<REAL> > b1_site_im_phi0(3);
279 multi1d<WordREG<REAL> > b2_site_re_phi0(3);
280 multi1d<WordREG<REAL> > b2_site_im_phi0(3);
284 WordREG<REAL>
u = sqrt(c1/jit_constant(3.0))*cos(theta/jit_constant(3.0));
285 WordREG<REAL> w = sqrt(c1)*sin(theta/jit_constant(3.0));
287 WordREG<REAL> u_sq =
u*
u;
288 WordREG<REAL> w_sq = w*w;
290 WordREG<REAL> xi0,xi1;
292 llvm::Value *w_smallP = llvm_lt( (fabs( w )).get_val() , llvm_create_value( 0.05 ) );
293 llvm::BasicBlock * block_xi0_exit = llvm_new_basic_block();
295 llvm::BasicBlock * block_xi0_small = llvm_new_basic_block();
296 llvm::BasicBlock * block_xi0_not_small = llvm_new_basic_block();
297 llvm::Value* xi0_phi0;
298 llvm::Value* xi0_phi1;
300 llvm_cond_branch( w_smallP , block_xi0_small , block_xi0_not_small );
302 llvm_set_insert_point( block_xi0_small );
303 WordREG<REAL> xi0_tmp0 =
305 (jit_constant(1.0/6.0)*w_sq*( jit_constant(1.0) -
306 (jit_constant(1.0/20.)*w_sq*( jit_constant(1.0) -
307 (jit_constant(1.0/42.0)*w_sq ) ))));
308 xi0_phi0 = xi0_tmp0.get_val();
310 llvm_branch( block_xi0_exit );
312 llvm_set_insert_point( block_xi0_not_small );
313 WordREG<REAL> xi0_tmp1 = sin(w)/w;
314 xi0_phi1 = xi0_tmp1.get_val();
315 llvm_branch( block_xi0_exit );
317 llvm_set_insert_point( block_xi0_exit );
319 llvm::PHINode* xi0_phi = llvm_phi( llvm_type<REAL>::value , 2 );
320 xi0_phi->addIncoming( xi0_phi0 , block_xi0_small );
321 xi0_phi->addIncoming( xi0_phi1 , block_xi0_not_small );
322 xi0.setup( xi0_phi );
325 llvm::BasicBlock * block_dobs1 = llvm_new_basic_block();
326 llvm::BasicBlock * block_dobs1_exit = llvm_new_basic_block();
327 llvm_cond_branch( r_dobs , block_dobs1 , block_dobs1_exit );
328 llvm_set_insert_point( block_dobs1 );
330 llvm::BasicBlock * block_xi1_small = llvm_new_basic_block();
331 llvm::BasicBlock * block_xi1_not_small = llvm_new_basic_block();
332 llvm::Value* xi1_phi0;
333 llvm::Value* xi1_phi1;
336 llvm_cond_branch( w_smallP , block_xi1_small , block_xi1_not_small );
338 llvm_set_insert_point( block_xi1_small );
339 WordREG<REAL> xi1_tmp0 =
341 ( jit_constant((REAL)1/(REAL)3) -
342 jit_constant((REAL)1/(REAL)30)*w_sq*( jit_constant((REAL)1) -
343 jit_constant((REAL)1/(REAL)28)*w_sq*( jit_constant((REAL)1) -
344 jit_constant((REAL)1/(REAL)54)*w_sq ) ) );
345 xi1_phi0 = xi1_tmp0.get_val();
347 llvm_branch( block_dobs1_exit );
349 llvm_set_insert_point( block_xi1_not_small );
350 WordREG<REAL> xi1_tmp1 = cos(w)/w_sq - sin(w)/(w_sq*w);
351 xi1_phi1 = xi1_tmp1.get_val();
353 llvm_branch( block_dobs1_exit );
359 llvm_set_insert_point( block_dobs1_exit );
361 llvm::PHINode* xi1_phi = llvm_phi( llvm_type<REAL>::value , 3 );
362 xi1_phi->addIncoming( xi1_phi0 , block_xi1_small );
363 xi1_phi->addIncoming( xi1_phi1 , block_xi1_not_small );
364 xi1_phi->addIncoming( llvm_cast( llvm_type<REAL>::value , llvm_create_value(0.0) ) , block_xi0_exit );
365 xi1.setup( xi1_phi );
367 WordREG<REAL> cosu = cos(
u);
368 WordREG<REAL> sinu = sin(
u);
369 WordREG<REAL> cosw = cos(w);
370 WordREG<REAL> sinw = sin(w);
371 WordREG<REAL> sin2u = sin(jit_constant(2.0)*
u);
372 WordREG<REAL> cos2u = cos(jit_constant(2.0)*
u);
373 WordREG<REAL> ucosu =
u*cosu;
374 WordREG<REAL> usinu =
u*sinu;
375 WordREG<REAL> ucos2u =
u*cos2u;
376 WordREG<REAL> usin2u =
u*sin2u;
378 WordREG<REAL> denum = jit_constant(9.0) * u_sq - w_sq;
382 WordREG<REAL> subexp1 = u_sq - w_sq;
383 WordREG<REAL> subexp2 = jit_constant(8.0)*u_sq*cosw;
384 WordREG<REAL> subexp3 = (jit_constant(3.0)*u_sq + w_sq)*xi0;
386 f_site_re[0] = ( (subexp1)*cos2u + cosu*subexp2 + jit_constant(2.0)*usinu*subexp3 ) / denum ;
387 f_site_im[0] = ( (subexp1)*sin2u - sinu*subexp2 + jit_constant(2.0)*ucosu*subexp3 ) / denum ;
391 WordREG<REAL> subexp = (jit_constant(3.0)*u_sq -w_sq)*xi0;
393 f_site_re[1] = (jit_constant(2.0)*(ucos2u - ucosu*cosw)+subexp*sinu)/denum;
394 f_site_im[1] = (jit_constant(2.0)*(usin2u + usinu*cosw)+subexp*cosu)/denum;
399 WordREG<REAL> subexp=jit_constant(3.0)*xi0;
401 f_site_re[2] = (cos2u - cosu*cosw -usinu*subexp) /denum ;
402 f_site_im[2] = (sin2u + sinu*cosw -ucosu*subexp) /denum ;
405 llvm::BasicBlock * block_dobs2 = llvm_new_basic_block();
406 llvm::BasicBlock * block_dobs2_exit = llvm_new_basic_block();
407 llvm_cond_branch( r_dobs , block_dobs2 , block_dobs2_exit );
408 llvm_set_insert_point( block_dobs2 );
411 multi1d<WordREG<REAL> > r_1_re(3);
412 multi1d<WordREG<REAL> > r_1_im(3);
413 multi1d<WordREG<REAL> > r_2_re(3);
414 multi1d<WordREG<REAL> > r_2_im(3);
420 WordREG<REAL> subexp1 = u_sq - w_sq;
421 WordREG<REAL> subexp2 = jit_constant(8.0)*cosw + (jit_constant(3.0)*u_sq + w_sq)*xi0 ;
422 WordREG<REAL> subexp3 = jit_constant(4.0)*u_sq*cosw - (jit_constant(9.0)*u_sq + w_sq)*xi0 ;
424 r_1_re[0] = jit_constant(2.0)*(ucos2u - sin2u *(subexp1)+ucosu*( subexp2 )- sinu*( subexp3 ) );
425 r_1_im[0] = jit_constant(2.0)*(usin2u + cos2u *(subexp1)-usinu*( subexp2 )- cosu*( subexp3 ) );
430 WordREG<REAL> subexp1 = cosw + jit_constant(3.0) * xi0;
431 WordREG<REAL> subexp2 = jit_constant(2.0)*cosw + xi0*(w_sq - jit_constant(3.0)*u_sq);
433 r_1_re[1] = jit_constant(2.0)*((cos2u - jit_constant(2.0)*usin2u) + usinu*( subexp1 )) - cosu*( subexp2 );
434 r_1_im[1] = jit_constant(2.0)*((sin2u + jit_constant(2.0)*ucos2u) + ucosu*( subexp1 )) + sinu*( subexp2 );
440 WordREG<REAL> subexp = cosw - jit_constant(3.0)*xi0;
441 r_1_re[2] = -jit_constant(2.0)*sin2u -jit_constant(3.0)*ucosu*xi0 + sinu*( subexp );
442 r_1_im[2] = jit_constant(2.0)*cos2u +jit_constant(3.0)*usinu*xi0 + cosu*( subexp );
449 WordREG<REAL> subexp = cosw + xi0 + jit_constant(3.0)*u_sq*xi1;
450 r_2_re[0] = -jit_constant(2.0)*(cos2u +
u*( jit_constant(4.0)*ucosu*xi0 - sinu*(subexp )) );
451 r_2_im[0] = -jit_constant(2.0)*(sin2u -
u*( jit_constant(4.0)*usinu*xi0 + cosu*(subexp )) );
458 WordREG<REAL> subexp = cosw + xi0 - jit_constant(3.0)*u_sq*xi1;
459 r_2_re[1] = jit_constant(2.0)*ucosu*xi0 - sinu*( subexp ) ;
460 r_2_im[1] = jit_constant(-2.0)*usinu*xi0 - cosu*( subexp ) ;
465 WordREG<REAL> subexp = jit_constant(3.0)*xi1;
467 r_2_re[2] = cosu*xi0 - usinu*subexp ;
468 r_2_im[2] = -( sinu*xi0 + ucosu*subexp ) ;
471 WordREG<REAL> b_denum=jit_constant(2.0)*denum*denum;
474 for(
int j=0;
j < 3;
j++) {
477 WordREG<REAL> subexp1 = jit_constant(2.0)*
u;
478 WordREG<REAL> subexp2 = jit_constant(3.0)*u_sq - w_sq;
479 WordREG<REAL> subexp3 = jit_constant(2.0)*(jit_constant(15.0)*u_sq + w_sq);
481 b1_site_re_phi0[
j]=( subexp1*r_1_re[
j] + subexp2*r_2_re[
j] - subexp3*f_site_re[
j] )/b_denum;
482 b1_site_im_phi0[
j]=( subexp1*r_1_im[
j] + subexp2*r_2_im[
j] - subexp3*f_site_im[
j] )/b_denum;
486 WordREG<REAL> subexp1 = jit_constant(3.0)*
u;
487 WordREG<REAL> subexp2 = jit_constant(24.0)*
u;
489 b2_site_re_phi0[
j]=( r_1_re[
j]- subexp1*r_2_re[
j] - subexp2 * f_site_re[
j] )/b_denum;
490 b2_site_im_phi0[
j]=( r_1_im[
j] -subexp1*r_2_im[
j] - subexp2 * f_site_im[
j] )/b_denum;
497 llvm::BasicBlock * block_c0neg = llvm_new_basic_block();
498 llvm::BasicBlock * block_c0neg_exit = llvm_new_basic_block();
499 llvm_cond_branch( c0_negativeP , block_c0neg , block_c0neg_exit );
500 llvm_set_insert_point( block_c0neg );
502 multi1d<WordREG<REAL> > b1_site_re_phi1(3);
503 multi1d<WordREG<REAL> > b1_site_im_phi1(3);
505 multi1d<WordREG<REAL> > b2_site_re_phi1(3);
506 multi1d<WordREG<REAL> > b2_site_im_phi1(3);
511 b1_site_im_phi1[0] = b1_site_im_phi0[0] * jit_constant(-1.0);
514 b1_site_re_phi1[1] = b1_site_re_phi0[1] * jit_constant(-1.0);
517 b1_site_im_phi1[2] = b1_site_im_phi0[2] * jit_constant(-1.0);
520 b2_site_re_phi1[0] = b2_site_re_phi0[0] * jit_constant(-1.0);
523 b2_site_im_phi1[1] = b2_site_im_phi0[1] * jit_constant(-1.0);
526 b2_site_re_phi1[2] = b2_site_re_phi0[2] * jit_constant(-1.0);
528 llvm_branch( block_c0neg_exit );
530 llvm_set_insert_point( block_c0neg_exit );
535 multi1d<WordREG<REAL> > f_site_re(3);
536 multi1d<WordREG<REAL> > f_site_im(3);
538 multi1d<WordREG<REAL> > b1_site_re(3);
539 multi1d<WordREG<REAL> > b1_site_im(3);
541 multi1d<WordREG<REAL> > b2_site_re(3);
542 multi1d<WordREG<REAL> > b2_site_im(3);
544 b1_site_im[1] = b1_site_im_phi0[1];
545 b1_site_re[0] = b1_site_re_phi0[0];
546 b1_site_re[2] = b1_site_re_phi0[2];
548 b2_site_re[1] = b2_site_re_phi0[1];
549 b2_site_im[0] = b2_site_im_phi0[0];
550 b2_site_im[2] = b2_site_im_phi0[2];
553 qdpPHI( b1_site_im[0] , b1_site_im_phi1[0] , block_c0neg ,
554 b1_site_im_phi0[0] , block_dobs2 );
555 qdpPHI( b1_site_re[1] , b1_site_re_phi1[1] , block_c0neg ,
556 b1_site_re_phi0[1] , block_dobs2 );
557 qdpPHI( b1_site_im[2] , b1_site_im_phi1[2] , block_c0neg ,
558 b1_site_im_phi0[2] , block_dobs2 );
559 qdpPHI( b2_site_re[0] , b2_site_re_phi1[0] , block_c0neg ,
560 b2_site_re_phi0[0] , block_dobs2 );
561 qdpPHI( b2_site_im[1] , b2_site_im_phi1[1] , block_c0neg ,
562 b2_site_im_phi0[1] , block_dobs2 );
563 qdpPHI( b2_site_re[2] , b2_site_re_phi1[2] , block_c0neg ,
564 b2_site_re_phi0[2] , block_dobs2 );
570 b10_j.elem().elem().real() = b1_site_re[0];
571 b10_j.elem().elem().imag() = b1_site_im[0];
573 b20_j.elem().elem().real() = b2_site_re[0];
574 b20_j.elem().elem().imag() = b2_site_im[0];
576 b11_j.elem().elem().real() = b1_site_re[1];
577 b11_j.elem().elem().imag() = b1_site_im[1];
579 b21_j.elem().elem().real() = b2_site_re[1];
580 b21_j.elem().elem().imag() = b2_site_im[1];
582 b12_j.elem().elem().real() = b1_site_re[2];
583 b12_j.elem().elem().imag() = b1_site_im[2];
585 b22_j.elem().elem().real() = b2_site_re[2];
586 b22_j.elem().elem().imag() = b2_site_im[2];
588 llvm_branch( block_dobs2_exit );
593 llvm_set_insert_point( block_dobs2_exit );
601 f0_j.elem().elem().real() = f_site_re[0];
602 f0_j.elem().elem().imag() = f_site_im[0];
604 f1_j.elem().elem().real() = f_site_re[1];
605 f1_j.elem().elem().imag() = f_site_im[1];
607 f2_j.elem().elem().real() = f_site_re[2];
608 f2_j.elem().elem().imag() = f_site_im[2];
612 llvm::BasicBlock * block_c0neg1 = llvm_new_basic_block();
613 llvm::BasicBlock * block_c0neg1_exit = llvm_new_basic_block();
614 llvm_cond_branch( c0_negativeP , block_c0neg1 , block_c0neg1_exit );
615 llvm_set_insert_point( block_c0neg1 );
620 f0_j.elem().elem().imag() *= jit_constant(-1.0);
624 f1_j.elem().elem().real() *= jit_constant(-1.0);
628 f2_j.elem().elem().imag() *= jit_constant(-1.0);
630 llvm_branch( block_c0neg1_exit );
632 llvm_set_insert_point( block_c0neg1_exit );
638 llvm_branch( block_exit );
639 llvm_set_insert_point( block_exit );
641 return jit_function_epilogue_get_cuf(
"jit_get_fs_bs.ptx" , __PRETTY_FUNCTION__ );
static multi1d< LatticeColorMatrix > u
multi1d< LatticeColorMatrix > Q