CHROMA
stout_utils_nvvm.cc
Go to the documentation of this file.
1 #include "qdp.h"
2 
3 #ifdef QDP_IS_QDPJIT
4 #ifdef QDPJIT_IS_QDPJITNVVM
5 
6 using namespace QDP;
7 
8 //#define QDP_JIT_NVVM_USE_LEGACY_LAUNCH
9 
10 void function_get_fs_bs_exec(CUfunction function,
11  const LatticeColorMatrix& Q,
12  const LatticeColorMatrix& QQ,
13  multi1d<LatticeComplex>& f,
14  multi1d<LatticeComplex>& b1,
15  multi1d<LatticeComplex>& b2,
16  bool dobs)
17 {
18  //QDPIO::cout << __FILE__ << ":" << __LINE__ << "\n";
19 
20  AddressLeaf addr_leaf(all);
21 
22  int junk_0 = forEach(Q, addr_leaf, NullCombine());
23  int junk_1 = forEach(QQ, addr_leaf, NullCombine());
24  int junk_2 = forEach(f[0], addr_leaf, NullCombine());
25  int junk_3 = forEach(f[1], addr_leaf, NullCombine());
26  int junk_4 = forEach(f[2], addr_leaf, NullCombine());
27  int junk_5 = forEach(b1[0], addr_leaf, NullCombine());
28  int junk_6 = forEach(b1[1], addr_leaf, NullCombine());
29  int junk_7 = forEach(b1[2], addr_leaf, NullCombine());
30  int junk_8 = forEach(b2[0], addr_leaf, NullCombine());
31  int junk_9 = forEach(b2[1], addr_leaf, NullCombine());
32  int junk_10= forEach(b2[2], addr_leaf, NullCombine());
33 
34  int lo = 0;
35  int hi = Layout::sitesOnNode();
36 #ifndef QDP_JIT_NVVM_USE_LEGACY_LAUNCH
37  JitParam jit_lo( QDP_get_global_cache().addJitParamInt( lo ) );
38  JitParam jit_hi( QDP_get_global_cache().addJitParamInt( hi ) );
39  JitParam jit_dobs( QDP_get_global_cache().addJitParamBool( dobs ) );
40  std::vector<QDPCache::ArgKey> ids;
41  ids.push_back( jit_lo.get_id() );
42  ids.push_back( jit_hi.get_id() );
43  ids.push_back( jit_dobs.get_id() );
44  for(unsigned i=0; i < addr_leaf.ids.size(); ++i)
45  ids.push_back( addr_leaf.ids[i] );
46  jit_launch(function,hi-lo,ids);
47 #else
48  unsigned char dobs_u8 = dobs ? 1 : 0;
49  std::vector<void*> addr;
50  addr.push_back( &lo );
51  addr.push_back( &hi );
52  addr.push_back( &dobs_u8 );
53  int addr_dest=addr.size();
54  for(int i=0; i < addr_leaf.addr.size(); ++i) {
55  addr.push_back( &addr_leaf.addr[i] );
56  }
57  jit_launch(function,hi-lo,addr);
58 #endif
59 }
60 
61 
62 
63 WordREG<REAL> jit_constant( double f )
64 {
65  return WordREG<REAL>(f);
66 }
67 
68 
69 CUfunction function_get_fs_bs_build(const LatticeColorMatrix& Q,
70  const LatticeColorMatrix& QQ,
71  multi1d<LatticeComplex>& f,
72  multi1d<LatticeComplex>& b1,
73  multi1d<LatticeComplex>& b2)
74 {
75  if (ptx_db::db_enabled) {
76  CUfunction func = llvm_ptx_db( __PRETTY_FUNCTION__ );
77  if (func)
78  return func;
79  }
80 
81  //std::cout << __PRETTY_FUNCTION__ << ": entering\n";
82 
83  llvm_start_new_function();
84 
85  ParamRef p_lo = llvm_add_param<int>();
86  ParamRef p_hi = llvm_add_param<int>();
87  ParamRef p_dobs = llvm_add_param<bool>();
88 
89  ParamLeaf param_leaf;
90 
91  typedef typename LeafFunctor<LatticeColorMatrix, ParamLeaf>::Type_t LCMJIT;
92  typedef typename LeafFunctor<LatticeComplex , ParamLeaf>::Type_t LCJIT;
93 
94  LCMJIT Q_jit(forEach(Q, param_leaf, TreeCombine()));
95  LCMJIT QQ_jit(forEach(QQ, param_leaf, TreeCombine()));
96  LCJIT f0_jit(forEach(f[0], param_leaf, TreeCombine()));
97  LCJIT f1_jit(forEach(f[1], param_leaf, TreeCombine()));
98  LCJIT f2_jit(forEach(f[2], param_leaf, TreeCombine()));
99  LCJIT b10_jit(forEach(b1[0], param_leaf, TreeCombine()));
100  LCJIT b11_jit(forEach(b1[1], param_leaf, TreeCombine()));
101  LCJIT b12_jit(forEach(b1[2], param_leaf, TreeCombine()));
102  LCJIT b20_jit(forEach(b2[0], param_leaf, TreeCombine()));
103  LCJIT b21_jit(forEach(b2[1], param_leaf, TreeCombine()));
104  LCJIT b22_jit(forEach(b2[2], param_leaf, TreeCombine()));
105 
106  llvm::Value* r_lo = llvm_derefParam( p_lo );
107  llvm::Value* r_hi = llvm_derefParam( p_hi );
108  llvm::Value* r_dobs = llvm_derefParam( p_dobs );
109  //llvm::Value* r_nobs = llvm_not( r_dobs );
110 
111  llvm::Value* r_idx = llvm_thread_idx();
112 
113  llvm_cond_exit( llvm_ge( r_idx , r_hi ) );
114 
115 
116  auto& Q_j = Q_jit.elem(JitDeviceLayout::Coalesced,r_idx);
117  auto& QQ_j = QQ_jit.elem(JitDeviceLayout::Coalesced,r_idx);
118 
119  auto& f0_j = f0_jit.elem(JitDeviceLayout::Coalesced,r_idx);
120  auto& f1_j = f1_jit.elem(JitDeviceLayout::Coalesced,r_idx);
121  auto& f2_j = f2_jit.elem(JitDeviceLayout::Coalesced,r_idx);
122 
123  auto& b10_j = b10_jit.elem(JitDeviceLayout::Coalesced,r_idx);
124  auto& b11_j = b11_jit.elem(JitDeviceLayout::Coalesced,r_idx);
125  auto& b12_j = b12_jit.elem(JitDeviceLayout::Coalesced,r_idx);
126  auto& b20_j = b20_jit.elem(JitDeviceLayout::Coalesced,r_idx);
127  auto& b21_j = b21_jit.elem(JitDeviceLayout::Coalesced,r_idx);
128  auto& b22_j = b22_jit.elem(JitDeviceLayout::Coalesced,r_idx);
129 
130  llvm::BasicBlock * block_exit = llvm_new_basic_block();
131  {
132  // Get the traces
133  PColorMatrixREG< RComplexREG< WordREG<REAL> >, Nc> Q_site = Q_j.elem();
134  PColorMatrixREG< RComplexREG< WordREG<REAL> >, Nc> QQ_site = QQ_j.elem();
135  PColorMatrixREG< RComplexREG< WordREG<REAL> >, Nc> QQQ = QQ_site*Q_site;
136 
137  // Real trQQQ;
138  // trQQQ.elem() = realTrace(QQQ);
139  // Real trQQ;
140  // trQQ.elem() = realTrace(QQ_site);
141 
142  // auto trQQQ = realTrace(QQQ);
143  // auto trQQ = realTrace(QQ_site);
144 
145  PScalarREG< RScalarREG< WordREG<REAL> > > trQQQ = realTrace(QQQ);
146  PScalarREG< RScalarREG< WordREG<REAL> > > trQQ = realTrace(QQ_site);
147 
148  WordREG<REAL> c0 = jit_constant((REAL)1/(REAL)3) * trQQQ.elem().elem(); // eq 13
149  WordREG<REAL> c1 = jit_constant((REAL)1/(REAL)2) * trQQ.elem().elem(); // eq 15
150 
151  llvm::BasicBlock * block_c1_lt = llvm_new_basic_block();
152  llvm::BasicBlock * block_c1_nlt = llvm_new_basic_block();
153  llvm_cond_branch( llvm_lt( c1.get_val() , llvm_create_value( 4.0e-3 ) ) , block_c1_lt , block_c1_nlt );
154 
155  llvm_set_insert_point( block_c1_lt );
156  { // if( c1 < 4.0e-3 )
157  f0_j.elem().elem().real() = jit_constant(1.0) - c0 * c0 / jit_constant(720.0);
158  f0_j.elem().elem().imag() = -( c0 / jit_constant(6.0) )*( jit_constant(1.0) -(c1/jit_constant(20.0))*(jit_constant(1.0)-(c1/jit_constant(42.0)))) ;
159 
160  f1_j.elem().elem().real() = c0/jit_constant(24.0)*(jit_constant(1.0)-c1/jit_constant(15.0)*(jit_constant(1.0)-jit_constant(3.0)*c1/jit_constant(112.0))) ;
161  f1_j.elem().elem().imag() = jit_constant(1.0)-c1/jit_constant(6.0)*(jit_constant(1.0)-c1/jit_constant(20.0)*(jit_constant(1.0)-c1/jit_constant(42.0)))-c0*c0/jit_constant(5040.0);
162 
163  f2_j.elem().elem().real() = jit_constant(0.5)*(jit_constant(-1.0)+c1/jit_constant(12.0)*(jit_constant(1.0)-c1/jit_constant(30.0)*(jit_constant(1.0)-c1/jit_constant(56.0)))+c0*c0/jit_constant(20160.0));
164  f2_j.elem().elem().imag() = jit_constant(0.5)*(c0/jit_constant(60.0)*(jit_constant(1.0)-c1/jit_constant(21.0)*(jit_constant(1.0)-c1/jit_constant(48.0))));
165 
166 
167  llvm::BasicBlock * block_dobs_0 = llvm_new_basic_block();
168  llvm_cond_branch( r_dobs , block_dobs_0 , block_exit );
169 
170  { // dobs
171  llvm_set_insert_point( block_dobs_0 );
172 
173  b20_j.elem().elem().real() = -c0/jit_constant(360.0);
174  b20_j.elem().elem().imag() = -jit_constant(1.0/6.0)*(jit_constant(1.0)-(c1/jit_constant(20.0))*(jit_constant(1.0)-c1/jit_constant(42.0)));
175 
176  // partial f0 / partial c1
177  //
178  b10_j.elem().elem().real() = jit_constant(0);
179  b10_j.elem().elem().imag() = (c0/jit_constant(120.0))*(jit_constant(1.0)-c1/jit_constant(21.0));
180 
181  // partial f1 / partial c0
182  //
183  b21_j.elem().elem().real() = jit_constant(1.0/24.0)*(jit_constant(1.0)-c1/jit_constant(15.0)*(jit_constant(1.0)-jit_constant(3.0)*c1/jit_constant(112.0)));
184  b21_j.elem().elem().imag() = -c0/jit_constant(2520.0);
185 
186 
187  // partial f1 / partial c1
188  b11_j.elem().elem().real() = -c0/jit_constant(360.0)*(jit_constant(1.0) - jit_constant(3.0)*c1/jit_constant(56.0) );
189  b11_j.elem().elem().imag() = -jit_constant(1.0/6.0)*(jit_constant(1.0)-c1/jit_constant(10.0)*(jit_constant(1.0)-c1/jit_constant(28.0)));
190 
191  // partial f2/ partial c0
192  b22_j.elem().elem().real() = jit_constant(0.5)*c0/jit_constant(10080.0);
193  b22_j.elem().elem().imag() = jit_constant(0.5)*( jit_constant(1.0/60.0)*(jit_constant(1.0)-c1/jit_constant(21.0)*(jit_constant(1.0)-c1/jit_constant(48.0))) );
194 
195  // partial f2/ partial c1
196  b12_j.elem().elem().real() = jit_constant(0.5)*( jit_constant(1.0/12.0)*(jit_constant(1.0)-(jit_constant(2.0)*c1/jit_constant(30.0))*(jit_constant(1.0)-jit_constant(3.0)*c1/jit_constant(112.0))) );
197  b12_j.elem().elem().imag() = jit_constant(0.5)*( -c0/jit_constant(1260.0)*(jit_constant(1.0)-c1/jit_constant(24.0)) );
198 
199  llvm_branch( block_exit );
200 
201  } // Dobs==true
202 
203  } // if (c1 < 4.0e-3 )
204 
205  llvm_set_insert_point( block_c1_nlt );
206 
207 
208  llvm::Value* c0_negativeP = llvm_lt( c0.get_val() , llvm_create_value(0.0) );
209  WordREG<REAL> c0abs = fabs(c0);
210  WordREG<REAL> c0max = jit_constant(2.0) * pow( c1 / jit_constant(3.0) , jit_constant(1.5) );
211  WordREG<REAL> theta;
212  WordREG<REAL> eps = (c0max - c0abs)/c0max;
213 
214 
215  // llvm::BasicBlock * block_dobs_0 = llvm_new_basic_block();
216  // llvm_cond_branch( r_dobs , block_dobs_0 , block_exit );
217 
218  llvm::Value* theta_phi0;
219  llvm::Value* theta_phi1;
220  llvm::Value* theta_phi2;
221 
222  llvm::BasicBlock * block_eps_0 = llvm_new_basic_block();
223  llvm::BasicBlock * block_eps_1 = llvm_new_basic_block();
224  llvm::BasicBlock * block_eps_2 = llvm_new_basic_block();
225  llvm::BasicBlock * block_eps_3 = llvm_new_basic_block();
226  llvm::BasicBlock * block_eps_exit = llvm_new_basic_block();
227 
228  llvm_cond_branch( llvm_lt( eps.get_val() , llvm_create_value(0.0) ) , block_eps_0 , block_eps_1 );
229 
230  llvm_set_insert_point( block_eps_0 ); // epsilon < 0
231  theta_phi0 = llvm_create_value(0.0);
232  llvm_branch(block_eps_exit);
233 
234  llvm_set_insert_point( block_eps_1 );
235  llvm_cond_branch( llvm_lt( eps.get_val() , llvm_create_value(1.0e-3) ) , block_eps_2 , block_eps_3 );
236 
237  llvm_set_insert_point( block_eps_2 ); // epsilon < 1.e-3
238  WordREG<REAL> sqtwo = sqrt( jit_constant(2.0) );
239  WordREG<REAL> theta_tmp;
240  theta_tmp =
241  sqtwo *
242  sqrt(eps) *
243  ( jit_constant(1.0) +
244  ( jit_constant(1/(REAL)12) +
245  ( jit_constant(3/(REAL)160) +
246  ( jit_constant(5/(REAL)896) +
247  ( jit_constant(35/(REAL)18432) +
248  jit_constant(63/(REAL)90112) * eps ) *
249  eps) *
250  eps) *
251  eps) *
252  eps);
253  theta_phi1 = theta_tmp.get_val();
254  llvm_branch(block_eps_exit);
255 
256  llvm_set_insert_point( block_eps_3 ); // else
257  WordREG<REAL> theta_tmp2;
258  theta_tmp2 = acos( c0abs/c0max );
259  theta_phi2 = theta_tmp2.get_val();
260  llvm_branch(block_eps_exit);
261 
262  llvm_set_insert_point( block_eps_exit ); // exit
263 
264  llvm::PHINode* theta_phi = llvm_phi( llvm_type<REAL>::value , 3 );
265  theta_phi->addIncoming( theta_phi0 , block_eps_0 );
266  theta_phi->addIncoming( theta_phi1 , block_eps_2 );
267  theta_phi->addIncoming( theta_phi2 , block_eps_3 );
268 
269  theta.setup( theta_phi );
270 
271 
272 
273  multi1d<WordREG<REAL> > f_site_re(3);
274  multi1d<WordREG<REAL> > f_site_im(3);
275 
276  multi1d<WordREG<REAL> > b1_site_re_phi0(3);
277  multi1d<WordREG<REAL> > b1_site_im_phi0(3);
278 
279  multi1d<WordREG<REAL> > b2_site_re_phi0(3);
280  multi1d<WordREG<REAL> > b2_site_im_phi0(3);
281 
282 
283 
284  WordREG<REAL> u = sqrt(c1/jit_constant(3.0))*cos(theta/jit_constant(3.0));
285  WordREG<REAL> w = sqrt(c1)*sin(theta/jit_constant(3.0));
286 
287  WordREG<REAL> u_sq = u*u;
288  WordREG<REAL> w_sq = w*w;
289 
290  WordREG<REAL> xi0,xi1;
291 
292  llvm::Value *w_smallP = llvm_lt( (fabs( w )).get_val() , llvm_create_value( 0.05 ) );
293  llvm::BasicBlock * block_xi0_exit = llvm_new_basic_block();
294  { // xi0
295  llvm::BasicBlock * block_xi0_small = llvm_new_basic_block();
296  llvm::BasicBlock * block_xi0_not_small = llvm_new_basic_block();
297  llvm::Value* xi0_phi0;
298  llvm::Value* xi0_phi1;
299 
300  llvm_cond_branch( w_smallP , block_xi0_small , block_xi0_not_small );
301 
302  llvm_set_insert_point( block_xi0_small );
303  WordREG<REAL> xi0_tmp0 =
304  jit_constant(1.0) -
305  (jit_constant(1.0/6.0)*w_sq*( jit_constant(1.0) -
306  (jit_constant(1.0/20.)*w_sq*( jit_constant(1.0) -
307  (jit_constant(1.0/42.0)*w_sq ) ))));
308  xi0_phi0 = xi0_tmp0.get_val();
309 
310  llvm_branch( block_xi0_exit );
311 
312  llvm_set_insert_point( block_xi0_not_small );
313  WordREG<REAL> xi0_tmp1 = sin(w)/w;
314  xi0_phi1 = xi0_tmp1.get_val();
315  llvm_branch( block_xi0_exit );
316 
317  llvm_set_insert_point( block_xi0_exit );
318 
319  llvm::PHINode* xi0_phi = llvm_phi( llvm_type<REAL>::value , 2 );
320  xi0_phi->addIncoming( xi0_phi0 , block_xi0_small );
321  xi0_phi->addIncoming( xi0_phi1 , block_xi0_not_small );
322  xi0.setup( xi0_phi );
323  } // xi0
324 
325  llvm::BasicBlock * block_dobs1 = llvm_new_basic_block();
326  llvm::BasicBlock * block_dobs1_exit = llvm_new_basic_block();
327  llvm_cond_branch( r_dobs , block_dobs1 , block_dobs1_exit );
328  llvm_set_insert_point( block_dobs1 );
329  //llvm::BasicBlock * block_xi1_exit = llvm_new_basic_block();
330  llvm::BasicBlock * block_xi1_small = llvm_new_basic_block();
331  llvm::BasicBlock * block_xi1_not_small = llvm_new_basic_block();
332  llvm::Value* xi1_phi0;
333  llvm::Value* xi1_phi1;
334  { // xi1
335 
336  llvm_cond_branch( w_smallP , block_xi1_small , block_xi1_not_small );
337 
338  llvm_set_insert_point( block_xi1_small );
339  WordREG<REAL> xi1_tmp0 =
340  jit_constant(-1.0)*
341  ( jit_constant((REAL)1/(REAL)3) -
342  jit_constant((REAL)1/(REAL)30)*w_sq*( jit_constant((REAL)1) -
343  jit_constant((REAL)1/(REAL)28)*w_sq*( jit_constant((REAL)1) -
344  jit_constant((REAL)1/(REAL)54)*w_sq ) ) );
345  xi1_phi0 = xi1_tmp0.get_val();
346  //llvm_branch( block_xi1_exit );
347  llvm_branch( block_dobs1_exit );
348 
349  llvm_set_insert_point( block_xi1_not_small );
350  WordREG<REAL> xi1_tmp1 = cos(w)/w_sq - sin(w)/(w_sq*w);
351  xi1_phi1 = xi1_tmp1.get_val();
352  //llvm_branch( block_xi1_exit );
353  llvm_branch( block_dobs1_exit );
354 
355  //llvm_set_insert_point( block_xi1_exit );
356 
357  //llvm_branch( block_dobs1_exit );
358  } // xi1
359  llvm_set_insert_point( block_dobs1_exit );
360 
361  llvm::PHINode* xi1_phi = llvm_phi( llvm_type<REAL>::value , 3 );
362  xi1_phi->addIncoming( xi1_phi0 , block_xi1_small );
363  xi1_phi->addIncoming( xi1_phi1 , block_xi1_not_small );
364  xi1_phi->addIncoming( llvm_cast( llvm_type<REAL>::value , llvm_create_value(0.0) ) , block_xi0_exit );
365  xi1.setup( xi1_phi );
366 
367  WordREG<REAL> cosu = cos(u);
368  WordREG<REAL> sinu = sin(u);
369  WordREG<REAL> cosw = cos(w);
370  WordREG<REAL> sinw = sin(w);
371  WordREG<REAL> sin2u = sin(jit_constant(2.0)*u);
372  WordREG<REAL> cos2u = cos(jit_constant(2.0)*u);
373  WordREG<REAL> ucosu = u*cosu;
374  WordREG<REAL> usinu = u*sinu;
375  WordREG<REAL> ucos2u = u*cos2u;
376  WordREG<REAL> usin2u = u*sin2u;
377 
378  WordREG<REAL> denum = jit_constant(9.0) * u_sq - w_sq;
379 
380 
381  {
382  WordREG<REAL> subexp1 = u_sq - w_sq;
383  WordREG<REAL> subexp2 = jit_constant(8.0)*u_sq*cosw;
384  WordREG<REAL> subexp3 = (jit_constant(3.0)*u_sq + w_sq)*xi0;
385 
386  f_site_re[0] = ( (subexp1)*cos2u + cosu*subexp2 + jit_constant(2.0)*usinu*subexp3 ) / denum ;
387  f_site_im[0] = ( (subexp1)*sin2u - sinu*subexp2 + jit_constant(2.0)*ucosu*subexp3 ) / denum ;
388  }
389 
390  {
391  WordREG<REAL> subexp = (jit_constant(3.0)*u_sq -w_sq)*xi0;
392 
393  f_site_re[1] = (jit_constant(2.0)*(ucos2u - ucosu*cosw)+subexp*sinu)/denum;
394  f_site_im[1] = (jit_constant(2.0)*(usin2u + usinu*cosw)+subexp*cosu)/denum;
395  }
396 
397 
398  {
399  WordREG<REAL> subexp=jit_constant(3.0)*xi0;
400 
401  f_site_re[2] = (cos2u - cosu*cosw -usinu*subexp) /denum ;
402  f_site_im[2] = (sin2u + sinu*cosw -ucosu*subexp) /denum ;
403  }
404 
405  llvm::BasicBlock * block_dobs2 = llvm_new_basic_block();
406  llvm::BasicBlock * block_dobs2_exit = llvm_new_basic_block();
407  llvm_cond_branch( r_dobs , block_dobs2 , block_dobs2_exit );
408  llvm_set_insert_point( block_dobs2 );
409  //if( dobs == true )
410  {
411  multi1d<WordREG<REAL> > r_1_re(3);
412  multi1d<WordREG<REAL> > r_1_im(3);
413  multi1d<WordREG<REAL> > r_2_re(3);
414  multi1d<WordREG<REAL> > r_2_im(3);
415 
416  // r_1[0]=Double(2)*cmplx(u, u_sq-w_sq)*exp2iu
417  // + 2.0*expmiu*( cmplx(8.0*u*cosw, -4.0*u_sq*cosw)
418  // + cmplx(u*(3.0*u_sq+w_sq),9.0*u_sq+w_sq)*xi0 );
419  {
420  WordREG<REAL> subexp1 = u_sq - w_sq;
421  WordREG<REAL> subexp2 = jit_constant(8.0)*cosw + (jit_constant(3.0)*u_sq + w_sq)*xi0 ;
422  WordREG<REAL> subexp3 = jit_constant(4.0)*u_sq*cosw - (jit_constant(9.0)*u_sq + w_sq)*xi0 ;
423 
424  r_1_re[0] = jit_constant(2.0)*(ucos2u - sin2u *(subexp1)+ucosu*( subexp2 )- sinu*( subexp3 ) );
425  r_1_im[0] = jit_constant(2.0)*(usin2u + cos2u *(subexp1)-usinu*( subexp2 )- cosu*( subexp3 ) );
426  }
427 
428  // r_1[1]=cmplx(2.0, 4.0*u)*exp2iu + expmiu*cmplx(-2.0*cosw-(w_sq-3.0*u_sq)*xi0,2.0*u*cosw+6.0*u*xi0);
429  {
430  WordREG<REAL> subexp1 = cosw + jit_constant(3.0) * xi0;
431  WordREG<REAL> subexp2 = jit_constant(2.0)*cosw + xi0*(w_sq - jit_constant(3.0)*u_sq);
432 
433  r_1_re[1] = jit_constant(2.0)*((cos2u - jit_constant(2.0)*usin2u) + usinu*( subexp1 )) - cosu*( subexp2 );
434  r_1_im[1] = jit_constant(2.0)*((sin2u + jit_constant(2.0)*ucos2u) + ucosu*( subexp1 )) + sinu*( subexp2 );
435  }
436 
437 
438  // r_1[2]=2.0*timesI(exp2iu) +expmiu*cmplx(-3.0*u*xi0, cosw-3*xi0);
439  {
440  WordREG<REAL> subexp = cosw - jit_constant(3.0)*xi0;
441  r_1_re[2] = -jit_constant(2.0)*sin2u -jit_constant(3.0)*ucosu*xi0 + sinu*( subexp );
442  r_1_im[2] = jit_constant(2.0)*cos2u +jit_constant(3.0)*usinu*xi0 + cosu*( subexp );
443  }
444 
445 
446  //r_2[0]=-2.0*exp2iu + 2*cmplx(0,u)*expmiu*cmplx(cosw+xi0+3*u_sq*xi1,
447  // 4*u*xi0);
448  {
449  WordREG<REAL> subexp = cosw + xi0 + jit_constant(3.0)*u_sq*xi1;
450  r_2_re[0] = -jit_constant(2.0)*(cos2u + u*( jit_constant(4.0)*ucosu*xi0 - sinu*(subexp )) );
451  r_2_im[0] = -jit_constant(2.0)*(sin2u - u*( jit_constant(4.0)*usinu*xi0 + cosu*(subexp )) );
452  }
453 
454 
455  // r_2[1]= expmiu*cmplx(cosw+xi0-3.0*u_sq*xi1, 2.0*u*xi0);
456  // r_2[1] = timesMinusI(r_2[1]);
457  {
458  WordREG<REAL> subexp = cosw + xi0 - jit_constant(3.0)*u_sq*xi1;
459  r_2_re[1] = jit_constant(2.0)*ucosu*xi0 - sinu*( subexp ) ;
460  r_2_im[1] = jit_constant(-2.0)*usinu*xi0 - cosu*( subexp ) ;
461  }
462 
463  //r_2[2]=expmiu*cmplx(xi0, -3.0*u*xi1);
464  {
465  WordREG<REAL> subexp = jit_constant(3.0)*xi1;
466 
467  r_2_re[2] = cosu*xi0 - usinu*subexp ;
468  r_2_im[2] = -( sinu*xi0 + ucosu*subexp ) ;
469  }
470 
471  WordREG<REAL> b_denum=jit_constant(2.0)*denum*denum;
472 
473 
474  for(int j=0; j < 3; j++) {
475 
476  {
477  WordREG<REAL> subexp1 = jit_constant(2.0)*u;
478  WordREG<REAL> subexp2 = jit_constant(3.0)*u_sq - w_sq;
479  WordREG<REAL> subexp3 = jit_constant(2.0)*(jit_constant(15.0)*u_sq + w_sq);
480 
481  b1_site_re_phi0[j]=( subexp1*r_1_re[j] + subexp2*r_2_re[j] - subexp3*f_site_re[j] )/b_denum;
482  b1_site_im_phi0[j]=( subexp1*r_1_im[j] + subexp2*r_2_im[j] - subexp3*f_site_im[j] )/b_denum;
483  }
484 
485  {
486  WordREG<REAL> subexp1 = jit_constant(3.0)*u;
487  WordREG<REAL> subexp2 = jit_constant(24.0)*u;
488 
489  b2_site_re_phi0[j]=( r_1_re[j]- subexp1*r_2_re[j] - subexp2 * f_site_re[j] )/b_denum;
490  b2_site_im_phi0[j]=( r_1_im[j] -subexp1*r_2_im[j] - subexp2 * f_site_im[j] )/b_denum;
491  }
492  }
493 
494 
495  // Now flip the coefficients of the b-s
496 
497  llvm::BasicBlock * block_c0neg = llvm_new_basic_block();
498  llvm::BasicBlock * block_c0neg_exit = llvm_new_basic_block();
499  llvm_cond_branch( c0_negativeP , block_c0neg , block_c0neg_exit );
500  llvm_set_insert_point( block_c0neg );
501 
502  multi1d<WordREG<REAL> > b1_site_re_phi1(3);
503  multi1d<WordREG<REAL> > b1_site_im_phi1(3);
504 
505  multi1d<WordREG<REAL> > b2_site_re_phi1(3);
506  multi1d<WordREG<REAL> > b2_site_im_phi1(3);
507 
508  //if( c0_negativeP )
509  {
510  //b1_site[0] = conj(b1_site[0]);
511  b1_site_im_phi1[0] = b1_site_im_phi0[0] * jit_constant(-1.0);
512 
513  //b1_site[1] = -conj(b1_site[1]);
514  b1_site_re_phi1[1] = b1_site_re_phi0[1] * jit_constant(-1.0);
515 
516  //b1_site[2] = conj(b1_site[2]);
517  b1_site_im_phi1[2] = b1_site_im_phi0[2] * jit_constant(-1.0);
518 
519  //b2_site[0] = -conj(b2_site[0]);
520  b2_site_re_phi1[0] = b2_site_re_phi0[0] * jit_constant(-1.0);
521 
522  //b2_site[1] = conj(b2_site[1]);
523  b2_site_im_phi1[1] = b2_site_im_phi0[1] * jit_constant(-1.0);
524 
525  //b2_site[2] = -conj(b2_site[2]);
526  b2_site_re_phi1[2] = b2_site_re_phi0[2] * jit_constant(-1.0);
527  }
528  llvm_branch( block_c0neg_exit );
529 
530  llvm_set_insert_point( block_c0neg_exit );
531 
532  //
533  // Now, PHI' them together
534  //
535  multi1d<WordREG<REAL> > f_site_re(3);
536  multi1d<WordREG<REAL> > f_site_im(3);
537 
538  multi1d<WordREG<REAL> > b1_site_re(3);
539  multi1d<WordREG<REAL> > b1_site_im(3);
540 
541  multi1d<WordREG<REAL> > b2_site_re(3);
542  multi1d<WordREG<REAL> > b2_site_im(3);
543 
544  b1_site_im[1] = b1_site_im_phi0[1];
545  b1_site_re[0] = b1_site_re_phi0[0];
546  b1_site_re[2] = b1_site_re_phi0[2];
547 
548  b2_site_re[1] = b2_site_re_phi0[1];
549  b2_site_im[0] = b2_site_im_phi0[0];
550  b2_site_im[2] = b2_site_im_phi0[2];
551 
552 
553  qdpPHI( b1_site_im[0] , b1_site_im_phi1[0] , block_c0neg ,
554  b1_site_im_phi0[0] , block_dobs2 );
555  qdpPHI( b1_site_re[1] , b1_site_re_phi1[1] , block_c0neg ,
556  b1_site_re_phi0[1] , block_dobs2 );
557  qdpPHI( b1_site_im[2] , b1_site_im_phi1[2] , block_c0neg ,
558  b1_site_im_phi0[2] , block_dobs2 );
559  qdpPHI( b2_site_re[0] , b2_site_re_phi1[0] , block_c0neg ,
560  b2_site_re_phi0[0] , block_dobs2 );
561  qdpPHI( b2_site_im[1] , b2_site_im_phi1[1] , block_c0neg ,
562  b2_site_im_phi0[1] , block_dobs2 );
563  qdpPHI( b2_site_re[2] , b2_site_re_phi1[2] , block_c0neg ,
564  b2_site_re_phi0[2] , block_dobs2 );
565 
566 
567  // Load back into the lattice sized object
568  //for(int j=0; j < 3; j++) {
569 
570  b10_j.elem().elem().real() = b1_site_re[0];
571  b10_j.elem().elem().imag() = b1_site_im[0];
572 
573  b20_j.elem().elem().real() = b2_site_re[0]; //ok
574  b20_j.elem().elem().imag() = b2_site_im[0]; //ok
575 
576  b11_j.elem().elem().real() = b1_site_re[1]; //ok
577  b11_j.elem().elem().imag() = b1_site_im[1]; //ok
578 
579  b21_j.elem().elem().real() = b2_site_re[1];
580  b21_j.elem().elem().imag() = b2_site_im[1];
581 
582  b12_j.elem().elem().real() = b1_site_re[2];
583  b12_j.elem().elem().imag() = b1_site_im[2];
584 
585  b22_j.elem().elem().real() = b2_site_re[2];
586  b22_j.elem().elem().imag() = b2_site_im[2];
587 
588  llvm_branch( block_dobs2_exit );
589  //}
590  } // end of if (dobs==true)
591  //llvm_label( cont_6 );
592 
593  llvm_set_insert_point( block_dobs2_exit );
594 
595 
596  // Now when everything is done flip signs of the b-s (can't do this before
597  // as the unflipped f-s are needed to find the b-s
598 
599  // Load back into the lattice sized object
600  // for(int j=0; j < 3; j++) {
601  f0_j.elem().elem().real() = f_site_re[0];
602  f0_j.elem().elem().imag() = f_site_im[0];
603 
604  f1_j.elem().elem().real() = f_site_re[1];
605  f1_j.elem().elem().imag() = f_site_im[1];
606 
607  f2_j.elem().elem().real() = f_site_re[2];
608  f2_j.elem().elem().imag() = f_site_im[2];
609  //}
610 
611 
612  llvm::BasicBlock * block_c0neg1 = llvm_new_basic_block();
613  llvm::BasicBlock * block_c0neg1_exit = llvm_new_basic_block();
614  llvm_cond_branch( c0_negativeP , block_c0neg1 , block_c0neg1_exit );
615  llvm_set_insert_point( block_c0neg1 );
616 
617  // if( c0_negativeP ) {
618  // f_site[0] = conj(f_site[0]);
619  //f_site_im[0] *= jit_constant(-1.0);
620  f0_j.elem().elem().imag() *= jit_constant(-1.0);
621 
622  //f_site[1] = -conj(f_site[1]);
623  //f_site_re[1] *= jit_constant(-1.0);
624  f1_j.elem().elem().real() *= jit_constant(-1.0);
625 
626  //f_site[2] = conj(f_site[2]);
627  //f_site_im[2] *= jit_constant(-1.0);
628  f2_j.elem().elem().imag() *= jit_constant(-1.0);
629 
630  llvm_branch( block_c0neg1_exit );
631 
632  llvm_set_insert_point( block_c0neg1_exit );
633 
634  //}
635 
636  } // End of if( corner_caseP ) else {}
637 
638  llvm_branch( block_exit );
639  llvm_set_insert_point( block_exit );
640 
641  return jit_function_epilogue_get_cuf("jit_get_fs_bs.ptx" , __PRETTY_FUNCTION__ );
642 }
643 
644 
645 #endif
646 #endif
unsigned j
Definition: ldumul_w.cc:35
unsigned i
Definition: ldumul_w.cc:34
static multi1d< LatticeColorMatrix > u
multi1d< LatticeColorMatrix > Q
Definition: t_clover.cc:12
func
Definition: t_preccfz.cc:17