CHROMA
stout_utils_llvm.cc
Go to the documentation of this file.
1 #include "qdp.h"
2 
3 using namespace QDP;
4 
5 
6 #ifdef QDP_IS_QDPJIT
7 #ifndef QDPJIT_IS_QDPJITPTX
8 #ifndef QDPJIT_IS_QDPJITNVVM
9 
10 #warning "Using QDP-JIT/LLVM stouting routines"
11 
12 void function_get_fs_bs_exec(const JitFunction& function,
13  const LatticeColorMatrix& Q,
14  const LatticeColorMatrix& QQ,
15  multi1d<LatticeComplex>& f,
16  multi1d<LatticeComplex>& b1,
17  multi1d<LatticeComplex>& b2,
18  bool dobs)
19 {
20  AddressLeaf addr_leaf(all);
21 
22  addr_leaf.setLit( dobs );
23 
24  int junk_0 = forEach(Q, addr_leaf, NullCombine());
25  int junk_1 = forEach(QQ, addr_leaf, NullCombine());
26  int junk_2 = forEach(f[0], addr_leaf, NullCombine());
27  int junk_3 = forEach(f[1], addr_leaf, NullCombine());
28  int junk_4 = forEach(f[2], addr_leaf, NullCombine());
29  int junk_5 = forEach(b1[0], addr_leaf, NullCombine());
30  int junk_6 = forEach(b1[1], addr_leaf, NullCombine());
31  int junk_7 = forEach(b1[2], addr_leaf, NullCombine());
32  int junk_8 = forEach(b2[0], addr_leaf, NullCombine());
33  int junk_9 = forEach(b2[1], addr_leaf, NullCombine());
34  int junk_10= forEach(b2[2], addr_leaf, NullCombine());
35 
36  //QDPIO::cerr << "calling getFsBs\n";
37 
38  jit_dispatch(function.func().at(0),Layout::sitesOnNode(),getDataLayoutInnerSize(),true,0,addr_leaf);
39 }
40 
41 
42 
43 WordREG<REAL> jit_constant( double f )
44 {
45  return WordREG<REAL>(f);
46 }
47 
48 
49 void function_get_fs_bs_build( JitFunction& func,
50  const LatticeColorMatrix& Q,
51  const LatticeColorMatrix& QQ,
52  multi1d<LatticeComplex>& f,
53  multi1d<LatticeComplex>& b1,
54  multi1d<LatticeComplex>& b2)
55 {
56  //std::cout << __PRETTY_FUNCTION__ << ": entering\n";
57 
58  JitMainLoop loop;
59 
60  ParamRef p_dobs = llvm_add_param<bool>();
61 
62  ParamLeaf param_leaf;
63 
64  typedef typename LeafFunctor<LatticeColorMatrix, ParamLeaf>::Type_t LCMJIT;
65  typedef typename LeafFunctor<LatticeComplex , ParamLeaf>::Type_t LCJIT;
66 
67  LCMJIT Q_jit(forEach(Q, param_leaf, TreeCombine()));
68  LCMJIT QQ_jit(forEach(QQ, param_leaf, TreeCombine()));
69  LCJIT f0_jit(forEach(f[0], param_leaf, TreeCombine()));
70  LCJIT f1_jit(forEach(f[1], param_leaf, TreeCombine()));
71  LCJIT f2_jit(forEach(f[2], param_leaf, TreeCombine()));
72  LCJIT b10_jit(forEach(b1[0], param_leaf, TreeCombine()));
73  LCJIT b11_jit(forEach(b1[1], param_leaf, TreeCombine()));
74  LCJIT b12_jit(forEach(b1[2], param_leaf, TreeCombine()));
75  LCJIT b20_jit(forEach(b2[0], param_leaf, TreeCombine()));
76  LCJIT b21_jit(forEach(b2[1], param_leaf, TreeCombine()));
77  LCJIT b22_jit(forEach(b2[2], param_leaf, TreeCombine()));
78 
79  llvm::Value* r_dobs = llvm_derefParam( p_dobs );
80 
81  IndexDomainVector idx = loop.getIdx();
82 
83  typename LCMJIT::Subtype_t& Q_j = Q_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
84  typename LCMJIT::Subtype_t& QQ_j = QQ_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
85 
86  typename LCJIT::Subtype_t& f0_j = f0_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
87  typename LCJIT::Subtype_t& f1_j = f1_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
88  typename LCJIT::Subtype_t& f2_j = f2_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
89 
90  typename LCJIT::Subtype_t& b10_j = b10_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
91  typename LCJIT::Subtype_t& b11_j = b11_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
92  typename LCJIT::Subtype_t& b12_j = b12_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
93  typename LCJIT::Subtype_t& b20_j = b20_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
94  typename LCJIT::Subtype_t& b21_j = b21_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
95  typename LCJIT::Subtype_t& b22_j = b22_jit.elem(JitDeviceLayout::LayoutCoalesced,idx);
96 
97  llvm::BasicBlock * block_exit = llvm_new_basic_block();
98  {
99  // Get the traces
100  PColorMatrixREG< RComplexREG< WordREG<REAL> >, Nc> Q_site = Q_j.elem();
101  PColorMatrixREG< RComplexREG< WordREG<REAL> >, Nc> QQ_site = QQ_j.elem();
102  PColorMatrixREG< RComplexREG< WordREG<REAL> >, Nc> QQQ = QQ_site*Q_site;
103 
104  // Real trQQQ;
105  // trQQQ.elem() = realTrace(QQQ);
106  // Real trQQ;
107  // trQQ.elem() = realTrace(QQ_site);
108 
109  // auto trQQQ = realTrace(QQQ);
110  // auto trQQ = realTrace(QQ_site);
111 
112  PScalarREG< RScalarREG< WordREG<REAL> > > trQQQ = realTrace(QQQ);
113  PScalarREG< RScalarREG< WordREG<REAL> > > trQQ = realTrace(QQ_site);
114 
115  WordREG<REAL> c0 = jit_constant((REAL)1/(REAL)3) * trQQQ.elem().elem(); // eq 13
116  WordREG<REAL> c1 = jit_constant((REAL)1/(REAL)2) * trQQ.elem().elem(); // eq 15
117 
118  llvm::BasicBlock * block_c1_lt = llvm_new_basic_block();
119  llvm::BasicBlock * block_c1_nlt = llvm_new_basic_block();
120  llvm_cond_branch( llvm_lt( c1.get_val() , llvm_create_value( 4.0e-3 ) ) , block_c1_lt , block_c1_nlt );
121 
122  llvm_set_insert_point( block_c1_lt );
123  { // if( c1 < 4.0e-3 )
124  f0_j.elem().elem().real() = jit_constant(1.0) - c0 * c0 / jit_constant(720.0);
125  f0_j.elem().elem().imag() = -( c0 / jit_constant(6.0) )*( jit_constant(1.0) -(c1/jit_constant(20.0))*(jit_constant(1.0)-(c1/jit_constant(42.0)))) ;
126 
127  f1_j.elem().elem().real() = c0/jit_constant(24.0)*(jit_constant(1.0)-c1/jit_constant(15.0)*(jit_constant(1.0)-jit_constant(3.0)*c1/jit_constant(112.0))) ;
128  f1_j.elem().elem().imag() = jit_constant(1.0)-c1/jit_constant(6.0)*(jit_constant(1.0)-c1/jit_constant(20.0)*(jit_constant(1.0)-c1/jit_constant(42.0)))-c0*c0/jit_constant(5040.0);
129 
130  f2_j.elem().elem().real() = jit_constant(0.5)*(jit_constant(-1.0)+c1/jit_constant(12.0)*(jit_constant(1.0)-c1/jit_constant(30.0)*(jit_constant(1.0)-c1/jit_constant(56.0)))+c0*c0/jit_constant(20160.0));
131  f2_j.elem().elem().imag() = jit_constant(0.5)*(c0/jit_constant(60.0)*(jit_constant(1.0)-c1/jit_constant(21.0)*(jit_constant(1.0)-c1/jit_constant(48.0))));
132 
133 
134  llvm::BasicBlock * block_dobs_0 = llvm_new_basic_block();
135  llvm_cond_branch( r_dobs , block_dobs_0 , block_exit );
136 
137  { // dobs
138  llvm_set_insert_point( block_dobs_0 );
139 
140  b20_j.elem().elem().real() = -c0/jit_constant(360.0);
141  b20_j.elem().elem().imag() = -jit_constant(1.0/6.0)*(jit_constant(1.0)-(c1/jit_constant(20.0))*(jit_constant(1.0)-c1/jit_constant(42.0)));
142 
143  // partial f0 / partial c1
144  //
145  b10_j.elem().elem().real() = jit_constant(0);
146  b10_j.elem().elem().imag() = (c0/jit_constant(120.0))*(jit_constant(1.0)-c1/jit_constant(21.0));
147 
148  // partial f1 / partial c0
149  //
150  b21_j.elem().elem().real() = jit_constant(1.0/24.0)*(jit_constant(1.0)-c1/jit_constant(15.0)*(jit_constant(1.0)-jit_constant(3.0)*c1/jit_constant(112.0)));
151  b21_j.elem().elem().imag() = -c0/jit_constant(2520.0);
152 
153 
154  // partial f1 / partial c1
155  b11_j.elem().elem().real() = -c0/jit_constant(360.0)*(jit_constant(1.0) - jit_constant(3.0)*c1/jit_constant(56.0) );
156  b11_j.elem().elem().imag() = -jit_constant(1.0/6.0)*(jit_constant(1.0)-c1/jit_constant(10.0)*(jit_constant(1.0)-c1/jit_constant(28.0)));
157 
158  // partial f2/ partial c0
159  b22_j.elem().elem().real() = jit_constant(0.5)*c0/jit_constant(10080.0);
160  b22_j.elem().elem().imag() = jit_constant(0.5)*( jit_constant(1.0/60.0)*(jit_constant(1.0)-c1/jit_constant(21.0)*(jit_constant(1.0)-c1/jit_constant(48.0))) );
161 
162  // partial f2/ partial c1
163  b12_j.elem().elem().real() = jit_constant(0.5)*( jit_constant(1.0/12.0)*(jit_constant(1.0)-(jit_constant(2.0)*c1/jit_constant(30.0))*(jit_constant(1.0)-jit_constant(3.0)*c1/jit_constant(112.0))) );
164  b12_j.elem().elem().imag() = jit_constant(0.5)*( -c0/jit_constant(1260.0)*(jit_constant(1.0)-c1/jit_constant(24.0)) );
165 
166  llvm_branch( block_exit );
167 
168  } // Dobs==true
169 
170  } // if (c1 < 4.0e-3 )
171 
172  llvm_set_insert_point( block_c1_nlt );
173 
174 
175  llvm::Value* c0_negativeP = llvm_lt( c0.get_val() , llvm_create_value(0.0) );
176  WordREG<REAL> c0abs = fabs(c0);
177  WordREG<REAL> c0max = jit_constant(2.0) * pow( c1 / jit_constant(3.0) , jit_constant(1.5) );
178  WordREG<REAL> theta;
179  WordREG<REAL> eps = (c0max - c0abs)/c0max;
180 
181 
182  // llvm::BasicBlock * block_dobs_0 = llvm_new_basic_block();
183  // llvm_cond_branch( r_dobs , block_dobs_0 , block_exit );
184 
185  llvm::Value* theta_phi0;
186  llvm::Value* theta_phi1;
187  llvm::Value* theta_phi2;
188 
189  llvm::BasicBlock * block_eps_0 = llvm_new_basic_block();
190  llvm::BasicBlock * block_eps_1 = llvm_new_basic_block();
191  llvm::BasicBlock * block_eps_2 = llvm_new_basic_block();
192  llvm::BasicBlock * block_eps_3 = llvm_new_basic_block();
193  llvm::BasicBlock * block_eps_exit = llvm_new_basic_block();
194 
195  llvm_cond_branch( llvm_lt( eps.get_val() , llvm_create_value(0.0) ) , block_eps_0 , block_eps_1 );
196 
197  llvm_set_insert_point( block_eps_0 ); // epsilon < 0
198  theta_phi0 = llvm_create_value(0.0);
199  llvm_branch(block_eps_exit);
200 
201  llvm_set_insert_point( block_eps_1 );
202  llvm_cond_branch( llvm_lt( eps.get_val() , llvm_create_value(1.0e-3) ) , block_eps_2 , block_eps_3 );
203 
204  llvm_set_insert_point( block_eps_2 ); // epsilon < 1.e-3
205  WordREG<REAL> sqtwo = sqrt( jit_constant(2.0) );
206  WordREG<REAL> theta_tmp;
207  theta_tmp =
208  sqtwo *
209  sqrt(eps) *
210  ( jit_constant(1.0) +
211  ( jit_constant(1/(REAL)12) +
212  ( jit_constant(3/(REAL)160) +
213  ( jit_constant(5/(REAL)896) +
214  ( jit_constant(35/(REAL)18432) +
215  jit_constant(63/(REAL)90112) * eps ) *
216  eps) *
217  eps) *
218  eps) *
219  eps);
220  theta_phi1 = theta_tmp.get_val();
221  llvm_branch(block_eps_exit);
222 
223  llvm_set_insert_point( block_eps_3 ); // else
224  WordREG<REAL> theta_tmp2;
225  theta_tmp2 = acos( c0abs/c0max );
226  theta_phi2 = theta_tmp2.get_val();
227  llvm_branch(block_eps_exit);
228 
229  llvm_set_insert_point( block_eps_exit ); // exit
230 
231  llvm::PHINode* theta_phi = llvm_phi( llvm_type<REAL>::value , 3 );
232  theta_phi->addIncoming( theta_phi0 , block_eps_0 );
233  theta_phi->addIncoming( theta_phi1 , block_eps_2 );
234  theta_phi->addIncoming( theta_phi2 , block_eps_3 );
235 
236  theta.setup( theta_phi );
237 
238 
239 
240  multi1d<WordREG<REAL> > f_site_re(3);
241  multi1d<WordREG<REAL> > f_site_im(3);
242 
243  multi1d<WordREG<REAL> > b1_site_re_phi0(3);
244  multi1d<WordREG<REAL> > b1_site_im_phi0(3);
245 
246  multi1d<WordREG<REAL> > b2_site_re_phi0(3);
247  multi1d<WordREG<REAL> > b2_site_im_phi0(3);
248 
249 
250 
251  WordREG<REAL> u = sqrt(c1/jit_constant(3.0))*cos(theta/jit_constant(3.0));
252  WordREG<REAL> w = sqrt(c1)*sin(theta/jit_constant(3.0));
253 
254  WordREG<REAL> u_sq = u*u;
255  WordREG<REAL> w_sq = w*w;
256 
257  WordREG<REAL> xi0,xi1;
258 
259  llvm::Value *w_smallP = llvm_lt( (fabs( w )).get_val() , llvm_create_value( 0.05 ) );
260  llvm::BasicBlock * block_xi0_exit = llvm_new_basic_block();
261  { // xi0
262  llvm::BasicBlock * block_xi0_small = llvm_new_basic_block();
263  llvm::BasicBlock * block_xi0_not_small = llvm_new_basic_block();
264  llvm::Value* xi0_phi0;
265  llvm::Value* xi0_phi1;
266 
267  llvm_cond_branch( w_smallP , block_xi0_small , block_xi0_not_small );
268 
269  llvm_set_insert_point( block_xi0_small );
270  WordREG<REAL> xi0_tmp0 =
271  jit_constant(1.0) -
272  (jit_constant(1.0/6.0)*w_sq*( jit_constant(1.0) -
273  (jit_constant(1.0/20.)*w_sq*( jit_constant(1.0) -
274  (jit_constant(1.0/42.0)*w_sq ) ))));
275  xi0_phi0 = xi0_tmp0.get_val();
276 
277  llvm_branch( block_xi0_exit );
278 
279  llvm_set_insert_point( block_xi0_not_small );
280  WordREG<REAL> xi0_tmp1 = sin(w)/w;
281  xi0_phi1 = xi0_tmp1.get_val();
282  llvm_branch( block_xi0_exit );
283 
284  llvm_set_insert_point( block_xi0_exit );
285 
286  llvm::PHINode* xi0_phi = llvm_phi( llvm_type<REAL>::value , 2 );
287  xi0_phi->addIncoming( xi0_phi0 , block_xi0_small );
288  xi0_phi->addIncoming( xi0_phi1 , block_xi0_not_small );
289  xi0.setup( xi0_phi );
290  } // xi0
291 
292  llvm::BasicBlock * block_dobs1 = llvm_new_basic_block();
293  llvm::BasicBlock * block_dobs1_exit = llvm_new_basic_block();
294  llvm_cond_branch( r_dobs , block_dobs1 , block_dobs1_exit );
295  llvm_set_insert_point( block_dobs1 );
296  //llvm::BasicBlock * block_xi1_exit = llvm_new_basic_block();
297  llvm::BasicBlock * block_xi1_small = llvm_new_basic_block();
298  llvm::BasicBlock * block_xi1_not_small = llvm_new_basic_block();
299  llvm::Value* xi1_phi0;
300  llvm::Value* xi1_phi1;
301  { // xi1
302 
303  llvm_cond_branch( w_smallP , block_xi1_small , block_xi1_not_small );
304 
305  llvm_set_insert_point( block_xi1_small );
306  WordREG<REAL> xi1_tmp0 =
307  jit_constant(-1.0)*
308  ( jit_constant((REAL)1/(REAL)3) -
309  jit_constant((REAL)1/(REAL)30)*w_sq*( jit_constant((REAL)1) -
310  jit_constant((REAL)1/(REAL)28)*w_sq*( jit_constant((REAL)1) -
311  jit_constant((REAL)1/(REAL)54)*w_sq ) ) );
312  xi1_phi0 = xi1_tmp0.get_val();
313  //llvm_branch( block_xi1_exit );
314  llvm_branch( block_dobs1_exit );
315 
316  llvm_set_insert_point( block_xi1_not_small );
317  WordREG<REAL> xi1_tmp1 = cos(w)/w_sq - sin(w)/(w_sq*w);
318  xi1_phi1 = xi1_tmp1.get_val();
319  //llvm_branch( block_xi1_exit );
320  llvm_branch( block_dobs1_exit );
321 
322  //llvm_set_insert_point( block_xi1_exit );
323 
324  //llvm_branch( block_dobs1_exit );
325  } // xi1
326  llvm_set_insert_point( block_dobs1_exit );
327 
328  llvm::PHINode* xi1_phi = llvm_phi( llvm_type<REAL>::value , 3 );
329  xi1_phi->addIncoming( xi1_phi0 , block_xi1_small );
330  xi1_phi->addIncoming( xi1_phi1 , block_xi1_not_small );
331  xi1_phi->addIncoming( llvm_cast( llvm_type<REAL>::value , llvm_create_value(0.0) ) , block_xi0_exit );
332  xi1.setup( xi1_phi );
333 
334  WordREG<REAL> cosu = cos(u);
335  WordREG<REAL> sinu = sin(u);
336  WordREG<REAL> cosw = cos(w);
337  WordREG<REAL> sinw = sin(w);
338  WordREG<REAL> sin2u = sin(jit_constant(2.0)*u);
339  WordREG<REAL> cos2u = cos(jit_constant(2.0)*u);
340  WordREG<REAL> ucosu = u*cosu;
341  WordREG<REAL> usinu = u*sinu;
342  WordREG<REAL> ucos2u = u*cos2u;
343  WordREG<REAL> usin2u = u*sin2u;
344 
345  WordREG<REAL> denum = jit_constant(9.0) * u_sq - w_sq;
346 
347 
348  {
349  WordREG<REAL> subexp1 = u_sq - w_sq;
350  WordREG<REAL> subexp2 = jit_constant(8.0)*u_sq*cosw;
351  WordREG<REAL> subexp3 = (jit_constant(3.0)*u_sq + w_sq)*xi0;
352 
353  f_site_re[0] = ( (subexp1)*cos2u + cosu*subexp2 + jit_constant(2.0)*usinu*subexp3 ) / denum ;
354  f_site_im[0] = ( (subexp1)*sin2u - sinu*subexp2 + jit_constant(2.0)*ucosu*subexp3 ) / denum ;
355  }
356 
357  {
358  WordREG<REAL> subexp = (jit_constant(3.0)*u_sq -w_sq)*xi0;
359 
360  f_site_re[1] = (jit_constant(2.0)*(ucos2u - ucosu*cosw)+subexp*sinu)/denum;
361  f_site_im[1] = (jit_constant(2.0)*(usin2u + usinu*cosw)+subexp*cosu)/denum;
362  }
363 
364 
365  {
366  WordREG<REAL> subexp=jit_constant(3.0)*xi0;
367 
368  f_site_re[2] = (cos2u - cosu*cosw -usinu*subexp) /denum ;
369  f_site_im[2] = (sin2u + sinu*cosw -ucosu*subexp) /denum ;
370  }
371 
372  llvm::BasicBlock * block_dobs2 = llvm_new_basic_block();
373  llvm::BasicBlock * block_dobs2_exit = llvm_new_basic_block();
374  llvm_cond_branch( r_dobs , block_dobs2 , block_dobs2_exit );
375  llvm_set_insert_point( block_dobs2 );
376  //if( dobs == true )
377  {
378  multi1d<WordREG<REAL> > r_1_re(3);
379  multi1d<WordREG<REAL> > r_1_im(3);
380  multi1d<WordREG<REAL> > r_2_re(3);
381  multi1d<WordREG<REAL> > r_2_im(3);
382 
383  // r_1[0]=Double(2)*cmplx(u, u_sq-w_sq)*exp2iu
384  // + 2.0*expmiu*( cmplx(8.0*u*cosw, -4.0*u_sq*cosw)
385  // + cmplx(u*(3.0*u_sq+w_sq),9.0*u_sq+w_sq)*xi0 );
386  {
387  WordREG<REAL> subexp1 = u_sq - w_sq;
388  WordREG<REAL> subexp2 = jit_constant(8.0)*cosw + (jit_constant(3.0)*u_sq + w_sq)*xi0 ;
389  WordREG<REAL> subexp3 = jit_constant(4.0)*u_sq*cosw - (jit_constant(9.0)*u_sq + w_sq)*xi0 ;
390 
391  r_1_re[0] = jit_constant(2.0)*(ucos2u - sin2u *(subexp1)+ucosu*( subexp2 )- sinu*( subexp3 ) );
392  r_1_im[0] = jit_constant(2.0)*(usin2u + cos2u *(subexp1)-usinu*( subexp2 )- cosu*( subexp3 ) );
393  }
394 
395  // r_1[1]=cmplx(2.0, 4.0*u)*exp2iu + expmiu*cmplx(-2.0*cosw-(w_sq-3.0*u_sq)*xi0,2.0*u*cosw+6.0*u*xi0);
396  {
397  WordREG<REAL> subexp1 = cosw + jit_constant(3.0) * xi0;
398  WordREG<REAL> subexp2 = jit_constant(2.0)*cosw + xi0*(w_sq - jit_constant(3.0)*u_sq);
399 
400  r_1_re[1] = jit_constant(2.0)*((cos2u - jit_constant(2.0)*usin2u) + usinu*( subexp1 )) - cosu*( subexp2 );
401  r_1_im[1] = jit_constant(2.0)*((sin2u + jit_constant(2.0)*ucos2u) + ucosu*( subexp1 )) + sinu*( subexp2 );
402  }
403 
404 
405  // r_1[2]=2.0*timesI(exp2iu) +expmiu*cmplx(-3.0*u*xi0, cosw-3*xi0);
406  {
407  WordREG<REAL> subexp = cosw - jit_constant(3.0)*xi0;
408  r_1_re[2] = -jit_constant(2.0)*sin2u -jit_constant(3.0)*ucosu*xi0 + sinu*( subexp );
409  r_1_im[2] = jit_constant(2.0)*cos2u +jit_constant(3.0)*usinu*xi0 + cosu*( subexp );
410  }
411 
412 
413  //r_2[0]=-2.0*exp2iu + 2*cmplx(0,u)*expmiu*cmplx(cosw+xi0+3*u_sq*xi1,
414  // 4*u*xi0);
415  {
416  WordREG<REAL> subexp = cosw + xi0 + jit_constant(3.0)*u_sq*xi1;
417  r_2_re[0] = -jit_constant(2.0)*(cos2u + u*( jit_constant(4.0)*ucosu*xi0 - sinu*(subexp )) );
418  r_2_im[0] = -jit_constant(2.0)*(sin2u - u*( jit_constant(4.0)*usinu*xi0 + cosu*(subexp )) );
419  }
420 
421 
422  // r_2[1]= expmiu*cmplx(cosw+xi0-3.0*u_sq*xi1, 2.0*u*xi0);
423  // r_2[1] = timesMinusI(r_2[1]);
424  {
425  WordREG<REAL> subexp = cosw + xi0 - jit_constant(3.0)*u_sq*xi1;
426  r_2_re[1] = jit_constant(2.0)*ucosu*xi0 - sinu*( subexp ) ;
427  r_2_im[1] = jit_constant(-2.0)*usinu*xi0 - cosu*( subexp ) ;
428  }
429 
430  //r_2[2]=expmiu*cmplx(xi0, -3.0*u*xi1);
431  {
432  WordREG<REAL> subexp = jit_constant(3.0)*xi1;
433 
434  r_2_re[2] = cosu*xi0 - usinu*subexp ;
435  r_2_im[2] = -( sinu*xi0 + ucosu*subexp ) ;
436  }
437 
438  WordREG<REAL> b_denum=jit_constant(2.0)*denum*denum;
439 
440 
441  for(int j=0; j < 3; j++) {
442 
443  {
444  WordREG<REAL> subexp1 = jit_constant(2.0)*u;
445  WordREG<REAL> subexp2 = jit_constant(3.0)*u_sq - w_sq;
446  WordREG<REAL> subexp3 = jit_constant(2.0)*(jit_constant(15.0)*u_sq + w_sq);
447 
448  b1_site_re_phi0[j]=( subexp1*r_1_re[j] + subexp2*r_2_re[j] - subexp3*f_site_re[j] )/b_denum;
449  b1_site_im_phi0[j]=( subexp1*r_1_im[j] + subexp2*r_2_im[j] - subexp3*f_site_im[j] )/b_denum;
450  }
451 
452  {
453  WordREG<REAL> subexp1 = jit_constant(3.0)*u;
454  WordREG<REAL> subexp2 = jit_constant(24.0)*u;
455 
456  b2_site_re_phi0[j]=( r_1_re[j]- subexp1*r_2_re[j] - subexp2 * f_site_re[j] )/b_denum;
457  b2_site_im_phi0[j]=( r_1_im[j] -subexp1*r_2_im[j] - subexp2 * f_site_im[j] )/b_denum;
458  }
459  }
460 
461 
462  // Now flip the coefficients of the b-s
463 
464  llvm::BasicBlock * block_c0neg = llvm_new_basic_block();
465  llvm::BasicBlock * block_c0neg_exit = llvm_new_basic_block();
466  llvm_cond_branch( c0_negativeP , block_c0neg , block_c0neg_exit );
467  llvm_set_insert_point( block_c0neg );
468 
469  multi1d<WordREG<REAL> > b1_site_re_phi1(3);
470  multi1d<WordREG<REAL> > b1_site_im_phi1(3);
471 
472  multi1d<WordREG<REAL> > b2_site_re_phi1(3);
473  multi1d<WordREG<REAL> > b2_site_im_phi1(3);
474 
475  //if( c0_negativeP )
476  {
477  //b1_site[0] = conj(b1_site[0]);
478  b1_site_im_phi1[0] = b1_site_im_phi0[0] * jit_constant(-1.0);
479 
480  //b1_site[1] = -conj(b1_site[1]);
481  b1_site_re_phi1[1] = b1_site_re_phi0[1] * jit_constant(-1.0);
482 
483  //b1_site[2] = conj(b1_site[2]);
484  b1_site_im_phi1[2] = b1_site_im_phi0[2] * jit_constant(-1.0);
485 
486  //b2_site[0] = -conj(b2_site[0]);
487  b2_site_re_phi1[0] = b2_site_re_phi0[0] * jit_constant(-1.0);
488 
489  //b2_site[1] = conj(b2_site[1]);
490  b2_site_im_phi1[1] = b2_site_im_phi0[1] * jit_constant(-1.0);
491 
492  //b2_site[2] = -conj(b2_site[2]);
493  b2_site_re_phi1[2] = b2_site_re_phi0[2] * jit_constant(-1.0);
494  }
495  llvm_branch( block_c0neg_exit );
496 
497  llvm_set_insert_point( block_c0neg_exit );
498 
499  //
500  // Now, PHI' them together
501  //
502  multi1d<WordREG<REAL> > f_site_re(3);
503  multi1d<WordREG<REAL> > f_site_im(3);
504 
505  multi1d<WordREG<REAL> > b1_site_re(3);
506  multi1d<WordREG<REAL> > b1_site_im(3);
507 
508  multi1d<WordREG<REAL> > b2_site_re(3);
509  multi1d<WordREG<REAL> > b2_site_im(3);
510 
511  b1_site_im[1] = b1_site_im_phi0[1];
512  b1_site_re[0] = b1_site_re_phi0[0];
513  b1_site_re[2] = b1_site_re_phi0[2];
514 
515  b2_site_re[1] = b2_site_re_phi0[1];
516  b2_site_im[0] = b2_site_im_phi0[0];
517  b2_site_im[2] = b2_site_im_phi0[2];
518 
519 
520  qdpPHI( b1_site_im[0] , b1_site_im_phi1[0] , block_c0neg ,
521  b1_site_im_phi0[0] , block_dobs2 );
522  qdpPHI( b1_site_re[1] , b1_site_re_phi1[1] , block_c0neg ,
523  b1_site_re_phi0[1] , block_dobs2 );
524  qdpPHI( b1_site_im[2] , b1_site_im_phi1[2] , block_c0neg ,
525  b1_site_im_phi0[2] , block_dobs2 );
526  qdpPHI( b2_site_re[0] , b2_site_re_phi1[0] , block_c0neg ,
527  b2_site_re_phi0[0] , block_dobs2 );
528  qdpPHI( b2_site_im[1] , b2_site_im_phi1[1] , block_c0neg ,
529  b2_site_im_phi0[1] , block_dobs2 );
530  qdpPHI( b2_site_re[2] , b2_site_re_phi1[2] , block_c0neg ,
531  b2_site_re_phi0[2] , block_dobs2 );
532 
533 
534  // Load back into the lattice sized object
535  //for(int j=0; j < 3; j++) {
536 
537  b10_j.elem().elem().real() = b1_site_re[0];
538  b10_j.elem().elem().imag() = b1_site_im[0];
539 
540  b20_j.elem().elem().real() = b2_site_re[0]; //ok
541  b20_j.elem().elem().imag() = b2_site_im[0]; //ok
542 
543  b11_j.elem().elem().real() = b1_site_re[1]; //ok
544  b11_j.elem().elem().imag() = b1_site_im[1]; //ok
545 
546  b21_j.elem().elem().real() = b2_site_re[1];
547  b21_j.elem().elem().imag() = b2_site_im[1];
548 
549  b12_j.elem().elem().real() = b1_site_re[2];
550  b12_j.elem().elem().imag() = b1_site_im[2];
551 
552  b22_j.elem().elem().real() = b2_site_re[2];
553  b22_j.elem().elem().imag() = b2_site_im[2];
554 
555  llvm_branch( block_dobs2_exit );
556  //}
557  } // end of if (dobs==true)
558  //llvm_label( cont_6 );
559 
560  llvm_set_insert_point( block_dobs2_exit );
561 
562 
563  // Now when everything is done flip signs of the b-s (can't do this before
564  // as the unflipped f-s are needed to find the b-s
565 
566  // Load back into the lattice sized object
567  // for(int j=0; j < 3; j++) {
568  f0_j.elem().elem().real() = f_site_re[0];
569  f0_j.elem().elem().imag() = f_site_im[0];
570 
571  f1_j.elem().elem().real() = f_site_re[1];
572  f1_j.elem().elem().imag() = f_site_im[1];
573 
574  f2_j.elem().elem().real() = f_site_re[2];
575  f2_j.elem().elem().imag() = f_site_im[2];
576  //}
577 
578 
579  llvm::BasicBlock * block_c0neg1 = llvm_new_basic_block();
580  llvm::BasicBlock * block_c0neg1_exit = llvm_new_basic_block();
581  llvm_cond_branch( c0_negativeP , block_c0neg1 , block_c0neg1_exit );
582  llvm_set_insert_point( block_c0neg1 );
583 
584  // if( c0_negativeP ) {
585  // f_site[0] = conj(f_site[0]);
586  //f_site_im[0] *= jit_constant(-1.0);
587  f0_j.elem().elem().imag() *= jit_constant(-1.0);
588 
589  //f_site[1] = -conj(f_site[1]);
590  //f_site_re[1] *= jit_constant(-1.0);
591  f1_j.elem().elem().real() *= jit_constant(-1.0);
592 
593  //f_site[2] = conj(f_site[2]);
594  //f_site_im[2] *= jit_constant(-1.0);
595  f2_j.elem().elem().imag() *= jit_constant(-1.0);
596 
597  llvm_branch( block_c0neg1_exit );
598 
599  llvm_set_insert_point( block_c0neg1_exit );
600 
601  //}
602 
603  } // End of if( corner_caseP ) else {}
604 
605  llvm_branch( block_exit );
606  llvm_set_insert_point( block_exit );
607 
608 
609  loop.done();
610 
611  func.func().push_back( jit_function_epilogue_get("jit_get_fs_bs.ptx") );
612 }
613 
614 #endif
615 #endif
616 #endif
unsigned j
Definition: ldumul_w.cc:35
static multi1d< LatticeColorMatrix > u
multi1d< LatticeColorMatrix > Q
Definition: t_clover.cc:12
func
Definition: t_preccfz.cc:17