CHROMA
lwldslash_llvm_w.h
Go to the documentation of this file.
1 // -*- C++ -*-
2 /*! \file
3  * \brief Wilson Dslash linear operator
4  */
5 
6 #ifndef __lwldslash_llvm_h__
7 #define __lwldslash_llvm_h__
8 
9 #include "state.h"
10 #include "io/aniso_io.h"
12 
13 extern "C" {
14 #include "dslash_sig_0.h"
15 #include "dslash_sig_1.h"
16 }
17 
18 namespace Chroma
19 {
20  //! General Wilson-Dirac dslash
21  /*!
22  * \ingroup linop
23  *
24  * DSLASH
25  *
26  * This routine is specific to Wilson fermions!
27  *
28  * Description:
29  *
30  * This routine applies the operator D' to Psi, putting the result in Chi.
31  *
32  * Nd-1
33  * ---
34  * \
35  * chi(x) := > U (x) (1 - isign gamma ) psi(x+mu)
36  * / mu mu
37  * ---
38  * mu=0
39  *
40  * Nd-1
41  * ---
42  * \ +
43  * + > U (x-mu) (1 + isign gamma ) psi(x-mu)
44  * / mu mu
45  * ---
46  * mu=0
47  *
48  */
49 
50  template<typename T, typename P, typename Q>
51  class LLVMWilsonDslashT : public WilsonDslashBase<T, P, Q>
52  {
53  public:
54 
55  //! Empty constructor. Must use create later
57 
58  //! Full constructor
60 
61  //! Full constructor with anisotropy
63  const AnisoParam_t& aniso_);
64 
65  //! Full constructor with general coefficients
67  const multi1d<Real>& coeffs_);
68 
69  //! Creation routine
71 
72  //! Creation routine with anisotropy
74  const AnisoParam_t& aniso_);
75 
76  //! Full constructor with general coefficients
78  const multi1d<Real>& coeffs_);
79 
80  //! No real need for cleanup here
82 
83  /**
84  * Apply a dslash
85  *
86  * \param chi result (Write)
87  * \param psi source (Read)
88  * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read)
89  * \param cb Checkerboard of OUTPUT std::vector (Read)
90  *
91  * \return The output of applying dslash on psi
92  */
93  void apply (T& chi, const T& psi, enum PlusMinus isign, int cb) const;
94 
95  //! Return the fermion BC object for this linear operator
96  const FermBC<T,P,Q>& getFermBC() const {return *fbc;}
97 
98  protected:
99  //! Get the anisotropy parameters
100  const multi1d<Real>& getCoeffs() const {return coeffs;}
101 
102  private:
103  void comms_setup();
104  void setup();
105  void comms_free();
106  void comms_wait() const;
107  void comms_send_receive(int i) const;
108 
109  multi1d<Real> coeffs; /*!< Nd array of coefficients of terms in the action */
111  Q u;
112 
113  int innerCount[2];
114  int faceCount[2];
115  const int *innerSites[2];
116  const int *faceSites[2];
117 
118  struct comms_t {
119  bool do_comms;
120  int snd_nd;
121  int snd_sz;
122  int rcv_nd;
123  int rcv_sz;
124  };
125  struct comms_t comms[8];
126 
127  double * send_buf[8];
128  double * recv_buf[8];
129  QMP_msgmem_t msg[8][2];
130  QMP_msghandle_t mh_a[8][2], mh[8];
131  QMP_mem_t *send_buf_mem[8];
132  QMP_mem_t *recv_buf_mem[8];
133 
135  };
136 
137  //! General Wilson-Dirac dslash
138  /*! \ingroup linop
139  * DSLASH
140  *
141  * This routine is specific to Wilson fermions!
142  *
143  * Description:
144  *
145  * This routine applies the operator D' to Psi, putting the result in Chi.
146  *
147  * Nd-1
148  * ---
149  * \
150  * chi(x) := > U (x) (1 - isign gamma ) psi(x+mu)
151  * / mu mu
152  * ---
153  * mu=0
154  *
155  * Nd-1
156  * ---
157  * \ +
158  * + > U (x-mu) (1 + isign gamma ) psi(x-mu)
159  * / mu mu
160  * ---
161  * mu=0
162  *
163  */
164 
165  //! Empty constructor
166  template<typename T, typename P, typename Q>
168  for (int i=0;i<8;i++)
169  comms[i].do_comms = false;
170  }
171 
172  //! Full constructor
173  template<typename T, typename P, typename Q>
175  {
176  create(state);
177  }
178 
179  //! Full constructor with anisotropy
180  template<typename T, typename P, typename Q>
182  const AnisoParam_t& aniso_)
183  {
184  create(state, aniso_);
185  }
186 
187  //! Full constructor with general coefficients
188  template<typename T, typename P, typename Q>
190  const multi1d<Real>& coeffs_)
191  {
192  create(state, coeffs_);
193  }
194 
195  //! Creation routine
196  template<typename T, typename P, typename Q>
198  {
199  multi1d<Real> cf(Nd);
200  cf = 1.0;
201  create(state, cf);
202  }
203 
204  //! Creation routine with anisotropy
205  template<typename T, typename P, typename Q>
207  const AnisoParam_t& anisoParam)
208  {
209  START_CODE();
210 
211  create(state, makeFermCoeffs(anisoParam));
212 
213  END_CODE();
214  }
215 
216 
217 
218  template<typename T, typename P, typename Q>
220  {
221  for (int i = 0 ; i < 8 ; i++ )
222  if (comms[i].do_comms)
223  {
224  int dstnum = comms[i].snd_sz;
225  int srcnum = comms[i].rcv_sz;
226  int dstnode = comms[i].snd_nd;
227  int srcnode = comms[i].rcv_nd;
228 
229  send_buf_mem[i] = QMP_allocate_aligned_memory(dstnum,QDP_ALIGNMENT_SIZE, (QMP_MEM_COMMS|QMP_MEM_FAST) ); // packed data to send
230  if( send_buf_mem[i] == 0x0 ) {
231  send_buf_mem[i] = QMP_allocate_aligned_memory(dstnum, QDP_ALIGNMENT_SIZE, QMP_MEM_COMMS);
232  if( send_buf_mem[i] == 0x0 ) {
233  QDP_error_exit("Unable to allocate send_buf_mem\n");
234  }
235  }
236  recv_buf_mem[i] = QMP_allocate_aligned_memory(srcnum,QDP_ALIGNMENT_SIZE, (QMP_MEM_COMMS|QMP_MEM_FAST)); // packed receive data
237  if( recv_buf_mem[i] == 0x0 ) {
238  recv_buf_mem[i] = QMP_allocate_aligned_memory(srcnum, QDP_ALIGNMENT_SIZE, QMP_MEM_COMMS);
239  if( recv_buf_mem[i] == 0x0 ) {
240  QDP_error_exit("Unable to allocate recv_buf_mem\n");
241  }
242  }
243  send_buf[i]=(double*)QMP_get_memory_pointer(send_buf_mem[i]);
244  recv_buf[i]=(double*)QMP_get_memory_pointer(recv_buf_mem[i]);
245 
246 
247  msg[i][0] = QMP_declare_msgmem( recv_buf[i] , srcnum );
248 
249  if( msg[i][0] == (QMP_msgmem_t)NULL ) {
250  QDP_error_exit("QMP_declare_msgmem for msg[0] failed in Map::operator()\n");
251  }
252 
253  msg[i][1] = QMP_declare_msgmem( send_buf[i] , dstnum );
254 
255  if( msg[i][1] == (QMP_msgmem_t)NULL ) {
256  QDP_error_exit("QMP_declare_msgmem for msg[1] failed in Map::operator()\n");
257  }
258 
259  mh_a[i][0] = QMP_declare_receive_from(msg[i][0], srcnode, 0);
260  if( mh_a[i][0] == (QMP_msghandle_t)NULL ) {
261  QDP_error_exit("QMP_declare_receive_from for mh_a[0] failed in Map::operator()\n");
262  }
263 
264  mh_a[i][1] = QMP_declare_send_to(msg[i][1], dstnode , 0);
265  if( mh_a[i][1] == (QMP_msghandle_t)NULL ) {
266  QDP_error_exit("QMP_declare_send_to for mh_a[1] failed in Map::operator()\n");
267  }
268 
269  mh[i] = QMP_declare_multiple(mh_a[i], 2);
270  if( mh[i] == (QMP_msghandle_t)NULL ) {
271  QDP_error_exit("QMP_declare_multiple for mh failed in Map::operator()\n");
272  }
273  }
274  }
275 
276 
277  template<typename T, typename P, typename Q>
279  QMP_status_t err;
280  if ((err = QMP_start(mh[i])) != QMP_SUCCESS)
281  QDP_error_exit(QMP_error_string(err));
282  }
283 
284 
285  template<typename T, typename P, typename Q>
287  QMP_status_t err;
288  for (int i = 0 ; i < 8 ; i++ )
289  if (comms[i].do_comms)
290  if ((err = QMP_wait(mh[i])) != QMP_SUCCESS)
291  QDP_error_exit(QMP_error_string(err));
292 
293 #if QDP_DEBUG >= 3
294  QDP_info("Map: calling free msgs");
295 #endif
296  }
297 
298 
299  template<typename T, typename P, typename Q>
301  for (int i = 0 ; i < 8 ; i++ )
302  if (comms[i].do_comms)
303  {
304  QMP_free_msghandle(mh[i]);
305  // QMP_free_msghandle(mh_a[1]);
306  // QMP_free_msghandle(mh_a[0]);
307  QMP_free_msgmem(msg[i][1]);
308  QMP_free_msgmem(msg[i][0]);
309 
310  QMP_free_memory(recv_buf_mem[i]);
311  QMP_free_memory(send_buf_mem[i]);
312  }
313  }
314 
315 
316  template<typename T, typename P, typename Q>
318  {
319  //QDPIO::cout << "Setting up LLVM Wilson Dslash\n";
320 
321  // Get communication footprint
322  int offnode_maps=0;
323  int comm_no=-1;
324  for( int dir = 3 ; dir >= 0 ; --dir ) {
325  for( int isign = -1 ; isign <= +1 ; isign += 2 ) {
326  comm_no++;
327 
328  const Map& map = shift.getMap(isign,dir);
329 
330  //QDPIO::cout << "dir = " << dir << " isign = " << isign << " id = " << id << "\n";
331 
332  if (map.hasOffnode())
333  {
334  offnode_maps |= map.getId();
335 
336  int dstnum = shift.getMap(isign,dir).get_destnodes_num()[rb[0].getId()][0]*sizeof(double)*12; // we are sending half-spinors
337  int srcnum = shift.getMap(isign,dir).get_srcenodes_num()[rb[0].getId()][0]*sizeof(double)*12;
338 
339  // QDPIO::cout << "receive buffer size = " << dstnum << "\n";
340  // QDPIO::cout << "send buffer size = " << srcnum << "\n";
341 
342  int dstnode = shift.getMap(isign,dir).get_destnodes()[0];
343  int srcnode = shift.getMap(isign,dir).get_srcenodes()[0];
344 
345  // QDPIO::cout << "send to = " << dstnode << "\n";
346  // QDPIO::cout << "receive from = " << srcnode << "\n";
347 
348  comms[comm_no].do_comms=true;
349  comms[comm_no].snd_nd=dstnode;
350  comms[comm_no].snd_sz=dstnum;
351  comms[comm_no].rcv_nd=srcnode;
352  comms[comm_no].rcv_sz=srcnum;
353  }
354  else
355  {
356  comms[comm_no].do_comms=false;
357  }
358  }
359  }
360 
361  comms_setup();
362 
363  // QDPIO::cout << "comms footprint = " << offnode_maps << "\n";
364 
365  for(int cb=0 ; cb<2 ; ++cb) {
366  if (offnode_maps > 0)
367  {
368  innerCount[cb] = MasterMap::Instance().getCountInner(rb[cb],offnode_maps);
369  faceCount[cb] = MasterMap::Instance().getCountFace(rb[cb],offnode_maps);
370  innerSites[cb] = MasterMap::Instance().getInnerSites(rb[cb],offnode_maps).slice();
371  faceSites[cb] = MasterMap::Instance().getFaceSites(rb[cb],offnode_maps).slice();
372  }
373  else
374  {
375  innerCount[cb] = rb[cb].numSiteTable();
376  faceCount[cb] = 0;
377  innerSites[cb] = rb[cb].siteTable().slice();
378  faceSites[cb] = NULL;
379  }
380  // QDPIO::cout << "inner rb[" << cb << "] = " << innerCount[cb] << " "
381  // << "face rb[" << cb << "] = " << faceCount[cb] << "\n";
382  }
383 
384  comm_thread = omp_get_max_threads() >=4 ? 3 : 0;
385  //QDPIO::cout << "using comm thread " << comm_thread << "\n";
386  } // setup
387 
388 
389 
390 
391  //! Full constructor with general coefficients
392  template<typename T, typename P, typename Q>
394  const multi1d<Real>& coeffs_)
395  {
396  // Save a copy of the aniso params original fields and with aniso folded in
397  coeffs = coeffs_;
398 
399  // Save a copy of the fermbc
400  fbc = state->getFermBC();
401 
402  // Sanity check
403  if (fbc.operator->() == 0)
404  {
405  QDPIO::cerr << "LLVMWilsonDslash: error: fbc is null" << std::endl;
406  QDP_abort(1);
407  }
408 
409  u.resize(Nd);
410 
411  // Fold in anisotropy
412  for(int mu=0; mu < u.size(); ++mu) {
413  u[mu] = (state->getLinks())[mu];
414  }
415 
416  // Rescale the u fields by the anisotropy
417  for(int mu=0; mu < u.size(); ++mu)
418  {
419  u[mu] *= coeffs[mu];
420  }
421 
422  setup();
423  }
424 
425 
426 
427 
428  //! General Wilson-Dirac dslash
429  /*! \ingroup linop
430  * Wilson dslash
431  *
432  * Arguments:
433  *
434  * \param chi Result (Write)
435  * \param psi Pseudofermion field (Read)
436  * \param isign D'^dag or D' ( MINUS | PLUS ) resp. (Read)
437  * \param cb Checkerboard of OUTPUT std::vector (Read)
438  */
439  template<typename T, typename P, typename Q>
440  void
442  enum PlusMinus isign, int cb) const
443  {
444  double* psi_ptr = (double*)psi.getFjit();
445  double* chi_ptr = (double*)chi.getFjit();
446  double* u0_ptr = (double*)u[0].getFjit();
447  double* u1_ptr = (double*)u[1].getFjit();
448  double* u2_ptr = (double*)u[2].getFjit();
449  double* u3_ptr = (double*)u[3].getFjit();
450 
451  switch (isign)
452  {
453  case PLUS:
454 #pragma omp parallel default(shared)
455  {
456  int threads_num = omp_get_num_threads();
457  int myId = omp_get_thread_num();
458 
459  if (comms[0].do_comms)
460  {
461  {
462  const Map& map = shift.getMap(-1,3);
463  const int* soffset_slice = map.soffset(rb[cb]).slice();
464  int soffset_num = map.soffset(rb[cb]).size();
465 
466  int low = soffset_num*myId/threads_num;
467  int high = soffset_num*(myId+1)/threads_num;
468 
469  func_gather_M_3_0(low,high,0,true,0,soffset_slice,send_buf[0],psi_ptr,u3_ptr);
470  }
471 
472 #pragma omp barrier
473  if (myId == comm_thread)
474  comms_send_receive(0);
475 
476  {
477  const Map& map = shift.getMap(+1,3);
478  const int* soffset_slice = map.soffset(rb[cb]).slice();
479  int soffset_num = map.soffset(rb[cb]).size();
480 
481  int low = soffset_num*myId/threads_num;
482  int high = soffset_num*(myId+1)/threads_num;
483 
484  func_gather_P_3_0(low,high,0,true,0,soffset_slice,send_buf[1],psi_ptr);
485  }
486 
487 #pragma omp barrier
488  if (myId == comm_thread)
489  comms_send_receive(1);
490  }
491 
492  if (comms[2].do_comms)
493  {
494  {
495  const Map& map = shift.getMap(-1,2);
496  const int* soffset_slice = map.soffset(rb[cb]).slice();
497  int soffset_num = map.soffset(rb[cb]).size();
498 
499  int low = soffset_num*myId/threads_num;
500  int high = soffset_num*(myId+1)/threads_num;
501 
502  func_gather_M_2_0(low,high,0,true,0,soffset_slice,send_buf[2],psi_ptr,u2_ptr);
503  }
504 
505 #pragma omp barrier
506  if (myId == comm_thread)
507  comms_send_receive(2);
508 
509  {
510  const Map& map = shift.getMap(+1,2);
511  const int* soffset_slice = map.soffset(rb[cb]).slice();
512  int soffset_num = map.soffset(rb[cb]).size();
513 
514  int low = soffset_num*myId/threads_num;
515  int high = soffset_num*(myId+1)/threads_num;
516 
517  func_gather_P_2_0(low,high,0,true,0,soffset_slice,send_buf[3],psi_ptr);
518  }
519 
520 #pragma omp barrier
521  if (myId == comm_thread)
522  comms_send_receive(3);
523  }
524 
525  if (comms[4].do_comms)
526  {
527  {
528  const Map& map = shift.getMap(-1,1);
529  const int* soffset_slice = map.soffset(rb[cb]).slice();
530  int soffset_num = map.soffset(rb[cb]).size();
531 
532  int low = soffset_num*myId/threads_num;
533  int high = soffset_num*(myId+1)/threads_num;
534 
535  func_gather_M_1_0(low,high,0,true,0,soffset_slice,send_buf[4],psi_ptr,u1_ptr);
536  }
537 
538 #pragma omp barrier
539  if (myId == comm_thread)
540  comms_send_receive(4);
541 
542  {
543  const Map& map = shift.getMap(+1,1);
544  const int* soffset_slice = map.soffset(rb[cb]).slice();
545  int soffset_num = map.soffset(rb[cb]).size();
546 
547  int low = soffset_num*myId/threads_num;
548  int high = soffset_num*(myId+1)/threads_num;
549 
550  func_gather_P_1_0(low,high,0,true,0,soffset_slice,send_buf[5],psi_ptr);
551  }
552 
553 #pragma omp barrier
554  if (myId == comm_thread)
555  comms_send_receive(5);
556  }
557 
558  if (comms[6].do_comms)
559  {
560  {
561  const Map& map = shift.getMap(-1,0);
562  const int* soffset_slice = map.soffset(rb[cb]).slice();
563  int soffset_num = map.soffset(rb[cb]).size();
564 
565  int low = soffset_num*myId/threads_num;
566  int high = soffset_num*(myId+1)/threads_num;
567 
568  func_gather_M_0_0(low,high,0,true,0,soffset_slice,send_buf[6],psi_ptr,u0_ptr);
569  }
570 
571 #pragma omp barrier
572  if (myId == comm_thread)
573  comms_send_receive(6);
574 
575  {
576  const Map& map = shift.getMap(+1,0);
577  const int* soffset_slice = map.soffset(rb[cb]).slice();
578  int soffset_num = map.soffset(rb[cb]).size();
579 
580  int low = soffset_num*myId/threads_num;
581  int high = soffset_num*(myId+1)/threads_num;
582 
583  func_gather_P_0_0(low,high,0,true,0,soffset_slice,send_buf[7],psi_ptr);
584  }
585 
586 #pragma omp barrier
587  if (myId == comm_thread)
588  comms_send_receive(7);
589  }
590 
591  {
592  int low = innerCount[cb]*myId/threads_num;
593  int high = innerCount[cb]*(myId+1)/threads_num;
594 
595  func_dslash_____0( low , high , 0 , false , 0 , innerSites[cb] , chi_ptr ,
596  shift.getMap(-1,3).goffset(rb[cb]).slice(), NULL, psi_ptr, u3_ptr ,
597  shift.getMap(+1,3).goffset(rb[cb]).slice(), NULL, psi_ptr, u3_ptr ,
598  shift.getMap(-1,2).goffset(rb[cb]).slice(), NULL, psi_ptr, u2_ptr ,
599  shift.getMap(+1,2).goffset(rb[cb]).slice(), NULL, psi_ptr, u2_ptr ,
600  shift.getMap(-1,1).goffset(rb[cb]).slice(), NULL, psi_ptr, u1_ptr ,
601  shift.getMap(+1,1).goffset(rb[cb]).slice(), NULL, psi_ptr, u1_ptr ,
602  shift.getMap(-1,0).goffset(rb[cb]).slice(), NULL, psi_ptr, u0_ptr ,
603  shift.getMap(+1,0).goffset(rb[cb]).slice(), NULL, psi_ptr, u0_ptr );
604  }
605 
606  if (myId == comm_thread)
607  comms_wait();
608 #pragma omp barrier
609 
610  {
611  int low = faceCount[cb]*myId/threads_num;
612  int high = faceCount[cb]*(myId+1)/threads_num;
613 
614  func_dslash_____0( low , high , 0 , false , 0 , faceSites[cb] , chi_ptr ,
615  shift.getMap(-1,3).goffset(rb[cb]).slice(),recv_buf[0],psi_ptr,u3_ptr,
616  shift.getMap(+1,3).goffset(rb[cb]).slice(),recv_buf[1],psi_ptr,u3_ptr,
617  shift.getMap(-1,2).goffset(rb[cb]).slice(),recv_buf[2],psi_ptr,u2_ptr,
618  shift.getMap(+1,2).goffset(rb[cb]).slice(),recv_buf[3],psi_ptr,u2_ptr,
619  shift.getMap(-1,1).goffset(rb[cb]).slice(),recv_buf[4],psi_ptr,u1_ptr,
620  shift.getMap(+1,1).goffset(rb[cb]).slice(),recv_buf[5],psi_ptr,u1_ptr,
621  shift.getMap(-1,0).goffset(rb[cb]).slice(),recv_buf[6],psi_ptr,u0_ptr,
622  shift.getMap(+1,0).goffset(rb[cb]).slice(),recv_buf[7],psi_ptr,u0_ptr);
623 
624  }
625 
626  }
627 
628  break;
629 
630  case MINUS:
631 
632 #pragma omp parallel default(shared)
633  {
634  int threads_num = omp_get_num_threads();
635  int myId = omp_get_thread_num();
636 
637  if (comms[0].do_comms)
638  {
639  {
640  const Map& map = shift.getMap(-1,3);
641  const int* soffset_slice = map.soffset(rb[cb]).slice();
642  int soffset_num = map.soffset(rb[cb]).size();
643 
644  int low = soffset_num*myId/threads_num;
645  int high = soffset_num*(myId+1)/threads_num;
646 
647  func_gather_M_3_1(low,high,0,true,0,soffset_slice,send_buf[0],psi_ptr,u3_ptr);
648  }
649 
650 #pragma omp barrier
651  if (myId == comm_thread)
652  comms_send_receive(0);
653 
654  {
655  const Map& map = shift.getMap(+1,3);
656  const int* soffset_slice = map.soffset(rb[cb]).slice();
657  int soffset_num = map.soffset(rb[cb]).size();
658 
659  int low = soffset_num*myId/threads_num;
660  int high = soffset_num*(myId+1)/threads_num;
661 
662  func_gather_P_3_1(low,high,0,true,0,soffset_slice,send_buf[1],psi_ptr);
663  }
664 
665 #pragma omp barrier
666  if (myId == comm_thread)
667  comms_send_receive(1);
668  }
669 
670  if (comms[2].do_comms)
671  {
672  {
673  const Map& map = shift.getMap(-1,2);
674  const int* soffset_slice = map.soffset(rb[cb]).slice();
675  int soffset_num = map.soffset(rb[cb]).size();
676 
677  int low = soffset_num*myId/threads_num;
678  int high = soffset_num*(myId+1)/threads_num;
679 
680  func_gather_M_2_1(low,high,0,true,0,soffset_slice,send_buf[2],psi_ptr,u2_ptr);
681  }
682 
683 #pragma omp barrier
684  if (myId == comm_thread)
685  comms_send_receive(2);
686 
687  {
688  const Map& map = shift.getMap(+1,2);
689  const int* soffset_slice = map.soffset(rb[cb]).slice();
690  int soffset_num = map.soffset(rb[cb]).size();
691 
692  int low = soffset_num*myId/threads_num;
693  int high = soffset_num*(myId+1)/threads_num;
694 
695  func_gather_P_2_1(low,high,0,true,0,soffset_slice,send_buf[3],psi_ptr);
696  }
697 
698 #pragma omp barrier
699  if (myId == comm_thread)
700  comms_send_receive(3);
701  }
702 
703  if (comms[4].do_comms)
704  {
705  {
706  const Map& map = shift.getMap(-1,1);
707  const int* soffset_slice = map.soffset(rb[cb]).slice();
708  int soffset_num = map.soffset(rb[cb]).size();
709 
710  int low = soffset_num*myId/threads_num;
711  int high = soffset_num*(myId+1)/threads_num;
712 
713  func_gather_M_1_1(low,high,0,true,0,soffset_slice,send_buf[4],psi_ptr,u1_ptr);
714  }
715 
716 #pragma omp barrier
717  if (myId == comm_thread)
718  comms_send_receive(4);
719 
720  {
721  const Map& map = shift.getMap(+1,1);
722  const int* soffset_slice = map.soffset(rb[cb]).slice();
723  int soffset_num = map.soffset(rb[cb]).size();
724 
725  int low = soffset_num*myId/threads_num;
726  int high = soffset_num*(myId+1)/threads_num;
727 
728  func_gather_P_1_1(low,high,0,true,0,soffset_slice,send_buf[5],psi_ptr);
729  }
730 
731 #pragma omp barrier
732  if (myId == comm_thread)
733  comms_send_receive(5);
734  }
735 
736  if (comms[6].do_comms)
737  {
738  {
739  const Map& map = shift.getMap(-1,0);
740  const int* soffset_slice = map.soffset(rb[cb]).slice();
741  int soffset_num = map.soffset(rb[cb]).size();
742 
743  int low = soffset_num*myId/threads_num;
744  int high = soffset_num*(myId+1)/threads_num;
745 
746  func_gather_M_0_1(low,high,0,true,0,soffset_slice,send_buf[6],psi_ptr,u0_ptr);
747  }
748 
749 #pragma omp barrier
750  if (myId == comm_thread)
751  comms_send_receive(6);
752 
753  {
754  const Map& map = shift.getMap(+1,0);
755  const int* soffset_slice = map.soffset(rb[cb]).slice();
756  int soffset_num = map.soffset(rb[cb]).size();
757 
758  int low = soffset_num*myId/threads_num;
759  int high = soffset_num*(myId+1)/threads_num;
760 
761  func_gather_P_0_1(low,high,0,true,0,soffset_slice,send_buf[7],psi_ptr);
762  }
763 
764 #pragma omp barrier
765  if (myId == comm_thread)
766  comms_send_receive(7);
767  }
768 
769  {
770  int low = innerCount[cb]*myId/threads_num;
771  int high = innerCount[cb]*(myId+1)/threads_num;
772 
773  func_dslash_____1( low , high , 0 , false , 0 , innerSites[cb] , chi_ptr ,
774  shift.getMap(-1,3).goffset(rb[cb]).slice(), NULL, psi_ptr, u3_ptr ,
775  shift.getMap(+1,3).goffset(rb[cb]).slice(), NULL, psi_ptr, u3_ptr ,
776  shift.getMap(-1,2).goffset(rb[cb]).slice(), NULL, psi_ptr, u2_ptr ,
777  shift.getMap(+1,2).goffset(rb[cb]).slice(), NULL, psi_ptr, u2_ptr ,
778  shift.getMap(-1,1).goffset(rb[cb]).slice(), NULL, psi_ptr, u1_ptr ,
779  shift.getMap(+1,1).goffset(rb[cb]).slice(), NULL, psi_ptr, u1_ptr ,
780  shift.getMap(-1,0).goffset(rb[cb]).slice(), NULL, psi_ptr, u0_ptr ,
781  shift.getMap(+1,0).goffset(rb[cb]).slice(), NULL, psi_ptr, u0_ptr );
782  }
783 
784  if (myId == comm_thread)
785  comms_wait();
786 #pragma omp barrier
787 
788  {
789  int low = faceCount[cb]*myId/threads_num;
790  int high = faceCount[cb]*(myId+1)/threads_num;
791 
792  func_dslash_____1( low , high , 0 , false , 0 , faceSites[cb] , chi_ptr ,
793  shift.getMap(-1,3).goffset(rb[cb]).slice(),recv_buf[0],psi_ptr,u3_ptr,
794  shift.getMap(+1,3).goffset(rb[cb]).slice(),recv_buf[1],psi_ptr,u3_ptr,
795  shift.getMap(-1,2).goffset(rb[cb]).slice(),recv_buf[2],psi_ptr,u2_ptr,
796  shift.getMap(+1,2).goffset(rb[cb]).slice(),recv_buf[3],psi_ptr,u2_ptr,
797  shift.getMap(-1,1).goffset(rb[cb]).slice(),recv_buf[4],psi_ptr,u1_ptr,
798  shift.getMap(+1,1).goffset(rb[cb]).slice(),recv_buf[5],psi_ptr,u1_ptr,
799  shift.getMap(-1,0).goffset(rb[cb]).slice(),recv_buf[6],psi_ptr,u0_ptr,
800  shift.getMap(+1,0).goffset(rb[cb]).slice(),recv_buf[7],psi_ptr,u0_ptr);
801 
802  }
803 
804  }
805 
806  break;
807  }
808 
809  LLVMWilsonDslashT<T,P,Q>::getFermBC().modifyF(chi, QDP::rb[cb]);
810  }
811 
812 
813 
814 
815 
816  // typedef LLVMWilsonDslashT<LatticeFermion,
817  // multi1d<LatticeColorMatrix>,
818  // multi1d<LatticeColorMatrix> > LLVMWilsonDslash;
819 
820 
821  // typedef LLVMWilsonDslashT<LatticeFermionF,
822  // multi1d<LatticeColorMatrixF>,
823  // multi1d<LatticeColorMatrixF> > LLVMWilsonDslashF;
824 
825  typedef LLVMWilsonDslashT<LatticeFermionD,
826  multi1d<LatticeColorMatrixD>,
827  multi1d<LatticeColorMatrixD> > LLVMWilsonDslashD;
828 
829 } // End Namespace Chroma
830 
831 
832 #endif
Anisotropy parameters.
Base class for all fermion action boundary conditions.
Definition: fermbc.h:20
Support class for fermion actions and linear operators.
Definition: state.h:94
Class for counted reference semantics.
Definition: handle.h:33
General Wilson-Dirac dslash.
const FermBC< T, P, Q > & getFermBC() const
Return the fermion BC object for this linear operator.
void create(Handle< FermState< T, P, Q > > state)
Creation routine.
const multi1d< Real > & getCoeffs() const
Get the anisotropy parameters.
Handle< FermBC< T, P, Q > > fbc
QMP_msghandle_t mh_a[8][2]
~LLVMWilsonDslashT()
No real need for cleanup here.
void comms_send_receive(int i) const
General Wilson-Dirac dslash.
int mu
Definition: cool.cc:24
LLVMWilsonDslashT()
Empty constructor. Must use create later.
void apply(T &chi, const T &psi, enum PlusMinus isign, int cb) const
General Wilson-Dirac dslash.
Wilson Dslash linear operator.
Nd
Definition: meslate.cc:74
Asqtad Staggered-Dirac operator.
Definition: klein_gord.cc:10
QDP_error_exit("too many BiCG iterations", n_count, rsd_sq, cp, c, re_rvr, im_rvr, re_a, im_a, re_b, im_b)
static multi1d< LatticeColorMatrix > u
LinOpSysSolverMGProtoClover::Q Q
LinOpSysSolverMGProtoClover::T T
LLVMWilsonDslashT< LatticeFermionD, multi1d< LatticeColorMatrixD >, multi1d< LatticeColorMatrixD > > LLVMWilsonDslashD
int i
Definition: pbg5p_w.cc:55
@ MINUS
Definition: chromabase.h:45
@ PLUS
Definition: chromabase.h:45
multi1d< LatticeFermion > chi(Ncb)
LatticeFermion psi
Definition: mespbg5p_w.cc:35
START_CODE()
int cb
Definition: invbicg.cc:120
const WilsonTypeFermAct< multi1d< LatticeFermion > > Handle< const ConnectState > state
Definition: pbg5p_w.cc:28
multi1d< Real > makeFermCoeffs(const AnisoParam_t &aniso)
Make fermion coefficients.
Definition: aniso_io.cc:63
Support class for fermion actions and linear operators.
Parameters for anisotropy.
Definition: aniso_io.h:24