SpatialOps
NeboLhs.h
1 /* This file was generated by fulmar version 0.9.2. */
2 
3 /*
4  * Copyright (c) 2014-2017 The University of Utah
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #ifndef NEBO_LHS_H
26  #define NEBO_LHS_H
27 
28  namespace SpatialOps {
29  #ifdef __CUDACC__
30  #define DIV_UP(a,b) ( ( (a) + (b) - 1 ) / (b) )
31 
32 
33  template<typename LhsType, typename RhsType, typename OptionalArgT>
34  __global__ void gpu_assign_kernel(LhsType lhs,
35  RhsType rhs,
36  int const xLow,
37  int const xHigh,
38  int const yLow,
39  int const yHigh,
40  int const zLow,
41  int const zHigh,
42  int * idx) {
43  lhs.template gpuwalk_assign<RhsType, OptionalArgT>(rhs,
44  xLow,
45  xHigh,
46  yLow,
47  yHigh,
48  zLow,
49  zHigh,
50  idx);
51  }
52  #endif
53  /* __CUDACC__ */;
54 
55  template<typename CurrentMode, typename FieldType>
56  struct NeboField;
57  template<typename FieldType>
58  struct NeboField<Initial, FieldType> {
59  public:
60  FieldType typedef field_type;
61 
62  NeboField<Initial, FieldType> typedef MyType;
63 
64  NeboField<SeqWalk, FieldType> typedef SeqWalkType;
65 
66  #ifdef ENABLE_THREADS
67  NeboField<Resize, FieldType> typedef ResizeType;
68  #endif
69  /* ENABLE_THREADS */
70 
71  #ifdef __CUDACC__
72  NeboField<GPUWalk, FieldType> typedef GPUWalkType;
73  #endif
74  /* __CUDACC__ */
75 
76  NeboField(FieldType f)
77  : field_(f)
78  {}
79 
80  template<typename RhsType>
81  inline void assign(bool const useGhost, RhsType rhs) {
82  GhostData const ghosts = calculate_actual_ghost(useGhost,
83  field_.get_ghost_data(),
84  field_.boundary_info(),
85  rhs.ghosts_with_bc());
86 
87  IntVec const extents = field_.window_with_ghost().extent() -
88  field_.get_valid_ghost_data().get_minus() - field_.get_valid_ghost_data().get_plus();
89 
90  IntVec const hasBC = field_.boundary_info().has_bc(PLUS_SIDE);
91 
92  const GhostData limits = GhostData(- ghosts.get_minus(0),
93  extents[0] + ghosts.get_plus(0),
94  - ghosts.get_minus(1),
95  extents[1] + ghosts.get_plus(1),
96  - ghosts.get_minus(2),
97  extents[2] + ghosts.get_plus(2));
98 
99  EvalTimeFlagsAndAssign(field_.boundary_info().has_bc(MINUS_SIDE),
100  hasBC)(*this,
101  rhs,
102  extents,
103  ghosts,
104  hasBC,
105  limits);
106  }
107 
108  template<typename RhsType, typename OptionalArgT>
109  inline void assign_after_eval_flags(RhsType rhs,
110  IntVec const & extents,
111  GhostData const & ghosts,
112  IntVec const & hasBC,
113  GhostData const limits) {
114  if(limits.get_plus(0) - limits.get_minus(0) > 0 &&
115  limits.get_plus(1) - limits.get_minus(1) > 0 &&
116  limits.get_plus(2) - limits.get_minus(2) > 0) {
117  /* field_.reset_valid_ghosts(ghosts) */;
118 
119  #ifdef __CUDACC__
120  #ifdef NEBO_GPU_TEST
121  gpu_test_assign<RhsType, OptionalArgT>(rhs,
122  extents,
123  ghosts,
124  hasBC,
125  limits)
126  #else
127  if(gpu_ready()) {
128  if(rhs.gpu_ready(gpu_device_index())) {
129  gpu_assign<RhsType, OptionalArgT>(rhs,
130  extents,
131  ghosts,
132  hasBC,
133  limits);
134  }
135  else {
136  std::ostringstream msg;
137  msg << "Nebo error in " << "Nebo Assignment" <<
138  ":\n";
139  msg << "Left-hand side of assignment allocated ";
140  msg << "on ";
141  msg << "GPU but right-hand side is not ";
142  msg << "(completely) accessible on the same GPU";
143  msg << "\n";
144  msg << "\t - " << __FILE__ << " : " << __LINE__;
145  throw(std::runtime_error(msg.str()));
146  };
147  }
148  else {
149  if(cpu_ready()) {
150  if(rhs.cpu_ready()) {
151  cpu_assign<RhsType, OptionalArgT>(rhs,
152  extents,
153  ghosts,
154  hasBC,
155  limits);
156  }
157  else {
158  std::ostringstream msg;
159  msg << "Nebo error in " << "Nebo Assignment" <<
160  ":\n";
161  msg << "Left-hand side of assignment allocated ";
162  msg << "on ";
163  msg << "CPU but right-hand side is not ";
164  msg << "(completely) accessible on the same CPU"
165  ;
166  msg << "\n";
167  msg << "\t - " << __FILE__ << " : " << __LINE__;
168  throw(std::runtime_error(msg.str()));
169  };
170  }
171  else {
172  std::ostringstream msg;
173  msg << "Nebo error in " << "Nebo Assignment" <<
174  ":\n";
175  msg << "Left-hand side of assignment allocated ";
176  msg << "on ";
177  msg << "unknown device - not on CPU or GPU";
178  msg << "\n";
179  msg << "\t - " << __FILE__ << " : " << __LINE__;
180  throw(std::runtime_error(msg.str()));
181  };
182  }
183  #endif
184  /* NEBO_GPU_TEST */
185  #else
186  cpu_assign<RhsType, OptionalArgT>(rhs,
187  extents,
188  ghosts,
189  hasBC,
190  limits)
191  #endif
192  /* __CUDACC__ */;
193  };
194  }
195 
196  template<typename RhsType>
197  inline void masked_assign(SpatialMask<FieldType> const & mask,
198  RhsType rhs) {
199  EvalTimeFlagsAndMaskedAssign(field_.boundary_info().has_bc(MINUS_SIDE),
200  field_.boundary_info().has_bc(PLUS_SIDE))(mask,
201  *
202  this,
203  rhs,
204  IntVec(0,
205  0,
206  0),
207  GhostData(0),
208  IntVec(0,
209  0,
210  0),
211  GhostData(0));
212  }
213 
214  template<typename RhsType, typename OptionalArgT>
215  inline void masked_assign_after(SpatialMask<FieldType> const & mask,
216  RhsType rhs,
217  IntVec const & extents,
218  GhostData const & ghosts,
219  IntVec const & hasBC,
220  GhostData const limits) {
221  #ifdef NEBO_REPORT_BACKEND
222  std::cout << "Starting Nebo masked assignment" << std::endl
223  #endif
224  /* NEBO_REPORT_BACKEND */;
225 
226  #ifdef __CUDACC__
227  if(gpu_ready()) {
228  std::ostringstream msg;
229  msg << "Nebo error in " << "Nebo Masked Assignment" << ":\n"
230  ;
231  msg << "Left-hand side of masked assignment allocated on ";
232  msg << "GPU and this backend does not support GPU execution"
233  ;
234  msg << "\n";
235  msg << "\t - " << __FILE__ << " : " << __LINE__;
236  throw(std::runtime_error(msg.str()));
237  }
238  else {
239  if(cpu_ready()) {
240  if(rhs.cpu_ready()) {
241  SeqWalkType lhs = init();
242 
243  PoolAutoPtr<int> curEval (Pool<int>::get (CPU_INDEX, 3), CPU_INDEX);
244  double mappedValue;
245 
246  NeboOptionalArg outerIndex(curEval, &mappedValue);
247 
248  typename RhsType::SeqWalkType expr = rhs.init(IntVec(0,
249  0,
250  0),
251  GhostData(0),
252  IntVec(0,
253  0,
254  0),
255  outerIndex);
256 
257  std::vector<IntVec>::const_iterator ip = mask.points().begin();
258 
259  std::vector<IntVec>::const_iterator const ep = mask.points().end();
260 
261  for(; ip != ep; ip++) {
262  int const x = (*ip)[0];
263 
264  int const y = (*ip)[1];
265 
266  int const z = (*ip)[2];
267 
268  outerIndex.setOuterIndex(x,y,z);
269 
270  lhs.ref(x, y, z) = expr.template eval<OptionalArgT>(x,
271  y,
272  z);
273  };
274  }
275  else {
276  std::ostringstream msg;
277  msg << "Nebo error in " << "Nebo Assignment" << ":\n";
278  msg << "Left-hand side of assignment allocated ";
279  msg << "on ";
280  msg << "CPU but right-hand side is not ";
281  msg << "(completely) accessible on the same CPU";
282  msg << "\n";
283  msg << "\t - " << __FILE__ << " : " << __LINE__;
284  throw(std::runtime_error(msg.str()));
285  };
286  }
287  else {
288  std::ostringstream msg;
289  msg << "Nebo error in " << "Nebo Assignment" << ":\n";
290  msg << "Left-hand side of assignment allocated ";
291  msg << "on ";
292  msg << "unknown device - not on CPU or GPU";
293  msg << "\n";
294  msg << "\t - " << __FILE__ << " : " << __LINE__;
295  throw(std::runtime_error(msg.str()));
296  };
297  }
298  #else
299  {
300  SeqWalkType lhs = init();
301 
302  PoolAutoPtr<int> curEval (Pool<int>::get (CPU_INDEX, 3), CPU_INDEX);
303  double mappedValue;
304 
305  NeboOptionalArg outerIndex(curEval, &mappedValue);
306 
307  typename RhsType::SeqWalkType expr = rhs.init(IntVec(0, 0, 0),
308  GhostData(0),
309  IntVec(0, 0, 0),
310  outerIndex);
311 
312  std::vector<IntVec>::const_iterator ip = mask.points().begin();
313 
314  std::vector<IntVec>::const_iterator const ep = mask.points().end();
315 
316  for(; ip != ep; ip++) {
317  int const x = (*ip)[0];
318 
319  int const y = (*ip)[1];
320 
321  int const z = (*ip)[2];
322 
323  outerIndex.setOuterIndex(x,y,z);
324 
325  lhs.ref(x, y, z) = expr.template eval<OptionalArgT>(x,
326  y,
327  z);
328  };
329  }
330  #endif
331  /* __CUDACC__ */;
332 
333  #ifdef NEBO_REPORT_BACKEND
334  std::cout << "Finished Nebo masked assignment" << std::endl
335  #endif
336  /* NEBO_REPORT_BACKEND */;
337  }
338 
339  inline SeqWalkType init(void) { return SeqWalkType(field_); }
340 
341  struct EvalTimeFlagsAndAssign {
342  EvalTimeFlagsAndAssign(IntVec const negBC, IntVec const plusBC)
343  : negBC_(negBC), plusBC_(plusBC)
344  {}
345 
346  template<typename RhsType, typename ... OptionalArgT>
347  inline void after_flags_determined(MyType & lhs,
348  RhsType rhs,
349  IntVec const & extents,
350  GhostData const & ghosts,
351  IntVec const & hasBC,
352  GhostData const limits) {
353  lhs.template assign_after_eval_flags<RhsType,
354  CompileTimeOptionalArgs<OptionalArgT
355  ...,
356  void>
357  >(rhs, extents, ghosts, hasBC, limits);
358  }
359 
360  template<typename RhsType>
361  inline void operator()(MyType & lhs,
362  RhsType rhs,
363  IntVec const & extents,
364  GhostData const & ghosts,
365  IntVec const & hasBC,
366  GhostData const limits) {
367  determine_bc_flag_x<RhsType>(lhs,
368  rhs,
369  extents,
370  ghosts,
371  hasBC,
372  limits);
373  }
374 
375  private:
376  template<typename RhsType, typename ... OptionalArgT>
377  inline void determine_bc_flag_z(MyType & lhs,
378  RhsType rhs,
379  IntVec const & extents,
380  GhostData const & ghosts,
381  IntVec const & hasBC,
382  GhostData const limits) {
383  using namespace CompileTimeOptionalArgsNamespace;
384 
385  if(negBC_[2] || plusBC_[2]) {
386  after_flags_determined<RhsType, HasBCOnZ, OptionalArgT ...>(lhs,
387  rhs,
388  extents,
389  ghosts,
390  hasBC,
391  limits);
392  }
393  else {
394  after_flags_determined<RhsType, OptionalArgT ...>(lhs,
395  rhs,
396  extents,
397  ghosts,
398  hasBC,
399  limits);
400  };
401  }
402 
403  template<typename RhsType, typename ... OptionalArgT>
404  inline void determine_bc_flag_y(MyType & lhs,
405  RhsType rhs,
406  IntVec const & extents,
407  GhostData const & ghosts,
408  IntVec const & hasBC,
409  GhostData const limits) {
410  using namespace CompileTimeOptionalArgsNamespace;
411 
412  if(negBC_[1] || plusBC_[1]) {
413  determine_bc_flag_z<RhsType, HasBCOnY, OptionalArgT ...>(lhs,
414  rhs,
415  extents,
416  ghosts,
417  hasBC,
418  limits);
419  }
420  else {
421  determine_bc_flag_z<RhsType, OptionalArgT ...>(lhs,
422  rhs,
423  extents,
424  ghosts,
425  hasBC,
426  limits);
427  };
428  }
429 
430  template<typename RhsType, typename ... OptionalArgT>
431  inline void determine_bc_flag_x(MyType & lhs,
432  RhsType rhs,
433  IntVec const & extents,
434  GhostData const & ghosts,
435  IntVec const & hasBC,
436  GhostData const limits) {
437  using namespace CompileTimeOptionalArgsNamespace;
438 
439  if(negBC_[0] || plusBC_[0]) {
440  determine_bc_flag_y<RhsType, HasBCOnX, OptionalArgT ...>(lhs,
441  rhs,
442  extents,
443  ghosts,
444  hasBC,
445  limits);
446  }
447  else {
448  determine_bc_flag_y<RhsType, OptionalArgT ...>(lhs,
449  rhs,
450  extents,
451  ghosts,
452  hasBC,
453  limits);
454  };
455  }
456 
457  private:
458  IntVec const negBC_;
459 
460  IntVec const plusBC_;
461  };
462 
463  struct EvalTimeFlagsAndMaskedAssign {
464  EvalTimeFlagsAndMaskedAssign(IntVec const negBC,
465  IntVec const plusBC)
466  : negBC_(negBC), plusBC_(plusBC)
467  {}
468 
469  template<typename RhsType, typename ... OptionalArgT>
470  inline void after_flags_determined(SpatialMask<FieldType> const &
471  mask,
472  MyType & lhs,
473  RhsType rhs,
474  IntVec const & extents,
475  GhostData const & ghosts,
476  IntVec const & hasBC,
477  GhostData const limits) {
478  lhs.template masked_assign_after<RhsType,
479  CompileTimeOptionalArgs<OptionalArgT
480  ...,
481  void> >(mask,
482  rhs,
483  extents,
484  ghosts,
485  hasBC,
486  limits);
487  }
488 
489  template<typename RhsType>
490  inline void operator()(SpatialMask<FieldType> const & mask,
491  MyType & lhs,
492  RhsType rhs,
493  IntVec const & extents,
494  GhostData const & ghosts,
495  IntVec const & hasBC,
496  GhostData const limits) {
497  determine_bc_flag_x<RhsType>(mask,
498  lhs,
499  rhs,
500  extents,
501  ghosts,
502  hasBC,
503  limits);
504  }
505 
506  private:
507  template<typename RhsType, typename ... OptionalArgT>
508  inline void determine_bc_flag_z(SpatialMask<FieldType> const &
509  mask,
510  MyType & lhs,
511  RhsType rhs,
512  IntVec const & extents,
513  GhostData const & ghosts,
514  IntVec const & hasBC,
515  GhostData const limits) {
516  using namespace CompileTimeOptionalArgsNamespace;
517 
518  if(negBC_[2] || plusBC_[2]) {
519  after_flags_determined<RhsType, HasBCOnZ, OptionalArgT ...>(mask,
520  lhs,
521  rhs,
522  extents,
523  ghosts,
524  hasBC,
525  limits);
526  }
527  else {
528  after_flags_determined<RhsType, OptionalArgT ...>(mask,
529  lhs,
530  rhs,
531  extents,
532  ghosts,
533  hasBC,
534  limits);
535  };
536  }
537 
538  template<typename RhsType, typename ... OptionalArgT>
539  inline void determine_bc_flag_y(SpatialMask<FieldType> const &
540  mask,
541  MyType & lhs,
542  RhsType rhs,
543  IntVec const & extents,
544  GhostData const & ghosts,
545  IntVec const & hasBC,
546  GhostData const limits) {
547  using namespace CompileTimeOptionalArgsNamespace;
548 
549  if(negBC_[1] || plusBC_[1]) {
550  determine_bc_flag_z<RhsType, HasBCOnY, OptionalArgT ...>(mask,
551  lhs,
552  rhs,
553  extents,
554  ghosts,
555  hasBC,
556  limits);
557  }
558  else {
559  determine_bc_flag_z<RhsType, OptionalArgT ...>(mask,
560  lhs,
561  rhs,
562  extents,
563  ghosts,
564  hasBC,
565  limits);
566  };
567  }
568 
569  template<typename RhsType, typename ... OptionalArgT>
570  inline void determine_bc_flag_x(SpatialMask<FieldType> const &
571  mask,
572  MyType & lhs,
573  RhsType rhs,
574  IntVec const & extents,
575  GhostData const & ghosts,
576  IntVec const & hasBC,
577  GhostData const limits) {
578  using namespace CompileTimeOptionalArgsNamespace;
579 
580  if(negBC_[0] || plusBC_[0]) {
581  determine_bc_flag_y<RhsType, HasBCOnX, OptionalArgT ...>(mask,
582  lhs,
583  rhs,
584  extents,
585  ghosts,
586  hasBC,
587  limits);
588  }
589  else {
590  determine_bc_flag_y<RhsType, OptionalArgT ...>(mask,
591  lhs,
592  rhs,
593  extents,
594  ghosts,
595  hasBC,
596  limits);
597  };
598  }
599 
600  private:
601  IntVec const negBC_;
602 
603  IntVec const plusBC_;
604  };
605 
606  private:
607  template<typename RhsType, typename OptionalArgT>
608  inline void cpu_assign(RhsType rhs,
609  IntVec const & extents,
610  GhostData const & ghosts,
611  IntVec const & hasBC,
612  GhostData const limits) {
613  #ifdef ENABLE_THREADS
614  if(is_thread_parallel()) {
615  thread_parallel_assign<RhsType, OptionalArgT>(rhs,
616  extents,
617  ghosts,
618  hasBC,
619  limits);
620  }
621  else {
622  sequential_assign<RhsType, OptionalArgT>(rhs,
623  extents,
624  ghosts,
625  hasBC,
626  limits);
627  }
628  #else
629  sequential_assign<RhsType, OptionalArgT>(rhs,
630  extents,
631  ghosts,
632  hasBC,
633  limits)
634  #endif
635  /* ENABLE_THREADS */;
636  }
637 
638  template<typename RhsType, typename OptionalArgT>
639  inline void sequential_assign(RhsType rhs,
640  IntVec const & extents,
641  GhostData const & ghosts,
642  IntVec const & hasBC,
643  GhostData const limits) {
644  #ifdef NEBO_REPORT_BACKEND
645  std::cout << "Starting Nebo sequential" << std::endl
646  #endif
647  /* NEBO_REPORT_BACKEND */;
648 
649  PoolAutoPtr<int> curEval (Pool<int>::get (CPU_INDEX, 3), CPU_INDEX);
650  double mappedValue;
651 
652  NeboOptionalArg outerIndex(curEval, &mappedValue);
653 
654  init().template seqwalk_assign<typename RhsType::SeqWalkType,
655  OptionalArgT>(rhs.init(extents,
656  ghosts,
657  hasBC,
658  outerIndex),
659  limits,
660  outerIndex);
661 
662  #ifdef NEBO_REPORT_BACKEND
663  std::cout << "Finished Nebo sequential" << std::endl
664  #endif
665  /* NEBO_REPORT_BACKEND */;
666  }
667 
668  #ifdef ENABLE_THREADS
669  template<typename RhsType, typename OptionalArgT>
670  inline void thread_parallel_assign(RhsType rhs,
671  IntVec const & extents,
672  GhostData const & ghosts,
673  IntVec const & hasBC,
674  GhostData const limits) {
675  #ifdef NEBO_REPORT_BACKEND
676  std::cout << "Starting Nebo thread parallel" << std::endl
677  #endif
678  /* NEBO_REPORT_BACKEND */;
679 
680  Semaphore semaphore(0);
681 
682  const int thread_count = field_.get_partition_count();
683 
684  typename RhsType::ResizeType typedef RhsResizeType;
685 
686  ResizeType new_lhs = resize();
687 
688  RhsResizeType new_rhs = rhs.resize();
689 
690  GhostData localLimits;
691 
692  const IntVec split = nebo_find_partition(IntVec(limits.get_plus(0)
693  - limits.get_minus(0),
694  limits.get_plus(1)
695  - limits.get_minus(1),
696  limits.get_plus(2)
697  - limits.get_minus(2)),
698  thread_count);
699 
700  const int max = nebo_partition_count(split);
701 
702  IntVec location = IntVec(0, 0, 0);
703 
704  for(int count = 0; count < max; count++) {
705  nebo_set_up_extents(location, split, localLimits, limits);
706 
707  ThreadPoolFIFO::self().schedule(boost::bind(&ResizeType::
708  template
709  resize_assign<RhsResizeType,
710  OptionalArgT>,
711  new_lhs,
712  new_rhs,
713  extents,
714  ghosts,
715  hasBC,
716  localLimits,
717  &semaphore));
718 
719  location = nebo_next_partition(location, split);
720  };
721 
722  for(int ii = 0; ii < max; ii++) { semaphore.wait(); };
723 
724  #ifdef NEBO_REPORT_BACKEND
725  std::cout << "Finished Nebo thread parallel" << std::endl
726  #endif
727  /* NEBO_REPORT_BACKEND */;
728  }
729 
730  inline ResizeType resize(void) { return ResizeType(field_); }
731  #endif
732  /* ENABLE_THREADS */
733 
734  #ifdef __CUDACC__
735  template<typename RhsType, typename OptionalArgT>
736  inline void gpu_assign(RhsType rhs,
737  IntVec const & extents,
738  GhostData const & ghosts,
739  IntVec const & hasBC,
740  GhostData const limits) {
741  #ifdef NEBO_REPORT_BACKEND
742  std::cout << "Starting Nebo CUDA" << std::endl
743  #endif
744  /* NEBO_REPORT_BACKEND */;
745 
746  typename RhsType::GPUWalkType typedef RhsGPUWalkType;
747 
748  int xExtent = limits.get_plus(0) - limits.get_minus(0);
749 
750  int yExtent = limits.get_plus(1) - limits.get_minus(1);
751 
752  int min_block_count = 0;
753  int thread_per_block = 0;
754 
755  // call cuda runtime function to calculate the block size to maximize occupancy
756  cudaOccupancyMaxPotentialBlockSize(&min_block_count, &thread_per_block, gpu_assign_kernel<GPUWalkType, RhsGPUWalkType, OptionalArgT>,
757  0, xExtent * yExtent);
758 
759  const int block_count = DIV_UP(xExtent * yExtent, thread_per_block);
760 
761  const int threadsx = std::min(xExtent, thread_per_block);
762  const int threadsy = std::min(yExtent, std::max(thread_per_block / threadsx, 1));
763 
764  const int blocksx = std::min(block_count, DIV_UP(xExtent, threadsx));
765  const int blocksy = std::min(DIV_UP(block_count, blocksx), DIV_UP(yExtent, threadsy));
766 
767  const dim3 dimBlock(threadsx, threadsy);
768 
769  const dim3 dimGrid(blocksx, blocksy);
770 
771  #ifndef NDEBUG
772  cudaError err;
773 
774  if(cudaSuccess != (err = cudaStreamSynchronize(field_.get_stream())))
775  {
776  std::ostringstream msg;
777  msg << "Nebo error in " << "CUDA Kernel - before call" <<
778  ":\n";
779  msg << " - " << cudaGetErrorString(err);
780  msg << "\n";
781  msg << "\t - " << __FILE__ << " : " << __LINE__;
782  throw(std::runtime_error(msg.str()));;
783  }
784  #endif
785  /* NDEBUG */;
786 
787  PoolAutoPtr<int> curEval (Pool<int>::get (GPU_INDEX, 3 * blocksx * blocksy * threadsx * threadsy), GPU_INDEX);
788 
789  PoolAutoPtr<double> mappedValue(Pool<double>::get(GPU_INDEX, blocksx * blocksy * threadsx * threadsy), GPU_INDEX);
790 
791  NeboOptionalArg outerIndex(curEval, mappedValue);
792 
793  gpu_assign_kernel<GPUWalkType, RhsGPUWalkType, OptionalArgT><<<dimGrid,
794  dimBlock,
795  0,
796  field_.get_stream()>>>(gpu_init(),
797  rhs.gpu_init(extents,
798  ghosts,
799  hasBC,
800  gpu_device_index(),
801  field_.get_stream(),
802  outerIndex),
803  limits.get_minus(0),
804  limits.get_plus(0),
805  limits.get_minus(1),
806  limits.get_plus(1),
807  limits.get_minus(2),
808  limits.get_plus(2),
809  curEval);
810 
811  cudaEventRecord(field_.get_last_event(), field_.get_stream());
812 
813  rhs.stream_wait_event(field_.get_last_event());
814 
815  #ifndef NDEBUG
816  if(cudaSuccess != (err = cudaStreamSynchronize(field_.get_stream())))
817  {
818  std::ostringstream msg;
819  msg << "Nebo error in " << "CUDA Kernel - after call" <<
820  ":\n";
821  msg << " - " << cudaGetErrorString(err);
822  msg << "\n";
823  msg << "\t - " << __FILE__ << " : " << __LINE__;
824  throw(std::runtime_error(msg.str()));;
825  }
826  #endif
827  /* NDEBUG */;
828 
829  #ifdef NEBO_REPORT_BACKEND
830  std::cout << "Finished Nebo CUDA" << std::endl
831  #endif
832  /* NEBO_REPORT_BACKEND */;
833  }
834 
835  inline bool cpu_ready(void) const {
836  return IS_CPU_INDEX(field_.active_device_index());
837  }
838 
839  inline bool gpu_ready(void) const {
840  return IS_GPU_INDEX(field_.active_device_index());
841  }
842 
843  inline int gpu_device_index(void) const {
844  return field_.active_device_index();
845  }
846 
847  inline GPUWalkType gpu_init(void) { return GPUWalkType(field_); }
848 
849  #ifdef NEBO_GPU_TEST
850  template<typename RhsType, typename OptionalArgT>
851  inline void gpu_test_assign(RhsType rhs,
852  IntVec const & extents,
853  GhostData const & ghosts,
854  IntVec const & hasBC,
855  GhostData const limits) {
856  #ifdef NEBO_REPORT_BACKEND
857  std::cout << "Starting Nebo CUDA with Nebo copying" <<
858  std::endl
859  #endif
860  /* NEBO_REPORT_BACKEND */;
861 
862  rhs.gpu_prep(0);
863 
864  if(CPU_INDEX == field_.active_device_index()) {
865  FieldType gpu_field(field_.window_with_ghost(),
866  field_.boundary_info(),
867  field_.get_valid_ghost_data(),
868  NULL,
869  InternalStorage,
870  GPU_INDEX);
871 
872  NeboField<Initial, FieldType> gpu_lhs(gpu_field);
873 
874  ema::cuda::CUDADeviceInterface & CDI = ema::cuda::
875  CUDADeviceInterface::self();
876 
877  FieldType const & ftmp_ = field_;
878 
879  CDI.memcpy_to(gpu_field.field_values(GPU_INDEX),
880  ftmp_.field_values(),
881  ftmp_.allocated_bytes(),
882  0,
883  ftmp_.get_stream());
884 
885  gpu_lhs.template gpu_assign<RhsType, OptionalArgT>(rhs,
886  extents,
887  ghosts,
888  hasBC,
889  limits);
890 
891  CDI.memcpy_from(field_.field_values(),
892  gpu_field.field_values(GPU_INDEX),
893  field_.allocated_bytes(),
894  0,
895  field_.get_stream());
896  }
897  else {
898  gpu_assign<RhsType, OptionalArgT>(rhs,
899  extents,
900  ghosts,
901  hasBC,
902  limits);
903  };
904 
905  #ifdef NEBO_REPORT_BACKEND
906  std::cout << "Finished Nebo CUDA with Nebo copying" <<
907  std::endl
908  #endif
909  /* NEBO_REPORT_BACKEND */;
910  }
911  #endif
912  /* NEBO_GPU_TEST */
913  #endif
914  /* __CUDACC__ */
915 
916  FieldType field_;
917  };
918  #ifdef ENABLE_THREADS
919  template<typename FieldType>
920  struct NeboField<Resize, FieldType> {
921  public:
922  FieldType typedef field_type;
923 
924  NeboField<SeqWalk, FieldType> typedef SeqWalkType;
925 
926  NeboField(FieldType f)
927  : field_(f)
928  {}
929 
930  #ifdef ENABLE_THREADS
931  template<typename RhsType, typename OptionalArgT>
932  inline void resize_assign(RhsType const & rhs,
933  IntVec const & extents,
934  GhostData const & ghosts,
935  IntVec const & hasBC,
936  GhostData const limits,
937  Semaphore * semaphore) {
938  PoolAutoPtr<int> curEval (Pool<int>::get (CPU_INDEX, 3), CPU_INDEX);
939  double mappedValue;
940 
941  NeboOptionalArg outerIndex(curEval, &mappedValue);
942 
943  init().template seqwalk_assign<typename RhsType::SeqWalkType,
944  OptionalArgT>(rhs.init(extents,
945  ghosts,
946  hasBC,
947  outerIndex),
948  limits,
949  outerIndex);
950 
951  semaphore->post();
952  }
953  #endif
954  /* ENABLE_THREADS */
955 
956  private:
957  inline SeqWalkType init(void) { return SeqWalkType(field_); }
958 
959  FieldType field_;
960  }
961  #endif
962  /* ENABLE_THREADS */;
963  template<typename FieldType>
964  struct NeboField<SeqWalk, FieldType> {
965  public:
966  FieldType typedef field_type;
967 
968  typename field_type::value_type typedef value_type;
969 
970  NeboField(FieldType f)
971  : xGlob_(f.window_with_ghost().glob_dim(0)),
972  yGlob_(f.window_with_ghost().glob_dim(1)),
973  base_(f.field_values(CPU_INDEX) + (f.window_with_ghost().offset(0) +
974  f.get_valid_ghost_data().get_minus(0))
975  + (f.window_with_ghost().glob_dim(0) * ((f.window_with_ghost().offset(1)
976  + f.get_valid_ghost_data().get_minus(1))
977  + (f.window_with_ghost().glob_dim(1)
978  * (f.window_with_ghost().offset(2)
979  + f.get_valid_ghost_data().get_minus(2))))))
980  {}
981 
982  template<typename RhsType, typename OptionalArgT>
983  inline void seqwalk_assign(RhsType rhs, GhostData const limits, NeboOptionalArg & curEval) {
984  for(int z = limits.get_minus(2); z < limits.get_plus(2); z++) {
985  for(int y = limits.get_minus(1); y < limits.get_plus(1); y++) {
986  for(int x = limits.get_minus(0); x < limits.get_plus(0); x++)
987  {
988  curEval.setOuterIndex(x,y,z);
989 
990  ref(x, y, z) = rhs.template eval<OptionalArgT>(x, y, z);
991 
992  };
993  };
994  };
995  }
996 
997  inline value_type & ref(int const x, int const y, int const z) {
998  return base_[x + xGlob_ * (y + (yGlob_ * z))];
999  }
1000 
1001  private:
1002  int const xGlob_;
1003 
1004  int const yGlob_;
1005 
1006  value_type * base_;
1007  };
1008  #ifdef __CUDACC__
1009  template<typename FieldType>
1010  struct NeboField<GPUWalk, FieldType> {
1011  public:
1012  FieldType typedef field_type;
1013 
1014  typename field_type::value_type typedef value_type;
1015 
1016  NeboField(FieldType f)
1017  : base_(f.field_values(f.active_device_index()) + (f.window_with_ghost().offset(0)
1018  + f.get_valid_ghost_data().get_minus(0))
1019  + (f.window_with_ghost().glob_dim(0) * ((f.window_with_ghost().offset(1)
1020  + f.get_valid_ghost_data().get_minus(1))
1021  + (f.window_with_ghost().glob_dim(1)
1022  * (f.window_with_ghost().offset(2)
1023  + f.get_valid_ghost_data().get_minus(2)))))),
1024  valid_(false),
1025  xGlob_(f.window_with_ghost().glob_dim(0)),
1026  yGlob_(f.window_with_ghost().glob_dim(1))
1027  {}
1028 
1029  template<typename RhsType, typename OptionalArgT>
1030  __device__ inline void gpuwalk_assign(RhsType rhs,
1031  int const xLow,
1032  int const xHigh,
1033  int const yLow,
1034  int const yHigh,
1035  int const zLow,
1036  int const zHigh,
1037  int * idx) {
1038  const int ii = blockIdx.x * blockDim.x + threadIdx.x;
1039 
1040  const int jj = blockIdx.y * blockDim.y + threadIdx.y;
1041 
1042  const int x = ii + xLow;
1043 
1044  const int y = jj + yLow;
1045 
1046  start(x, y, xHigh, yHigh);
1047 
1048  if (valid()) {
1049  idx[3*ii] = x;
1050  idx[3*ii+1] = y;
1051  }
1052 
1053  for(int z = zLow; z < zHigh; z++) {
1054  if(valid()) {
1055  idx[3*ii+2] = z;
1056  ref(x, y, z) = rhs.template eval<OptionalArgT>(x, y, z);
1057  };
1058  };
1059  }
1060 
1061  private:
1062  __device__ inline bool valid(void) { return valid_; }
1063 
1064  __device__ inline void start(int x,
1065  int y,
1066  int const xHigh,
1067  int const yHigh) {
1068  valid_ = (x < xHigh && y < yHigh);
1069  }
1070 
1071  __device__ inline value_type & ref(int const x,
1072  int const y,
1073  int const z) {
1074  return base_[x + xGlob_ * (y + (yGlob_ * z))];
1075  }
1076 
1077  value_type * base_;
1078 
1079  int valid_;
1080 
1081  int const xGlob_;
1082 
1083  int const yGlob_;
1084  }
1085  #endif
1086  /* __CUDACC__ */;
1087  } /* SpatialOps */
1088 
1089 #endif
1090 /* NEBO_LHS_H */
1091 
const std::vector< IntVec > & points(void) const
return reference to list of points in given list NOTE: Not supported for external field types ...
Definition: SpatialMask.h:187
takes ownership over memory allocated from the pool and puts the memory back into the pool when it is...
Definition: MemoryPool.h:121
Abstracts a mask.
Definition: SpatialMask.h:70
IntVec get_plus() const
obtain the IntVec containing the number of ghost cells on the (+) faces
Definition: GhostData.h:145
void post()
release a resource
Definition: Semaphore.h:29
Holds information about the number of ghost cells on each side of the domain.
Definition: GhostData.h:54
Parameter used to initialize Nebo expression operands across modes. The argument only stores informat...
Definition: NeboBasic.h:312
Provide resource management for multithreaded situations.
Definition: Semaphore.h:20
static ThreadPoolFIFO & self()
obtain the singleton instance of ThreadPoolFIFO
Definition: ThreadPool.cpp:194
IntVec get_minus() const
obtain the IntVec containing the number of ghost cells on the (-) faces
Definition: GhostData.h:135
void wait()
Wait until a resource is available (a call to post is made).
Definition: Semaphore.h:38