MFEM  v4.6.0
Finite element discretization library
forall.hpp
Go to the documentation of this file.
1 // Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
2 // at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3 // LICENSE and NOTICE for details. LLNL-CODE-806117.
4 //
5 // This file is part of the MFEM library. For more information and source code
6 // availability visit https://mfem.org.
7 //
8 // MFEM is free software; you can redistribute it and/or modify it under the
9 // terms of the BSD-3 license. We welcome feedback and contributions, see file
10 // CONTRIBUTING.md for details.
11 
12 #ifndef MFEM_FORALL_HPP
13 #define MFEM_FORALL_HPP
14 
15 #include "../config/config.hpp"
16 #include "annotation.hpp"
17 #include "error.hpp"
18 #include "backends.hpp"
19 #include "device.hpp"
20 #include "mem_manager.hpp"
21 #include "../linalg/dtensor.hpp"
22 
23 namespace mfem
24 {
25 
26 // The following DofQuadLimit_ structs define the maximum values of D1D and Q1D
27 // often used in the "fallback kernels" for partial assembly. Different limits
28 // take effect for different architectures. The limits should be queried using
29 // the public interface in DeviceDofQuadLimits or DofQuadLimits, and generally
30 // not be directly accessing the structs defined below.
31 //
32 // In host code, the limits associated with the currently configured Device can
33 // be accessed using DeviceDofQuadLimits::Get().
34 //
35 // In mfem::forall kernels or MFEM_HOST_DEVICE functions, the limits
36 // corresponding to the architecture the function is being compiled for can be
37 // accessed as static constexpr variables using the type alias DofQuadLimits.
38 
39 namespace internal
40 {
41 
42 struct DofQuadLimits_CUDA
43 {
44  static constexpr int MAX_D1D = 14;
45  static constexpr int MAX_Q1D = 14;
46  static constexpr int HCURL_MAX_D1D = 5;
47  static constexpr int HCURL_MAX_Q1D = 6;
48  static constexpr int HDIV_MAX_D1D = 5;
49  static constexpr int HDIV_MAX_Q1D = 6;
50  static constexpr int MAX_INTERP_1D = 8;
51  static constexpr int MAX_DET_1D = 6;
52 };
53 
54 struct DofQuadLimits_HIP
55 {
56  static constexpr int MAX_D1D = 10;
57  static constexpr int MAX_Q1D = 10;
58  static constexpr int HCURL_MAX_D1D = 5;
59  static constexpr int HCURL_MAX_Q1D = 5;
60  static constexpr int HDIV_MAX_D1D = 5;
61  static constexpr int HDIV_MAX_Q1D = 6;
62  static constexpr int MAX_INTERP_1D = 8;
63  static constexpr int MAX_DET_1D = 6;
64 };
65 
66 struct DofQuadLimits_CPU
67 {
68 #ifndef _WIN32
69  static constexpr int MAX_D1D = 24;
70  static constexpr int MAX_Q1D = 24;
71 #else
72  static constexpr int MAX_D1D = 14;
73  static constexpr int MAX_Q1D = 14;
74 #endif
75  static constexpr int HCURL_MAX_D1D = 10;
76  static constexpr int HCURL_MAX_Q1D = 10;
77  static constexpr int HDIV_MAX_D1D = 10;
78  static constexpr int HDIV_MAX_Q1D = 10;
79  static constexpr int MAX_INTERP_1D = MAX_D1D;
80  static constexpr int MAX_DET_1D = MAX_D1D;
81 };
82 
83 } // namespace internal
84 
85 /// @brief Maximum number of 1D DOFs or quadrature points for the architecture
86 /// currently being compiled for (used in fallback kernels).
87 ///
88 /// DofQuadLimits provides access to the limits as static constexpr member
89 /// variables for use in mfem::forall kernels or MFEM_HOST_DEVICE functions.
90 ///
91 /// @sa For accessing the limits according to the runtime configuration of the
92 /// Device, see DeviceDofQuadLimits.
93 #if defined(__CUDA_ARCH__)
94 using DofQuadLimits = internal::DofQuadLimits_CUDA;
95 #elif defined(__HIP_DEVICE_COMPILE__)
96 using DofQuadLimits = internal::DofQuadLimits_HIP;
97 #else
98 using DofQuadLimits = internal::DofQuadLimits_CPU;
99 #endif
100 
101 /// @brief Maximum number of 1D DOFs or quadrature points for the current
102 /// runtime configuration of the Device (used in fallback kernels).
103 ///
104 /// DeviceDofQuadLimits can be used in host code to query the limits for the
105 /// configured device (e.g. when the user has selected GPU execution at
106 /// runtime).
107 ///
108 /// @sa For accessing the limits according to the current compiler pass, see
109 /// DofQuadLimits.
111 {
112  int MAX_D1D; ///< Maximum number of 1D nodal points.
113  int MAX_Q1D; ///< Maximum number of 1D quadrature points.
114  int HCURL_MAX_D1D; ///< Maximum number of 1D nodal points for H(curl).
115  int HCURL_MAX_Q1D; ///< Maximum number of 1D quadrature points for H(curl).
116  int HDIV_MAX_D1D; ///< Maximum number of 1D nodal points for H(div).
117  int HDIV_MAX_Q1D; ///< Maximum number of 1D quadrature points for H(div).
118  int MAX_INTERP_1D; ///< Maximum number of points for use in QuadratureInterpolator.
119  int MAX_DET_1D; ///< Maximum number of points for determinant computation in QuadratureInterpolator.
120 
121  /// Return a const reference to the DeviceDofQuadLimits singleton.
122  static const DeviceDofQuadLimits &Get()
123  {
124  static const DeviceDofQuadLimits dof_quad_limits;
125  return dof_quad_limits;
126  }
127 
128 private:
129  /// Initialize the limits depending on the configuration of the Device.
131  {
132  if (Device::Allows(Backend::CUDA_MASK)) { Populate<internal::DofQuadLimits_CUDA>(); }
133  else if (Device::Allows(Backend::HIP_MASK)) { Populate<internal::DofQuadLimits_HIP>(); }
134  else { Populate<internal::DofQuadLimits_CPU>(); }
135  }
136 
137  /// @brief Set the limits using the static members of the type @a T.
138  ///
139  /// @a T should be one of DofQuadLimits_CUDA, DofQuadLimits_HIP, or
140  /// DofQuadLimits_CPU.
141  template <typename T> void Populate()
142  {
143  MAX_D1D = T::MAX_D1D;
144  MAX_Q1D = T::MAX_Q1D;
145  HCURL_MAX_D1D = T::HCURL_MAX_D1D;
146  HCURL_MAX_Q1D = T::HCURL_MAX_Q1D;
147  HDIV_MAX_D1D = T::HDIV_MAX_D1D;
148  HDIV_MAX_Q1D = T::HDIV_MAX_Q1D;
149  MAX_INTERP_1D = T::MAX_INTERP_1D;
150  MAX_DET_1D = T::MAX_DET_1D;
151  }
152 };
153 
154 // MFEM pragma macros that can be used inside MFEM_FORALL macros.
155 #define MFEM_PRAGMA(X) _Pragma(#X)
156 
157 // MFEM_UNROLL pragma macro that can be used inside MFEM_FORALL macros.
158 #if defined(MFEM_USE_CUDA) && defined(__CUDA_ARCH__)
159 #define MFEM_UNROLL(N) MFEM_PRAGMA(unroll(N))
160 #else
161 #define MFEM_UNROLL(N)
162 #endif
163 
164 // MFEM_GPU_FORALL: "parallel for" executed with CUDA or HIP based on the MFEM
165 // build-time configuration (MFEM_USE_CUDA or MFEM_USE_HIP). If neither CUDA nor
166 // HIP is enabled, this macro is a no-op.
167 #if defined(MFEM_USE_CUDA)
168 #define MFEM_GPU_FORALL(i, N,...) CuWrap1D(N, [=] MFEM_DEVICE \
169  (int i) {__VA_ARGS__})
170 #elif defined(MFEM_USE_HIP)
171 #define MFEM_GPU_FORALL(i, N,...) HipWrap1D(N, [=] MFEM_DEVICE \
172  (int i) {__VA_ARGS__})
173 #else
174 #define MFEM_GPU_FORALL(i, N,...) do { } while (false)
175 #endif
176 
177 // Implementation of MFEM's "parallel for" (forall) device/host kernel
178 // interfaces supporting RAJA, CUDA, OpenMP, and sequential backends.
179 
180 // The MFEM_FORALL wrapper
181 #define MFEM_FORALL(i,N,...) \
182  ForallWrap<1>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__})
183 
184 // MFEM_FORALL with a 2D CUDA block
185 #define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
186  ForallWrap<2>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,BZ)
187 
188 // MFEM_FORALL with a 3D CUDA block
189 #define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
190  ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z)
191 
192 // MFEM_FORALL with a 3D CUDA block and grid
193 // With G=0, this is the same as MFEM_FORALL_3D(i,N,X,Y,Z,...)
194 #define MFEM_FORALL_3D_GRID(i,N,X,Y,Z,G,...) \
195  ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z,G)
196 
197 // MFEM_FORALL that uses the basic CPU backend when use_dev is false. See for
198 // example the functions in vector.cpp, where we don't want to use the mfem
199 // device for operations on small vectors.
200 #define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
201  ForallWrap<1>(use_dev,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__})
202 
203 
204 /// OpenMP backend
205 template <typename HBODY>
206 void OmpWrap(const int N, HBODY &&h_body)
207 {
208 #ifdef MFEM_USE_OPENMP
209  #pragma omp parallel for
210  for (int k = 0; k < N; k++)
211  {
212  h_body(k);
213  }
214 #else
215  MFEM_CONTRACT_VAR(N);
216  MFEM_CONTRACT_VAR(h_body);
217  MFEM_ABORT("OpenMP requested for MFEM but OpenMP is not enabled!");
218 #endif
219 }
220 
221 
222 /// RAJA Cuda and Hip backends
223 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
224 using cuda_launch_policy =
225  RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>;
226 using cuda_teams_x =
227  RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
228 using cuda_threads_z =
229  RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>;
230 #endif
231 
232 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
233 using hip_launch_policy =
234  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>;
235 using hip_teams_x =
236  RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
237 using hip_threads_z =
238  RAJA::LoopPolicy<RAJA::hip_thread_z_direct>;
239 #endif
240 
241 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
242 template <const int BLOCKS = MFEM_CUDA_BLOCKS, typename DBODY>
243 void RajaCuWrap1D(const int N, DBODY &&d_body)
244 {
245  //true denotes asynchronous kernel
246  RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
247 }
248 
249 template <typename DBODY>
250 void RajaCuWrap2D(const int N, DBODY &&d_body,
251  const int X, const int Y, const int BZ)
252 {
253  MFEM_VERIFY(N>0, "");
254  MFEM_VERIFY(BZ>0, "");
255  const int G = (N+BZ-1)/BZ;
256 
257  using namespace RAJA;
258  using RAJA::RangeSegment;
259 
260  launch<cuda_launch_policy>
261  (LaunchParams(Teams(G), Threads(X, Y, BZ)),
262  [=] RAJA_DEVICE (LaunchContext ctx)
263  {
264 
265  loop<cuda_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)
266  {
267 
268  loop<cuda_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)
269  {
270 
271  const int k = n*BZ + tz;
272  if (k >= N) { return; }
273  d_body(k);
274 
275  });
276 
277  });
278 
279  });
280 
281  MFEM_GPU_CHECK(cudaGetLastError());
282 }
283 
284 template <typename DBODY>
285 void RajaCuWrap3D(const int N, DBODY &&d_body,
286  const int X, const int Y, const int Z, const int G)
287 {
288  MFEM_VERIFY(N>0, "");
289  const int GRID = G == 0 ? N : G;
290  using namespace RAJA;
291  using RAJA::RangeSegment;
292 
293  launch<cuda_launch_policy>
294  (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
295  [=] RAJA_DEVICE (LaunchContext ctx)
296  {
297 
298  loop<cuda_teams_x>(ctx, RangeSegment(0, N), d_body);
299 
300  });
301 
302  MFEM_GPU_CHECK(cudaGetLastError());
303 }
304 
305 template <int Dim>
306 struct RajaCuWrap;
307 
308 template <>
309 struct RajaCuWrap<1>
310 {
311  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
312  static void run(const int N, DBODY &&d_body,
313  const int X, const int Y, const int Z, const int G)
314  {
315  RajaCuWrap1D<BLCK>(N, d_body);
316  }
317 };
318 
319 template <>
320 struct RajaCuWrap<2>
321 {
322  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
323  static void run(const int N, DBODY &&d_body,
324  const int X, const int Y, const int Z, const int G)
325  {
326  RajaCuWrap2D(N, d_body, X, Y, Z);
327  }
328 };
329 
330 template <>
331 struct RajaCuWrap<3>
332 {
333  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
334  static void run(const int N, DBODY &&d_body,
335  const int X, const int Y, const int Z, const int G)
336  {
337  RajaCuWrap3D(N, d_body, X, Y, Z, G);
338  }
339 };
340 
341 #endif
342 
343 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
344 template <const int BLOCKS = MFEM_HIP_BLOCKS, typename DBODY>
345 void RajaHipWrap1D(const int N, DBODY &&d_body)
346 {
347  //true denotes asynchronous kernel
348  RAJA::forall<RAJA::hip_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
349 }
350 
351 template <typename DBODY>
352 void RajaHipWrap2D(const int N, DBODY &&d_body,
353  const int X, const int Y, const int BZ)
354 {
355  MFEM_VERIFY(N>0, "");
356  MFEM_VERIFY(BZ>0, "");
357  const int G = (N+BZ-1)/BZ;
358 
359  using namespace RAJA;
360  using RAJA::RangeSegment;
361 
362  launch<hip_launch_policy>
363  (LaunchParams(Teams(G), Threads(X, Y, BZ)),
364  [=] RAJA_DEVICE (LaunchContext ctx)
365  {
366 
367  loop<hip_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)
368  {
369 
370  loop<hip_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)
371  {
372 
373  const int k = n*BZ + tz;
374  if (k >= N) { return; }
375  d_body(k);
376 
377  });
378 
379  });
380 
381  });
382 
383  MFEM_GPU_CHECK(hipGetLastError());
384 }
385 
386 template <typename DBODY>
387 void RajaHipWrap3D(const int N, DBODY &&d_body,
388  const int X, const int Y, const int Z, const int G)
389 {
390  MFEM_VERIFY(N>0, "");
391  const int GRID = G == 0 ? N : G;
392  using namespace RAJA;
393  using RAJA::RangeSegment;
394 
395  launch<hip_launch_policy>
396  (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
397  [=] RAJA_DEVICE (LaunchContext ctx)
398  {
399 
400  loop<hip_teams_x>(ctx, RangeSegment(0, N), d_body);
401 
402  });
403 
404  MFEM_GPU_CHECK(hipGetLastError());
405 }
406 
407 template <int Dim>
408 struct RajaHipWrap;
409 
410 template <>
411 struct RajaHipWrap<1>
412 {
413  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
414  static void run(const int N, DBODY &&d_body,
415  const int X, const int Y, const int Z, const int G)
416  {
417  RajaHipWrap1D<BLCK>(N, d_body);
418  }
419 };
420 
421 template <>
422 struct RajaHipWrap<2>
423 {
424  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
425  static void run(const int N, DBODY &&d_body,
426  const int X, const int Y, const int Z, const int G)
427  {
428  RajaHipWrap2D(N, d_body, X, Y, Z);
429  }
430 };
431 
432 template <>
433 struct RajaHipWrap<3>
434 {
435  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
436  static void run(const int N, DBODY &&d_body,
437  const int X, const int Y, const int Z, const int G)
438  {
439  RajaHipWrap3D(N, d_body, X, Y, Z, G);
440  }
441 };
442 
443 #endif
444 
445 /// RAJA OpenMP backend
446 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
447 
448 template <typename HBODY>
449 void RajaOmpWrap(const int N, HBODY &&h_body)
450 {
451  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
452 }
453 
454 #endif
455 
456 
457 /// RAJA sequential loop backend
458 template <typename HBODY>
459 void RajaSeqWrap(const int N, HBODY &&h_body)
460 {
461 #ifdef MFEM_USE_RAJA
462  RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0,N), h_body);
463 #else
464  MFEM_CONTRACT_VAR(N);
465  MFEM_CONTRACT_VAR(h_body);
466  MFEM_ABORT("RAJA requested but RAJA is not enabled!");
467 #endif
468 }
469 
470 
471 /// CUDA backend
472 #ifdef MFEM_USE_CUDA
473 
474 template <typename BODY> __global__ static
475 void CuKernel1D(const int N, BODY body)
476 {
477  const int k = blockDim.x*blockIdx.x + threadIdx.x;
478  if (k >= N) { return; }
479  body(k);
480 }
481 
482 template <typename BODY> __global__ static
483 void CuKernel2D(const int N, BODY body)
484 {
485  const int k = blockIdx.x*blockDim.z + threadIdx.z;
486  if (k >= N) { return; }
487  body(k);
488 }
489 
490 template <typename BODY> __global__ static
491 void CuKernel3D(const int N, BODY body)
492 {
493  for (int k = blockIdx.x; k < N; k += gridDim.x) { body(k); }
494 }
495 
496 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
497 void CuWrap1D(const int N, DBODY &&d_body)
498 {
499  if (N==0) { return; }
500  const int GRID = (N+BLCK-1)/BLCK;
501  CuKernel1D<<<GRID,BLCK>>>(N, d_body);
502  MFEM_GPU_CHECK(cudaGetLastError());
503 }
504 
505 template <typename DBODY>
506 void CuWrap2D(const int N, DBODY &&d_body,
507  const int X, const int Y, const int BZ)
508 {
509  if (N==0) { return; }
510  MFEM_VERIFY(BZ>0, "");
511  const int GRID = (N+BZ-1)/BZ;
512  const dim3 BLCK(X,Y,BZ);
513  CuKernel2D<<<GRID,BLCK>>>(N,d_body);
514  MFEM_GPU_CHECK(cudaGetLastError());
515 }
516 
517 template <typename DBODY>
518 void CuWrap3D(const int N, DBODY &&d_body,
519  const int X, const int Y, const int Z, const int G)
520 {
521  if (N==0) { return; }
522  const int GRID = G == 0 ? N : G;
523  const dim3 BLCK(X,Y,Z);
524  CuKernel3D<<<GRID,BLCK>>>(N,d_body);
525  MFEM_GPU_CHECK(cudaGetLastError());
526 }
527 
528 template <int Dim>
529 struct CuWrap;
530 
531 template <>
532 struct CuWrap<1>
533 {
534  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
535  static void run(const int N, DBODY &&d_body,
536  const int X, const int Y, const int Z, const int G)
537  {
538  CuWrap1D<BLCK>(N, d_body);
539  }
540 };
541 
542 template <>
543 struct CuWrap<2>
544 {
545  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
546  static void run(const int N, DBODY &&d_body,
547  const int X, const int Y, const int Z, const int G)
548  {
549  CuWrap2D(N, d_body, X, Y, Z);
550  }
551 };
552 
553 template <>
554 struct CuWrap<3>
555 {
556  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
557  static void run(const int N, DBODY &&d_body,
558  const int X, const int Y, const int Z, const int G)
559  {
560  CuWrap3D(N, d_body, X, Y, Z, G);
561  }
562 };
563 
564 #endif // MFEM_USE_CUDA
565 
566 
567 /// HIP backend
568 #ifdef MFEM_USE_HIP
569 
570 template <typename BODY> __global__ static
571 void HipKernel1D(const int N, BODY body)
572 {
573  const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
574  if (k >= N) { return; }
575  body(k);
576 }
577 
578 template <typename BODY> __global__ static
579 void HipKernel2D(const int N, BODY body)
580 {
581  const int k = hipBlockIdx_x*hipBlockDim_z + hipThreadIdx_z;
582  if (k >= N) { return; }
583  body(k);
584 }
585 
586 template <typename BODY> __global__ static
587 void HipKernel3D(const int N, BODY body)
588 {
589  for (int k = hipBlockIdx_x; k < N; k += hipGridDim_x) { body(k); }
590 }
591 
592 template <const int BLCK = MFEM_HIP_BLOCKS, typename DBODY>
593 void HipWrap1D(const int N, DBODY &&d_body)
594 {
595  if (N==0) { return; }
596  const int GRID = (N+BLCK-1)/BLCK;
597  hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,0,N,d_body);
598  MFEM_GPU_CHECK(hipGetLastError());
599 }
600 
601 template <typename DBODY>
602 void HipWrap2D(const int N, DBODY &&d_body,
603  const int X, const int Y, const int BZ)
604 {
605  if (N==0) { return; }
606  const int GRID = (N+BZ-1)/BZ;
607  const dim3 BLCK(X,Y,BZ);
608  hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,0,N,d_body);
609  MFEM_GPU_CHECK(hipGetLastError());
610 }
611 
612 template <typename DBODY>
613 void HipWrap3D(const int N, DBODY &&d_body,
614  const int X, const int Y, const int Z, const int G)
615 {
616  if (N==0) { return; }
617  const int GRID = G == 0 ? N : G;
618  const dim3 BLCK(X,Y,Z);
619  hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,0,N,d_body);
620  MFEM_GPU_CHECK(hipGetLastError());
621 }
622 
623 template <int Dim>
624 struct HipWrap;
625 
626 template <>
627 struct HipWrap<1>
628 {
629  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
630  static void run(const int N, DBODY &&d_body,
631  const int X, const int Y, const int Z, const int G)
632  {
633  HipWrap1D<BLCK>(N, d_body);
634  }
635 };
636 
637 template <>
638 struct HipWrap<2>
639 {
640  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
641  static void run(const int N, DBODY &&d_body,
642  const int X, const int Y, const int Z, const int G)
643  {
644  HipWrap2D(N, d_body, X, Y, Z);
645  }
646 };
647 
648 template <>
649 struct HipWrap<3>
650 {
651  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
652  static void run(const int N, DBODY &&d_body,
653  const int X, const int Y, const int Z, const int G)
654  {
655  HipWrap3D(N, d_body, X, Y, Z, G);
656  }
657 };
658 
659 #endif // MFEM_USE_HIP
660 
661 
662 /// The forall kernel body wrapper
663 template <const int DIM, typename d_lambda, typename h_lambda>
664 inline void ForallWrap(const bool use_dev, const int N,
665  d_lambda &&d_body, h_lambda &&h_body,
666  const int X=0, const int Y=0, const int Z=0,
667  const int G=0)
668 {
669  MFEM_CONTRACT_VAR(X);
670  MFEM_CONTRACT_VAR(Y);
671  MFEM_CONTRACT_VAR(Z);
672  MFEM_CONTRACT_VAR(G);
673  MFEM_CONTRACT_VAR(d_body);
674  if (!use_dev) { goto backend_cpu; }
675 
676 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
677  // If Backend::RAJA_CUDA is allowed, use it
679  {
680  return RajaCuWrap<DIM>::run(N, d_body, X, Y, Z, G);
681  }
682 #endif
683 
684 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
685  // If Backend::RAJA_HIP is allowed, use it
687  {
688  return RajaHipWrap<DIM>::run(N, d_body, X, Y, Z, G);
689  }
690 #endif
691 
692 #ifdef MFEM_USE_CUDA
693  // If Backend::CUDA is allowed, use it
695  {
696  return CuWrap<DIM>::run(N, d_body, X, Y, Z, G);
697  }
698 #endif
699 
700 #ifdef MFEM_USE_HIP
701  // If Backend::HIP is allowed, use it
703  {
704  return HipWrap<DIM>::run(N, d_body, X, Y, Z, G);
705  }
706 #endif
707 
708  // If Backend::DEBUG_DEVICE is allowed, use it
709  if (Device::Allows(Backend::DEBUG_DEVICE)) { goto backend_cpu; }
710 
711 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
712  // If Backend::RAJA_OMP is allowed, use it
713  if (Device::Allows(Backend::RAJA_OMP)) { return RajaOmpWrap(N, h_body); }
714 #endif
715 
716 #ifdef MFEM_USE_OPENMP
717  // If Backend::OMP is allowed, use it
718  if (Device::Allows(Backend::OMP)) { return OmpWrap(N, h_body); }
719 #endif
720 
721 #ifdef MFEM_USE_RAJA
722  // If Backend::RAJA_CPU is allowed, use it
723  if (Device::Allows(Backend::RAJA_CPU)) { return RajaSeqWrap(N, h_body); }
724 #endif
725 
726 backend_cpu:
727  // Handle Backend::CPU. This is also a fallback for any allowed backends not
728  // handled above, e.g. OCCA_CPU with configuration 'occa-cpu,cpu', or
729  // OCCA_OMP with configuration 'occa-omp,cpu'.
730  for (int k = 0; k < N; k++) { h_body(k); }
731 }
732 
733 template <const int DIM, typename lambda>
734 inline void ForallWrap(const bool use_dev, const int N, lambda &&body,
735  const int X=0, const int Y=0, const int Z=0,
736  const int G=0)
737 {
738  ForallWrap<DIM>(use_dev, N, body, body, X, Y, Z, G);
739 }
740 
741 template<typename lambda>
742 inline void forall(int N, lambda &&body) { ForallWrap<1>(true, N, body); }
743 
744 template<typename lambda>
745 inline void forall_switch(bool use_dev, int N, lambda &&body)
746 {
747  ForallWrap<1>(use_dev, N, body);
748 }
749 
750 template<typename lambda>
751 inline void forall_2D(int N, int X, int Y, lambda &&body)
752 {
753  ForallWrap<2>(true, N, body, X, Y, 1);
754 }
755 
756 template<typename lambda>
757 inline void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
758 {
759  ForallWrap<2>(true, N, body, X, Y, BZ);
760 }
761 
762 template<typename lambda>
763 inline void forall_3D(int N, int X, int Y, int Z, lambda &&body)
764 {
765  ForallWrap<3>(true, N, body, X, Y, Z, 0);
766 }
767 
768 template<typename lambda>
769 inline void forall_3D_grid(int N, int X, int Y, int Z, int G, lambda &&body)
770 {
771  ForallWrap<3>(true, N, body, X, Y, Z, G);
772 }
773 
774 } // namespace mfem
775 
776 #endif // MFEM_FORALL_HPP
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:518
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
Definition: forall.hpp:763
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:630
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:613
void forall_2D(int N, int X, int Y, lambda &&body)
Definition: forall.hpp:751
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
Definition: forall.hpp:459
RAJA::LaunchPolicy< RAJA::cuda_launch_t< true > > cuda_launch_policy
RAJA Cuda and Hip backends.
Definition: forall.hpp:225
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:312
RAJA::LaunchPolicy< RAJA::hip_launch_t< true > > hip_launch_policy
Definition: forall.hpp:234
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:425
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
Definition: forall.hpp:449
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:506
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES...
Definition: device.hpp:45
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:436
Biwise-OR of all HIP backends.
Definition: device.hpp:90
void HipWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:593
int HDIV_MAX_D1D
Maximum number of 1D nodal points for H(div).
Definition: forall.hpp:116
void RajaCuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:285
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
Definition: device.hpp:48
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:546
RAJA::LoopPolicy< RAJA::cuda_thread_z_direct > cuda_threads_z
Definition: forall.hpp:229
int HCURL_MAX_Q1D
Maximum number of 1D quadrature points for H(curl).
Definition: forall.hpp:115
int MAX_Q1D
Maximum number of 1D quadrature points.
Definition: forall.hpp:113
int MAX_DET_1D
Maximum number of points for determinant computation in QuadratureInterpolator.
Definition: forall.hpp:119
void forall_3D_grid(int N, int X, int Y, int Z, int G, lambda &&body)
Definition: forall.hpp:769
void RajaHipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:387
internal::DofQuadLimits_CUDA DofQuadLimits
Maximum number of 1D DOFs or quadrature points for the architecture currently being compiled for (use...
Definition: forall.hpp:94
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
Definition: forall.hpp:757
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES...
Definition: device.hpp:42
void forall_switch(bool use_dev, int N, lambda &&body)
Definition: forall.hpp:745
Biwise-OR of all CUDA backends.
Definition: device.hpp:88
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:535
int MAX_D1D
Maximum number of 1D nodal points.
Definition: forall.hpp:112
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:652
void forall(int N, lambda &&body)
Definition: forall.hpp:742
Maximum number of 1D DOFs or quadrature points for the current runtime configuration of the Device (u...
Definition: forall.hpp:110
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
Definition: device.hpp:35
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:641
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
Definition: device.hpp:258
void CuWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:497
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
Definition: forall.hpp:122
RAJA::LoopPolicy< RAJA::cuda_block_x_direct > cuda_teams_x
Definition: forall.hpp:227
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:557
RAJA::LoopPolicy< RAJA::hip_block_x_direct > hip_teams_x
Definition: forall.hpp:236
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:323
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:602
void RajaHipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:352
int MAX_INTERP_1D
Maximum number of points for use in QuadratureInterpolator.
Definition: forall.hpp:118
int HDIV_MAX_Q1D
Maximum number of 1D quadrature points for H(div).
Definition: forall.hpp:117
void RajaCuWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:243
void RajaCuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:250
int HCURL_MAX_D1D
Maximum number of 1D nodal points for H(curl).
Definition: forall.hpp:114
void RajaHipWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:345
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:334
[device] RAJA HIP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_HIP = YES.
Definition: device.hpp:51
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:414
RAJA::LoopPolicy< RAJA::hip_thread_z_direct > hip_threads_z
Definition: forall.hpp:238
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
Definition: device.hpp:39
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
Definition: device.hpp:37
void ForallWrap(const bool use_dev, const int N, d_lambda &&d_body, h_lambda &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
Definition: forall.hpp:664
[device] Debug backend: host memory is READ/WRITE protected while a device is in use. It allows to test the "device" code-path (using separate host/device memory pools and host <-> device transfers) without any GPU hardware. As &#39;DEBUG&#39; is sometimes used as a macro, _DEVICE has been added to avoid conflicts.
Definition: device.hpp:75
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.
Definition: forall.hpp:206