llnl · artv3 · Mar 4, 2026 · Nov 25, 2025 · Nov 25, 2025 · Dec 2, 2025
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -75,6 +75,10 @@ raja_add_executable(
   NAME raja-launch
   SOURCES raja-launch.cpp)
 
+raja_add_executable(
+  NAME gpu-launch-context-indices
+  SOURCES gpu-launch-context-indices.cpp)
+
 raja_add_executable(
   NAME launch_matrix-multiply
   SOURCES launch_matrix-multiply.cpp)

diff --git a/examples/gpu-launch-context-indices.cpp b/examples/gpu-launch-context-indices.cpp
@@ -0,0 +1,167 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) Lawrence Livermore National Security, LLC and other
+// RAJA Project Developers. See top-level LICENSE and COPYRIGHT
+// files for dates and other details. No copyright assignment is required
+// to contribute to RAJA.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ * RAJA Launch Example: LaunchContext index/dimension caching (CUDA/HIP)
+ *
+ * RAJA launch kernels receive a "launch context" object (ctx) that provides
+ * access to execution details needed by hierarchical kernels, such as:
+ *   - team (block) indices and dimensions
+ *   - thread indices and dimensions
+ *
+ * Many RAJA launch patterns (multiple nested RAJA::loop regions, multiple uses
+ * of indices/dims, etc.) can lead to repeated queries of the underlying device
+ * intrinsics (e.g., blockIdx.x, threadIdx.x, blockDim.x). RAJA provides
+ * LaunchContext policies that control whether those values are cached within
+ * the context object on first access and then reused.
+ *
+ * This example selects the "all cached indices and dims" policy for CUDA/HIP
+ * and runs a simple teams/threads kernel that writes `d_array[i] = i`.
+ */
+
+template<typename Backend>
+struct BackendTraits;
+
+#if defined(RAJA_ENABLE_HIP)
+struct HipBackend;
+
+template<>
+struct BackendTraits<HipBackend>
+{
+  static constexpr const char* name = "HIP";
+  using device_res_t                = RAJA::resources::Hip;
+  using launch_t                    = RAJA::hip_launch_t<true>;
+  // Cache all indices/dimensions accessed through the launch context.
+  //threadIdx, blockDim, blockIdx, gridDim cached 
+  using cache_policy_t              = RAJA::HipIndicesAndDims<true, true, true, true>;
+  using ctx_policy_t                = RAJA::HipLaunchContextIndicesAndDimsPolicy<cache_policy_t>;
+  using block_x_direct_t            = RAJA::hip_block_x_direct;
+  using thread_x_direct_t           = RAJA::hip_thread_x_loop;
+};
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+struct CudaBackend;
+
+template<>
+struct BackendTraits<CudaBackend>
+{
+  static constexpr const char* name = "CUDA";
+  using device_res_t                = RAJA::resources::Cuda;
+  using launch_t                    = RAJA::cuda_launch_t<true>;
+  // Cache all indices/dimensions accessed through the launch context.
+  //threadIdx, blockDim, blockIdx, gridDim cached
+  using cache_policy_t              = RAJA::CudaIndicesAndDims<true, true, true, true>;
+  using ctx_policy_t                = RAJA::CudaLaunchContextIndicesAndDimsPolicy<cache_policy_t>;
+  using block_x_direct_t            = RAJA::cuda_block_x_direct;
+  using thread_x_direct_t           = RAJA::cuda_thread_x_loop;
+};
+#endif
+
+template<typename Backend>
+int run_example()
+{
+  using T = BackendTraits<Backend>;
+
+  std::cout << "\n Running RAJA " << T::name
+            << " launch-context indices/dims caching example...\n";
+
+  constexpr int N         = 64;
+  constexpr int BLOCK_DIM = 32;
+  constexpr int GRID_DIM  = 1;
+
+  typename T::device_res_t device_res;
+  RAJA::resources::Host host_res;
+
+  int* d_array = device_res.template allocate<int>(N);
+  int* h_array = host_res.allocate<int>(N);
+
+  for (int i = 0; i < N; ++i)
+  {
+    h_array[i] = -1;
+  }
+  device_res.memcpy(d_array, h_array, sizeof(int) * N);
+
+  using launch_policy = RAJA::LaunchPolicy<typename T::launch_t>;
+  // LaunchContextT binds a LaunchContext policy to the context type.
+  using Ctx           = RAJA::LaunchContextT<typename T::ctx_policy_t>;
+  using teams_x       = RAJA::LoopPolicy<typename T::block_x_direct_t>;
+  using threads_x     = RAJA::LoopPolicy<typename T::thread_x_direct_t>;
+
+  RAJA::launch<launch_policy>(
+      device_res,
+      RAJA::LaunchParams(RAJA::Teams(GRID_DIM), RAJA::Threads(BLOCK_DIM)),
+      [=] RAJA_HOST_DEVICE(Ctx ctx) {
+        // The nested loops below will access team/thread indices/dimensions via
+        // the launch context. With the "all cached" policy, those values are
+        // cached in `ctx` the first time they are needed.
+
+        RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, GRID_DIM), [&](int bx) {
+
+          // Iterate over more logical thread-iterations than the physical
+          // thread dimension to exercise the *_thread_x_loop mapping.
+          RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, 2 * BLOCK_DIM),
+                               [&](int tx) {
+
+            if (tx < N)
+            {
+              d_array[tx] = tx;
+            }
+
+          });
+        });
+      });
+
+  device_res.memcpy(h_array, d_array, sizeof(int) * N);
+
+  int err_count = 0;
+  for (int i = 0; i < N; ++i)
+  {
+    if (h_array[i] != i)
+    {
+      ++err_count;
+    }
+  }
+
+  std::cout << "    Result -- " << (err_count ? "FAIL" : "PASS") << "\n";
+  if (err_count)
+  {
+    std::cout << "      error count = " << err_count << "\n";
+  }
+
+  device_res.deallocate(d_array);
+  host_res.deallocate(h_array);
+
+  return (err_count ? 1 : 0);
+}
+
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+{
+#if defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_CUDA)
+  int err_count = 0;
+
+  #if defined(RAJA_ENABLE_HIP)
+  err_count += run_example<HipBackend>();
+  #endif
+
+  #if defined(RAJA_ENABLE_CUDA)
+  err_count += run_example<CudaBackend>();
+  #endif
+
+  std::cout << "\n DONE!...\n";
+  return (err_count ? 1 : 0);
+#else
+  std::cout << "Please build with HIP or CUDA to run this example ...\n";
+  return 0;
+#endif
+}
diff --git a/include/RAJA/pattern/launch/launch_context_policy.hpp b/include/RAJA/pattern/launch/launch_context_policy.hpp
@@ -0,0 +1,97 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing a helper to
+ *           determine the launch context type
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) Lawrence Livermore National Security, LLC and other
+// RAJA Project Developers. See top-level LICENSE and COPYRIGHT
+// files for dates and other details. No copyright assignment is required
+// to contribute to RAJA.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_context_policy_HPP
+#define RAJA_pattern_context_policy_HPP
+
+#include <type_traits>
+
+namespace RAJA
+{
+
+template<typename LaunchContextPolicy>
+class LaunchContextT;
+
+class LaunchContextHostPolicy;
+
+namespace detail
+{
+
+template<typename T>
+struct first_argument;
+
+template<typename R, typename Arg0, typename... Args>
+struct first_argument<R(Arg0, Args...)>
+{
+  using type = Arg0;
+};
+
+template<typename C, typename R, typename Arg0, typename... Args>
+struct first_argument<R (C::*)(Arg0, Args...)>
+    : first_argument<R(Arg0, Args...)>
+{};
+
+template<typename C, typename R, typename Arg0, typename... Args>
+struct first_argument<R (C::*)(Arg0, Args...) const>
+    : first_argument<R(Arg0, Args...)>
+{};
+
+template<typename C, typename R, typename Arg0, typename... Args>
+struct first_argument<R (C::*)(Arg0, Args...) noexcept>
+    : first_argument<R(Arg0, Args...)>
+{};
+
+template<typename C, typename R, typename Arg0, typename... Args>
+struct first_argument<R (C::*)(Arg0, Args...) const noexcept>
+    : first_argument<R(Arg0, Args...)>
+{};
+
+template<typename T, typename = void>
+struct callable_signature
+{
+  using type = camp::decay<T>;
+};
+
+template<typename T>
+struct callable_signature<T, std::void_t<decltype(&camp::decay<T>::operator())>>
+{
+  using type = decltype(&camp::decay<T>::operator());
+};
+
+template<typename T, typename = void>
+struct launch_context_type
+{
+  using type = LaunchContextT<LaunchContextHostPolicy>;
+};
+
+template<typename T>
+struct launch_context_type<T,
+                           std::void_t<typename first_argument<camp::decay<
+                               typename callable_signature<T>::type>>::type>>
+{
+  using type = camp::decay<
+      typename first_argument<typename callable_signature<T>::type>::type>;
+};
+
+
+}  // namespace detail
+
+}  // namespace RAJA
+#endif
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
@@ -22,10 +22,22 @@
 
 #include "RAJA/config.hpp"
 #include "RAJA/internal/get_platform.hpp"
+#include "RAJA/pattern/launch/launch_context_policy.hpp"
 #include "RAJA/util/StaticLayout.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/plugins.hpp"
 #include "RAJA/util/types.hpp"
+
+// Needed to provide a default indices/dims implementation for LaunchContext
+// when compiling for GPU backends. The default launch context is used by
+// existing examples and user code (e.g. RAJA::LaunchContext), but device-side
+// index mappers require an indices/dims object.
+#if defined(RAJA_HIP_ACTIVE)
+#include "RAJA/policy/hip/policy.hpp"
+#elif defined(RAJA_CUDA_ACTIVE)
+#include "RAJA/policy/cuda/policy.hpp"
+#endif
+
 #include "camp/camp.hpp"
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
@@ -176,21 +188,21 @@ struct LaunchParams
   Threads apply(Threads const& a) { return (threads = a); }
 };
 
-class LaunchContext
+class LaunchContextBase
 {
 public:
   // Bump style allocator used to
   // get memory from the pool
   size_t shared_mem_offset;
-
   void* shared_mem_ptr;
 
+// In the future move this into a derived class.
 #if defined(RAJA_SYCL_ACTIVE)
   // SGS ODR issue
   mutable ::sycl::nd_item<3>* itm;
 #endif
 
-  RAJA_HOST_DEVICE LaunchContext()
+  RAJA_HOST_DEVICE LaunchContextBase()
       : shared_mem_offset(0),
         shared_mem_ptr(nullptr)
   {}
@@ -209,20 +221,6 @@ class LaunchContext
     return static_cast<T*>(mem_ptr);
   }
 
-  /*
-  //Odd dependecy with atomics is breaking CI builds
-  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t
-  z_stride=DIM-1, typename arg, typename... args> RAJA_HOST_DEVICE auto
-  getSharedMemoryView(size_t bytes, arg idx, args... idxs)
-  {
-    T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset];
-
-    shared_mem_offset += bytes*sizeof(T);
-    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx,
-  idxs...);
-  }
-  */
-
   RAJA_HOST_DEVICE void releaseSharedMemory()
   {
     // On the cpu/gpu we want to restart the count
@@ -243,6 +241,24 @@ class LaunchContext
   }
 };
 
+template<>
+class LaunchContextT<LaunchContextHostPolicy> : public LaunchContextBase
+{
+public:
+  using LaunchContextBase::LaunchContextBase;
+};
+
+// Preserve backwards compatibility
+#if defined(RAJA_HIP_ACTIVE)
+using LaunchContext =
+    LaunchContextT<HipLaunchContextNonCachedIndicesAndDimsPolicy>;
+#elif defined(RAJA_CUDA_ACTIVE)
+using LaunchContext =
+    LaunchContextT<CudaLaunchContextNonCachedIndicesAndDimsPolicy>;
+#else
+using LaunchContext = LaunchContextT<LaunchContextHostPolicy>;
+#endif
+
 template<typename LAUNCH_POLICY>
 struct LaunchExecute;