Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
cd5065e
initial commit for launch loop optimization
artv3 Nov 25, 2025
484ff1a
add structs to store gpu thread/block info in launch ctx
artv3 Nov 25, 2025
18f332b
add cuda variant and add build guards for cpu
artv3 Dec 2, 2025
21f6184
Merge branch 'develop' into artv3/launch-loop-opt
artv3 Dec 2, 2025
73f224a
rework to support dim3 copy in ctx
artv3 Dec 11, 2025
8a02fee
Merge branch 'artv3/launch-loop-opt' of https://github.com/LLNL/RAJA …
artv3 Dec 11, 2025
1fbe50b
minor clean up pass
artv3 Dec 11, 2025
672889e
make format
artv3 Dec 11, 2025
5908a20
Update include/RAJA/pattern/launch/launch_core.hpp
artv3 Dec 11, 2025
316e019
Merge branch 'develop' into artv3/launch-loop-opt
rhornung67 Dec 15, 2025
4d9f800
clean up pass
artv3 Dec 18, 2025
d9ce271
update with develop and fix merge conflicts
artv3 Dec 18, 2025
85aef5a
fix build error
artv3 Dec 18, 2025
0469302
take develop submodule
artv3 Dec 18, 2025
4a695f2
cuda backend
artv3 Dec 18, 2025
f91a498
make style
artv3 Dec 18, 2025
d21c41f
omp backend
artv3 Dec 18, 2025
40a5c1b
seq backend + make style
artv3 Dec 18, 2025
e0f4825
clean up pass
artv3 Dec 18, 2025
96e99d5
Update include/RAJA/pattern/launch/launch_context_policy.hpp
artv3 Dec 18, 2025
a9f0cca
minor clean up
artv3 Dec 18, 2025
7d4595b
minor clean up
artv3 Dec 18, 2025
c23f76f
Merge branch 'artv3/launch-loop-opt' of github.com:LLNL/RAJA into art…
artv3 Dec 18, 2025
c990a4f
revert changes to example
artv3 Dec 18, 2025
f7939fd
remove specialization from launch policy
artv3 Dec 18, 2025
c24331c
make work for function pointers
artv3 Dec 18, 2025
0518138
store dim3 based on launch context type - hip
artv3 Dec 19, 2025
d5da29a
rework omp backend
artv3 Dec 19, 2025
af88dbb
update sequential backend
artv3 Dec 19, 2025
21ad0a8
get things building for cuda -- need a good clean up pass
artv3 Dec 19, 2025
646a95b
cuda clean up pass
artv3 Dec 19, 2025
597641b
clean up ordering in hip launch
artv3 Dec 19, 2025
5403737
clean up ordering
artv3 Dec 19, 2025
e41e970
make style
artv3 Dec 19, 2025
7c95430
use constexpt for getting dim values
artv3 Dec 19, 2025
d7cbbb5
Add classes that can cache Idx/Dim
MrBurmark Dec 19, 2025
bfe72de
merge develop, fix conflict
artv3 Jan 19, 2026
e494dac
Merge branch 'feature/burmark1/cache_idx_dim' into artv3/launch-loop-opt
artv3 Feb 27, 2026
5c88a4d
use cache idx in launch
artv3 Feb 27, 2026
960f0b7
remove dead code
artv3 Feb 27, 2026
aa3186c
clean up pass
artv3 Mar 2, 2026
7e79393
clean up code
artv3 Mar 2, 2026
e8e5e6d
have it also work for cuda
artv3 Mar 2, 2026
97c5edd
simplify helper functions
artv3 Mar 2, 2026
4ffefda
clean up pass
artv3 Mar 2, 2026
c2135ed
minor clean up
artv3 Mar 2, 2026
f5218ef
Update include/RAJA/policy/cuda/launch.hpp
artv3 Mar 3, 2026
93b3456
update the way we get index data
artv3 Mar 3, 2026
f36a2ce
clean up pass
artv3 Mar 3, 2026
0e18deb
default needs the indicies and dims struct
artv3 Mar 3, 2026
4a5c0a6
clean up pass
artv3 Mar 3, 2026
f078f0c
make style
artv3 Mar 3, 2026
26a00a3
Merge branch 'develop' into artv3/launch-loop-opt
artv3 Mar 3, 2026
75d9fc8
clean up pass
artv3 Mar 3, 2026
0954fdb
clean up pass
artv3 Mar 3, 2026
4936ad3
clean up pass
artv3 Mar 3, 2026
78f5ba3
clean up pass
artv3 Mar 3, 2026
67d52d7
make style
artv3 Mar 3, 2026
b6e45b3
clena up pass
artv3 Mar 3, 2026
140e88d
minor move to base
artv3 Mar 3, 2026
eb98047
clean up pass
artv3 Mar 4, 2026
a05e982
make style
artv3 Mar 4, 2026
f7237c1
make style
artv3 Mar 4, 2026
f482fff
PR comments
artv3 Mar 4, 2026
882157b
make style
artv3 Mar 4, 2026
cf8a88f
Update include/RAJA/pattern/launch/launch_core.hpp
artv3 Mar 4, 2026
a40aa65
Update include/RAJA/pattern/launch/launch_core.hpp
artv3 Mar 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ raja_add_executable(
NAME raja-launch
SOURCES raja-launch.cpp)

raja_add_executable(
NAME gpu-launch-context-indices
SOURCES gpu-launch-context-indices.cpp)

raja_add_executable(
NAME launch_matrix-multiply
SOURCES launch_matrix-multiply.cpp)
Expand Down
167 changes: 167 additions & 0 deletions examples/gpu-launch-context-indices.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
// Copyright (c) Lawrence Livermore National Security, LLC and other
// RAJA Project Developers. See top-level LICENSE and COPYRIGHT
// files for dates and other details. No copyright assignment is required
// to contribute to RAJA.
//
// SPDX-License-Identifier: (BSD-3-Clause)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//

#include <iostream>

#include "RAJA/RAJA.hpp"

/*
* RAJA Launch Example: LaunchContext index/dimension caching (CUDA/HIP)
*
* RAJA launch kernels receive a "launch context" object (ctx) that provides
* access to execution details needed by hierarchical kernels, such as:
* - team (block) indices and dimensions
* - thread indices and dimensions
*
* Many RAJA launch patterns (multiple nested RAJA::loop regions, multiple uses
* of indices/dims, etc.) can lead to repeated queries of the underlying device
* intrinsics (e.g., blockIdx.x, threadIdx.x, blockDim.x). RAJA provides
* LaunchContext policies that control whether those values are cached within
* the context object on first access and then reused.
*
* This example selects the "all cached indices and dims" policy for CUDA/HIP
* and runs a simple teams/threads kernel that writes `d_array[i] = i`.
*/

template<typename Backend>
struct BackendTraits;

#if defined(RAJA_ENABLE_HIP)
struct HipBackend;

template<>
struct BackendTraits<HipBackend>
{
static constexpr const char* name = "HIP";
using device_res_t = RAJA::resources::Hip;
using launch_t = RAJA::hip_launch_t<true>;
// Cache all indices/dimensions accessed through the launch context.
//threadIdx, blockDim, blockIdx, gridDim cached
using cache_policy_t = RAJA::HipIndicesAndDims<true, true, true, true>;
using ctx_policy_t = RAJA::HipLaunchContextIndicesAndDimsPolicy<cache_policy_t>;
using block_x_direct_t = RAJA::hip_block_x_direct;
using thread_x_direct_t = RAJA::hip_thread_x_loop;
};
#endif

#if defined(RAJA_ENABLE_CUDA)
struct CudaBackend;

template<>
struct BackendTraits<CudaBackend>
{
static constexpr const char* name = "CUDA";
using device_res_t = RAJA::resources::Cuda;
using launch_t = RAJA::cuda_launch_t<true>;
// Cache all indices/dimensions accessed through the launch context.
//threadIdx, blockDim, blockIdx, gridDim cached
using cache_policy_t = RAJA::CudaIndicesAndDims<true, true, true, true>;
using ctx_policy_t = RAJA::CudaLaunchContextIndicesAndDimsPolicy<cache_policy_t>;
using block_x_direct_t = RAJA::cuda_block_x_direct;
using thread_x_direct_t = RAJA::cuda_thread_x_loop;
};
#endif

template<typename Backend>
int run_example()
{
using T = BackendTraits<Backend>;

std::cout << "\n Running RAJA " << T::name
<< " launch-context indices/dims caching example...\n";

constexpr int N = 64;
constexpr int BLOCK_DIM = 32;
constexpr int GRID_DIM = 1;

typename T::device_res_t device_res;
RAJA::resources::Host host_res;

int* d_array = device_res.template allocate<int>(N);
int* h_array = host_res.allocate<int>(N);

for (int i = 0; i < N; ++i)
{
h_array[i] = -1;
}
device_res.memcpy(d_array, h_array, sizeof(int) * N);

using launch_policy = RAJA::LaunchPolicy<typename T::launch_t>;
// LaunchContextT binds a LaunchContext policy to the context type.
using Ctx = RAJA::LaunchContextT<typename T::ctx_policy_t>;
using teams_x = RAJA::LoopPolicy<typename T::block_x_direct_t>;
using threads_x = RAJA::LoopPolicy<typename T::thread_x_direct_t>;

RAJA::launch<launch_policy>(
device_res,
RAJA::LaunchParams(RAJA::Teams(GRID_DIM), RAJA::Threads(BLOCK_DIM)),
[=] RAJA_HOST_DEVICE(Ctx ctx) {
// The nested loops below will access team/thread indices/dimensions via
// the launch context. With the "all cached" policy, those values are
// cached in `ctx` the first time they are needed.

RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, GRID_DIM), [&](int bx) {

// Iterate over more logical thread-iterations than the physical
// thread dimension to exercise the *_thread_x_loop mapping.
RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, 2 * BLOCK_DIM),
[&](int tx) {

if (tx < N)
{
d_array[tx] = tx;
}

});
});
});

device_res.memcpy(h_array, d_array, sizeof(int) * N);

int err_count = 0;
for (int i = 0; i < N; ++i)
{
if (h_array[i] != i)
{
++err_count;
}
}

std::cout << " Result -- " << (err_count ? "FAIL" : "PASS") << "\n";
if (err_count)
{
std::cout << " error count = " << err_count << "\n";
}

device_res.deallocate(d_array);
host_res.deallocate(h_array);

return (err_count ? 1 : 0);
}

int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
{
#if defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_CUDA)
int err_count = 0;

#if defined(RAJA_ENABLE_HIP)
err_count += run_example<HipBackend>();
#endif

#if defined(RAJA_ENABLE_CUDA)
err_count += run_example<CudaBackend>();
#endif

std::cout << "\n DONE!...\n";
return (err_count ? 1 : 0);
#else
std::cout << "Please build with HIP or CUDA to run this example ...\n";
return 0;
#endif
}
97 changes: 97 additions & 0 deletions include/RAJA/pattern/launch/launch_context_policy.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*!
******************************************************************************
*
* \file
*
* \brief RAJA header file containing a helper to
* determine the launch context type
*
******************************************************************************
*/

//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
// Copyright (c) Lawrence Livermore National Security, LLC and other
// RAJA Project Developers. See top-level LICENSE and COPYRIGHT
// files for dates and other details. No copyright assignment is required
// to contribute to RAJA.
//
// SPDX-License-Identifier: (BSD-3-Clause)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//

#ifndef RAJA_pattern_context_policy_HPP
#define RAJA_pattern_context_policy_HPP

#include <type_traits>

namespace RAJA
{

template<typename LaunchContextPolicy>
class LaunchContextT;
Comment thread
MrBurmark marked this conversation as resolved.

class LaunchContextHostPolicy;

namespace detail
{

template<typename T>
struct first_argument;

template<typename R, typename Arg0, typename... Args>
struct first_argument<R(Arg0, Args...)>
Comment thread
artv3 marked this conversation as resolved.
{
using type = Arg0;
};

template<typename C, typename R, typename Arg0, typename... Args>
struct first_argument<R (C::*)(Arg0, Args...)>
: first_argument<R(Arg0, Args...)>
{};

template<typename C, typename R, typename Arg0, typename... Args>
struct first_argument<R (C::*)(Arg0, Args...) const>
: first_argument<R(Arg0, Args...)>
{};

template<typename C, typename R, typename Arg0, typename... Args>
struct first_argument<R (C::*)(Arg0, Args...) noexcept>
: first_argument<R(Arg0, Args...)>
{};

template<typename C, typename R, typename Arg0, typename... Args>
struct first_argument<R (C::*)(Arg0, Args...) const noexcept>
: first_argument<R(Arg0, Args...)>
{};

template<typename T, typename = void>
struct callable_signature
{
using type = camp::decay<T>;
};

template<typename T>
struct callable_signature<T, std::void_t<decltype(&camp::decay<T>::operator())>>
{
using type = decltype(&camp::decay<T>::operator());
};

template<typename T, typename = void>
struct launch_context_type
{
using type = LaunchContextT<LaunchContextHostPolicy>;
};

template<typename T>
struct launch_context_type<T,
std::void_t<typename first_argument<camp::decay<
typename callable_signature<T>::type>>::type>>
{
using type = camp::decay<
typename first_argument<typename callable_signature<T>::type>::type>;
};


} // namespace detail

} // namespace RAJA
#endif
50 changes: 33 additions & 17 deletions include/RAJA/pattern/launch/launch_core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,22 @@

#include "RAJA/config.hpp"
#include "RAJA/internal/get_platform.hpp"
#include "RAJA/pattern/launch/launch_context_policy.hpp"
#include "RAJA/util/StaticLayout.hpp"
#include "RAJA/util/macros.hpp"
#include "RAJA/util/plugins.hpp"
#include "RAJA/util/types.hpp"

// Needed to provide a default indices/dims implementation for LaunchContext
// when compiling for GPU backends. The default launch context is used by
// existing examples and user code (e.g. RAJA::LaunchContext), but device-side
// index mappers require an indices/dims object.
#if defined(RAJA_HIP_ACTIVE)
#include "RAJA/policy/hip/policy.hpp"
#elif defined(RAJA_CUDA_ACTIVE)
#include "RAJA/policy/cuda/policy.hpp"
#endif

#include "camp/camp.hpp"
#include "camp/concepts.hpp"
#include "camp/tuple.hpp"
Expand Down Expand Up @@ -176,21 +188,21 @@ struct LaunchParams
Threads apply(Threads const& a) { return (threads = a); }
};

class LaunchContext
class LaunchContextBase
{
public:
// Bump style allocator used to
// get memory from the pool
size_t shared_mem_offset;

void* shared_mem_ptr;

// In the future move this into a derived class.
#if defined(RAJA_SYCL_ACTIVE)
// SGS ODR issue
mutable ::sycl::nd_item<3>* itm;
#endif
Comment thread
MrBurmark marked this conversation as resolved.

RAJA_HOST_DEVICE LaunchContext()
RAJA_HOST_DEVICE LaunchContextBase()
: shared_mem_offset(0),
shared_mem_ptr(nullptr)
{}
Expand All @@ -209,20 +221,6 @@ class LaunchContext
return static_cast<T*>(mem_ptr);
}

/*
//Odd dependecy with atomics is breaking CI builds
template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t
z_stride=DIM-1, typename arg, typename... args> RAJA_HOST_DEVICE auto
getSharedMemoryView(size_t bytes, arg idx, args... idxs)
{
T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset];

shared_mem_offset += bytes*sizeof(T);
return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx,
idxs...);
}
*/

RAJA_HOST_DEVICE void releaseSharedMemory()
{
// On the cpu/gpu we want to restart the count
Expand All @@ -243,6 +241,24 @@ class LaunchContext
}
};

template<>
class LaunchContextT<LaunchContextHostPolicy> : public LaunchContextBase
{
public:
using LaunchContextBase::LaunchContextBase;
};

// Preserve backwards compatibility
#if defined(RAJA_HIP_ACTIVE)
using LaunchContext =
LaunchContextT<HipLaunchContextNonCachedIndicesAndDimsPolicy>;
#elif defined(RAJA_CUDA_ACTIVE)
using LaunchContext =
LaunchContextT<CudaLaunchContextNonCachedIndicesAndDimsPolicy>;
#else
using LaunchContext = LaunchContextT<LaunchContextHostPolicy>;
#endif

template<typename LAUNCH_POLICY>
struct LaunchExecute;

Expand Down
Loading