Skip to content

Commit 3857bd7

Browse files
committed
Merge branch 'develop'
2 parents b85b321 + b33e35b commit 3857bd7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+3451
-665
lines changed

.clang-format

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ IndentWidth: 4
5555
IndentWrappedFunctionNames: false
5656
KeepEmptyLinesAtTheStartOfBlocks: false
5757
MaxEmptyLinesToKeep: 1
58-
NamespaceIndentation: Inner
58+
NamespaceIndentation: None
5959
PointerAlignment: Left
6060
ReflowComments: false
6161
SortIncludes: true

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ if(NOT CMAKE_BUILD_TYPE)
88
endif()
99

1010

11-
file(GLOB sources "${PROJECT_SOURCE_DIR}/src/*.cpp")
11+
file(GLOB sources "${PROJECT_SOURCE_DIR}/src/*.cpp" "${PROJECT_SOURCE_DIR}/src/*/*.cpp")
1212
add_library(${PROJECT_NAME} STATIC ${sources})
1313
set(KERNEL_LAUNCHER_CLANG_TIDY clang-tidy -checks=-*,readability-*,bugprone-*,-readability-magic-numbers,-readability-use-anyofallof,-readability-else-after-return)
1414

Makefile

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
BUILD_DIR=build
22

3-
fmt:
4-
clang-format -i include/kernel_launcher/*.h src/*.cpp tests/*.cpp examples/*/*.cu
3+
pretty:
4+
clang-format -i include/*.h include/*/*.h include/*/*/*.h src/*.cpp src/*/*.cpp tests/*.cpp examples/*/*.cu
5+
6+
fmt: pretty
57

68
test: ${BUILD_DIR}
79
cd ${BUILD_DIR} && make kernel_launcher_tests
@@ -11,7 +13,7 @@ ${BUILD_DIR}:
1113
mkdir ${BUILD_DIR}
1214
cd ${BUILD_DIR} && cmake -DKERNEL_LAUNCHER_BUILD_TEST=1 -DCMAKE_BUILD_TYPE=debug ..
1315

14-
all: fmt test
16+
all: pretty test
1517
clean:
1618

17-
.PHONY: fmt test all clean
19+
.PHONY: pretty fmt test all clean

README.md

Lines changed: 82 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@
1212

1313

1414

15-
_Kernel Launcher_ is a C++ library that makes it easy to dynamically compile _CUDA_ kernels at run time (using [NVRTC](https://docs.nvidia.com/cuda/nvrtc/index.html)) and call them in an easy type-safe way using C++ magic.
16-
Additionally, _Kernel Launcher_ supports exporting kernel specifications, to enable tuning by [Kernel Tuner](https://github.com/KernelTuner/kernel_tuner), and importing the tuning results, known as _wisdom_ files, back into the application.
15+
_Kernel Launcher_ is a C++ library that enables dynamic compilation _CUDA_ kernels at run time (using [NVRTC](https://docs.nvidia.com/cuda/nvrtc/index.html)) and launching them in an easy type-safe way using C++ magic.
16+
On top of that, Kernel Launcher supports _capturing_ kernel launches, to enable tuning by [Kernel Tuner](https://github.com/KernelTuner/kernel_tuner), and importing the tuning results, known as _wisdom_ files, back into the application.
17+
The result: highly efficient GPU applications with maximum portability.
18+
1719

1820

1921

@@ -23,32 +25,89 @@ Recommended installation is using CMake. See the [installation guide](https://ke
2325

2426
## Example
2527

26-
See the documentation for [examples](https://kerneltuner.github.io/kernel_launcher/example.html) or check out the [examples](https://github.com/KernelTuner/kernel_launcher/tree/master/examples) directory.
28+
There are many ways of using Kernel Launcher. See the documentation for [examples](https://kerneltuner.github.io/kernel_launcher/example.html) or check out the [examples](https://github.com/KernelTuner/kernel_launcher/tree/master/examples) directory.
29+
30+
31+
### Pragma-based API
32+
Below shows an example of using the pragma-based API, which allows existing CUDA kernels to be annotated with Kernel-Launcher-specific directives.
33+
34+
**kernel.cu**
35+
```cpp
36+
#pragma kernel tune(threads_per_block=32, 64, 128, 256, 512, 1024)
37+
#pragma kernel block_size(threads_per_block)
38+
#pragma kernel problem_size(n)
39+
#pragma kernel buffers(A[n], B[n], C[n])
40+
template <typename T>
41+
__global__ void vector_add(int n, T *C, const T *A, const T *B) {
42+
int i = blockIdx.x * blockDim.x + threadIdx.x;
43+
if (i < n) {
44+
C[i] = A[i] + B[i];
45+
}
46+
}
47+
```
2748
49+
**main.cpp**
2850
```cpp
2951
#include "kernel_launcher.h"
3052
3153
int main() {
54+
// Initialize CUDA memory. This is outside the scope of kernel_launcher.
55+
unsigned int n = 1000000;
56+
float *dev_A, *dev_B, *dev_C;
57+
/* cudaMalloc, cudaMemcpy, ... */
58+
3259
// Namespace alias.
3360
namespace kl = kernel_launcher;
3461
35-
// Create a kernel builder
36-
kl::KernelBuilder builder("vector_add", "vector_add_kernel.cu");
62+
// Launch the kernel! Again, the grid size and block size do not need to
63+
// be specified, they are calculated from the kernel specifications and
64+
// run-time arguments.
65+
kl::launch(
66+
kl::PragmaKernel("vector_add", "kernel.cu", {"float"}),
67+
n, dev_C, dev_A, dev_B
68+
);
69+
}
70+
71+
```
72+
73+
74+
### Builder-based API
75+
Below shows an example of the `KernelBuilder`-based API.
76+
This offers more flexiblity than the pragma-based API, but is also more verbose:
77+
78+
**kernel.cu**
79+
```cpp
80+
template <typename T>
81+
__global__ void vector_add(int n, T *C, const T *A, const T *B) {
82+
int i = blockIdx.x * blockDim.x + threadIdx.x;
83+
if (i < n) {
84+
C[i] = A[i] + B[i];
85+
}
86+
}
87+
```
88+
89+
**main.cpp**
90+
```cpp
91+
#include "kernel_launcher.h"
92+
93+
int main() {
94+
// Namespace alias.
95+
namespace kl = kernel_launcher;
3796
3897
// Define the variables that can be tuned for this kernel.
39-
auto threads_per_block = builder.tune("block_size", {32, 64, 128, 256, 512, 1024});
40-
auto elements_per_thread = builder.tune("elements_per_thread", {1, 2, 4, 8});
98+
auto space = kl::ConfigSpace();
99+
auto threads_per_block = space.tune("block_size", {32, 64, 128, 256, 512, 1024});
41100
42-
// Set kernel properties such as block size, grid divisor, template arguments, etc.
101+
// Create a kernel builder and set kernel properties such as block size,
102+
// grid divisor, template arguments, etc.
103+
auto builder = kl::KernelBuilder("vector_add", "kernel.cu", space);
43104
builder
44-
.problem_size(kl::arg0)
45-
.block_size(threads_per_block)
46-
.grid_divisors(threads_per_block * elements_per_thread)
47105
.template_args(kl::type_of<float>())
48-
.define("ELEMENTS_PER_THREAD", elements_per_thread);
106+
.problem_size(kl::arg0)
107+
.block_size(threads_per_block);
49108
50109
// Define the kernel
51-
kl::WisdomKernel vector_add_kernel(builder);
110+
auto vector_add_kernel = kl::WisdomKernel(builder);
52111
53112
// Initialize CUDA memory. This is outside the scope of kernel_launcher.
54113
unsigned int n = 1000000;
@@ -60,16 +119,24 @@ int main() {
60119
// derived from the kernel specifications and run-time arguments.
61120
vector_add_kernel(n, dev_C, dev_A, dev_B);
62121
}
63-
64122
```
65123

124+
125+
66126
## License
67127

68128
Licensed under Apache 2.0. See [LICENSE](https://github.com/KernelTuner/kernel_launcher/blob/master/LICENSE).
69129

130+
70131
## Citation
71132

72-
```
133+
If you use Kernel Launcher in your work, please cite the following publication:
134+
135+
> S. Heldens, B. van Werkhoven (2023), "Kernel Launcher: C++ Library for Optimal-Performance Portable CUDA Applications", The Eighteenth International Workshop on Automatic Performance Tuning (iWAPT2023) co-located with IPDPS 2023
136+
137+
As BibTeX:
138+
139+
```Latex
73140
@article{heldens2023kernellauncher,
74141
title={Kernel Launcher: C++ Library for Optimal-Performance Portable CUDA Applications},
75142
author={Heldens, Stijn and van Werkhoven, Ben},

docs/build_api.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def build_index_page(groups):
7777
"KernelSource",
7878
"Kernel",
7979
],
80-
"Wisdom": [
80+
"Wisdom Kernels": [
8181
"WisdomKernel",
8282
"WisdomSettings",
8383
"WisdomRecord",
@@ -92,6 +92,10 @@ def build_index_page(groups):
9292
"export_capture_file",
9393
"capture_file_exists",
9494
],
95+
"Pragma Kernels": [
96+
"PragmaKernel",
97+
"build_pragma_kernel"
98+
],
9599
"Registry": [
96100
"KernelRegistry",
97101
"IKernelDescriptor",

docs/env_vars.rst

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Environment Variables
1212
* - **KERNEL_LAUNCHER_CAPTURE**
1313
- ``_``
1414

15-
- Kernels for which a tuning specification will be exported on the first call to the kernel.
15+
- Kernels for which a tuning specification will be captured.
1616
The value should a comma-seperated list of kernel names.
1717
Additionally, an ``*`` can be used as a wild card.
1818

@@ -30,6 +30,14 @@ Environment Variables
3030
(i.e., a wisdom file was found), the ``KERNEL_LAUNCHER_CAPTURE_FORCE`` will force to always
3131
capture kernels regardless of whether wisdom files are available.
3232

33+
* - **KERNEL_LAUNCHER_CAPTURE _SKIP**
34+
- ``0``
35+
- Set the number of kernel launches to skip before capturing a particular kernel.
36+
For example, if you set the value to ``3``, only the fourth launch will be captured since the
37+
first three launches will be skipped.
38+
39+
Note that this option is applied on a `per-kernel basis`, which means that each individual kernel keeps its own skip counter.
40+
3341
* - **KERNEL_LAUNCHER_LOG**
3442
- ``info``
3543
- Controls how much logging information is printed to stderr. There are three possible options:

docs/example.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ Guides
99
examples/basic
1010
examples/wisdom
1111
examples/registry
12+
examples/pragma
1213

docs/examples/pragma.rst

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
Pragma Kernels
2+
===========================
3+
4+
In the previous examples, we demonstrated how a tunable kernel can be specified by defining a ``KernelBuilder`` instance in the host-side code.
5+
While this API offers flexiblity, it can be cumbersome and requires keeping the kernel code in CUDA in sync with the host-side code in C++.
6+
7+
Kernel Launcher also provides a way to define kernel specifications directly in the CUDA code by using pragma directives to annotate the kernel code.
8+
Although this method is less flexible than the ``KernelBuilder`` API, it is much more convenient and suitable for most CUDA kernels.
9+
10+
11+
Source Code
12+
-----------
13+
14+
The following code example shows valid CUDA kernel code containing pragma directives.
15+
The ``#pragma`` annotations will be ignored by the ``nvcc`` compiler (but they may produce compiler warnings).
16+
17+
18+
.. literalinclude:: vector_add_annotated.cu
19+
:lines: 1-20
20+
:lineno-start: 1
21+
22+
23+
Code Explanation
24+
----------------
25+
26+
The kernel contains the following ``pragma`` directives:
27+
28+
.. literalinclude:: vector_add_annotated.cu
29+
:lines: 1-2
30+
:lineno-start: 1
31+
32+
The tune directives specify the tunable parameters: ``threads_per_block`` and ``items_per_thread``.
33+
Since ``items_per_thread`` is also the name of the template parameter, so it is passed to the kernel as a compile-time constant via this parameter.
34+
The value of ``threads_per_block`` is not passed to the kernel but is used by subsequent pragmas.
35+
36+
.. literalinclude:: vector_add_annotated.cu
37+
:lines: 3-3
38+
:lineno-start: 3
39+
40+
The ``set`` directives defines a constant.
41+
In this case, the constant ``items_per_block`` is defined as the product of ``threads_per_block`` and ``items_per_thread``.
42+
43+
.. literalinclude:: vector_add_annotated.cu
44+
:lines: 4-6
45+
:lineno-start: 4
46+
47+
The ``problem_size`` directive defines the problem size (as discussed in as discussed in :doc:`basic`), ``block_size`` specifies the thread block size, and ``grid_divisor`` specifies how the problem size should be divided to obtain the thread grid size.
48+
Alternatively, ``grid_size`` can be used to specify the grid size directly.
49+
50+
51+
.. literalinclude:: vector_add_annotated.cu
52+
:lines: 7-7
53+
:lineno-start: 7
54+
55+
The ``buffers`` directive specifies the size of each buffer (``A``, ``B``, and ``C``) as ``n`` elements to be known by Kernel Launcher.
56+
This is necessary since raw pointers can be used for buffer arguments, for which size information may not be available.
57+
If the ``buffers`` pragma is not specified, Kernel Launcher can still be used but it is not possible to capture kernel launches.
58+
59+
.. literalinclude:: vector_add_annotated.cu
60+
:lines: 8-8
61+
:lineno-start: 8
62+
63+
The ``tuning_key`` directive specifies the tuning key, which can be a concatenation of strings or variables.
64+
In this example, the tuning key is ``"vector_add_" + T``, where ``T`` is the name of the type.
65+
66+
67+
Host Code
68+
---------
69+
70+
The below code shows how to call the kernel from the host in C++::
71+
72+
#include "kernel_launcher/pragma.h"
73+
using namespace kl = kernel_launcher;
74+
75+
void launch_vector_add(float* C, const float* A, const float* B) {
76+
kl::launch(
77+
kl::PragmaKernel("vector_add_annotated.cu", "vector_add", {"float"}),
78+
n, C, A, B
79+
);
80+
);
81+
82+
83+
The ``PragmaKernel`` class implements the ``IKernelDescriptor`` interface, as described in :doc:`registry`.
84+
This class reads the specified file, extracts the Kernel Launcher pragmas from the source code, and compiles the kernel.
85+
86+
The ``launch`` function launches the kernel and, as discussed in :doc:`registry`, it uses the default registry to cache kernel compilations.
87+
This means that the kernel is only compiled once, even if the same kernel is called from different locations in the program.
88+
89+
90+
List of pragmas
91+
---------------
92+
93+
The table below lists the valid directives.
94+
95+
.. list-table::
96+
97+
* - Directive
98+
- Description
99+
100+
* - ``tune``
101+
- Add a new tunable variable.
102+
103+
* - ``set``
104+
- Add a new variable.
105+
106+
* - ``buffers``
107+
- Specify the size of buffer arguments. This directive may occur multiple times.
108+
109+
* - ``tuning_key``
110+
- Specify the tuning key used to search for the corresponding wisdom file.
111+
112+
* - ``problem_size``
113+
- An N-dimensional vector that indicates workload size.
114+
115+
* - ``grid_size``
116+
- An N-dimensional vector that indicates the CUDA grid size.
117+
118+
* - ``block_size``
119+
- An N-dimensional vector that indicates the CUDA thread block size.
120+
121+
* - ``grid_divisor``
122+
- Alternative way of specifying the grid size. The problem size is divided by the grid divisors to obtain the grid dimensions.
123+
124+
* - ``restriction``
125+
- Boolean expression that must evaluate to ``true`` for a kernel configuration to be valid.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#pragma kernel tune(threads_per_block=32, 64, 128, 256, 512, 1024)
2+
#pragma kernel tune(items_per_thread=1, 2, 4, 8)
3+
#pragma kernel set(items_per_block=threads_per_block * items_per_thread)
4+
#pragma kernel problem_size(n)
5+
#pragma kernel block_size(threads_per_block)
6+
#pragma kernel grid_divisor(items_per_block)
7+
#pragma kernel buffers(C[n], A[n], B[n])
8+
#pragma kernel tuning_key("vector_add_" + T)
9+
template <typename T, int items_per_thread=1>
10+
__global__
11+
void vector_add(int n, T* C, const T* A, const T* B) {
12+
for (int k = 0; k < items_per_thread; k++) {
13+
int i = blockIdx.x * items_per_thread * blockDim.x + k * blockDim.x + threadIdx.x;
14+
15+
if (i < n) {
16+
C[i] = A[i] + B[i];
17+
}
18+
}
19+
}
20+

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
add_subdirectory(vector_add)
2+
add_subdirectory(vector_add_annotated)
23
add_subdirectory(matmul)
34

0 commit comments

Comments
 (0)