KernelTuner
diff --git a/‎.clang-format‎
Lines changed: 1 addition & 1 deletion b/‎.clang-format‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 6 additions & 4 deletions b/‎Makefile‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 82 additions & 15 deletions b/‎README.md‎
Lines changed: 82 additions & 15 deletions
diff --git a/‎docs/build_api.py‎
Lines changed: 5 additions & 1 deletion b/‎docs/build_api.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/env_vars.rst‎
Lines changed: 9 additions & 1 deletion b/‎docs/env_vars.rst‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎docs/example.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/example.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/examples/pragma.rst‎
Lines changed: 125 additions & 0 deletions b/‎docs/examples/pragma.rst‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎docs/examples/vector_add_annotated.cu‎
Lines changed: 20 additions & 0 deletions b/‎docs/examples/vector_add_annotated.cu‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎examples/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎examples/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -55,7 +55,7 @@ IndentWidth: 4
 IndentWrappedFunctionNames: false
 KeepEmptyLinesAtTheStartOfBlocks: false
 MaxEmptyLinesToKeep: 1
-NamespaceIndentation: Inner
+NamespaceIndentation: None
 PointerAlignment: Left
 ReflowComments: false
 SortIncludes: true
 
@@ -8,7 +8,7 @@ if(NOT CMAKE_BUILD_TYPE)
 endif()
 
 
-file(GLOB sources "${PROJECT_SOURCE_DIR}/src/*.cpp")
+file(GLOB sources "${PROJECT_SOURCE_DIR}/src/*.cpp" "${PROJECT_SOURCE_DIR}/src/*/*.cpp")
 add_library(${PROJECT_NAME} STATIC ${sources})
 set(KERNEL_LAUNCHER_CLANG_TIDY clang-tidy -checks=-*,readability-*,bugprone-*,-readability-magic-numbers,-readability-use-anyofallof,-readability-else-after-return)
 
 
@@ -1,7 +1,9 @@
 BUILD_DIR=build
 
-fmt:
-	clang-format -i include/kernel_launcher/*.h src/*.cpp tests/*.cpp examples/*/*.cu
+pretty:
+	clang-format -i include/*.h include/*/*.h include/*/*/*.h src/*.cpp src/*/*.cpp tests/*.cpp examples/*/*.cu
+
+fmt: pretty
 
 test: ${BUILD_DIR}
 	cd ${BUILD_DIR} && make kernel_launcher_tests
@@ -11,7 +13,7 @@ ${BUILD_DIR}:
 	mkdir ${BUILD_DIR}
 	cd ${BUILD_DIR} && cmake -DKERNEL_LAUNCHER_BUILD_TEST=1 -DCMAKE_BUILD_TYPE=debug ..
 
-all: fmt test
+all: pretty test
 clean:
 
-.PHONY: fmt test all clean
+.PHONY: pretty fmt test all clean
@@ -12,8 +12,10 @@
 
 
 
-_Kernel Launcher_ is a C++ library that makes it easy to dynamically compile _CUDA_ kernels at run time (using [NVRTC](https://docs.nvidia.com/cuda/nvrtc/index.html)) and call them in an easy type-safe way using C++ magic.
-Additionally, _Kernel Launcher_ supports exporting kernel specifications, to enable tuning by [Kernel Tuner](https://github.com/KernelTuner/kernel_tuner), and importing the tuning results, known as _wisdom_ files, back into the application.
+_Kernel Launcher_ is a C++ library that enables dynamic compilation _CUDA_ kernels at run time (using [NVRTC](https://docs.nvidia.com/cuda/nvrtc/index.html)) and launching them in an easy type-safe way using C++ magic.
+On top of that, Kernel Launcher supports _capturing_ kernel launches, to enable tuning by [Kernel Tuner](https://github.com/KernelTuner/kernel_tuner), and importing the tuning results, known as _wisdom_ files, back into the application.
+The result: highly efficient GPU applications with maximum portability.
+
 
 
 
@@ -23,32 +25,89 @@ Recommended installation is using CMake. See the [installation guide](https://ke
 
 ## Example
 
-See the documentation for [examples](https://kerneltuner.github.io/kernel_launcher/example.html) or check out the [examples](https://github.com/KernelTuner/kernel_launcher/tree/master/examples) directory.
+There are many ways of using Kernel Launcher. See the documentation for [examples](https://kerneltuner.github.io/kernel_launcher/example.html) or check out the [examples](https://github.com/KernelTuner/kernel_launcher/tree/master/examples) directory.
+
+
+### Pragma-based API
+Below shows an example of using the pragma-based API, which allows existing CUDA kernels to be annotated with Kernel-Launcher-specific directives.
+
+**kernel.cu**
+```cpp
+#pragma kernel tune(threads_per_block=32, 64, 128, 256, 512, 1024)
+#pragma kernel block_size(threads_per_block)
+#pragma kernel problem_size(n)
+#pragma kernel buffers(A[n], B[n], C[n])
+template <typename T>
+__global__ void vector_add(int n, T *C, const T *A, const T *B) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n) {
+        C[i] = A[i] + B[i];
+    }
+}
+```
 
+**main.cpp**
 ```cpp
 #include "kernel_launcher.h"
 
 int main() {
+    // Initialize CUDA memory. This is outside the scope of kernel_launcher.
+    unsigned int n = 1000000;
+    float *dev_A, *dev_B, *dev_C;
+    /* cudaMalloc, cudaMemcpy, ... */
+
     // Namespace alias.
     namespace kl = kernel_launcher;
 
-    // Create a kernel builder
-    kl::KernelBuilder builder("vector_add", "vector_add_kernel.cu");
+    // Launch the kernel! Again, the grid size and block size do not need to
+    // be specified, they are calculated from the kernel specifications and
+    // run-time arguments.
+    kl::launch(
+        kl::PragmaKernel("vector_add", "kernel.cu", {"float"}),
+        n, dev_C, dev_A, dev_B
+    );
+}
+
+```
+
+
+### Builder-based API
+Below shows an example of the `KernelBuilder`-based API.
+This offers more flexiblity than the pragma-based API, but is also more verbose:
+
+**kernel.cu**
+```cpp
+template <typename T>
+__global__ void vector_add(int n, T *C, const T *A, const T *B) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n) {
+        C[i] = A[i] + B[i];
+    }
+}
+```
+
+**main.cpp**
+```cpp
+#include "kernel_launcher.h"
+
+int main() {
+    // Namespace alias.
+    namespace kl = kernel_launcher;
 
     // Define the variables that can be tuned for this kernel.
-    auto threads_per_block = builder.tune("block_size", {32, 64, 128, 256, 512, 1024});
-    auto elements_per_thread = builder.tune("elements_per_thread", {1, 2, 4, 8});
+    auto space = kl::ConfigSpace();
+    auto threads_per_block = space.tune("block_size", {32, 64, 128, 256, 512, 1024});
 
-    // Set kernel properties such as block size, grid divisor, template arguments, etc.
+    // Create a kernel builder and set kernel properties such as block size,
+    // grid divisor, template arguments, etc.
+    auto builder = kl::KernelBuilder("vector_add", "kernel.cu", space);
     builder
-        .problem_size(kl::arg0)
-        .block_size(threads_per_block)
-        .grid_divisors(threads_per_block * elements_per_thread)
         .template_args(kl::type_of<float>())
-        .define("ELEMENTS_PER_THREAD", elements_per_thread);
+        .problem_size(kl::arg0)
+        .block_size(threads_per_block);
 
     // Define the kernel
-    kl::WisdomKernel vector_add_kernel(builder);
+    auto vector_add_kernel = kl::WisdomKernel(builder);
 
     // Initialize CUDA memory. This is outside the scope of kernel_launcher.
     unsigned int n = 1000000;
@@ -60,16 +119,24 @@ int main() {
     // derived from the kernel specifications and run-time arguments.
     vector_add_kernel(n, dev_C, dev_A, dev_B);
 }
-
 ```
 
+
+
 ## License
 
 Licensed under Apache 2.0. See [LICENSE](https://github.com/KernelTuner/kernel_launcher/blob/master/LICENSE).
 
+
 ## Citation
 
-```
+If you use Kernel Launcher in your work, please cite the following publication:
+
+> S. Heldens, B. van Werkhoven (2023), "Kernel Launcher: C++ Library for Optimal-Performance Portable CUDA Applications", The Eighteenth International Workshop on Automatic Performance Tuning (iWAPT2023) co-located with IPDPS 2023
+
+As BibTeX:
+
+```Latex
 @article{heldens2023kernellauncher,
   title={Kernel Launcher: C++ Library for Optimal-Performance Portable CUDA Applications},
   author={Heldens, Stijn and van Werkhoven, Ben},
 
@@ -77,7 +77,7 @@ def build_index_page(groups):
         "KernelSource",
         "Kernel",
     ],
-    "Wisdom": [
+    "Wisdom Kernels": [
         "WisdomKernel",
         "WisdomSettings",
         "WisdomRecord",
@@ -92,6 +92,10 @@ def build_index_page(groups):
         "export_capture_file",
         "capture_file_exists",
     ],
+    "Pragma Kernels": [
+        "PragmaKernel",
+        "build_pragma_kernel"
+    ],
     "Registry": [
         "KernelRegistry",
         "IKernelDescriptor",
 
@@ -12,7 +12,7 @@ Environment Variables
    * - **KERNEL_LAUNCHER_CAPTURE**
      - ``_``
 
-     - Kernels for which a tuning specification will be exported on the first call to the kernel.
+     - Kernels for which a tuning specification will be captured.
        The value should a comma-seperated list of kernel names.
        Additionally, an ``*`` can be used as a wild card.
 
@@ -30,6 +30,14 @@ Environment Variables
        (i.e., a wisdom file was found), the ``KERNEL_LAUNCHER_CAPTURE_FORCE`` will force to always
        capture kernels regardless of whether wisdom files are available.
 
+   * - **KERNEL_LAUNCHER_CAPTURE _SKIP**
+     - ``0``
+     - Set the number of kernel launches to skip before capturing a particular kernel.
+       For example, if you set the value to ``3``, only the fourth launch will be captured since the
+       first three launches will be skipped.
+
+       Note that this option is applied on a `per-kernel basis`, which means that each individual kernel keeps its own skip counter.
+
    * - **KERNEL_LAUNCHER_LOG**
      - ``info``
      - Controls how much logging information is printed to stderr. There are three possible options:
 
@@ -9,4 +9,5 @@ Guides
    examples/basic
    examples/wisdom
    examples/registry
+   examples/pragma
 
@@ -0,0 +1,125 @@
+Pragma Kernels
+===========================
+
+In the previous examples, we demonstrated how a tunable kernel can be specified by defining a ``KernelBuilder`` instance in the host-side code.
+While this API offers flexiblity, it can be cumbersome and requires keeping the kernel code in CUDA in sync with the host-side code in C++.
+
+Kernel Launcher also provides a way to define kernel specifications directly in the CUDA code by using pragma directives to annotate the kernel code.
+Although this method is less flexible than the ``KernelBuilder`` API, it is much more convenient and suitable for most CUDA kernels.
+
+
+Source Code
+-----------
+
+The following code example shows valid CUDA kernel code containing pragma directives.
+The ``#pragma`` annotations will be ignored by the ``nvcc`` compiler (but they may produce compiler warnings).
+
+
+.. literalinclude:: vector_add_annotated.cu
+   :lines: 1-20
+   :lineno-start: 1
+
+
+Code Explanation
+----------------
+
+The kernel contains the following ``pragma`` directives:
+
+.. literalinclude:: vector_add_annotated.cu
+   :lines: 1-2
+   :lineno-start: 1
+
+The tune directives specify the tunable parameters: ``threads_per_block`` and ``items_per_thread``.
+Since ``items_per_thread`` is also the name of the template parameter, so it is passed to the kernel as a compile-time constant via this parameter.
+The value of ``threads_per_block`` is not passed to the kernel but is used by subsequent pragmas.
+
+.. literalinclude:: vector_add_annotated.cu
+   :lines: 3-3
+   :lineno-start: 3
+
+The ``set`` directives defines a constant.
+In this case, the constant ``items_per_block`` is defined as the product of ``threads_per_block`` and ``items_per_thread``.
+
+.. literalinclude:: vector_add_annotated.cu
+   :lines: 4-6
+   :lineno-start: 4
+
+The ``problem_size`` directive defines the problem size (as discussed in as discussed in :doc:`basic`), ``block_size`` specifies the thread block size, and ``grid_divisor`` specifies how the problem size should be divided to obtain the thread grid size.
+Alternatively, ``grid_size`` can be used to specify the grid size directly.
+
+
+.. literalinclude:: vector_add_annotated.cu
+   :lines: 7-7
+   :lineno-start: 7
+
+The ``buffers`` directive specifies the size of each buffer (``A``, ``B``, and ``C``) as ``n`` elements to be known by Kernel Launcher.
+This is necessary since raw pointers can be used for buffer arguments, for which size information may not be available.
+If the ``buffers`` pragma is not specified, Kernel Launcher can still be used but it is not possible to capture kernel launches.
+
+.. literalinclude:: vector_add_annotated.cu
+   :lines: 8-8
+   :lineno-start: 8
+
+The ``tuning_key`` directive specifies the tuning key, which can be a concatenation of strings or variables.
+In this example, the tuning key is ``"vector_add_" + T``, where ``T`` is the name of the type.
+
+
+Host Code
+---------
+
+The below code shows how to call the kernel from the host in C++::
+
+    #include "kernel_launcher/pragma.h"
+    using namespace kl = kernel_launcher;
+
+    void launch_vector_add(float* C, const float* A, const float* B) {
+        kl::launch(
+            kl::PragmaKernel("vector_add_annotated.cu", "vector_add", {"float"}),
+            n, C, A, B
+        );
+    );
+
+
+The ``PragmaKernel`` class implements the ``IKernelDescriptor`` interface, as described in :doc:`registry`.
+This class reads the specified file, extracts the Kernel Launcher pragmas from the source code, and compiles the kernel.
+
+The ``launch`` function launches the kernel and, as discussed in :doc:`registry`, it uses the default registry to cache kernel compilations.
+This means that the kernel is only compiled once, even if the same kernel is called from different locations in the program.
+
+
+List of pragmas
+---------------
+
+The table below lists the valid directives.
+
+.. list-table::
+
+   * - Directive
+     - Description
+
+   * - ``tune``
+     - Add a new tunable variable.
+
+   * - ``set``
+     - Add a new variable.
+
+   * - ``buffers``
+     - Specify the size of buffer arguments. This directive may occur multiple times.
+
+   * - ``tuning_key``
+     - Specify the tuning key used to search for the corresponding wisdom file.
+
+   * - ``problem_size``
+     - An N-dimensional vector that indicates workload size.
+
+   * - ``grid_size``
+     - An N-dimensional vector that indicates the CUDA grid size.
+
+   * - ``block_size``
+     - An N-dimensional vector that indicates the CUDA thread block size.
+
+   * - ``grid_divisor``
+     - Alternative way of specifying the grid size. The problem size is divided by the grid divisors to obtain the grid dimensions.
+
+   * - ``restriction``
+     - Boolean expression that must evaluate to ``true`` for a kernel configuration to be valid.
@@ -0,0 +1,20 @@
+#pragma kernel tune(threads_per_block=32, 64, 128, 256, 512, 1024)
+#pragma kernel tune(items_per_thread=1, 2, 4, 8)
+#pragma kernel set(items_per_block=threads_per_block * items_per_thread)
+#pragma kernel problem_size(n)
+#pragma kernel block_size(threads_per_block)
+#pragma kernel grid_divisor(items_per_block)
+#pragma kernel buffers(C[n], A[n], B[n])
+#pragma kernel tuning_key("vector_add_" + T)
+template <typename T, int items_per_thread=1>
+__global__
+void vector_add(int n, T* C, const T* A, const T* B) {
+    for (int k = 0; k < items_per_thread; k++) {
+        int i = blockIdx.x * items_per_thread * blockDim.x + k * blockDim.x + threadIdx.x;
+
+        if (i < n) {
+            C[i] = A[i] + B[i];
+        }
+    }
+}
+
@@ -1,3 +1,4 @@
 add_subdirectory(vector_add)
+add_subdirectory(vector_add_annotated)
 add_subdirectory(matmul)
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
`1`	`1`	`add_subdirectory(vector_add)`
	`2`	`+add_subdirectory(vector_add_annotated)`
`2`	`3`	`add_subdirectory(matmul)`
`3`	`4`