Merge pull request #77 from MichealReed/improve-async

austinvhuang · web-flow · commit cd37551d9211 · 2025-03-11T11:07:33.000-04:00
Refactor Synchronicity, Provide Async API
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,7 +2,7 @@
 # and cmake/gpu.cmake for more details
 cmake_minimum_required(VERSION 3.28)
 project(gpu)
-
+set(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with
                                       # LSP
 set(CMAKE_CXX_STANDARD 20)
@@ -23,6 +23,24 @@ endif()
 include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/dawn.cmake")
 include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake")
 
+target_link_libraries(gpu PRIVATE webgpu_dawn)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/test)
+
+add_executable(test_gpu ${CMAKE_CURRENT_SOURCE_DIR}/test/test_gpu.cpp)
+target_link_libraries(test_gpu PRIVATE gpu)
+
+# Platform-specific post-build actions (e.g. copying DLLs for MSVC)
+if(MSVC)
+    add_custom_command(
+        TARGET test_gpu POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+                ${DAWN_BUILD_DIR}/$<CONFIG>/webgpu_dawn.dll
+                $<TARGET_FILE_DIR:test_gpu>
+        COMMENT "Copying webgpu_dawn.dll to the build directory"
+    )
+endif()
+
 add_library(gpud SHARED gpu.hpp)
 set_target_properties(gpud PROPERTIES LINKER_LANGUAGE CXX)
 target_link_libraries(gpud PRIVATE gpu)
diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake
@@ -7,6 +7,8 @@ if(EMSCRIPTEN)
     set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "")
     set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "")
     set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EM_SDK_DIR}/upstream/emscripten CACHE INTERNAL "" FORCE)
+else()
+    add_compile_definitions(USE_DAWN_API)
 endif()
 
 # Enable find for no dawn rebuilds with flutter run
diff --git a/cmake/example.cmake b/cmake/example.cmake
@@ -45,13 +45,14 @@ if(EMSCRIPTEN)
     # Set Emscripten-specific link flags that enable WASM output and expose certain symbols.
     # Needed to use updated version, emdawnwebgpu
     set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS "\
+        -O3 \
         -sUSE_WEBGPU=0 \
         -sWASM=1 \
         -DDAWN_EMSCRIPTEN_TOOLCHAIN=${EMSCRIPTEN_DIR} \
         -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \
         -sEXPORTED_RUNTIME_METHODS=ccall \
         -sUSE_GLFW=3 \
-        -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=5MB \
+        -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=15MB \
         -sASYNCIFY \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \
diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake
@@ -32,9 +32,10 @@ add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS})
 set_target_properties(gpu PROPERTIES LINKER_LANGUAGE CXX)
 target_include_directories(gpu PUBLIC "${PROJECT_ROOT}")
 if(NOT EMSCRIPTEN)
+    target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/")
     target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/dawn/")
+    target_include_directories(gpu PUBLIC "${DAWN_DIR}/include/")
 else()
     target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/")
     target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/")
 endif()
-
diff --git a/cmake/templates/index.html.in b/cmake/templates/index.html.in
@@ -12,7 +12,7 @@
       if (typeof Module !== 'undefined') {
         Module.onRuntimeInitialized = function() {
           // Optionally, pass arguments to main in an array.
-          Module._main([]);
+          Module.ccall('main', 'number', [], [], { async: true });
         };
       } else {
         console.error('Module is undefined. Check that your generated JS file is loaded properly.');
diff --git a/docs/gpuflow.md b/docs/gpuflow.md
@@ -0,0 +1,78 @@
+# GPU.cpp Lifecycle
+
+```mermaid
+flowchart TD
+  %% Data Preparation & Upload
+  subgraph "Data Preparation & Upload"
+    A["CPU Data"]
+    B["Define Data Properties<br>(shape, type, size)"]
+    C["Create GPU Buffer<br>(allocate raw buffer)"]
+    D["Create Tensor<br>(allocates Array with one<br> or more buffers<br>and associates Shape)"]
+    
+    E["Upload Data via toGPU <br>(raw buffer)<br>toGPU<br>(ctx, data, buffer, size)"]
+    F["Upload Data via toGPU<br>(Tensor overload)<br>toGPU(ctx, data, tensor)"]
+    G["Optional: <br> Kernel Parameters<br>toGPU(ctx, params, Kernel)"]
+  end
+
+  %% Buffer Setup & Bindings
+  subgraph "Buffer & Binding Setup"
+    H["Define Bindings<br>(Bindings, TensorView)"]
+    I["Map GPU buffers<br> to shader bindings<br>(Collection from Tensor<br> or single buffers)"]
+  end
+
+  %% Kernel Setup & Execution
+  subgraph "Kernel Setup & Execution"
+    J["Define KernelCode<br>(WGSL template, workgroup size, precision)"]
+    K["Create Kernel"]
+    L["Dispatch Kernel"]
+  end
+
+  %% GPU Execution & Result Readback
+  subgraph "GPU Execution & Result Readback"
+    M["Kernel Execution<br>(GPU shader runs)"]
+    N["Readback Data<br>(toCPU variants)"]
+  end
+
+  %% Context & Resources
+  O["Context<br>(Device, Queue,<br>TensorPool, KernelPool)"]
+
+  %% Flow Connections
+  A --> B
+  B --> C
+  B --> D
+  C --> E
+  D --> F
+  F --> H
+  E --> H
+  H --> I
+  I --> K
+  J --> K
+  G --- K
+  K --> L
+  L --> M
+  M --> N
+
+  %% Context shared by all stages
+  O --- D
+  O --- E
+  O --- F
+  O --- K
+  O --- L
+  O --- N
+```
+
+• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`.
+• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};`
+• `gpu::Bindings` collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel.  
+• The `gpu::TensorPool` (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup.
+• `gpu::KernelCode` contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration.  
+• The `gpu::createKernelAsync/gpu::createKernel` functions (within the Execution Flow) use the `gpu::Context`, `gpu::Bindings`, and `gpu::KernelCode` to configure and construct a `gpu::Kernel` that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.).  
+• `gpu::KernelCode`’s workgroup size (a `gpu::Shape`) defines the dispatch configuration, and the `gpu::Kernel` eventually uses the underlying `gpu::Array` (contains` WGPUBuffer, WGPUBufferUsage, size_t`) and `gpu::Shape` data from the created Tensor.
+
+`gpu::Tensor` Ranks:
+Rank 0: Scalar
+Rank 1: Vector
+Rank 2: Matrix
+Rank 3: 3D Tensor (or Cube)
+Rank 4: 4D Tensor
+Rank (max 8): Higher Dimensional Tensors
diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp
@@ -28,21 +28,24 @@ int main(int argc, char **argv) {
   printf("--------------\n\n");
 
   // std::unique_ptr<Context> ctx = createContext();
+  #ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+  auto adaptersList = listAdapters(ctx);
+  LOG(kDefLog, kInfo, "Available GPU adapters:\n%s", adaptersList.c_str());
+  #else
   Context ctx = createContext();
+  #endif
   static constexpr size_t N = 10000;
   std::array<float, N> inputArr, outputArr;
   for (int i = 0; i < N; ++i) {
     inputArr[i] = static_cast<float>(i) / 10.0; // dummy input data
   }
   Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data());
   Tensor output = createTensor(ctx, Shape{N}, kf32);
-  std::promise<void> promise;
-  std::future<void> future = promise.get_future();
   Kernel op = createKernel(ctx, {kGelu, 256, kf32},
                            Bindings{input, output},
                            {cdiv(N, 256), 1, 1});
-  dispatchKernel(ctx, op, promise);
-  wait(ctx, future);
+  dispatchKernel(ctx, op);
   toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   for (int i = 0; i < 12; ++i) {
     printf("  gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]);
diff --git a/examples/render/run.cpp b/examples/render/run.cpp
@@ -124,10 +124,8 @@ int main(int argc, char **argv) {
                                      cdiv({NCOLS, NROWS, 1}, wgSize), params);
   printf("\033[2J\033[H");
   while (true) {
-    std::promise<void> promise;
-    std::future<void> future = promise.get_future();
-    dispatchKernel(ctx, renderKernel, promise);
-    wait(ctx, future);
+
+    dispatchKernel(ctx, renderKernel);
     toCPU(ctx, devScreen, screen.data(), sizeof(screen));
     params.time = getCurrentTimeInMilliseconds() - zeroTime;
 
diff --git a/gpu.hpp b/gpu.hpp
diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp
diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp