Skip to content

Commit cd37551

Browse files
authored
Merge pull request #77 from MichealReed/improve-async
Refactor Synchronicity, Provide Async API
2 parents 1193fb1 + 39c816c commit cd37551

File tree

11 files changed

+1351
-444
lines changed

11 files changed

+1351
-444
lines changed

CMakeLists.txt

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# and cmake/gpu.cmake for more details
33
cmake_minimum_required(VERSION 3.28)
44
project(gpu)
5-
5+
set(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
66
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with
77
# LSP
88
set(CMAKE_CXX_STANDARD 20)
@@ -23,6 +23,24 @@ endif()
2323
include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/dawn.cmake")
2424
include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake")
2525

26+
target_link_libraries(gpu PRIVATE webgpu_dawn)
27+
28+
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/test)
29+
30+
add_executable(test_gpu ${CMAKE_CURRENT_SOURCE_DIR}/test/test_gpu.cpp)
31+
target_link_libraries(test_gpu PRIVATE gpu)
32+
33+
# Platform-specific post-build actions (e.g. copying DLLs for MSVC)
34+
if(MSVC)
35+
add_custom_command(
36+
TARGET test_gpu POST_BUILD
37+
COMMAND ${CMAKE_COMMAND} -E copy
38+
${DAWN_BUILD_DIR}/$<CONFIG>/webgpu_dawn.dll
39+
$<TARGET_FILE_DIR:test_gpu>
40+
COMMENT "Copying webgpu_dawn.dll to the build directory"
41+
)
42+
endif()
43+
2644
add_library(gpud SHARED gpu.hpp)
2745
set_target_properties(gpud PROPERTIES LINKER_LANGUAGE CXX)
2846
target_link_libraries(gpud PRIVATE gpu)

cmake/dawn.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ if(EMSCRIPTEN)
77
set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "")
88
set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "")
99
set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EM_SDK_DIR}/upstream/emscripten CACHE INTERNAL "" FORCE)
10+
else()
11+
add_compile_definitions(USE_DAWN_API)
1012
endif()
1113

1214
# Enable find for no dawn rebuilds with flutter run

cmake/example.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,14 @@ if(EMSCRIPTEN)
4545
# Set Emscripten-specific link flags that enable WASM output and expose certain symbols.
4646
# Needed to use updated version, emdawnwebgpu
4747
set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS "\
48+
-O3 \
4849
-sUSE_WEBGPU=0 \
4950
-sWASM=1 \
5051
-DDAWN_EMSCRIPTEN_TOOLCHAIN=${EMSCRIPTEN_DIR} \
5152
-sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \
5253
-sEXPORTED_RUNTIME_METHODS=ccall \
5354
-sUSE_GLFW=3 \
54-
-sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=5MB \
55+
-sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=15MB \
5556
-sASYNCIFY \
5657
--js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \
5758
--js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \

cmake/gpu.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS})
3232
set_target_properties(gpu PROPERTIES LINKER_LANGUAGE CXX)
3333
target_include_directories(gpu PUBLIC "${PROJECT_ROOT}")
3434
if(NOT EMSCRIPTEN)
35+
target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/")
3536
target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/dawn/")
37+
target_include_directories(gpu PUBLIC "${DAWN_DIR}/include/")
3638
else()
3739
target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/")
3840
target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/")
3941
endif()
40-

cmake/templates/index.html.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
if (typeof Module !== 'undefined') {
1313
Module.onRuntimeInitialized = function() {
1414
// Optionally, pass arguments to main in an array.
15-
Module._main([]);
15+
Module.ccall('main', 'number', [], [], { async: true });
1616
};
1717
} else {
1818
console.error('Module is undefined. Check that your generated JS file is loaded properly.');

docs/gpuflow.md

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# GPU.cpp Lifecycle
2+
3+
```mermaid
4+
flowchart TD
5+
%% Data Preparation & Upload
6+
subgraph "Data Preparation & Upload"
7+
A["CPU Data"]
8+
B["Define Data Properties<br>(shape, type, size)"]
9+
C["Create GPU Buffer<br>(allocate raw buffer)"]
10+
D["Create Tensor<br>(allocates Array with one<br> or more buffers<br>and associates Shape)"]
11+
12+
E["Upload Data via toGPU <br>(raw buffer)<br>toGPU<br>(ctx, data, buffer, size)"]
13+
F["Upload Data via toGPU<br>(Tensor overload)<br>toGPU(ctx, data, tensor)"]
14+
G["Optional: <br> Kernel Parameters<br>toGPU(ctx, params, Kernel)"]
15+
end
16+
17+
%% Buffer Setup & Bindings
18+
subgraph "Buffer & Binding Setup"
19+
H["Define Bindings<br>(Bindings, TensorView)"]
20+
I["Map GPU buffers<br> to shader bindings<br>(Collection from Tensor<br> or single buffers)"]
21+
end
22+
23+
%% Kernel Setup & Execution
24+
subgraph "Kernel Setup & Execution"
25+
J["Define KernelCode<br>(WGSL template, workgroup size, precision)"]
26+
K["Create Kernel"]
27+
L["Dispatch Kernel"]
28+
end
29+
30+
%% GPU Execution & Result Readback
31+
subgraph "GPU Execution & Result Readback"
32+
M["Kernel Execution<br>(GPU shader runs)"]
33+
N["Readback Data<br>(toCPU variants)"]
34+
end
35+
36+
%% Context & Resources
37+
O["Context<br>(Device, Queue,<br>TensorPool, KernelPool)"]
38+
39+
%% Flow Connections
40+
A --> B
41+
B --> C
42+
B --> D
43+
C --> E
44+
D --> F
45+
F --> H
46+
E --> H
47+
H --> I
48+
I --> K
49+
J --> K
50+
G --- K
51+
K --> L
52+
L --> M
53+
M --> N
54+
55+
%% Context shared by all stages
56+
O --- D
57+
O --- E
58+
O --- F
59+
O --- K
60+
O --- L
61+
O --- N
62+
```
63+
64+
• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`.
65+
• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};`
66+
`gpu::Bindings` collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel.
67+
• The `gpu::TensorPool` (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup.
68+
`gpu::KernelCode` contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration.
69+
• The `gpu::createKernelAsync/gpu::createKernel` functions (within the Execution Flow) use the `gpu::Context`, `gpu::Bindings`, and `gpu::KernelCode` to configure and construct a `gpu::Kernel` that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.).
70+
`gpu::KernelCode`’s workgroup size (a `gpu::Shape`) defines the dispatch configuration, and the `gpu::Kernel` eventually uses the underlying `gpu::Array` (contains` WGPUBuffer, WGPUBufferUsage, size_t`) and `gpu::Shape` data from the created Tensor.
71+
72+
`gpu::Tensor` Ranks:
73+
Rank 0: Scalar
74+
Rank 1: Vector
75+
Rank 2: Matrix
76+
Rank 3: 3D Tensor (or Cube)
77+
Rank 4: 4D Tensor
78+
Rank (max 8): Higher Dimensional Tensors

examples/hello_world/run.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,21 +28,24 @@ int main(int argc, char **argv) {
2828
printf("--------------\n\n");
2929

3030
// std::unique_ptr<Context> ctx = createContext();
31+
#ifdef USE_DAWN_API
32+
Context ctx = createContextByGpuIdx(0);
33+
auto adaptersList = listAdapters(ctx);
34+
LOG(kDefLog, kInfo, "Available GPU adapters:\n%s", adaptersList.c_str());
35+
#else
3136
Context ctx = createContext();
37+
#endif
3238
static constexpr size_t N = 10000;
3339
std::array<float, N> inputArr, outputArr;
3440
for (int i = 0; i < N; ++i) {
3541
inputArr[i] = static_cast<float>(i) / 10.0; // dummy input data
3642
}
3743
Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data());
3844
Tensor output = createTensor(ctx, Shape{N}, kf32);
39-
std::promise<void> promise;
40-
std::future<void> future = promise.get_future();
4145
Kernel op = createKernel(ctx, {kGelu, 256, kf32},
4246
Bindings{input, output},
4347
{cdiv(N, 256), 1, 1});
44-
dispatchKernel(ctx, op, promise);
45-
wait(ctx, future);
48+
dispatchKernel(ctx, op);
4649
toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
4750
for (int i = 0; i < 12; ++i) {
4851
printf(" gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]);

examples/render/run.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,10 +124,8 @@ int main(int argc, char **argv) {
124124
cdiv({NCOLS, NROWS, 1}, wgSize), params);
125125
printf("\033[2J\033[H");
126126
while (true) {
127-
std::promise<void> promise;
128-
std::future<void> future = promise.get_future();
129-
dispatchKernel(ctx, renderKernel, promise);
130-
wait(ctx, future);
127+
128+
dispatchKernel(ctx, renderKernel);
131129
toCPU(ctx, devScreen, screen.data(), sizeof(screen));
132130
params.time = getCurrentTimeInMilliseconds() - zeroTime;
133131

0 commit comments

Comments
 (0)