From feee33862fe3cafb909da33ed8b8e5c5fc6735ce Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 4 Feb 2025 15:50:59 +0100 Subject: [PATCH 01/13] define basic intrinsics --- src/KernelAbstractions.jl | 30 +++++++++++++++++----- src/intrinsics.jl | 52 +++++++++++++++++++++++++++++++++++++++ src/pocl/backend.jl | 25 ++++++------------- 3 files changed, 83 insertions(+), 24 deletions(-) create mode 100644 src/intrinsics.jl diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 9af2d9d4c..8c2844fd7 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -194,6 +194,10 @@ function unsafe_free! end unsafe_free!(::AbstractArray) = return +include("intrinsics.jl") +import .KernelIntrinsics +export KernelIntrinsics + ### # Kernel language # - @localmem @@ -460,13 +464,27 @@ end # Internal kernel functions ### -function __index_Local_Linear end -function __index_Group_Linear end -function __index_Global_Linear end +function __index_Local_Linear(ctx) + return KernelIntrinsics.get_local_id().x +end -function __index_Local_Cartesian end -function __index_Group_Cartesian end -function __index_Global_Cartesian end +function __index_Group_Linear(ctx) + return KernelIntrinsics.get_group_id().x +end + +function __index_Global_Linear(ctx) + return KernelIntrinsics.get_global_id().x +end + +function __index_Local_Cartesian(ctx) + return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x] +end +function __index_Group_Cartesian(ctx) + return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x] +end +function __index_Global_Cartesian(ctx) + return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x) +end @inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...)) @inline __index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...)) diff --git a/src/intrinsics.jl b/src/intrinsics.jl new file mode 100644 index 000000000..33ed56fed --- /dev/null +++ b/src/intrinsics.jl @@ -0,0 +1,52 @@ +module KernelIntrinsics + +""" + get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Return the number of global work-items specified. + +!!! note + 1-based. +""" +function get_global_size end + +""" + get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the unique global work-item ID. +""" +function get_global_id end + +""" + get_local_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Return the number of local work-items specified. +""" +function get_local_size end + +""" + get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the unique local work-item ID. +""" +function get_local_id end + +""" + get_num_groups()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the number of groups. +""" +function get_num_groups end + +""" + get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the unique group ID. +""" +function get_group_id end + +function localmemory end +function barrier end +function print end + +end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 87fccdb93..32c2951a6 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -140,29 +140,18 @@ end ## Indexing Functions +const KI = KA.KernelIntrinsics -@device_override @inline function KA.__index_Local_Linear(ctx) - return get_local_id(1) +@device_override @inline function KI.get_local_id() + return (; x = get_local_id(1), y = get_local_id(2), z = get_local_id(3)) end -@device_override @inline function KA.__index_Group_Linear(ctx) - return get_group_id(1) +@device_override @inline function KI.get_group_id() + return (; x = get_group_id(1), y = get_group_id(2), z = get_group_id(3)) end -@device_override @inline function KA.__index_Global_Linear(ctx) - return get_global_id(1) -end - -@device_override @inline function KA.__index_Local_Cartesian(ctx) - @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)] -end - -@device_override @inline function KA.__index_Group_Cartesian(ctx) - @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)] -end - -@device_override @inline function KA.__index_Global_Cartesian(ctx) - return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) +@device_override @inline function KI.get_global_id() + return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3)) end @device_override @inline function KA.__validindex(ctx) From ca815e26482d90da9577597be0a29b961354bbff Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 16 Sep 2025 12:37:32 -0300 Subject: [PATCH 02/13] Fix docstrings --- src/intrinsics.jl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 33ed56fed..15202fab7 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -4,9 +4,6 @@ module KernelIntrinsics get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} Return the number of global work-items specified. - -!!! note - 1-based. """ function get_global_size end @@ -14,6 +11,9 @@ function get_global_size end get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} Returns the unique global work-item ID. + +!!! note + 1-based. """ function get_global_id end @@ -28,6 +28,9 @@ function get_local_size end get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} Returns the unique local work-item ID. + +!!! note + 1-based. """ function get_local_id end @@ -42,6 +45,9 @@ function get_num_groups end get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} Returns the unique group ID. + +!!! note + 1-based. """ function get_group_id end From df8cbbadaced7da4a48050986d13ec4be929fddb Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 16 Sep 2025 13:51:00 -0300 Subject: [PATCH 03/13] localmemory --- src/KernelAbstractions.jl | 4 +++- src/pocl/backend.jl | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 8c2844fd7..f5f76306f 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -814,7 +814,9 @@ include("macros.jl") ### function Scratchpad end -function SharedMemory end +function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} + KernelIntrinsics.localmemory(t, dims, id) +end function __synchronize() error("@synchronize used outside kernel or not captured") diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 32c2951a6..c06d014ce 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -166,7 +166,7 @@ end ## Shared and Scratch Memory -@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} +@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} ptr = POCL.emit_localmemory(T, Val(prod(Dims))) CLDeviceArray(Dims, ptr) end From 11b6a8ee9b48a3b2a70f09e3da36e92aabd2da79 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:30:33 -0300 Subject: [PATCH 04/13] Move default barrier implementation to KernelIntrinsics --- src/KernelAbstractions.jl | 2 +- src/intrinsics.jl | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index f5f76306f..cd3d1fc99 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -819,7 +819,7 @@ function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, end function __synchronize() - error("@synchronize used outside kernel or not captured") + KernelIntrinsics.barrier() end @generated function __print(items...) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 15202fab7..32cc125d1 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -52,7 +52,9 @@ Returns the unique group ID. function get_group_id end function localmemory end -function barrier end +function barrier() + error("Group barrier used outside kernel or not captured") +end function print end end From f1ad4125d457a1314e636e9d648e8c8f61515f20 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:36:22 -0300 Subject: [PATCH 05/13] Implement remaining intrinsics for POCL --- src/pocl/backend.jl | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index c06d014ce..873458976 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -154,6 +154,18 @@ end return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3)) end +@device_override @inline function KI.get_local_size() + return (; x = get_local_size(1), y = get_local_size(2), z = get_local_size(3)) +end + +@device_override @inline function KI.get_num_groups() + return (; x = get_num_groups(1), y = get_num_groups(2), z = get_num_groups(3)) +end + +@device_override @inline function KI.get_global_size() + return (; x = get_global_size(1), y = get_global_size(2), z = get_global_size(3)) +end + @device_override @inline function KA.__validindex(ctx) if KA.__dynamic_checkbounds(ctx) I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) @@ -178,7 +190,7 @@ end ## Synchronization and Printing -@device_override @inline function KA.__synchronize() +@device_override @inline function KI.barrier() work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE) end From d60f6f22a76dd98e9a28a3a93837a4464867376b Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:23:11 -0300 Subject: [PATCH 06/13] Add basic tests Modified from initial Claude code --- test/intrinsics.jl | 62 ++++++++++++++++++++++++++++++++++++++++++++++ test/testsuite.jl | 5 ++++ 2 files changed, 67 insertions(+) create mode 100644 test/intrinsics.jl diff --git a/test/intrinsics.jl b/test/intrinsics.jl new file mode 100644 index 000000000..693b2bd9e --- /dev/null +++ b/test/intrinsics.jl @@ -0,0 +1,62 @@ + +@kernel cpu = false inbounds = true unsafe_indices = true function test_intrinsics_kernel(results) + # Test all intrinsics return NamedTuples with x, y, z fields + global_size = KernelIntrinsics.get_global_size() + global_id = KernelIntrinsics.get_global_id() + local_size = KernelIntrinsics.get_local_size() + local_id = KernelIntrinsics.get_local_id() + num_groups = KernelIntrinsics.get_num_groups() + group_id = KernelIntrinsics.get_group_id() + + if UInt32(global_id.x) <= UInt32(global_size.x) + results[1, global_id.x] = global_id.x + results[2, global_id.x] = local_id.x + results[3, global_id.x] = group_id.x + results[4, global_id.x] = global_size.x + results[5, global_id.x] = local_size.x + results[6, global_id.x] = num_groups.x + end +end + + +function intrinsics_testsuite(backend, AT) + @testset "KernelIntrinsics Tests" begin + @testset "Basic intrinsics functionality" begin + + # Test with small kernel + N = 16 + results = AT(zeros(UInt32, 6, N)) + + kernel = test_intrinsics_kernel(backend(), 4, (N,)) + kernel(results, ndrange = N) + KernelAbstractions.synchronize(backend()) + + host_results = Array(results) + + # Verify results make sense + for i in 1:N + global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i] + + # Global IDs should be 1-based and sequential + @test global_id_x == i + + # Global size should match our ndrange + @test global_size_x == N + + # Local size should be 4 (our workgroupsize) + @test local_size_x == 4 + + # Number of groups should be ceil(N/4) = 4 + @test num_groups_x == 4 + + # Group ID should be 1-based + expected_group = div(i - 1, 4) + 1 + @test group_id_x == expected_group + + # Local ID should be 1-based within group + expected_local = ((i - 1) % 4) + 1 + @test local_id_x == expected_local + end + end + end +end diff --git a/test/testsuite.jl b/test/testsuite.jl index 2418db998..3426e5423 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -26,6 +26,7 @@ end include("test.jl") +include("intrinsics.jl") include("localmem.jl") include("private.jl") include("unroll.jl") @@ -47,6 +48,10 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{ specialfunctions_testsuite(backend) end + @conditional_testset "Intrinsics" skip_tests begin + intrinsics_testsuite(backend, AT) + end + @conditional_testset "Localmem" skip_tests begin localmem_testsuite(backend, AT) end From 3fda18fee8097e07cd1b68f748782b6741f2238d Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 1 Oct 2025 13:24:43 -0300 Subject: [PATCH 07/13] Int32 -> Int https://github.com/JuliaGPU/KernelAbstractions.jl/pull/562/files#r1958255435 --- src/intrinsics.jl | 12 ++++++------ src/pocl/backend.jl | 12 ++++++------ test/intrinsics.jl | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 32cc125d1..060116200 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -1,14 +1,14 @@ module KernelIntrinsics """ - get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_global_size()::@NamedTuple{x::Int, y::Int, z::Int} Return the number of global work-items specified. """ function get_global_size end """ - get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_global_id()::@NamedTuple{x::Int, y::Int, z::Int} Returns the unique global work-item ID. @@ -18,14 +18,14 @@ Returns the unique global work-item ID. function get_global_id end """ - get_local_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_local_size()::@NamedTuple{x::Int, y::Int, z::Int} Return the number of local work-items specified. """ function get_local_size end """ - get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_local_id()::@NamedTuple{x::Int, y::Int, z::Int} Returns the unique local work-item ID. @@ -35,14 +35,14 @@ Returns the unique local work-item ID. function get_local_id end """ - get_num_groups()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int} Returns the number of groups. """ function get_num_groups end """ - get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_group_id()::@NamedTuple{x::Int, y::Int, z::Int} Returns the unique group ID. diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 873458976..ef0aba6bd 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -143,27 +143,27 @@ end const KI = KA.KernelIntrinsics @device_override @inline function KI.get_local_id() - return (; x = get_local_id(1), y = get_local_id(2), z = get_local_id(3)) + return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3))) end @device_override @inline function KI.get_group_id() - return (; x = get_group_id(1), y = get_group_id(2), z = get_group_id(3)) + return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3))) end @device_override @inline function KI.get_global_id() - return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3)) + return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3))) end @device_override @inline function KI.get_local_size() - return (; x = get_local_size(1), y = get_local_size(2), z = get_local_size(3)) + return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3))) end @device_override @inline function KI.get_num_groups() - return (; x = get_num_groups(1), y = get_num_groups(2), z = get_num_groups(3)) + return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3))) end @device_override @inline function KI.get_global_size() - return (; x = get_global_size(1), y = get_global_size(2), z = get_global_size(3)) + return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3))) end @device_override @inline function KA.__validindex(ctx) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 693b2bd9e..e58b58a63 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -25,7 +25,7 @@ function intrinsics_testsuite(backend, AT) # Test with small kernel N = 16 - results = AT(zeros(UInt32, 6, N)) + results = AT(zeros(Int, 6, N)) kernel = test_intrinsics_kernel(backend(), 4, (N,)) kernel(results, ndrange = N) From 84d0c68f61a33fdf621bf67419fd3ffb70ca4841 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 2 Oct 2025 16:58:13 -0300 Subject: [PATCH 08/13] Format --- src/KernelAbstractions.jl | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index cd3d1fc99..138c63c52 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -814,13 +814,9 @@ include("macros.jl") ### function Scratchpad end -function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} - KernelIntrinsics.localmemory(t, dims, id) -end +SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id) -function __synchronize() - KernelIntrinsics.barrier() -end +__synchronize() = KernelIntrinsics.barrier() @generated function __print(items...) str = "" From 639414806c591e581b6e0d4420530dfbf7081f63 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 00:45:13 -0300 Subject: [PATCH 09/13] Launch interface --- src/KernelAbstractions.jl | 7 ++++--- src/intrinsics.jl | 39 ++++++++++++++++++++++++++++++++++++++- src/pocl/backend.jl | 2 +- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 138c63c52..7b6266189 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -244,10 +244,10 @@ Declare storage that is local to a workgroup. """ macro localmem(T, dims) # Stay in sync with CUDAnative - id = gensym("static_shmem") + # id = gensym("static_shmem") return quote - $SharedMemory($(esc(T)), Val($(esc(dims))), Val($(QuoteNode(id)))) + $SharedMemory($(esc(T)), Val($(esc(dims))))#, Val($(QuoteNode(id)))) end end @@ -814,7 +814,8 @@ include("macros.jl") ### function Scratchpad end -SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id) +# SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id) +SharedMemory(t::Type{T}, dims::Val{Dims}) where {T, Dims} = KernelIntrinsics.localmemory(t, dims) __synchronize() = KernelIntrinsics.barrier() diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 060116200..77bbf2a3e 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -51,10 +51,47 @@ Returns the unique group ID. """ function get_group_id end -function localmemory end +""" + localmemory(T, dims) + +Declare memory that is local to a workgroup. + +!!! note + Backend implementations **must** implement: + ``` + localmemory(T::DataType, ::Val{Dims}) where {T, Dims} + ``` + As well as the on-device functionality. +""" +localmemory(::Type{T}, dims) where T = localmemory(T, Val(dims)) +# @inline localmemory(::Type{T}, dims::Val{Dims}) where {T, Dims} = localmemory(T, dims, Val(gensym("static_shmem"))) + function barrier() error("Group barrier used outside kernel or not captured") end function print end + +""" + KIKernel{Backend, BKern} + +KIKernel closure struct that is used to represent the backend +kernel on the host. + +!!! note + Backend implementations **must** implement: + ``` + KI.KIKernel(::NewBackend, f, args...; kwargs...) + (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing) + ``` + As well as the on-device functionality. +""" +struct KIKernel{Backend, BKern} + backend::Backend + kern::BKern +end + +function kernel_max_work_group_size end +function max_work_group_size end +function multiprocessor_count end end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index ef0aba6bd..4a50cc7e0 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -178,7 +178,7 @@ end ## Shared and Scratch Memory -@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} +@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims} ptr = POCL.emit_localmemory(T, Val(prod(Dims))) CLDeviceArray(Dims, ptr) end From 8e533ffebc469787cab72b862a12153710f05c8c Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 10:24:19 -0300 Subject: [PATCH 10/13] Docs --- src/intrinsics.jl | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 77bbf2a3e..9a271fa2d 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -66,9 +66,28 @@ Declare memory that is local to a workgroup. localmemory(::Type{T}, dims) where T = localmemory(T, Val(dims)) # @inline localmemory(::Type{T}, dims::Val{Dims}) where {T, Dims} = localmemory(T, dims, Val(gensym("static_shmem"))) +""" + barrier() + +After a `barrier()` call, all read and writes to global and local memory +from each thread in the workgroup are visible in from all other threads in the +workgroup. + +!!! note + `barrier()` must be encountered by all workitems of a work-group executing the kernel or by none at all. + +!!! note + Backend implementations **must** implement: + ``` + @device_override barrier() + ``` + As well as the on-device functionality. +""" function barrier() error("Group barrier used outside kernel or not captured") end + +# TODO function print end From 31abcb3ead7619565a4279fe23e4c0d12f5c1cee Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 10:27:40 -0300 Subject: [PATCH 11/13] New stuff docs --- src/intrinsics.jl | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 9a271fa2d..efbe93f0c 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -110,7 +110,51 @@ struct KIKernel{Backend, BKern} kern::BKern end +""" + kernel_max_work_group_size(backend, kern; [max_work_items::Int])::Int + +The maximum workgroup size limit for a kernel as reported by the backend. +This function should always be used to determine the workgroup size before +launching a kernel. + +!!! note + Backend implementations **must** implement: + ``` + kernel_max_work_group_size(backend::NewBackend, kern::KIKernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int + ``` + As well as the on-device functionality. +""" function kernel_max_work_group_size end + +""" + max_work_group_size(backend, kern; [max_work_items::Int])::Int + +The maximum workgroup size limit for a kernel as reported by the backend. +This function represents a theoretical maximum; `kernel_max_work_group_size` +should be used before launching a kernel as some backends may error if +kernel launch with too big a workgroup is attempted. + +!!! note + Backend implementations **must** implement: + ``` + max_work_group_size(backend::NewBackend)::Int + ``` + As well as the on-device functionality. +""" function max_work_group_size end -function multiprocessor_count end + +""" + multiprocessor_count(backend::NewBackend)::Int + +The multiprocessor count for the current device used by `backend`. +Used for certain algorithm optimizations. + +!!! note + Backend implementations **may** implement: + ``` + multiprocessor_count(backend::NewBackend)::Int + ``` + As well as the on-device functionality. +""" +multiprocessor_count(::Backend) = 0 end From f2db0a2247202f166e49444ecd9e9ad70e823a6a Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 10:36:26 -0300 Subject: [PATCH 12/13] Fix --- src/intrinsics.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index efbe93f0c..5492d8f60 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -105,9 +105,9 @@ kernel on the host. ``` As well as the on-device functionality. """ -struct KIKernel{Backend, BKern} - backend::Backend - kern::BKern +struct KIKernel{B, Kern} + backend::B + kern::Kern end """ @@ -156,5 +156,5 @@ Used for certain algorithm optimizations. ``` As well as the on-device functionality. """ -multiprocessor_count(::Backend) = 0 +multiprocessor_count(_) = 0 end From 2d0a5151b252904923307c2addb1fbfb0951a6da Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 11:50:03 -0300 Subject: [PATCH 13/13] Temp adaptation to test in-progress interface with CUDA.jl --- src/intrinsics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 5492d8f60..37cbba88a 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -101,7 +101,7 @@ kernel on the host. Backend implementations **must** implement: ``` KI.KIKernel(::NewBackend, f, args...; kwargs...) - (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing) + (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) ``` As well as the on-device functionality. """