JuliaGPU · christiangnrd · Feb 4, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 18, 2025
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
@@ -194,6 +194,10 @@ function unsafe_free! end
 
 unsafe_free!(::AbstractArray) = return
 
+include("intrinsics.jl")
+import .KernelIntrinsics
+export KernelIntrinsics
+
 ###
 # Kernel language
 # - @localmem
@@ -240,10 +244,10 @@ Declare storage that is local to a workgroup.
 """
 macro localmem(T, dims)
     # Stay in sync with CUDAnative
-    id = gensym("static_shmem")
+    # id = gensym("static_shmem")
 
     return quote
-        $SharedMemory($(esc(T)), Val($(esc(dims))), Val($(QuoteNode(id))))
+        $SharedMemory($(esc(T)), Val($(esc(dims))))#, Val($(QuoteNode(id))))
     end
 end
 
@@ -460,13 +464,27 @@ end
 # Internal kernel functions
 ###
 
-function __index_Local_Linear end
-function __index_Group_Linear end
-function __index_Global_Linear end
+function __index_Local_Linear(ctx)
+    return KernelIntrinsics.get_local_id().x
+end
+
+function __index_Group_Linear(ctx)
+    return KernelIntrinsics.get_group_id().x
+end
+
+function __index_Global_Linear(ctx)
+    return KernelIntrinsics.get_global_id().x
+end
 
-function __index_Local_Cartesian end
-function __index_Group_Cartesian end
-function __index_Global_Cartesian end
+function __index_Local_Cartesian(ctx)
+    return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x]
+end
+function __index_Group_Cartesian(ctx)
+    return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x]
+end
+function __index_Global_Cartesian(ctx)
+    return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x)
+end
 
 @inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...))
 @inline __index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...))
@@ -796,11 +814,10 @@ include("macros.jl")
 ###
 
 function Scratchpad end
-function SharedMemory end
+# SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id)
+SharedMemory(t::Type{T}, dims::Val{Dims}) where {T, Dims} = KernelIntrinsics.localmemory(t, dims)
 
-function __synchronize()
-    error("@synchronize used outside kernel or not captured")
-end
+__synchronize() = KernelIntrinsics.barrier()
 
 @generated function __print(items...)
     str = ""

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
@@ -0,0 +1,160 @@
+module KernelIntrinsics
+
+"""
+    get_global_size()::@NamedTuple{x::Int, y::Int, z::Int}
+
+Return the number of global work-items specified.
+"""
+function get_global_size end
+
+"""
+    get_global_id()::@NamedTuple{x::Int, y::Int, z::Int}
+
+Returns the unique global work-item ID.
+
+!!! note
+    1-based.
+"""
+function get_global_id end
+
+"""
+    get_local_size()::@NamedTuple{x::Int, y::Int, z::Int}
+
+Return the number of local work-items specified.
+"""
+function get_local_size end
+
+"""
+    get_local_id()::@NamedTuple{x::Int, y::Int, z::Int}
+
+Returns the unique local work-item ID.
+
+!!! note
+    1-based.
+"""
+function get_local_id end
+
+"""
+    get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int}
+
+Returns the number of groups.
+"""
+function get_num_groups end
+
+"""
+    get_group_id()::@NamedTuple{x::Int, y::Int, z::Int}
+
+Returns the unique group ID.
+
+!!! note
+    1-based.
+"""
+function get_group_id end
+
+"""
+    localmemory(T, dims)
+
+Declare memory that is local to a workgroup.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    localmemory(T::DataType, ::Val{Dims}) where {T, Dims}
+    ```
+    As well as the on-device functionality.
+"""
+localmemory(::Type{T}, dims) where T = localmemory(T, Val(dims))
+# @inline localmemory(::Type{T}, dims::Val{Dims}) where {T, Dims} = localmemory(T, dims, Val(gensym("static_shmem")))
+
+"""
+    barrier()
+
+After a `barrier()` call, all read and writes to global and local memory
+from each thread in the workgroup are visible in from all other threads in the
+workgroup.
+
+!!! note
+    `barrier()` must be encountered by all workitems of a work-group executing the kernel or by none at all.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override barrier()
+    ```
+    As well as the on-device functionality.
+"""
+function barrier()
+    error("Group barrier used outside kernel or not captured")
+end
+
+# TODO
+function print end
+
+
+"""
+    KIKernel{Backend, BKern}
+
+KIKernel closure struct that is used to represent the backend
+kernel on the host.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    KI.KIKernel(::NewBackend, f, args...; kwargs...)
+    (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
+    ```
+    As well as the on-device functionality.
+"""
+struct KIKernel{B, Kern}
+    backend::B
+    kern::Kern
+end
+
+"""
+    kernel_max_work_group_size(backend, kern; [max_work_items::Int])::Int
+
+The maximum workgroup size limit for a kernel as reported by the backend.
+This function should always be used to determine the workgroup size before
+launching a kernel.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    kernel_max_work_group_size(backend::NewBackend, kern::KIKernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int
+    ```
+    As well as the on-device functionality.
+"""
+function kernel_max_work_group_size end
+
+"""
+    max_work_group_size(backend, kern; [max_work_items::Int])::Int
+
+The maximum workgroup size limit for a kernel as reported by the backend.
+This function represents a theoretical maximum; `kernel_max_work_group_size`
+should be used before launching a kernel as some backends may error if
+kernel launch with too big a workgroup is attempted.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    max_work_group_size(backend::NewBackend)::Int
+    ```
+    As well as the on-device functionality.
+"""
+function max_work_group_size end
+
+"""
+    multiprocessor_count(backend::NewBackend)::Int
+
+The multiprocessor count for the current device used by `backend`.
+Used for certain algorithm optimizations.
+
+!!! note
+    Backend implementations **may** implement:
+    ```
+    multiprocessor_count(backend::NewBackend)::Int
+    ```
+    As well as the on-device functionality.
+"""
+multiprocessor_count(_) = 0
+end
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
@@ -140,29 +140,30 @@ end
 
 
 ## Indexing Functions
+const KI = KA.KernelIntrinsics
 
-@device_override @inline function KA.__index_Local_Linear(ctx)
-    return get_local_id(1)
+@device_override @inline function KI.get_local_id()
+    return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3)))
 end
 
-@device_override @inline function KA.__index_Group_Linear(ctx)
-    return get_group_id(1)
+@device_override @inline function KI.get_group_id()
+    return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3)))
 end
 
-@device_override @inline function KA.__index_Global_Linear(ctx)
-    return get_global_id(1)
+@device_override @inline function KI.get_global_id()
+    return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3)))
 end
 
-@device_override @inline function KA.__index_Local_Cartesian(ctx)
-    @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)]
+@device_override @inline function KI.get_local_size()
+    return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3)))
 end
 
-@device_override @inline function KA.__index_Group_Cartesian(ctx)
-    @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)]
+@device_override @inline function KI.get_num_groups()
+    return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3)))
 end
 
-@device_override @inline function KA.__index_Global_Cartesian(ctx)
-    return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
+@device_override @inline function KI.get_global_size()
+    return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3)))
 end
 
 @device_override @inline function KA.__validindex(ctx)
@@ -177,7 +178,7 @@ end
 
 ## Shared and Scratch Memory
 
-@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
     ptr = POCL.emit_localmemory(T, Val(prod(Dims)))
     CLDeviceArray(Dims, ptr)
 end
@@ -189,7 +190,7 @@ end
 
 ## Synchronization and Printing
 
-@device_override @inline function KA.__synchronize()
+@device_override @inline function KI.barrier()
     work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
 end
 

diff --git a/test/intrinsics.jl b/test/intrinsics.jl
@@ -0,0 +1,62 @@
+
+@kernel cpu = false inbounds = true unsafe_indices = true function test_intrinsics_kernel(results)
+    # Test all intrinsics return NamedTuples with x, y, z fields
+    global_size = KernelIntrinsics.get_global_size()
+    global_id = KernelIntrinsics.get_global_id()
+    local_size = KernelIntrinsics.get_local_size()
+    local_id = KernelIntrinsics.get_local_id()
+    num_groups = KernelIntrinsics.get_num_groups()
+    group_id = KernelIntrinsics.get_group_id()
+
+    if UInt32(global_id.x) <= UInt32(global_size.x)
+        results[1, global_id.x] = global_id.x
+        results[2, global_id.x] = local_id.x
+        results[3, global_id.x] = group_id.x
+        results[4, global_id.x] = global_size.x
+        results[5, global_id.x] = local_size.x
+        results[6, global_id.x] = num_groups.x
+    end
+end
+
+
+function intrinsics_testsuite(backend, AT)
+    @testset "KernelIntrinsics Tests" begin
+        @testset "Basic intrinsics functionality" begin
+
+            # Test with small kernel
+            N = 16
+            results = AT(zeros(Int, 6, N))
+
+            kernel = test_intrinsics_kernel(backend(), 4, (N,))
+            kernel(results, ndrange = N)
+            KernelAbstractions.synchronize(backend())
+
+            host_results = Array(results)
+
+            # Verify results make sense
+            for i in 1:N
+                global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i]
+
+                # Global IDs should be 1-based and sequential
+                @test global_id_x == i
+
+                # Global size should match our ndrange
+                @test global_size_x == N
+
+                # Local size should be 4 (our workgroupsize)
+                @test local_size_x == 4
+
+                # Number of groups should be ceil(N/4) = 4
+                @test num_groups_x == 4
+
+                # Group ID should be 1-based
+                expected_group = div(i - 1, 4) + 1
+                @test group_id_x == expected_group
+
+                # Local ID should be 1-based within group
+                expected_local = ((i - 1) % 4) + 1
+                @test local_id_x == expected_local
+            end
+        end
+    end
+end
diff --git a/test/testsuite.jl b/test/testsuite.jl
@@ -26,6 +26,7 @@ end
 
 
 include("test.jl")
+include("intrinsics.jl")
 include("localmem.jl")
 include("private.jl")
 include("unroll.jl")
@@ -47,6 +48,10 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{
         specialfunctions_testsuite(backend)
     end
 
+    @conditional_testset "Intrinsics" skip_tests begin
+        intrinsics_testsuite(backend, AT)
+    end
+
     @conditional_testset "Localmem" skip_tests begin
         localmem_testsuite(backend, AT)
     end