Skip to content
41 changes: 29 additions & 12 deletions src/KernelAbstractions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,10 @@ function unsafe_free! end

unsafe_free!(::AbstractArray) = return

include("intrinsics.jl")
import .KernelIntrinsics
export KernelIntrinsics

###
# Kernel language
# - @localmem
Expand Down Expand Up @@ -240,10 +244,10 @@ Declare storage that is local to a workgroup.
"""
macro localmem(T, dims)
# Stay in sync with CUDAnative
id = gensym("static_shmem")
# id = gensym("static_shmem")

return quote
$SharedMemory($(esc(T)), Val($(esc(dims))), Val($(QuoteNode(id))))
$SharedMemory($(esc(T)), Val($(esc(dims))))#, Val($(QuoteNode(id))))
end
end

Expand Down Expand Up @@ -460,13 +464,27 @@ end
# Internal kernel functions
###

function __index_Local_Linear end
function __index_Group_Linear end
function __index_Global_Linear end
function __index_Local_Linear(ctx)
return KernelIntrinsics.get_local_id().x
end

function __index_Group_Linear(ctx)
return KernelIntrinsics.get_group_id().x
end

function __index_Global_Linear(ctx)
return KernelIntrinsics.get_global_id().x
end

function __index_Local_Cartesian end
function __index_Group_Cartesian end
function __index_Global_Cartesian end
function __index_Local_Cartesian(ctx)
return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x]
end
function __index_Group_Cartesian(ctx)
return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x]
end
function __index_Global_Cartesian(ctx)
return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x)
end

@inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...))
@inline __index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...))
Expand Down Expand Up @@ -796,11 +814,10 @@ include("macros.jl")
###

function Scratchpad end
function SharedMemory end
# SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id)
SharedMemory(t::Type{T}, dims::Val{Dims}) where {T, Dims} = KernelIntrinsics.localmemory(t, dims)

function __synchronize()
error("@synchronize used outside kernel or not captured")
end
__synchronize() = KernelIntrinsics.barrier()

@generated function __print(items...)
str = ""
Expand Down
160 changes: 160 additions & 0 deletions src/intrinsics.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
module KernelIntrinsics

"""
get_global_size()::@NamedTuple{x::Int, y::Int, z::Int}

Return the number of global work-items specified.
"""
function get_global_size end

"""
get_global_id()::@NamedTuple{x::Int, y::Int, z::Int}

Returns the unique global work-item ID.

!!! note
1-based.
"""
function get_global_id end

"""
get_local_size()::@NamedTuple{x::Int, y::Int, z::Int}

Return the number of local work-items specified.
"""
function get_local_size end

"""
get_local_id()::@NamedTuple{x::Int, y::Int, z::Int}

Returns the unique local work-item ID.

!!! note
1-based.
"""
function get_local_id end

"""
get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int}

Returns the number of groups.
"""
function get_num_groups end

"""
get_group_id()::@NamedTuple{x::Int, y::Int, z::Int}

Returns the unique group ID.

!!! note
1-based.
"""
function get_group_id end

"""
localmemory(T, dims)

Declare memory that is local to a workgroup.

!!! note
Backend implementations **must** implement:
```
localmemory(T::DataType, ::Val{Dims}) where {T, Dims}
```
As well as the on-device functionality.
"""
localmemory(::Type{T}, dims) where T = localmemory(T, Val(dims))
# @inline localmemory(::Type{T}, dims::Val{Dims}) where {T, Dims} = localmemory(T, dims, Val(gensym("static_shmem")))

"""
barrier()

After a `barrier()` call, all read and writes to global and local memory
from each thread in the workgroup are visible in from all other threads in the
workgroup.

!!! note
`barrier()` must be encountered by all workitems of a work-group executing the kernel or by none at all.

!!! note
Backend implementations **must** implement:
```
@device_override barrier()
```
As well as the on-device functionality.
"""
function barrier()
error("Group barrier used outside kernel or not captured")
end

# TODO
function print end


"""
KIKernel{Backend, BKern}

KIKernel closure struct that is used to represent the backend
kernel on the host.

!!! note
Backend implementations **must** implement:
```
KI.KIKernel(::NewBackend, f, args...; kwargs...)
(kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
```
As well as the on-device functionality.
"""
struct KIKernel{B, Kern}
backend::B
kern::Kern
end

"""
kernel_max_work_group_size(backend, kern; [max_work_items::Int])::Int

The maximum workgroup size limit for a kernel as reported by the backend.
This function should always be used to determine the workgroup size before
launching a kernel.

!!! note
Backend implementations **must** implement:
```
kernel_max_work_group_size(backend::NewBackend, kern::KIKernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int
```
As well as the on-device functionality.
"""
function kernel_max_work_group_size end

"""
max_work_group_size(backend, kern; [max_work_items::Int])::Int

The maximum workgroup size limit for a kernel as reported by the backend.
This function represents a theoretical maximum; `kernel_max_work_group_size`
should be used before launching a kernel as some backends may error if
kernel launch with too big a workgroup is attempted.

!!! note
Backend implementations **must** implement:
```
max_work_group_size(backend::NewBackend)::Int
```
As well as the on-device functionality.
"""
function max_work_group_size end

"""
multiprocessor_count(backend::NewBackend)::Int

The multiprocessor count for the current device used by `backend`.
Used for certain algorithm optimizations.

!!! note
Backend implementations **may** implement:
```
multiprocessor_count(backend::NewBackend)::Int
```
As well as the on-device functionality.
"""
multiprocessor_count(_) = 0
end
29 changes: 15 additions & 14 deletions src/pocl/backend.jl
Original file line number Diff line number Diff line change
Expand Up @@ -140,29 +140,30 @@ end


## Indexing Functions
const KI = KA.KernelIntrinsics

@device_override @inline function KA.__index_Local_Linear(ctx)
return get_local_id(1)
@device_override @inline function KI.get_local_id()
return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3)))
end

@device_override @inline function KA.__index_Group_Linear(ctx)
return get_group_id(1)
@device_override @inline function KI.get_group_id()
return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3)))
end

@device_override @inline function KA.__index_Global_Linear(ctx)
return get_global_id(1)
@device_override @inline function KI.get_global_id()
return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3)))
end

@device_override @inline function KA.__index_Local_Cartesian(ctx)
@inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)]
@device_override @inline function KI.get_local_size()
return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3)))
end

@device_override @inline function KA.__index_Group_Cartesian(ctx)
@inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)]
@device_override @inline function KI.get_num_groups()
return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3)))
end

@device_override @inline function KA.__index_Global_Cartesian(ctx)
return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
@device_override @inline function KI.get_global_size()
return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3)))
end

@device_override @inline function KA.__validindex(ctx)
Expand All @@ -177,7 +178,7 @@ end

## Shared and Scratch Memory

@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
ptr = POCL.emit_localmemory(T, Val(prod(Dims)))
CLDeviceArray(Dims, ptr)
end
Expand All @@ -189,7 +190,7 @@ end

## Synchronization and Printing

@device_override @inline function KA.__synchronize()
@device_override @inline function KI.barrier()
work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
end

Expand Down
62 changes: 62 additions & 0 deletions test/intrinsics.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@

@kernel cpu = false inbounds = true unsafe_indices = true function test_intrinsics_kernel(results)
# Test all intrinsics return NamedTuples with x, y, z fields
global_size = KernelIntrinsics.get_global_size()
global_id = KernelIntrinsics.get_global_id()
local_size = KernelIntrinsics.get_local_size()
local_id = KernelIntrinsics.get_local_id()
num_groups = KernelIntrinsics.get_num_groups()
group_id = KernelIntrinsics.get_group_id()

if UInt32(global_id.x) <= UInt32(global_size.x)
results[1, global_id.x] = global_id.x
results[2, global_id.x] = local_id.x
results[3, global_id.x] = group_id.x
results[4, global_id.x] = global_size.x
results[5, global_id.x] = local_size.x
results[6, global_id.x] = num_groups.x
end
end


function intrinsics_testsuite(backend, AT)
@testset "KernelIntrinsics Tests" begin
@testset "Basic intrinsics functionality" begin

# Test with small kernel
N = 16
results = AT(zeros(Int, 6, N))

kernel = test_intrinsics_kernel(backend(), 4, (N,))
kernel(results, ndrange = N)
KernelAbstractions.synchronize(backend())

host_results = Array(results)

# Verify results make sense
for i in 1:N
global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i]

# Global IDs should be 1-based and sequential
@test global_id_x == i

# Global size should match our ndrange
@test global_size_x == N

# Local size should be 4 (our workgroupsize)
@test local_size_x == 4

# Number of groups should be ceil(N/4) = 4
@test num_groups_x == 4

# Group ID should be 1-based
expected_group = div(i - 1, 4) + 1
@test group_id_x == expected_group

# Local ID should be 1-based within group
expected_local = ((i - 1) % 4) + 1
@test local_id_x == expected_local
end
end
end
end
5 changes: 5 additions & 0 deletions test/testsuite.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ end


include("test.jl")
include("intrinsics.jl")
include("localmem.jl")
include("private.jl")
include("unroll.jl")
Expand All @@ -47,6 +48,10 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{
specialfunctions_testsuite(backend)
end

@conditional_testset "Intrinsics" skip_tests begin
intrinsics_testsuite(backend, AT)
end

@conditional_testset "Localmem" skip_tests begin
localmem_testsuite(backend, AT)
end
Expand Down
Loading