From 236c812094fe915dbb5a5414857b3848ab4245bf Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sun, 18 May 2025 12:24:29 -0300 Subject: [PATCH 1/7] Port `reverse` from CUDA --- Project.toml | 2 + src/GPUArrays.jl | 2 + src/host/reverse.jl | 148 +++++++++++++++++++++++++++++++++++++++++ test/testsuite/base.jl | 45 +++++++++++++ 4 files changed, 197 insertions(+) create mode 100644 src/host/reverse.jl diff --git a/Project.toml b/Project.toml index c399348c..f5929187 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "11.2.6" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" +GPUToolbox = "096a3bc2-3ced-46d0-87f4-dd12716f4bfc" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -24,6 +25,7 @@ JLD2Ext = "JLD2" [compat] Adapt = "4.0" GPUArraysCore = "= 0.2.0" +GPUToolbox = "0.2, 0.3, 1" JLD2 = "0.4, 0.5, 0.6" KernelAbstractions = "0.9.28, 0.10" LLVM = "3.9, 4, 5, 6, 7, 8, 9" diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl index 8c1fc14e..206f564b 100644 --- a/src/GPUArrays.jl +++ b/src/GPUArrays.jl @@ -1,5 +1,6 @@ module GPUArrays +using GPUToolbox using KernelAbstractions using Serialization using Random @@ -26,6 +27,7 @@ include("host/construction.jl") ## integrations and specialized methods include("host/base.jl") include("host/indexing.jl") +include("host/reverse.jl") include("host/broadcast.jl") include("host/mapreduce.jl") include("host/linalg.jl") diff --git a/src/host/reverse.jl b/src/host/reverse.jl new file mode 100644 index 00000000..0f2f5c3c --- /dev/null +++ b/src/host/reverse.jl @@ -0,0 +1,148 @@ +# reversing + +# the kernel works by treating the array as 1d. after reversing by dimension x an element at +# pos [i1, i2, i3, ... , i{x}, ..., i{n}] will be at +# pos [i1, i2, i3, ... , d{x} - i{x} + 1, ..., i{n}] where d{x} is the size of dimension x + +# out-of-place version, copying a single value per thread from input to output +function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N}; + dims=1:ndims(input)) where {T, N} + @assert size(input) == size(output) + rev_dims = ntuple((d)-> d in dims && size(input, d) > 1, N) + ref = size(input) .+ 1 + # converts an ND-index in the data array to the linear index + lin_idx = LinearIndices(input) + # converts a linear index in a reduced array to an ND-index, but using the reduced size + nd_idx = CartesianIndices(input) + + ## COV_EXCL_START + @kernel unsafe_indices=true function kernel(input, output) + offset_in = Int32(@groupsize()[1]) * (@index(Group, Linear) - 1i32) + index_in = offset_in + @index(Local, Linear) + + @inbounds if index_in <= length(input) + idx = Tuple(nd_idx[index_in]) + idx = ifelse.(rev_dims, ref .- idx, idx) + index_out = lin_idx[idx...] + output[index_out] = input[index_in] + end + end + ## COV_EXCL_STOP + + nthreads = 256 + + kernel(get_backend(input), nthreads)(input, output; ndrange=length(input)) +end + +# in-place version, swapping elements on half the number of threads +function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N} + rev_dims = ntuple((d)-> d in dims && size(data, d) > 1, N) + half_dim = findlast(rev_dims) + if isnothing(half_dim) + # no reverse operation needed at all in this case. + return + end + ref = size(data) .+ 1 + # converts an ND-index in the data array to the linear index + lin_idx = LinearIndices(data) + reduced_size = ntuple((d)->ifelse(d==half_dim, cld(size(data,d),2), size(data,d)), N) + reduced_length = prod(reduced_size) + # converts a linear index in a reduced array to an ND-index, but using the reduced size + nd_idx = CartesianIndices(reduced_size) + + ## COV_EXCL_START + @kernel unsafe_indices=true function kernel(data) + offset_in = Int32(@groupsize()[1]) * (@index(Group, Linear) - 1i32) + index_in = offset_in + @index(Local, Linear) + + @inbounds if index_in <= reduced_length + idx = Tuple(nd_idx[index_in]) + index_in = lin_idx[idx...] + idx = ifelse.(rev_dims, ref .- idx, idx) + index_out = lin_idx[idx...] + + if index_in < index_out + temp = data[index_out] + data[index_out] = data[index_in] + data[index_in] = temp + end + end + end + ## COV_EXCL_STOP + + # NOTE: we launch slightly more than half the number of elements in the array as threads. + # The last non-singleton dimension along which to reverse is used to define how the array is split. + # Only the middle row in case of an odd array dimension could cause trouble, but this is prevented by + # ignoring the threads that cross the mid-point + + nthreads = 256 + + kernel(get_backend(data), nthreads)(data; ndrange=length(data)) +end + + +# n-dimensional API + +function Base.reverse!(data::AnyGPUArray{T, N}; dims=:) where {T, N} + if isa(dims, Colon) + dims = 1:ndims(data) + end + if !applicable(iterate, dims) + throw(ArgumentError("dimension $dims is not an iterable")) + end + if !all(1 .≤ dims .≤ ndims(data)) + throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(data))")) + end + + _reverse!(data; dims=dims) + + return data +end + +# out-of-place +function Base.reverse(input::AnyGPUArray{T, N}; dims=:) where {T, N} + if isa(dims, Colon) + dims = 1:ndims(input) + end + if !applicable(iterate, dims) + throw(ArgumentError("dimension $dims is not an iterable")) + end + if !all(1 .≤ dims .≤ ndims(input)) + throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(input))")) + end + + if all(size(input)[[dims...]].==1) + # no reverse operation needed at all in this case. + return copy(input) + else + output = similar(input) + _reverse(input, output; dims=dims) + return output + end +end + + +# 1-dimensional API + +# in-place +Base.@propagate_inbounds function Base.reverse!(data::AnyGPUVector{T}, start::Integer, + stop::Integer=length(data)) where {T} + _reverse!(view(data, start:stop)) + return data +end + +Base.reverse!(data::AnyGPUVector{T}) where {T} = @inbounds reverse!(data, 1, length(data)) + +# out-of-place +Base.@propagate_inbounds function Base.reverse(input::AnyGPUVector{T}, start::Integer, + stop::Integer=length(input)) where {T} + output = similar(input) + + start > 1 && copyto!(output, 1, input, 1, start-1) + _reverse(view(input, start:stop), view(output, start:stop)) + stop < length(input) && copyto!(output, stop+1, input, stop+1) + + return output +end + +Base.reverse(data::AnyGPUVector{T}) where {T} = @inbounds reverse(data, 1, length(data)) diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl index 6bcfd5b4..55436126 100644 --- a/test/testsuite/base.jl +++ b/test/testsuite/base.jl @@ -381,6 +381,51 @@ end gA = reshape(AT(A),4) end + @testset "reverse" begin + # 1-d out-of-place + @test compare(x->reverse(x), AT, rand(Float32, 1000)) + @test compare(x->reverse(x, 10), AT, rand(Float32, 1000)) + @test compare(x->reverse(x, 10, 90), AT, rand(Float32, 1000)) + + # 1-d in-place + @test compare(x->reverse!(x), AT, rand(Float32, 1000)) + @test compare(x->reverse!(x, 10), AT, rand(Float32, 1000)) + @test compare(x->reverse!(x, 10, 90), AT, rand(Float32, 1000)) + + # n-d out-of-place + for shape in ([1, 2, 4, 3], [4, 2], [5], [2^5, 2^5, 2^5]), + dim in 1:length(shape) + @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...)) + + cpu = rand(Float32, shape...) + gpu = AT(cpu) + reverse!(gpu; dims=dim) + @test Array(gpu) == reverse(cpu; dims=dim) + end + + # supports multidimensional reverse + for shape in ([1, 2, 4, 3], [2^5, 2^5, 2^5]), + dim in ((1,2),(2,3),(1,3),:) + @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...)) + + cpu = rand(Float32, shape...) + gpu = AT(cpu) + reverse!(gpu; dims=dim) + @test Array(gpu) == reverse(cpu; dims=dim) + end + + # wrapped array + @test compare(x->reverse(x), AT, reshape(rand(Float32, 2,2), 4)) + + # error throwing + cpu = rand(Float32, 1,2,3,4) + gpu = AT(cpu) + @test_throws ArgumentError reverse!(gpu, dims=5) + @test_throws ArgumentError reverse!(gpu, dims=0) + @test_throws ArgumentError reverse(gpu, dims=5) + @test_throws ArgumentError reverse(gpu, dims=0) + end + @testset "reinterpret" begin A = Int32[-1,-2,-3] dA = AT(A) From 9cbfae70171a6e270c45b5ce0dac91a63c6cf6fc Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sun, 2 Nov 2025 00:26:06 -0300 Subject: [PATCH 2/7] Use KernelIntrinsics --- Project.toml | 2 +- src/GPUArrays.jl | 1 + src/host/reverse.jl | 12 ++++++------ 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/Project.toml b/Project.toml index f5929187..8ab8647b 100644 --- a/Project.toml +++ b/Project.toml @@ -27,7 +27,7 @@ Adapt = "4.0" GPUArraysCore = "= 0.2.0" GPUToolbox = "0.2, 0.3, 1" JLD2 = "0.4, 0.5, 0.6" -KernelAbstractions = "0.9.28, 0.10" +KernelAbstractions = "0.10" LLVM = "3.9, 4, 5, 6, 7, 8, 9" LinearAlgebra = "1" Printf = "1" diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl index 206f564b..edb35931 100644 --- a/src/GPUArrays.jl +++ b/src/GPUArrays.jl @@ -2,6 +2,7 @@ module GPUArrays using GPUToolbox using KernelAbstractions +import KernelAbstractions.KernelIntrinsics as KI using Serialization using Random using LinearAlgebra diff --git a/src/host/reverse.jl b/src/host/reverse.jl index 0f2f5c3c..1c845897 100644 --- a/src/host/reverse.jl +++ b/src/host/reverse.jl @@ -17,8 +17,8 @@ function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N}; ## COV_EXCL_START @kernel unsafe_indices=true function kernel(input, output) - offset_in = Int32(@groupsize()[1]) * (@index(Group, Linear) - 1i32) - index_in = offset_in + @index(Local, Linear) + offset_in = Int32(KI.get_local_size().x) * (KI.get_group_id().x - 1i32) + index_in = offset_in + KI.get_local_id().x @inbounds if index_in <= length(input) idx = Tuple(nd_idx[index_in]) @@ -31,7 +31,7 @@ function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N}; nthreads = 256 - kernel(get_backend(input), nthreads)(input, output; ndrange=length(input)) + kernel(get_backend(input))(input, output; ndrange=length(input), nthreads) end # in-place version, swapping elements on half the number of threads @@ -52,8 +52,8 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N} ## COV_EXCL_START @kernel unsafe_indices=true function kernel(data) - offset_in = Int32(@groupsize()[1]) * (@index(Group, Linear) - 1i32) - index_in = offset_in + @index(Local, Linear) + offset_in = Int32(KI.get_local_size().x) * (KI.get_group_id().x - 1i32) + index_in = offset_in + KI.get_local_id().x @inbounds if index_in <= reduced_length idx = Tuple(nd_idx[index_in]) @@ -77,7 +77,7 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N} nthreads = 256 - kernel(get_backend(data), nthreads)(data; ndrange=length(data)) + kernel(get_backend(data))(data; ndrange=length(data), nthreads) end From b395474a818f74611c2f72ea4b97a6152d6e271c Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sun, 2 Nov 2025 00:38:58 -0300 Subject: [PATCH 3/7] fix --- src/host/reverse.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/host/reverse.jl b/src/host/reverse.jl index 1c845897..8be1c404 100644 --- a/src/host/reverse.jl +++ b/src/host/reverse.jl @@ -31,7 +31,7 @@ function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N}; nthreads = 256 - kernel(get_backend(input))(input, output; ndrange=length(input), nthreads) + kernel(get_backend(input))(input, output; ndrange=length(input), workgroupsize=nthreads) end # in-place version, swapping elements on half the number of threads @@ -77,7 +77,7 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N} nthreads = 256 - kernel(get_backend(data))(data; ndrange=length(data), nthreads) + kernel(get_backend(data))(data; ndrange=length(data), workgroupsize=nthreads) end From b5e2c3b26397a45dcdc075f3e5c9e5b43036b0f7 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sun, 2 Nov 2025 14:25:22 -0400 Subject: [PATCH 4/7] Add some edge cases --- test/testsuite/base.jl | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl index 55436126..8a2f71bb 100644 --- a/test/testsuite/base.jl +++ b/test/testsuite/base.jl @@ -393,25 +393,29 @@ end @test compare(x->reverse!(x, 10, 90), AT, rand(Float32, 1000)) # n-d out-of-place - for shape in ([1, 2, 4, 3], [4, 2], [5], [2^5, 2^5, 2^5]), + for shape in ([1, 2, 4, 3], [4, 2], [5], [0], [1], [2^5, 2^5, 2^5]), dim in 1:length(shape) - @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...)) - - cpu = rand(Float32, shape...) - gpu = AT(cpu) - reverse!(gpu; dims=dim) - @test Array(gpu) == reverse(cpu; dims=dim) + @testset "Shape: $shape, Dim: $dim" + @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...)) + + cpu = rand(Float32, shape...) + gpu = AT(cpu) + reverse!(gpu; dims=dim) + @test Array(gpu) == reverse(cpu; dims=dim) + end end # supports multidimensional reverse - for shape in ([1, 2, 4, 3], [2^5, 2^5, 2^5]), + for shape in ([1,1,1,1], [1, 2, 4, 3], [2^5, 2^5, 2^5]), dim in ((1,2),(2,3),(1,3),:) - @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...)) - - cpu = rand(Float32, shape...) - gpu = AT(cpu) - reverse!(gpu; dims=dim) - @test Array(gpu) == reverse(cpu; dims=dim) + @testset "Shape: $shape, Dim: $dim" + @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...)) + + cpu = rand(Float32, shape...) + gpu = AT(cpu) + reverse!(gpu; dims=dim) + @test Array(gpu) == reverse(cpu; dims=dim) + end end # wrapped array From 1ab8b2190c3c9b95af42fb2d4005de2f7bc764d9 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sun, 2 Nov 2025 14:26:09 -0400 Subject: [PATCH 5/7] Use KernelIntrinsics --- src/host/reverse.jl | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/host/reverse.jl b/src/host/reverse.jl index 8be1c404..74453074 100644 --- a/src/host/reverse.jl +++ b/src/host/reverse.jl @@ -16,7 +16,7 @@ function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N}; nd_idx = CartesianIndices(input) ## COV_EXCL_START - @kernel unsafe_indices=true function kernel(input, output) + function rev_kernel(input, output) offset_in = Int32(KI.get_local_size().x) * (KI.get_group_id().x - 1i32) index_in = offset_in + KI.get_local_id().x @@ -28,10 +28,11 @@ function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N}; end end ## COV_EXCL_STOP + kernel = KI.KIKernel(get_backend(input), rev_kernel, input, output) + nthreads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=length(input)) + ngroups = cld(length(input), nthreads) - nthreads = 256 - - kernel(get_backend(input))(input, output; ndrange=length(input), workgroupsize=nthreads) + kernel(input, output; numworkgroups=ngroups, workgroupsize=nthreads) end # in-place version, swapping elements on half the number of threads @@ -51,7 +52,7 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N} nd_idx = CartesianIndices(reduced_size) ## COV_EXCL_START - @kernel unsafe_indices=true function kernel(data) + function rev_kernel!(data) offset_in = Int32(KI.get_local_size().x) * (KI.get_group_id().x - 1i32) index_in = offset_in + KI.get_local_id().x @@ -75,9 +76,12 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N} # Only the middle row in case of an odd array dimension could cause trouble, but this is prevented by # ignoring the threads that cross the mid-point - nthreads = 256 - kernel(get_backend(data))(data; ndrange=length(data), workgroupsize=nthreads) + kernel = KI.KIKernel(get_backend(data), rev_kernel!, data) + nthreads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=reduced_length) + ngroups = cld(reduced_length, nthreads) + + kernel(input, output; numworkgroups=ngroups, workgroupsize=nthreads) end From cc4b829af5c35006cdc72d8a81123eaa7b5f1a58 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Sun, 2 Nov 2025 15:27:39 -0400 Subject: [PATCH 6/7] Test and fix some edge cases --- src/host/reverse.jl | 25 ++++++++++++++++++------- test/testsuite/base.jl | 10 ++++++++-- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/host/reverse.jl b/src/host/reverse.jl index 74453074..57ec8650 100644 --- a/src/host/reverse.jl +++ b/src/host/reverse.jl @@ -26,9 +26,12 @@ function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N}; index_out = lin_idx[idx...] output[index_out] = input[index_in] end + return end ## COV_EXCL_STOP - kernel = KI.KIKernel(get_backend(input), rev_kernel, input, output) + + backend = get_backend(input) + kernel = KI.KIKernel(backend, rev_kernel, input, output) nthreads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=length(input)) ngroups = cld(length(input), nthreads) @@ -68,6 +71,7 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N} data[index_in] = temp end end + return end ## COV_EXCL_STOP @@ -76,12 +80,12 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N} # Only the middle row in case of an odd array dimension could cause trouble, but this is prevented by # ignoring the threads that cross the mid-point - - kernel = KI.KIKernel(get_backend(data), rev_kernel!, data) + backend = get_backend(data) + kernel = KI.KIKernel(backend, rev_kernel!, data) nthreads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=reduced_length) ngroups = cld(reduced_length, nthreads) - kernel(input, output; numworkgroups=ngroups, workgroupsize=nthreads) + kernel(data; numworkgroups=ngroups, workgroupsize=nthreads) end @@ -98,7 +102,10 @@ function Base.reverse!(data::AnyGPUArray{T, N}; dims=:) where {T, N} throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(data))")) end - _reverse!(data; dims=dims) + # no reverse operation needed at all otherwise. + if !(all(size(data)[[dims...]].==1) || isempty(data)) + _reverse!(data; dims=dims) + end return data end @@ -115,7 +122,7 @@ function Base.reverse(input::AnyGPUArray{T, N}; dims=:) where {T, N} throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(input))")) end - if all(size(input)[[dims...]].==1) + if all(size(input)[[dims...]].==1) || isempty(input) # no reverse operation needed at all in this case. return copy(input) else @@ -131,7 +138,8 @@ end # in-place Base.@propagate_inbounds function Base.reverse!(data::AnyGPUVector{T}, start::Integer, stop::Integer=length(data)) where {T} - _reverse!(view(data, start:stop)) + # only reverse if there are more than 2 elements to reverse + (stop-start > 1) && _reverse!(view(data, start:stop)) return data end @@ -140,6 +148,9 @@ Base.reverse!(data::AnyGPUVector{T}) where {T} = @inbounds reverse!(data, 1, len # out-of-place Base.@propagate_inbounds function Base.reverse(input::AnyGPUVector{T}, start::Integer, stop::Integer=length(input)) where {T} + # Copy array in one kernel if no work is to be done + (stop-start > 1) || return copy(input) + output = similar(input) start > 1 && copyto!(output, 1, input, 1, start-1) diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl index 8a2f71bb..6f45ad12 100644 --- a/test/testsuite/base.jl +++ b/test/testsuite/base.jl @@ -384,18 +384,24 @@ end @testset "reverse" begin # 1-d out-of-place @test compare(x->reverse(x), AT, rand(Float32, 1000)) + @test compare(x->reverse(x), AT, rand(Float32, 0)) + @test compare(x->reverse(x), AT, rand(Float32, 1)) @test compare(x->reverse(x, 10), AT, rand(Float32, 1000)) @test compare(x->reverse(x, 10, 90), AT, rand(Float32, 1000)) + @test compare(x->reverse(x, 3, 3), AT, Float32[1,2,3,4,5]) # 1-d in-place @test compare(x->reverse!(x), AT, rand(Float32, 1000)) + @test compare(x->reverse!(x), AT, rand(Float32, 0)) + @test compare(x->reverse!(x), AT, rand(Float32, 1)) @test compare(x->reverse!(x, 10), AT, rand(Float32, 1000)) @test compare(x->reverse!(x, 10, 90), AT, rand(Float32, 1000)) + @test compare(x->reverse!(x, 3, 3), AT, Float32[1,2,3,4,5]) # n-d out-of-place for shape in ([1, 2, 4, 3], [4, 2], [5], [0], [1], [2^5, 2^5, 2^5]), dim in 1:length(shape) - @testset "Shape: $shape, Dim: $dim" + @testset "Shape: $shape, Dim: $dim" begin @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...)) cpu = rand(Float32, shape...) @@ -408,7 +414,7 @@ end # supports multidimensional reverse for shape in ([1,1,1,1], [1, 2, 4, 3], [2^5, 2^5, 2^5]), dim in ((1,2),(2,3),(1,3),:) - @testset "Shape: $shape, Dim: $dim" + @testset "Shape: $shape, Dim: $dim" begin @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...)) cpu = rand(Float32, shape...) From acd7c632c2eed4c6d8ac0a849494146adff1f651 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 4 Nov 2025 13:02:30 -0400 Subject: [PATCH 7/7] New kernel interface --- src/host/reverse.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/host/reverse.jl b/src/host/reverse.jl index 57ec8650..0de8075e 100644 --- a/src/host/reverse.jl +++ b/src/host/reverse.jl @@ -31,7 +31,7 @@ function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N}; ## COV_EXCL_STOP backend = get_backend(input) - kernel = KI.KIKernel(backend, rev_kernel, input, output) + kernel = KI.@kikernel backend launch=false rev_kernel(input, output) nthreads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=length(input)) ngroups = cld(length(input), nthreads) @@ -81,7 +81,7 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N} # ignoring the threads that cross the mid-point backend = get_backend(data) - kernel = KI.KIKernel(backend, rev_kernel!, data) + kernel = KI.@kikernel backend launch=false rev_kernel!(data) nthreads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=reduced_length) ngroups = cld(reduced_length, nthreads)