From 236c812094fe915dbb5a5414857b3848ab4245bf Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 18 May 2025 12:24:29 -0300
Subject: [PATCH 1/7] Port `reverse` from CUDA

---
 Project.toml           |   2 +
 src/GPUArrays.jl       |   2 +
 src/host/reverse.jl    | 148 +++++++++++++++++++++++++++++++++++++++++
 test/testsuite/base.jl |  45 +++++++++++++
 4 files changed, 197 insertions(+)
 create mode 100644 src/host/reverse.jl

diff --git a/Project.toml b/Project.toml
index c399348c..f5929187 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "11.2.6"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+GPUToolbox = "096a3bc2-3ced-46d0-87f4-dd12716f4bfc"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -24,6 +25,7 @@ JLD2Ext = "JLD2"
 [compat]
 Adapt = "4.0"
 GPUArraysCore = "= 0.2.0"
+GPUToolbox = "0.2, 0.3, 1"
 JLD2 = "0.4, 0.5, 0.6"
 KernelAbstractions = "0.9.28, 0.10"
 LLVM = "3.9, 4, 5, 6, 7, 8, 9"
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
index 8c1fc14e..206f564b 100644
--- a/src/GPUArrays.jl
+++ b/src/GPUArrays.jl
@@ -1,5 +1,6 @@
 module GPUArrays
 
+using GPUToolbox
 using KernelAbstractions
 using Serialization
 using Random
@@ -26,6 +27,7 @@ include("host/construction.jl")
 ## integrations and specialized methods
 include("host/base.jl")
 include("host/indexing.jl")
+include("host/reverse.jl")
 include("host/broadcast.jl")
 include("host/mapreduce.jl")
 include("host/linalg.jl")
diff --git a/src/host/reverse.jl b/src/host/reverse.jl
new file mode 100644
index 00000000..0f2f5c3c
--- /dev/null
+++ b/src/host/reverse.jl
@@ -0,0 +1,148 @@
+# reversing
+
+# the kernel works by treating the array as 1d. after reversing by dimension x an element at
+# pos [i1, i2, i3, ... , i{x},            ..., i{n}] will be at
+# pos [i1, i2, i3, ... , d{x} - i{x} + 1, ..., i{n}] where d{x} is the size of dimension x
+
+# out-of-place version, copying a single value per thread from input to output
+function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N};
+                  dims=1:ndims(input)) where {T, N}
+    @assert size(input) == size(output)
+    rev_dims = ntuple((d)-> d in dims && size(input, d) > 1, N)
+    ref = size(input) .+ 1
+    # converts an ND-index in the data array to the linear index
+    lin_idx = LinearIndices(input)
+    # converts a linear index in a reduced array to an ND-index, but using the reduced size
+    nd_idx = CartesianIndices(input)
+
+    ## COV_EXCL_START
+    @kernel unsafe_indices=true function kernel(input, output)
+        offset_in = Int32(@groupsize()[1]) * (@index(Group, Linear) - 1i32)
+        index_in = offset_in + @index(Local, Linear)
+
+        @inbounds if index_in <= length(input)
+            idx = Tuple(nd_idx[index_in])
+            idx = ifelse.(rev_dims, ref .- idx, idx)
+            index_out = lin_idx[idx...]
+            output[index_out] = input[index_in]
+        end
+    end
+    ## COV_EXCL_STOP
+
+    nthreads = 256
+
+    kernel(get_backend(input), nthreads)(input, output; ndrange=length(input))
+end
+
+# in-place version, swapping elements on half the number of threads
+function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N}
+    rev_dims = ntuple((d)-> d in dims && size(data, d) > 1, N)
+    half_dim = findlast(rev_dims)
+    if isnothing(half_dim)
+        # no reverse operation needed at all in this case.
+        return
+    end
+    ref = size(data) .+ 1
+    # converts an ND-index in the data array to the linear index
+    lin_idx = LinearIndices(data)
+    reduced_size = ntuple((d)->ifelse(d==half_dim, cld(size(data,d),2), size(data,d)), N)
+    reduced_length = prod(reduced_size)
+    # converts a linear index in a reduced array to an ND-index, but using the reduced size
+    nd_idx = CartesianIndices(reduced_size)
+
+    ## COV_EXCL_START
+    @kernel unsafe_indices=true function kernel(data)
+        offset_in = Int32(@groupsize()[1]) * (@index(Group, Linear) - 1i32)
+        index_in = offset_in + @index(Local, Linear)
+
+        @inbounds if index_in <= reduced_length
+            idx = Tuple(nd_idx[index_in])
+            index_in = lin_idx[idx...]
+            idx = ifelse.(rev_dims, ref .- idx, idx)
+            index_out = lin_idx[idx...]
+
+            if index_in < index_out
+                temp = data[index_out]
+                data[index_out] = data[index_in]
+                data[index_in] = temp
+            end
+        end
+    end
+    ## COV_EXCL_STOP
+
+    # NOTE: we launch slightly more than half the number of elements in the array as threads.
+    # The last non-singleton dimension along which to reverse is used to define how the array is split.
+    # Only the middle row in case of an odd array dimension could cause trouble, but this is prevented by
+    # ignoring the threads that cross the mid-point
+
+    nthreads = 256
+
+    kernel(get_backend(data), nthreads)(data; ndrange=length(data))
+end
+
+
+# n-dimensional API
+
+function Base.reverse!(data::AnyGPUArray{T, N}; dims=:) where {T, N}
+    if isa(dims, Colon)
+        dims = 1:ndims(data)
+    end
+    if !applicable(iterate, dims)
+        throw(ArgumentError("dimension $dims is not an iterable"))
+    end
+    if !all(1 .≤ dims .≤ ndims(data))
+        throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(data))"))
+    end
+
+    _reverse!(data; dims=dims)
+
+    return data
+end
+
+# out-of-place
+function Base.reverse(input::AnyGPUArray{T, N}; dims=:) where {T, N}
+    if isa(dims, Colon)
+        dims = 1:ndims(input)
+    end
+    if !applicable(iterate, dims)
+        throw(ArgumentError("dimension $dims is not an iterable"))
+    end
+    if !all(1 .≤ dims .≤ ndims(input))
+        throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(input))"))
+    end
+
+    if all(size(input)[[dims...]].==1)
+        # no reverse operation needed at all in this case.
+        return copy(input)
+    else
+        output = similar(input)
+        _reverse(input, output; dims=dims)
+        return output
+    end
+end
+
+
+# 1-dimensional API
+
+# in-place
+Base.@propagate_inbounds function Base.reverse!(data::AnyGPUVector{T}, start::Integer,
+                                                stop::Integer=length(data)) where {T}
+    _reverse!(view(data, start:stop))
+    return data
+end
+
+Base.reverse!(data::AnyGPUVector{T}) where {T} = @inbounds reverse!(data, 1, length(data))
+
+# out-of-place
+Base.@propagate_inbounds function Base.reverse(input::AnyGPUVector{T}, start::Integer,
+                                               stop::Integer=length(input)) where {T}
+    output = similar(input)
+
+    start > 1 && copyto!(output, 1, input, 1, start-1)
+    _reverse(view(input, start:stop), view(output, start:stop))
+    stop < length(input) && copyto!(output, stop+1, input, stop+1)
+
+    return output
+end
+
+Base.reverse(data::AnyGPUVector{T}) where {T} = @inbounds reverse(data, 1, length(data))
diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl
index 6bcfd5b4..55436126 100644
--- a/test/testsuite/base.jl
+++ b/test/testsuite/base.jl
@@ -381,6 +381,51 @@ end
       gA = reshape(AT(A),4)
     end
 
+    @testset "reverse" begin
+      # 1-d out-of-place
+      @test compare(x->reverse(x), AT, rand(Float32, 1000))
+      @test compare(x->reverse(x, 10), AT, rand(Float32, 1000))
+      @test compare(x->reverse(x, 10, 90), AT, rand(Float32, 1000))
+
+      # 1-d in-place
+      @test compare(x->reverse!(x), AT, rand(Float32, 1000))
+      @test compare(x->reverse!(x, 10), AT, rand(Float32, 1000))
+      @test compare(x->reverse!(x, 10, 90), AT, rand(Float32, 1000))
+
+      # n-d out-of-place
+      for shape in ([1, 2, 4, 3], [4, 2], [5], [2^5, 2^5, 2^5]),
+          dim in 1:length(shape)
+        @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...))
+
+        cpu = rand(Float32, shape...)
+        gpu = AT(cpu)
+        reverse!(gpu; dims=dim)
+        @test Array(gpu) == reverse(cpu; dims=dim)
+      end
+
+      # supports multidimensional reverse
+      for shape in ([1, 2, 4, 3], [2^5, 2^5, 2^5]),
+          dim in ((1,2),(2,3),(1,3),:)
+        @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...))
+
+        cpu = rand(Float32, shape...)
+        gpu = AT(cpu)
+        reverse!(gpu; dims=dim)
+        @test Array(gpu) == reverse(cpu; dims=dim)
+      end
+
+      # wrapped array
+      @test compare(x->reverse(x), AT, reshape(rand(Float32, 2,2), 4))
+
+      # error throwing
+      cpu = rand(Float32, 1,2,3,4)
+      gpu = AT(cpu)
+      @test_throws ArgumentError reverse!(gpu, dims=5)
+      @test_throws ArgumentError reverse!(gpu, dims=0)
+      @test_throws ArgumentError reverse(gpu, dims=5)
+      @test_throws ArgumentError reverse(gpu, dims=0)
+  end
+
     @testset "reinterpret" begin
       A = Int32[-1,-2,-3]
       dA = AT(A)

From 9cbfae70171a6e270c45b5ce0dac91a63c6cf6fc Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 2 Nov 2025 00:26:06 -0300
Subject: [PATCH 2/7] Use KernelIntrinsics

---
 Project.toml        |  2 +-
 src/GPUArrays.jl    |  1 +
 src/host/reverse.jl | 12 ++++++------
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/Project.toml b/Project.toml
index f5929187..8ab8647b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -27,7 +27,7 @@ Adapt = "4.0"
 GPUArraysCore = "= 0.2.0"
 GPUToolbox = "0.2, 0.3, 1"
 JLD2 = "0.4, 0.5, 0.6"
-KernelAbstractions = "0.9.28, 0.10"
+KernelAbstractions = "0.10"
 LLVM = "3.9, 4, 5, 6, 7, 8, 9"
 LinearAlgebra = "1"
 Printf = "1"
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
index 206f564b..edb35931 100644
--- a/src/GPUArrays.jl
+++ b/src/GPUArrays.jl
@@ -2,6 +2,7 @@ module GPUArrays
 
 using GPUToolbox
 using KernelAbstractions
+import KernelAbstractions.KernelIntrinsics as KI
 using Serialization
 using Random
 using LinearAlgebra
diff --git a/src/host/reverse.jl b/src/host/reverse.jl
index 0f2f5c3c..1c845897 100644
--- a/src/host/reverse.jl
+++ b/src/host/reverse.jl
@@ -17,8 +17,8 @@ function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N};
 
     ## COV_EXCL_START
     @kernel unsafe_indices=true function kernel(input, output)
-        offset_in = Int32(@groupsize()[1]) * (@index(Group, Linear) - 1i32)
-        index_in = offset_in + @index(Local, Linear)
+        offset_in = Int32(KI.get_local_size().x) * (KI.get_group_id().x - 1i32)
+        index_in = offset_in + KI.get_local_id().x
 
         @inbounds if index_in <= length(input)
             idx = Tuple(nd_idx[index_in])
@@ -31,7 +31,7 @@ function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N};
 
     nthreads = 256
 
-    kernel(get_backend(input), nthreads)(input, output; ndrange=length(input))
+    kernel(get_backend(input))(input, output; ndrange=length(input), nthreads)
 end
 
 # in-place version, swapping elements on half the number of threads
@@ -52,8 +52,8 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N}
 
     ## COV_EXCL_START
     @kernel unsafe_indices=true function kernel(data)
-        offset_in = Int32(@groupsize()[1]) * (@index(Group, Linear) - 1i32)
-        index_in = offset_in + @index(Local, Linear)
+        offset_in = Int32(KI.get_local_size().x) * (KI.get_group_id().x - 1i32)
+        index_in = offset_in + KI.get_local_id().x
 
         @inbounds if index_in <= reduced_length
             idx = Tuple(nd_idx[index_in])
@@ -77,7 +77,7 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N}
 
     nthreads = 256
 
-    kernel(get_backend(data), nthreads)(data; ndrange=length(data))
+    kernel(get_backend(data))(data; ndrange=length(data), nthreads)
 end
 
 

From b395474a818f74611c2f72ea4b97a6152d6e271c Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 2 Nov 2025 00:38:58 -0300
Subject: [PATCH 3/7] fix

---
 src/host/reverse.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/host/reverse.jl b/src/host/reverse.jl
index 1c845897..8be1c404 100644
--- a/src/host/reverse.jl
+++ b/src/host/reverse.jl
@@ -31,7 +31,7 @@ function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N};
 
     nthreads = 256
 
-    kernel(get_backend(input))(input, output; ndrange=length(input), nthreads)
+    kernel(get_backend(input))(input, output; ndrange=length(input), workgroupsize=nthreads)
 end
 
 # in-place version, swapping elements on half the number of threads
@@ -77,7 +77,7 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N}
 
     nthreads = 256
 
-    kernel(get_backend(data))(data; ndrange=length(data), nthreads)
+    kernel(get_backend(data))(data; ndrange=length(data), workgroupsize=nthreads)
 end
 
 

From b5e2c3b26397a45dcdc075f3e5c9e5b43036b0f7 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 2 Nov 2025 14:25:22 -0400
Subject: [PATCH 4/7] Add some edge cases

---
 test/testsuite/base.jl | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl
index 55436126..8a2f71bb 100644
--- a/test/testsuite/base.jl
+++ b/test/testsuite/base.jl
@@ -393,25 +393,29 @@ end
       @test compare(x->reverse!(x, 10, 90), AT, rand(Float32, 1000))
 
       # n-d out-of-place
-      for shape in ([1, 2, 4, 3], [4, 2], [5], [2^5, 2^5, 2^5]),
+      for shape in ([1, 2, 4, 3], [4, 2], [5], [0], [1], [2^5, 2^5, 2^5]),
           dim in 1:length(shape)
-        @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...))
-
-        cpu = rand(Float32, shape...)
-        gpu = AT(cpu)
-        reverse!(gpu; dims=dim)
-        @test Array(gpu) == reverse(cpu; dims=dim)
+          @testset "Shape: $shape, Dim: $dim"
+              @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...))
+
+              cpu = rand(Float32, shape...)
+              gpu = AT(cpu)
+              reverse!(gpu; dims=dim)
+              @test Array(gpu) == reverse(cpu; dims=dim)
+          end
       end
 
       # supports multidimensional reverse
-      for shape in ([1, 2, 4, 3], [2^5, 2^5, 2^5]),
+      for shape in ([1,1,1,1], [1, 2, 4, 3], [2^5, 2^5, 2^5]),
           dim in ((1,2),(2,3),(1,3),:)
-        @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...))
-
-        cpu = rand(Float32, shape...)
-        gpu = AT(cpu)
-        reverse!(gpu; dims=dim)
-        @test Array(gpu) == reverse(cpu; dims=dim)
+          @testset "Shape: $shape, Dim: $dim"
+              @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...))
+
+              cpu = rand(Float32, shape...)
+              gpu = AT(cpu)
+              reverse!(gpu; dims=dim)
+              @test Array(gpu) == reverse(cpu; dims=dim)
+          end
       end
 
       # wrapped array

From 1ab8b2190c3c9b95af42fb2d4005de2f7bc764d9 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 2 Nov 2025 14:26:09 -0400
Subject: [PATCH 5/7] Use KernelIntrinsics

---
 src/host/reverse.jl | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/host/reverse.jl b/src/host/reverse.jl
index 8be1c404..74453074 100644
--- a/src/host/reverse.jl
+++ b/src/host/reverse.jl
@@ -16,7 +16,7 @@ function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N};
     nd_idx = CartesianIndices(input)
 
     ## COV_EXCL_START
-    @kernel unsafe_indices=true function kernel(input, output)
+    function rev_kernel(input, output)
         offset_in = Int32(KI.get_local_size().x) * (KI.get_group_id().x - 1i32)
         index_in = offset_in + KI.get_local_id().x
 
@@ -28,10 +28,11 @@ function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N};
         end
     end
     ## COV_EXCL_STOP
+    kernel = KI.KIKernel(get_backend(input), rev_kernel, input, output)
+    nthreads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=length(input))
+    ngroups = cld(length(input), nthreads)
 
-    nthreads = 256
-
-    kernel(get_backend(input))(input, output; ndrange=length(input), workgroupsize=nthreads)
+    kernel(input, output; numworkgroups=ngroups, workgroupsize=nthreads)
 end
 
 # in-place version, swapping elements on half the number of threads
@@ -51,7 +52,7 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N}
     nd_idx = CartesianIndices(reduced_size)
 
     ## COV_EXCL_START
-    @kernel unsafe_indices=true function kernel(data)
+    function rev_kernel!(data)
         offset_in = Int32(KI.get_local_size().x) * (KI.get_group_id().x - 1i32)
         index_in = offset_in + KI.get_local_id().x
 
@@ -75,9 +76,12 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N}
     # Only the middle row in case of an odd array dimension could cause trouble, but this is prevented by
     # ignoring the threads that cross the mid-point
 
-    nthreads = 256
 
-    kernel(get_backend(data))(data; ndrange=length(data), workgroupsize=nthreads)
+    kernel = KI.KIKernel(get_backend(data), rev_kernel!, data)
+    nthreads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=reduced_length)
+    ngroups = cld(reduced_length, nthreads)
+
+    kernel(input, output; numworkgroups=ngroups, workgroupsize=nthreads)
 end
 
 

From cc4b829af5c35006cdc72d8a81123eaa7b5f1a58 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 2 Nov 2025 15:27:39 -0400
Subject: [PATCH 6/7] Test and fix some edge cases

---
 src/host/reverse.jl    | 25 ++++++++++++++++++-------
 test/testsuite/base.jl | 10 ++++++++--
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/src/host/reverse.jl b/src/host/reverse.jl
index 74453074..57ec8650 100644
--- a/src/host/reverse.jl
+++ b/src/host/reverse.jl
@@ -26,9 +26,12 @@ function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N};
             index_out = lin_idx[idx...]
             output[index_out] = input[index_in]
         end
+        return
     end
     ## COV_EXCL_STOP
-    kernel = KI.KIKernel(get_backend(input), rev_kernel, input, output)
+
+    backend = get_backend(input)
+    kernel = KI.KIKernel(backend, rev_kernel, input, output)
     nthreads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=length(input))
     ngroups = cld(length(input), nthreads)
 
@@ -68,6 +71,7 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N}
                 data[index_in] = temp
             end
         end
+        return
     end
     ## COV_EXCL_STOP
 
@@ -76,12 +80,12 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N}
     # Only the middle row in case of an odd array dimension could cause trouble, but this is prevented by
     # ignoring the threads that cross the mid-point
 
-
-    kernel = KI.KIKernel(get_backend(data), rev_kernel!, data)
+    backend = get_backend(data)
+    kernel = KI.KIKernel(backend, rev_kernel!, data)
     nthreads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=reduced_length)
     ngroups = cld(reduced_length, nthreads)
 
-    kernel(input, output; numworkgroups=ngroups, workgroupsize=nthreads)
+    kernel(data; numworkgroups=ngroups, workgroupsize=nthreads)
 end
 
 
@@ -98,7 +102,10 @@ function Base.reverse!(data::AnyGPUArray{T, N}; dims=:) where {T, N}
         throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(data))"))
     end
 
-    _reverse!(data; dims=dims)
+    # no reverse operation needed at all otherwise.
+    if !(all(size(data)[[dims...]].==1) || isempty(data))
+        _reverse!(data; dims=dims)
+    end
 
     return data
 end
@@ -115,7 +122,7 @@ function Base.reverse(input::AnyGPUArray{T, N}; dims=:) where {T, N}
         throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(input))"))
     end
 
-    if all(size(input)[[dims...]].==1)
+    if all(size(input)[[dims...]].==1) || isempty(input)
         # no reverse operation needed at all in this case.
         return copy(input)
     else
@@ -131,7 +138,8 @@ end
 # in-place
 Base.@propagate_inbounds function Base.reverse!(data::AnyGPUVector{T}, start::Integer,
                                                 stop::Integer=length(data)) where {T}
-    _reverse!(view(data, start:stop))
+    # only reverse if there are more than 2 elements to reverse
+    (stop-start > 1) && _reverse!(view(data, start:stop))
     return data
 end
 
@@ -140,6 +148,9 @@ Base.reverse!(data::AnyGPUVector{T}) where {T} = @inbounds reverse!(data, 1, len
 # out-of-place
 Base.@propagate_inbounds function Base.reverse(input::AnyGPUVector{T}, start::Integer,
                                                stop::Integer=length(input)) where {T}
+    # Copy array in one kernel if no work is to be done
+    (stop-start > 1) || return copy(input)
+
     output = similar(input)
 
     start > 1 && copyto!(output, 1, input, 1, start-1)
diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl
index 8a2f71bb..6f45ad12 100644
--- a/test/testsuite/base.jl
+++ b/test/testsuite/base.jl
@@ -384,18 +384,24 @@ end
     @testset "reverse" begin
       # 1-d out-of-place
       @test compare(x->reverse(x), AT, rand(Float32, 1000))
+      @test compare(x->reverse(x), AT, rand(Float32, 0))
+      @test compare(x->reverse(x), AT, rand(Float32, 1))
       @test compare(x->reverse(x, 10), AT, rand(Float32, 1000))
       @test compare(x->reverse(x, 10, 90), AT, rand(Float32, 1000))
+      @test compare(x->reverse(x, 3, 3), AT, Float32[1,2,3,4,5])
 
       # 1-d in-place
       @test compare(x->reverse!(x), AT, rand(Float32, 1000))
+      @test compare(x->reverse!(x), AT, rand(Float32, 0))
+      @test compare(x->reverse!(x), AT, rand(Float32, 1))
       @test compare(x->reverse!(x, 10), AT, rand(Float32, 1000))
       @test compare(x->reverse!(x, 10, 90), AT, rand(Float32, 1000))
+      @test compare(x->reverse!(x, 3, 3), AT, Float32[1,2,3,4,5])
 
       # n-d out-of-place
       for shape in ([1, 2, 4, 3], [4, 2], [5], [0], [1], [2^5, 2^5, 2^5]),
           dim in 1:length(shape)
-          @testset "Shape: $shape, Dim: $dim"
+          @testset "Shape: $shape, Dim: $dim" begin
               @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...))
 
               cpu = rand(Float32, shape...)
@@ -408,7 +414,7 @@ end
       # supports multidimensional reverse
       for shape in ([1,1,1,1], [1, 2, 4, 3], [2^5, 2^5, 2^5]),
           dim in ((1,2),(2,3),(1,3),:)
-          @testset "Shape: $shape, Dim: $dim"
+          @testset "Shape: $shape, Dim: $dim" begin
               @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...))
 
               cpu = rand(Float32, shape...)

From acd7c632c2eed4c6d8ac0a849494146adff1f651 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 4 Nov 2025 13:02:30 -0400
Subject: [PATCH 7/7] New kernel interface

---
 src/host/reverse.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/host/reverse.jl b/src/host/reverse.jl
index 57ec8650..0de8075e 100644
--- a/src/host/reverse.jl
+++ b/src/host/reverse.jl
@@ -31,7 +31,7 @@ function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N};
     ## COV_EXCL_STOP
 
     backend = get_backend(input)
-    kernel = KI.KIKernel(backend, rev_kernel, input, output)
+    kernel = KI.@kikernel backend launch=false rev_kernel(input, output)
     nthreads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=length(input))
     ngroups = cld(length(input), nthreads)
 
@@ -81,7 +81,7 @@ function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N}
     # ignoring the threads that cross the mid-point
 
     backend = get_backend(data)
-    kernel = KI.KIKernel(backend, rev_kernel!, data)
+    kernel = KI.@kikernel backend launch=false rev_kernel!(data)
     nthreads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=reduced_length)
     ngroups = cld(reduced_length, nthreads)