diff --git a/Project.toml b/Project.toml index c399348c..8ab8647b 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "11.2.6" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" +GPUToolbox = "096a3bc2-3ced-46d0-87f4-dd12716f4bfc" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -24,8 +25,9 @@ JLD2Ext = "JLD2" [compat] Adapt = "4.0" GPUArraysCore = "= 0.2.0" +GPUToolbox = "0.2, 0.3, 1" JLD2 = "0.4, 0.5, 0.6" -KernelAbstractions = "0.9.28, 0.10" +KernelAbstractions = "0.10" LLVM = "3.9, 4, 5, 6, 7, 8, 9" LinearAlgebra = "1" Printf = "1" diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl index 8c1fc14e..edb35931 100644 --- a/src/GPUArrays.jl +++ b/src/GPUArrays.jl @@ -1,6 +1,8 @@ module GPUArrays +using GPUToolbox using KernelAbstractions +import KernelAbstractions.KernelIntrinsics as KI using Serialization using Random using LinearAlgebra @@ -26,6 +28,7 @@ include("host/construction.jl") ## integrations and specialized methods include("host/base.jl") include("host/indexing.jl") +include("host/reverse.jl") include("host/broadcast.jl") include("host/mapreduce.jl") include("host/linalg.jl") diff --git a/src/host/reverse.jl b/src/host/reverse.jl new file mode 100644 index 00000000..0de8075e --- /dev/null +++ b/src/host/reverse.jl @@ -0,0 +1,163 @@ +# reversing + +# the kernel works by treating the array as 1d. after reversing by dimension x an element at +# pos [i1, i2, i3, ... , i{x}, ..., i{n}] will be at +# pos [i1, i2, i3, ... , d{x} - i{x} + 1, ..., i{n}] where d{x} is the size of dimension x + +# out-of-place version, copying a single value per thread from input to output +function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N}; + dims=1:ndims(input)) where {T, N} + @assert size(input) == size(output) + rev_dims = ntuple((d)-> d in dims && size(input, d) > 1, N) + ref = size(input) .+ 1 + # converts an ND-index in the data array to the linear index + lin_idx = LinearIndices(input) + # converts a linear index in a reduced array to an ND-index, but using the reduced size + nd_idx = CartesianIndices(input) + + ## COV_EXCL_START + function rev_kernel(input, output) + offset_in = Int32(KI.get_local_size().x) * (KI.get_group_id().x - 1i32) + index_in = offset_in + KI.get_local_id().x + + @inbounds if index_in <= length(input) + idx = Tuple(nd_idx[index_in]) + idx = ifelse.(rev_dims, ref .- idx, idx) + index_out = lin_idx[idx...] + output[index_out] = input[index_in] + end + return + end + ## COV_EXCL_STOP + + backend = get_backend(input) + kernel = KI.@kikernel backend launch=false rev_kernel(input, output) + nthreads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=length(input)) + ngroups = cld(length(input), nthreads) + + kernel(input, output; numworkgroups=ngroups, workgroupsize=nthreads) +end + +# in-place version, swapping elements on half the number of threads +function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N} + rev_dims = ntuple((d)-> d in dims && size(data, d) > 1, N) + half_dim = findlast(rev_dims) + if isnothing(half_dim) + # no reverse operation needed at all in this case. + return + end + ref = size(data) .+ 1 + # converts an ND-index in the data array to the linear index + lin_idx = LinearIndices(data) + reduced_size = ntuple((d)->ifelse(d==half_dim, cld(size(data,d),2), size(data,d)), N) + reduced_length = prod(reduced_size) + # converts a linear index in a reduced array to an ND-index, but using the reduced size + nd_idx = CartesianIndices(reduced_size) + + ## COV_EXCL_START + function rev_kernel!(data) + offset_in = Int32(KI.get_local_size().x) * (KI.get_group_id().x - 1i32) + index_in = offset_in + KI.get_local_id().x + + @inbounds if index_in <= reduced_length + idx = Tuple(nd_idx[index_in]) + index_in = lin_idx[idx...] + idx = ifelse.(rev_dims, ref .- idx, idx) + index_out = lin_idx[idx...] + + if index_in < index_out + temp = data[index_out] + data[index_out] = data[index_in] + data[index_in] = temp + end + end + return + end + ## COV_EXCL_STOP + + # NOTE: we launch slightly more than half the number of elements in the array as threads. + # The last non-singleton dimension along which to reverse is used to define how the array is split. + # Only the middle row in case of an odd array dimension could cause trouble, but this is prevented by + # ignoring the threads that cross the mid-point + + backend = get_backend(data) + kernel = KI.@kikernel backend launch=false rev_kernel!(data) + nthreads = KI.kernel_max_work_group_size(backend, kernel; max_work_items=reduced_length) + ngroups = cld(reduced_length, nthreads) + + kernel(data; numworkgroups=ngroups, workgroupsize=nthreads) +end + + +# n-dimensional API + +function Base.reverse!(data::AnyGPUArray{T, N}; dims=:) where {T, N} + if isa(dims, Colon) + dims = 1:ndims(data) + end + if !applicable(iterate, dims) + throw(ArgumentError("dimension $dims is not an iterable")) + end + if !all(1 .≤ dims .≤ ndims(data)) + throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(data))")) + end + + # no reverse operation needed at all otherwise. + if !(all(size(data)[[dims...]].==1) || isempty(data)) + _reverse!(data; dims=dims) + end + + return data +end + +# out-of-place +function Base.reverse(input::AnyGPUArray{T, N}; dims=:) where {T, N} + if isa(dims, Colon) + dims = 1:ndims(input) + end + if !applicable(iterate, dims) + throw(ArgumentError("dimension $dims is not an iterable")) + end + if !all(1 .≤ dims .≤ ndims(input)) + throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(input))")) + end + + if all(size(input)[[dims...]].==1) || isempty(input) + # no reverse operation needed at all in this case. + return copy(input) + else + output = similar(input) + _reverse(input, output; dims=dims) + return output + end +end + + +# 1-dimensional API + +# in-place +Base.@propagate_inbounds function Base.reverse!(data::AnyGPUVector{T}, start::Integer, + stop::Integer=length(data)) where {T} + # only reverse if there are more than 2 elements to reverse + (stop-start > 1) && _reverse!(view(data, start:stop)) + return data +end + +Base.reverse!(data::AnyGPUVector{T}) where {T} = @inbounds reverse!(data, 1, length(data)) + +# out-of-place +Base.@propagate_inbounds function Base.reverse(input::AnyGPUVector{T}, start::Integer, + stop::Integer=length(input)) where {T} + # Copy array in one kernel if no work is to be done + (stop-start > 1) || return copy(input) + + output = similar(input) + + start > 1 && copyto!(output, 1, input, 1, start-1) + _reverse(view(input, start:stop), view(output, start:stop)) + stop < length(input) && copyto!(output, stop+1, input, stop+1) + + return output +end + +Base.reverse(data::AnyGPUVector{T}) where {T} = @inbounds reverse(data, 1, length(data)) diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl index 6bcfd5b4..6f45ad12 100644 --- a/test/testsuite/base.jl +++ b/test/testsuite/base.jl @@ -381,6 +381,61 @@ end gA = reshape(AT(A),4) end + @testset "reverse" begin + # 1-d out-of-place + @test compare(x->reverse(x), AT, rand(Float32, 1000)) + @test compare(x->reverse(x), AT, rand(Float32, 0)) + @test compare(x->reverse(x), AT, rand(Float32, 1)) + @test compare(x->reverse(x, 10), AT, rand(Float32, 1000)) + @test compare(x->reverse(x, 10, 90), AT, rand(Float32, 1000)) + @test compare(x->reverse(x, 3, 3), AT, Float32[1,2,3,4,5]) + + # 1-d in-place + @test compare(x->reverse!(x), AT, rand(Float32, 1000)) + @test compare(x->reverse!(x), AT, rand(Float32, 0)) + @test compare(x->reverse!(x), AT, rand(Float32, 1)) + @test compare(x->reverse!(x, 10), AT, rand(Float32, 1000)) + @test compare(x->reverse!(x, 10, 90), AT, rand(Float32, 1000)) + @test compare(x->reverse!(x, 3, 3), AT, Float32[1,2,3,4,5]) + + # n-d out-of-place + for shape in ([1, 2, 4, 3], [4, 2], [5], [0], [1], [2^5, 2^5, 2^5]), + dim in 1:length(shape) + @testset "Shape: $shape, Dim: $dim" begin + @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...)) + + cpu = rand(Float32, shape...) + gpu = AT(cpu) + reverse!(gpu; dims=dim) + @test Array(gpu) == reverse(cpu; dims=dim) + end + end + + # supports multidimensional reverse + for shape in ([1,1,1,1], [1, 2, 4, 3], [2^5, 2^5, 2^5]), + dim in ((1,2),(2,3),(1,3),:) + @testset "Shape: $shape, Dim: $dim" begin + @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...)) + + cpu = rand(Float32, shape...) + gpu = AT(cpu) + reverse!(gpu; dims=dim) + @test Array(gpu) == reverse(cpu; dims=dim) + end + end + + # wrapped array + @test compare(x->reverse(x), AT, reshape(rand(Float32, 2,2), 4)) + + # error throwing + cpu = rand(Float32, 1,2,3,4) + gpu = AT(cpu) + @test_throws ArgumentError reverse!(gpu, dims=5) + @test_throws ArgumentError reverse!(gpu, dims=0) + @test_throws ArgumentError reverse(gpu, dims=5) + @test_throws ArgumentError reverse(gpu, dims=0) + end + @testset "reinterpret" begin A = Int32[-1,-2,-3] dA = AT(A)