From 7a80e71f469343d96780e87386b9ed5217fdcd6a Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Sun, 8 Jun 2025 12:50:10 -0400 Subject: [PATCH 01/14] LibBlosc2: New chunk codec --- LibBlosc2/CHANGELOG.md | 11 + LibBlosc2/LICENSE | 21 ++ LibBlosc2/Project.toml | 18 ++ LibBlosc2/README.md | 26 +++ LibBlosc2/src/ChunkCodecLibBlosc2.jl | 61 ++++++ LibBlosc2/src/decode.jl | 132 +++++++++++ LibBlosc2/src/encode.jl | 145 +++++++++++++ LibBlosc2/src/libblosc2.jl | 313 +++++++++++++++++++++++++++ LibBlosc2/test/Project.toml | 7 + LibBlosc2/test/runtests.jl | 100 +++++++++ Project.toml | 1 + 11 files changed, 835 insertions(+) create mode 100644 LibBlosc2/CHANGELOG.md create mode 100644 LibBlosc2/LICENSE create mode 100644 LibBlosc2/Project.toml create mode 100644 LibBlosc2/README.md create mode 100644 LibBlosc2/src/ChunkCodecLibBlosc2.jl create mode 100644 LibBlosc2/src/decode.jl create mode 100644 LibBlosc2/src/encode.jl create mode 100644 LibBlosc2/src/libblosc2.jl create mode 100644 LibBlosc2/test/Project.toml create mode 100644 LibBlosc2/test/runtests.jl diff --git a/LibBlosc2/CHANGELOG.md b/LibBlosc2/CHANGELOG.md new file mode 100644 index 0000000..65eee55 --- /dev/null +++ b/LibBlosc2/CHANGELOG.md @@ -0,0 +1,11 @@ +# Release Notes + +All notable changes to this package will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). + +## Unreleased + +### Added + +- Initial release diff --git a/LibBlosc2/LICENSE b/LibBlosc2/LICENSE new file mode 100644 index 0000000..568769b --- /dev/null +++ b/LibBlosc2/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Erik Schnetter + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/LibBlosc2/Project.toml b/LibBlosc2/Project.toml new file mode 100644 index 0000000..9a85531 --- /dev/null +++ b/LibBlosc2/Project.toml @@ -0,0 +1,18 @@ +name = "ChunkCodecLibBlosc2" +uuid = "59b5581c-e2bc-42b3-a6f1-80e88eec7b70" +authors = ["Erik Schnetter "] +version = "0.1.0" + +[deps] +Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" +Blosc2_jll = "d43303dc-dd0e-56c6-b0a8-331f4c8c9bfb" +ChunkCodecCore = "0b6fb165-00bc-4d37-ab8b-79f91016dbe1" + +[compat] +Accessors = "0.1.42" +Blosc2_jll = "201.1700.100" +ChunkCodecCore = "0.5.0" +julia = "1.10" + +[workspace] +projects = ["test"] diff --git a/LibBlosc2/README.md b/LibBlosc2/README.md new file mode 100644 index 0000000..ecb7e24 --- /dev/null +++ b/LibBlosc2/README.md @@ -0,0 +1,26 @@ +# ChunkCodecLibBlosc2 + +## Warning: ChunkCodecLibBlosc2 is currently a WIP and its API may drastically change at any time. + +This package implements the ChunkCodec interface for the following encoders and decoders +using the c-blosc2 library + +1. `Blosc2Codec`, `Blosc2EncodeOptions`, `Blosc2DecodeOptions` + +## Example + +```julia-repl +julia> using ChunkCodecLibBlosc2 + +julia> data = [0x00, 0x01, 0x02, 0x03]; + +julia> compressed_data = encode(Blosc2EncodeOptions(), data); + +julia> decompressed_data = decode(Blosc2Codec(), compressed_data; max_size=length(data), size_hint=length(data)); + +julia> data == decompressed_data +true +``` + +The low level interface is defined in the `ChunkCodecCore` package. + diff --git a/LibBlosc2/src/ChunkCodecLibBlosc2.jl b/LibBlosc2/src/ChunkCodecLibBlosc2.jl new file mode 100644 index 0000000..38768a7 --- /dev/null +++ b/LibBlosc2/src/ChunkCodecLibBlosc2.jl @@ -0,0 +1,61 @@ +module ChunkCodecLibBlosc2 + +using Base.Libc: free + +using Accessors + +using Blosc2_jll: libblosc2 + +using ChunkCodecCore: + Codec, + EncodeOptions, + DecodeOptions, + check_in_range, + check_contiguous, + DecodingError +import ChunkCodecCore: + decode_options, + try_decode!, + try_encode!, + encode_bound, + try_find_decoded_size, + decoded_size_range + +export Blosc2Codec, + Blosc2EncodeOptions, + Blosc2DecodeOptions, + Blosc2DecodingError + +if VERSION >= v"1.11.0-DEV.469" + eval(Meta.parse("public is_compressor_valid, compcode, compname")) +end + +# reexport ChunkCodecCore +using ChunkCodecCore: ChunkCodecCore, encode, decode +export ChunkCodecCore, encode, decode + +include("libblosc2.jl") + +""" + struct Blosc2Codec <: Codec + Blosc2Codec() + +Blosc2 compression using c-blosc2 library: https://github.com/Blosc2/c-blosc2 + +Decoding does not accept any extra data appended to the compressed block. +Decoding also does not accept truncated data, or multiple compressed blocks concatenated together. + +[`Blosc2EncodeOptions`](@ref) and [`Blosc2DecodeOptions`](@ref) +can be used to set decoding and encoding options. +""" +struct Blosc2Codec <: Codec end +decode_options(::Blosc2Codec) = Blosc2DecodeOptions() + +include("encode.jl") +include("decode.jl") + +# Initialize the Blosc2 library. This function is idempotent, i.e. it +# can be called called multiple times without harm. +__init__() = @ccall libblosc2.blosc2_init()::Cvoid + +end # module ChunkCodecLibBlosc2 diff --git a/LibBlosc2/src/decode.jl b/LibBlosc2/src/decode.jl new file mode 100644 index 0000000..cf72432 --- /dev/null +++ b/LibBlosc2/src/decode.jl @@ -0,0 +1,132 @@ +""" + Blosc2DecodingError() + +Error for data that cannot be decoded. +""" +struct Blosc2DecodingError <: DecodingError +end + +function Base.showerror(io::IO, err::Blosc2DecodingError) + print(io, "Blosc2DecodingError: blosc2 compressed buffer cannot be decoded") + return nothing +end + +""" + struct Blosc2DecodeOptions <: DecodeOptions + Blosc2DecodeOptions(; kwargs...) + +Blosc2 decompression using c-blosc2 library: https://github.com/Blosc/c-blosc2 + +# Keyword Arguments + +- `codec::Blosc2Codec=Blosc2Codec()` +""" +struct Blosc2DecodeOptions <: DecodeOptions + codec::Blosc2Codec +end +Blosc2DecodeOptions(; codec::Blosc2Codec=Blosc2Codec(), kwargs...) = Blosc2DecodeOptions(codec) + +function try_find_decoded_size(::Blosc2DecodeOptions, src::AbstractVector{UInt8})::Int64 + check_contiguous(src) + + copy_cframe = false + schunk = @ccall libblosc2.blosc2_schunk_from_buffer(src::Ptr{UInt8}, length(src)::Int64, copy_cframe::UInt8)::Ptr{Blosc2SChunk} + if schunk == Ptr{Blosc2Storage}() + # These are not a valid blosc2-encoded data + throw(Blosc2DecodingError()) + end + @ccall libblosc2.blosc2_schunk_avoid_cframe_free(schunk::Ptr{Blosc2SChunk}, true::UInt8)::Cvoid + + total_nbytes = Int64(0) + + nchunks = unsafe_load(schunk).nchunks + for nchunk in 0:(nchunks - 1) + cbuffer = Ref{Ptr{UInt8}}() + needs_free = Ref{UInt8}() + chunksize = @ccall libblosc2.blosc2_schunk_get_chunk(schunk::Ptr{Blosc2SChunk}, nchunk::Int64, cbuffer::Ref{Ptr{UInt8}}, + needs_free::Ref{UInt8})::Cint + @assert chunksize > 0 + cbuffer = cbuffer[] + needs_free = Bool(needs_free[]) + + nbytes = Ref{Int32}() + success = @ccall libblosc2.blosc1_cbuffer_validate(cbuffer::Ptr{Cvoid}, chunksize::Cint, nbytes::Ref{Cint})::Cint + @assert success == 0 + nbytes = nbytes[] + + total_nbytes += nbytes + + if needs_free + # We could provide buffer into which to decode instead, reusing that buffer + Libc.free(cbuffer) + end + end + + # TODO: Use this instead of the loop above + @assert unsafe_load(schunk).nbytes == total_nbytes + + success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Cvoid})::Cint + @assert success == 0 + + return total_nbytes::Int64 +end + +#TODO: implement `try_resize_decode!` + +function try_decode!(d::Blosc2DecodeOptions, dst::AbstractVector{UInt8}, src::AbstractVector{UInt8}; + kwargs...)::Union{Nothing,Int64} + check_contiguous(dst) + check_contiguous(src) + + schunk = @ccall libblosc2.blosc2_schunk_from_buffer(src::Ptr{UInt8}, length(src)::Int64, false::UInt8)::Ptr{Blosc2SChunk} + @assert schunk != Ptr{Blosc2Storage}() + @ccall libblosc2.blosc2_schunk_avoid_cframe_free(schunk::Ptr{Blosc2SChunk}, true::UInt8)::Cvoid + + there_was_an_error = false + total_nbytes = Int64(0) + + nchunks = unsafe_load(schunk).nchunks + for nchunk in 0:(nchunks - 1) + cbuffer = Ref{Ptr{UInt8}}() + needs_free = Ref{UInt8}() + chunksize = @ccall libblosc2.blosc2_schunk_get_chunk(schunk::Ptr{Blosc2SChunk}, nchunk::Int64, cbuffer::Ref{Ptr{UInt8}}, + needs_free::Ref{UInt8})::Cint + @assert chunksize > 0 + cbuffer = cbuffer[] + needs_free = Bool(needs_free[]) + + nbytes = Ref{Int32}() + success = @ccall libblosc2.blosc1_cbuffer_validate(cbuffer::Ptr{Cvoid}, chunksize::Cint, nbytes::Ref{Cint})::Cint + @assert success == 0 + nbytes = nbytes[] + + if needs_free + Libc.free(cbuffer) + end + + # TODO: Use this instead of checking each chunk + @assert unsafe_load(schunk).nbytes == nbytes + + if total_nbytes + nbytes > length(dst) + there_was_an_error = true + break + end + + @assert total_nbytes + nbytes <= length(dst) + nbytes′ = @ccall libblosc2.blosc2_schunk_decompress_chunk(schunk::Ptr{Blosc2SChunk}, nchunk::Int64, + pointer(dst, total_nbytes+1)::Ptr{Cvoid}, nbytes::Int32)::Cint + @assert nbytes′ >= 0 + @assert nbytes′ == nbytes + + total_nbytes += nbytes + end + + success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Cvoid})::Cint + @assert success == 0 + + if there_was_an_error + return nothing + end + + return total_nbytes::Int64 +end diff --git a/LibBlosc2/src/encode.jl b/LibBlosc2/src/encode.jl new file mode 100644 index 0000000..7bb9937 --- /dev/null +++ b/LibBlosc2/src/encode.jl @@ -0,0 +1,145 @@ +""" + struct Blosc2EncodeOptions <: EncodeOptions + Blosc2EncodeOptions(; kwargs...) + +Blosc2 compression using c-blosc2 library: https://github.com/Blosc2/c-blosc2 + +# Keyword Arguments + +- `codec::Blosc2Codec=Blosc2Codec()` +- `clevel::Integer=5`: The compression level, between 0 (no compression) and 9 (maximum compression) +- `doshuffle::Integer=1`: Whether to use the shuffle filter. + + 0 means not applying it, 1 means applying it at a byte level, + and 2 means at a bit level (slower but may achieve better entropy alignment). +- `typesize::Integer=1`: The element size to use when shuffling. + + For implementation reasons, only `typesize` in `1:$(BLOSC_MAX_TYPESIZE)` will allow the + shuffle filter to work. When `typesize` is not in this range, shuffle + will be silently disabled. +- `compressor::AbstractString="lz4"`: The string representing the type of compressor to use. + + For example, "blosclz", "lz4", "lz4hc", "zlib", or "zstd". + Use `is_compressor_valid` to check if a compressor is supported. +""" +struct Blosc2EncodeOptions <: EncodeOptions + codec::Blosc2Codec + clevel::Int32 + doshuffle::Int32 + typesize::Int64 + compressor::String +end +function Blosc2EncodeOptions(; + codec::Blosc2Codec=Blosc2Codec(), + clevel::Integer=5, + doshuffle::Integer=1, + typesize::Integer=1, + compressor::AbstractString="lz4", + kwargs...) + _clevel = Int32(clamp(clevel, 0, 9)) + check_in_range(0:2; doshuffle) + _typesize = if typesize ∈ 2:BLOSC_MAX_TYPESIZE + Int64(typesize) + else + Int64(1) + end + is_compressor_valid(compressor) || + throw(ArgumentError("is_compressor_valid(compressor) must hold. Got\ncompressor => $(repr(compressor))")) + return Blosc2EncodeOptions(codec, _clevel, doshuffle, _typesize, compressor) +end + +# The maximum chunk size we're using: 1 GByte (must be less than 2 GByte including overhead) +const MAX_CHUNK_SIZE = Int64(1024)^3 + +# The maximum overhead for the schunk +const MAX_SCHUNK_OVERHEAD = 172 # apparently undocumented -- just a guess + +# We just punt with the upper bound. typemax(Int64) is a huge number anyway. +decoded_size_range(e::Blosc2EncodeOptions) = Int64(0):Int64(e.typesize):(typemax(Int64) ÷ 2) + +function encode_bound(::Blosc2EncodeOptions, src_size::Int64)::Int64 + return clamp(widen(src_size) + cld(src_size, MAX_CHUNK_SIZE) * BLOSC2_MAX_OVERHEAD + MAX_SCHUNK_OVERHEAD, Int64) +end + +function try_encode!(e::Blosc2EncodeOptions, dst::AbstractVector{UInt8}, src::AbstractVector{UInt8}; + kwargs...)::Union{Nothing,Int64} + check_contiguous(dst) + check_contiguous(src) + src_size::Int64 = length(src) + dst_size::Int64 = length(dst) + check_in_range(decoded_size_range(e); src_size) + + ccode = compcode(e.compressor) + @assert ccode >= 0 + numinternalthreads = 1 + + # Create a super-chunk container + cparams = Blosc2CParams() + @reset cparams.typesize = e.typesize + @reset cparams.compcode = ccode + @reset cparams.clevel = e.clevel + @reset cparams.nthreads = numinternalthreads + @reset cparams.filters[BLOSC2_MAX_FILTERS] = e.doshuffle + cparams_obj = [cparams] + + dparams = Blosc2DParams() + @reset dparams.nthreads = numinternalthreads + dparams_obj = [dparams] + + io = Blosc2IO() + io_obj = [io] + + storage = Blosc2Storage() + @reset storage.cparams = pointer(cparams_obj) + @reset storage.dparams = pointer(dparams_obj) + @reset storage.io = pointer(io_obj) + storage_obj = [storage] + + there_was_an_error = false + + GC.@preserve cparams_obj dparams_obj io_obj storage_obj begin + schunk = @ccall libblosc2.blosc2_schunk_new(storage_obj::Ptr{Blosc2Storage})::Ptr{Blosc2SChunk} + @assert schunk != Ptr{Blosc2Storage}() + + # Break input into chunks + for pos in 1:MAX_CHUNK_SIZE:src_size + endpos = min(src_size, pos + MAX_CHUNK_SIZE - 1) + srcview = @view src[pos:endpos] + nbytes = length(srcview) + nchunks = @ccall libblosc2.blosc2_schunk_append_buffer(schunk::Ptr{Blosc2SChunk}, srcview::Ptr{Cvoid}, + nbytes::Int32)::Int64 + @assert nchunks >= 0 + @assert nchunks == (pos-1) ÷ MAX_CHUNK_SIZE + 1 + end + + cframe = Ref{Ptr{UInt8}}() + needs_free = Ref{UInt8}() # bool + compressed_size = @ccall libblosc2.blosc2_schunk_to_buffer(schunk::Ptr{Blosc2SChunk}, cframe::Ref{Ptr{UInt8}}, + needs_free::Ref{UInt8})::Int64 + @assert compressed_size >= 0 + cframe = cframe[] + needs_free = Bool(needs_free[]) + + if compressed_size <= length(dst) + # TODO: Encode directly into `dst` + unsafe_copyto!(pointer(dst), cframe, compressed_size) + else + # Insufficient space to stored compressed data. + # We should detect this earlier, already in the loop above. + there_was_an_error = true + end + + success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Blosc2SChunk})::Cint + @assert success == 0 + + if needs_free + Libc.free(cframe) + end + end + + if there_was_an_error + return nothing + end + + return compressed_size::Int64 +end diff --git a/LibBlosc2/src/libblosc2.jl b/LibBlosc2/src/libblosc2.jl new file mode 100644 index 0000000..3783b3d --- /dev/null +++ b/LibBlosc2/src/libblosc2.jl @@ -0,0 +1,313 @@ +# Constants and C wrapper functions ported to Julia from blosc2.h https://github.com/Blosc/c-blosc2/blob/5fcd6fbf9ffcf613fabdb1eb3a90eeb12f7c04fe/include/blosc2.h + +################################################################################ +# Constants + +# [175] +# Extended header length (Blosc2, see README_HEADER) +const BLOSC_EXTENDED_HEADER_LENGTH = 32 +const BLOSC2_MAX_OVERHEAD = BLOSC_EXTENDED_HEADER_LENGTH +const BLOSC_MAX_TYPESIZE = Int(typemax(UInt8)) + +# [222] +const BLOSC2_MAX_FILTERS = 6 + +# [242] Codes for filters. +# No shuffle (for compatibility with Blosc1). +const BLOSC_NOSHUFFLE = 0 +# No filter. +const BLOSC_NOFILTER = 0 +const BLOSC_SHUFFLE = 1 +# Byte-wise shuffle. `filters_meta` does not have any effect here. +const BLOSC_BITSHUFFLE = 2 +# Bit-wise shuffle. `filters_meta` does not have any effect here. +const BLOSC_DELTA = 3 +# Delta filter. `filters_meta` does not have any effect here. +const BLOSC_TRUNC_PREC = 4 +# Truncate mantissa precision. +# Positive values in `filters_meta` will keep bits; negative values will zero bits. +const BLOSC_LAST_FILTER = 5 + +# [314] Codes for the different compressors shipped with Blosc +const BLOSC_BLOSCLZ = 0 +const BLOSC_LZ4 = 1 +const BLOSC_LZ4HC = 2 +const BLOSC_ZLIB = 4 +const BLOSC_ZSTD = 5 +const BLOSC_LAST_CODEC = 6 + +# [396] Split mode for blocks. +const BLOSC_ALWAYS_SPLIT = 1 +const BLOSC_NEVER_SPLIT = 2 +const BLOSC_AUTO_SPLIT = 3 +const BLOSC_FORWARD_COMPAT_SPLIT = 4 + +# [1641] +const BLOSC2_MAX_METALAYERS = 16 +const BLOSC2_MAX_VLMETALAYERS = 8 * 1024 + +################################################################################ +# Types + +""" + struct Blosc2CParams + +The parameters for creating a context for compression purposes. +""" +struct Blosc2CParams + # The compressor codec. + compcode::UInt8 + # The metadata for the compressor codec. + compcode_meta::UInt8 + # The compression level (5). + clevel::UInt8 + # Use dicts or not when compressing (only for ZSTD). + use_dict::Cint + # The type size (8). + typesize::Int32 + # The number of threads to use internally (1). + nthreads::Int16 + # The requested size of the compressed blocks (0 means automatic). + blocksize::Int32 + # Whether the blocks should be split or not. + splitmode::Int32 + # The associated schunk, if any (NULL). + schunk::Ptr{Cvoid} + # The (sequence of) filters. + filters::NTuple{BLOSC2_MAX_FILTERS,UInt8} + # The metadata for filters. + filters_meta::NTuple{BLOSC2_MAX_FILTERS,UInt8} + # The prefilter function. + prefilter::Ptr{Cvoid} # blosc2_prefilter_fn + # The prefilter parameters. + preparams::Ptr{Cvoid} # blosc2_prefilter_params* + # Tune configuration. + tuner_params::Ptr{Cvoid} + # The tuner id. + tuner_id::Cint + # Whether the codec is instrumented or not + instr_codec::UInt8 # bool + # User defined parameters for the codec + codec_params::Ptr{Cvoid} + # User defined parameters for the filters + filter_params::NTuple{BLOSC2_MAX_FILTERS,Ptr{Cvoid}} +end +Blosc2CParams() = @ccall libblosc2.blosc2_get_blosc2_cparams_defaults()::Blosc2CParams + +""" + struct Blosc2DParams + +The parameters for creating a context for decompression purposes. +""" +struct Blosc2DParams + # The number of threads to use internally (1). + nthreads::Int16 + # The associated schunk, if any (NULL). + schunk::Ptr{Cvoid} + # The postfilter function. + postfilter::Ptr{Cvoid} # blosc2_postfilter_fn + # The postfilter parameters. + postparams::Ptr{Cvoid} # blosc2_postfilter_params* +end +Blosc2DParams() = @ccall libblosc2.blosc2_get_blosc2_dparams_defaults()::Blosc2DParams + +""" + struct Blosc2IO + +Input/Output parameters. +""" +struct Blosc2IO + id::UInt8 + # The IO identifier. + name::Cstring + # The IO parameters. + params::Ptr{Cvoid} +end +Blosc2IO() = @ccall libblosc2.blosc2_get_blosc2_io_defaults()::Blosc2IO + +""" + struct Blosc2Storage + +This struct is meant for holding storage parameters for a +for a blosc2 container, allowing to specify, for example, how to interpret +the contents included in the schunk. +""" +struct Blosc2Storage + # Whether the chunks are contiguous or sparse. + contiguous::UInt8 # bool + # The path for persistent storage. If NULL, that means in-memory. + urlpath::Cstring + # The compression params when creating a schunk. + # If NULL, sensible defaults are used depending on the context. + cparams::Ptr{Blosc2CParams} + # The decompression params when creating a schunk. + # If NULL, sensible defaults are used depending on the context. + dparams::Ptr{Blosc2DParams} + # Input/output backend. + io::Ptr{Blosc2IO} +end +Blosc2Storage() = @ccall libblosc2.blosc2_get_blosc2_storage_defaults()::Blosc2Storage + +struct Blosc2Metalayer + # The metalayer identifier for Blosc client (e.g. Blosc2 NDim). + name::Cstring + # The serialized (msgpack preferably) content of the metalayer. + content::Ptr{UInt8} + # The length in bytes of the content. + content_len::Int32 +end + +""" + struct Blosc2SChunk + +This struct is the standard container for Blosc 2 compressed data. +""" +struct Blosc2SChunk + version::UInt8 + # The default compressor. Each chunk can override this. + compcode::UInt8 + # The default compressor metadata. Each chunk can override this. + compcode_meta::UInt8 + # The compression level and other compress params. + clevel::UInt8 + # The split mode. + splitmode::UInt8 + # The type size. + typesize::Int32 + # The requested size of the compressed blocks (0; meaning automatic). + blocksize::Int32 + # Size of each chunk. 0 if not a fixed chunksize. + chunksize::Int32 + # The (sequence of) filters. 8-bit per filter. + filters::NTuple{BLOSC2_MAX_FILTERS,UInt8} + # Metadata for filters. 8-bit per meta-slot. + filters_meta::NTuple{BLOSC2_MAX_FILTERS,UInt8} + # Number of chunks in super-chunk. + nchunks::Int64 + # The current chunk that is being accessed + current_nchunk::Int64 + # The data size (uncompressed). + nbytes::Int64 + # The data size + chunks header size (compressed). + cbytes::Int64 + # Pointer to chunk data pointers buffer. + data::Ptr{Ptr{UInt8}} + # Length of the chunk data pointers buffer. + data_len::Csize_t + # Pointer to storage info. + storage::Ptr{Blosc2Storage} + # Pointer to frame used as store for chunks. + frame::Ptr{Cvoid} # blosc2_frame* + # Context for the thread holder. NULL if not acquired. + # ctx::Ptr{UInt8} + # Context for compression + cctx::Ptr{Cvoid} # blosc2_context* + # Context for decompression. + dctx::Ptr{Cvoid} # blosc2_context* + # The array of metalayers. + metalayers::NTuple{BLOSC2_MAX_METALAYERS,Ptr{Blosc2Metalayer}} + # The number of metalayers in the super-chunk + nmetalayers::UInt16 + # The array of variable-length metalayers. + vlmetalayers::NTuple{BLOSC2_MAX_VLMETALAYERS,Ptr{Blosc2Metalayer}} + # The number of variable-length metalayers. + nvlmetalayers::Int16 + # Tune configuration. + tuner_params::Ptr{Cvoid} + # Id for tuner + tuner_id::Cint + # The ndim (mainly for ZFP usage) + ndim::Int8 + # The blockshape (mainly for ZFP usage) + blockshape::Ptr{Int64} +end + +################################################################################ +# Functions + +""" + is_compressor_valid(s::AbstractString)::Bool + +Check if a compressor name is valid. +""" +function is_compressor_valid(s::AbstractString) + '\0' ∈ s && return false + code = @ccall libblosc2.blosc2_compname_to_compcode(s::Cstring)::Cint + return code >= 0 +end + +""" + compcode(s::AbstractString)::Int + +Return a nonnegative integer code used internally by Blosc to identify the compressor. +Throws an `ArgumentError` if `s` is not the name of a supported algorithm. +""" +function compcode(s::AbstractString) + code = @ccall libblosc2.blosc2_compname_to_compcode(s::Cstring)::Cint + code == -1 && throw(ArgumentError("unrecognized compressor $(repr(s))")) + return Int(code) +end + +""" + compname(compcode::Integer)::String + +Return the compressor name corresponding to the internal integer code used by Blosc. +Throws an `ArgumentError` if `compcode` is not a valid code. +""" +function compname(compcode::Integer) + name = Ref{Ptr{UInt8}}() + code = @ccall libblosc2.blosc2_compcode_to_compname(compcode::Cint, name::Ref{Ptr{UInt8}})::Cint + code == -1 && throw(ArgumentError("unrecognized compcode $compcode")) + name = name[] + return unsafe_string(name) +end + +################################################################################ + +# The following is the original license info from blosc2.h and LICENSE.txt + +#= +/********************************************************************* + Blosc - Blocked Shuffling and Compression Library + + Copyright (c) 2021 Blosc Development Team + https://blosc.org + License: BSD 3-Clause (see LICENSE.txt) + + See LICENSE.txt for details about copyright and rights to use. +**********************************************************************/ +=# + +#= contents of LICENSE.txt +BSD License + +For Blosc - A blocking, shuffling and lossless compression library + +Copyright (c) 2009-2018 Francesc Alted +Copyright (c) 2019-present Blosc Development Team + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name Francesc Alted nor the names of its contributors may be used + to endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +=# diff --git a/LibBlosc2/test/Project.toml b/LibBlosc2/test/Project.toml new file mode 100644 index 0000000..d98123d --- /dev/null +++ b/LibBlosc2/test/Project.toml @@ -0,0 +1,7 @@ +[deps] +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +ChunkCodecCore = "0b6fb165-00bc-4d37-ab8b-79f91016dbe1" +ChunkCodecLibBlosc2 = "59b5581c-e2bc-42b3-a6f1-80e88eec7b70" +ChunkCodecTests = "06b1ce50-b741-4199-b118-ba5fe1a70fa7" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/LibBlosc2/test/runtests.jl b/LibBlosc2/test/runtests.jl new file mode 100644 index 0000000..a038f7d --- /dev/null +++ b/LibBlosc2/test/runtests.jl @@ -0,0 +1,100 @@ +using Random: Random +using ChunkCodecLibBlosc2: + ChunkCodecLibBlosc2, + Blosc2Codec, + Blosc2EncodeOptions, + Blosc2DecodeOptions, + Blosc2DecodingError +using ChunkCodecCore: decode, encode +using ChunkCodecTests: test_codec +using Test: @testset, @test_throws, @test +using Aqua: Aqua + +Aqua.test_all(ChunkCodecLibBlosc2; persistent_tasks=false) + +Random.seed!(1234) + +#TODO @testset "default" begin +#TODO test_codec(Blosc2Codec(), Blosc2EncodeOptions(), Blosc2DecodeOptions(); trials=100) +#TODO end +#TODO @testset "typesize" begin +#TODO for i in 1:50 +#TODO test_codec(Blosc2Codec(), Blosc2EncodeOptions(; typesize=i), Blosc2DecodeOptions(); trials=10) +#TODO end +#TODO end +#TODO @testset "compressors" begin +#TODO for clevel in 0:9 +#TODO for compressor in ["blosclz", "lz4", "lz4hc", "zlib", "zstd"] +#TODO test_codec(Blosc2Codec(), Blosc2EncodeOptions(; compressor, clevel), Blosc2DecodeOptions(); trials=10) +#TODO end +#TODO end +#TODO end +#TODO @testset "invalid options" begin +#TODO @test Blosc2EncodeOptions(; clevel=-1).clevel == 0 +#TODO @test Blosc2EncodeOptions(; clevel=100).clevel == 9 +#TODO # typesize can be anything, but out of the range it gets set to 1 +#TODO e = Blosc2EncodeOptions(; typesize=typemax(UInt128)) +#TODO @test e.typesize == 1 +#TODO e = Blosc2EncodeOptions(; typesize=0) +#TODO @test e.typesize == 1 +#TODO e = Blosc2EncodeOptions(; typesize=-1) +#TODO @test e.typesize == 1 +#TODO e = Blosc2EncodeOptions(; typesize=ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE) +#TODO @test e.typesize == ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE +#TODO e = Blosc2EncodeOptions(; typesize=(ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE+1)) +#TODO @test e.typesize == 1 +#TODO @test_throws ArgumentError Blosc2EncodeOptions(; compressor="") +#TODO @test_throws ArgumentError Blosc2EncodeOptions(; compressor="asfdgfsdgrwwea") +#TODO @test_throws ArgumentError Blosc2EncodeOptions(; compressor="blosclz,") +#TODO @test_throws ArgumentError Blosc2EncodeOptions(; compressor="blosclz\0") +#TODO end +#TODO @testset "compcode and compname" begin +#TODO @test ChunkCodecLibBlosc2.compcode("blosclz") == 0 +#TODO @test ChunkCodecLibBlosc2.is_compressor_valid("blosclz") +#TODO @test ChunkCodecLibBlosc2.compname(0) == "blosclz" +#TODO +#TODO @test_throws ArgumentError ChunkCodecLibBlosc2.compcode("sdaffads") +#TODO @test !ChunkCodecLibBlosc2.is_compressor_valid("sdaffads") +#TODO @test_throws ArgumentError ChunkCodecLibBlosc2.compcode("sdaffads") +#TODO @test_throws ArgumentError ChunkCodecLibBlosc2.compname(100) +#TODO +#TODO @test !ChunkCodecLibBlosc2.is_compressor_valid("\0") +#TODO end +@testset "errors" begin + # check Blosc2DecodingError prints the correct error message + @test sprint(Base.showerror, Blosc2DecodingError()) == "Blosc2DecodingError: blosc2 compressed buffer cannot be decoded" + # check that a truncated buffer throws a Blosc2DecodingError + u = UInt8[0x00] + c = encode(Blosc2EncodeOptions(), u) + @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c[1:(end - 1)]) + @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), UInt8[0x00]) + # check that a buffer with extra data throws a Blosc2DecodingError + @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), [c; 0x00;]) + # check corrupting LZ4 encoding throws a Blosc2DecodingError + u = zeros(UInt8, 1000) + c = encode(Blosc2EncodeOptions(), u) + + c[end-5] = 0x40 + # Blosc2 does not detect this corruption. (Apparently it stores + # unused and unchecked data in the trailer near the end of the + # compressed data.) We check whether at least the decompressed + # data are correct. + # BROKEN @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c) + @test decode(Blosc2DecodeOptions(), c) == u + + # There's more unused/unchecked data + c[end-50] = 0x40 + # BROKEN @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c) + @test decode(Blosc2DecodeOptions(), c) == u + + # Finally, this corruption has an effect + c[end-100] = 0x40 + @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c) +end +@testset "public" begin + if VERSION >= v"1.11.0-DEV.469" + for sym in (:is_compressor_valid, :compcode, :compname) + @test Base.ispublic(ChunkCodecLibBlosc2, sym) + end + end +end diff --git a/Project.toml b/Project.toml index cd45adb..c00a838 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,7 @@ projects = [ "ChunkCodecCore", "ChunkCodecTests", "LibBlosc", + "LibBlosc2", "LibBrotli", "LibBzip2", "LibLz4", From 50b017339959f33037a4051111cca9e9ba237f12 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Sun, 8 Jun 2025 13:09:29 -0400 Subject: [PATCH 02/14] CI: Enable LibBlosc2 --- .github/workflows/CI.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 1850535..a40b1ec 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -37,6 +37,11 @@ jobs: - ChunkCodecCore/** - ChunkCodecTests/** - LibBlosc/** + LibBlosc2: + - .github/** + - ChunkCodecCore/** + - ChunkCodecTests/** + - LibBlosc2/** LibBrotli: - .github/** - ChunkCodecCore/** From 11d64b1d75fa2c2c9a4ba5b13cd0e71434068196 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Sun, 8 Jun 2025 13:43:39 -0400 Subject: [PATCH 03/14] LibBlosc2: Allow dynamic chunk sizes --- LibBlosc2/src/decode.jl | 3 +- LibBlosc2/src/encode.jl | 18 +++---- LibBlosc2/test/runtests.jl | 101 ++++++++++++++++++++----------------- 3 files changed, 66 insertions(+), 56 deletions(-) diff --git a/LibBlosc2/src/decode.jl b/LibBlosc2/src/decode.jl index cf72432..c943bd4 100644 --- a/LibBlosc2/src/decode.jl +++ b/LibBlosc2/src/decode.jl @@ -105,7 +105,8 @@ function try_decode!(d::Blosc2DecodeOptions, dst::AbstractVector{UInt8}, src::Ab end # TODO: Use this instead of checking each chunk - @assert unsafe_load(schunk).nbytes == nbytes + # overall uncompressed size: unsafe_load(schunk).nbytes + # this chunk uncompressed size: nbytes if total_nbytes + nbytes > length(dst) there_was_an_error = true diff --git a/LibBlosc2/src/encode.jl b/LibBlosc2/src/encode.jl index 7bb9937..221b96e 100644 --- a/LibBlosc2/src/encode.jl +++ b/LibBlosc2/src/encode.jl @@ -27,6 +27,7 @@ struct Blosc2EncodeOptions <: EncodeOptions clevel::Int32 doshuffle::Int32 typesize::Int64 + chunksize::Int64 compressor::String end function Blosc2EncodeOptions(; @@ -34,6 +35,7 @@ function Blosc2EncodeOptions(; clevel::Integer=5, doshuffle::Integer=1, typesize::Integer=1, + chunksize::Integer=Int64(1024)^3, # 1 GByte compressor::AbstractString="lz4", kwargs...) _clevel = Int32(clamp(clevel, 0, 9)) @@ -43,22 +45,20 @@ function Blosc2EncodeOptions(; else Int64(1) end + _chunksize = Int64(clamp(chunksize, 1024, Int64(1024)^3)) # 1 GByte is_compressor_valid(compressor) || throw(ArgumentError("is_compressor_valid(compressor) must hold. Got\ncompressor => $(repr(compressor))")) - return Blosc2EncodeOptions(codec, _clevel, doshuffle, _typesize, compressor) + return Blosc2EncodeOptions(codec, _clevel, doshuffle, _typesize, _chunksize, compressor) end -# The maximum chunk size we're using: 1 GByte (must be less than 2 GByte including overhead) -const MAX_CHUNK_SIZE = Int64(1024)^3 - # The maximum overhead for the schunk const MAX_SCHUNK_OVERHEAD = 172 # apparently undocumented -- just a guess # We just punt with the upper bound. typemax(Int64) is a huge number anyway. decoded_size_range(e::Blosc2EncodeOptions) = Int64(0):Int64(e.typesize):(typemax(Int64) ÷ 2) -function encode_bound(::Blosc2EncodeOptions, src_size::Int64)::Int64 - return clamp(widen(src_size) + cld(src_size, MAX_CHUNK_SIZE) * BLOSC2_MAX_OVERHEAD + MAX_SCHUNK_OVERHEAD, Int64) +function encode_bound(e::Blosc2EncodeOptions, src_size::Int64)::Int64 + return clamp(widen(src_size) + cld(src_size, e.chunksize) * BLOSC2_MAX_OVERHEAD + MAX_SCHUNK_OVERHEAD, Int64) end function try_encode!(e::Blosc2EncodeOptions, dst::AbstractVector{UInt8}, src::AbstractVector{UInt8}; @@ -102,14 +102,14 @@ function try_encode!(e::Blosc2EncodeOptions, dst::AbstractVector{UInt8}, src::Ab @assert schunk != Ptr{Blosc2Storage}() # Break input into chunks - for pos in 1:MAX_CHUNK_SIZE:src_size - endpos = min(src_size, pos + MAX_CHUNK_SIZE - 1) + for pos in 1:e.chunksize:src_size + endpos = min(src_size, pos + e.chunksize - 1) srcview = @view src[pos:endpos] nbytes = length(srcview) nchunks = @ccall libblosc2.blosc2_schunk_append_buffer(schunk::Ptr{Blosc2SChunk}, srcview::Ptr{Cvoid}, nbytes::Int32)::Int64 @assert nchunks >= 0 - @assert nchunks == (pos-1) ÷ MAX_CHUNK_SIZE + 1 + @assert nchunks == (pos-1) ÷ e.chunksize + 1 end cframe = Ref{Ptr{UInt8}}() diff --git a/LibBlosc2/test/runtests.jl b/LibBlosc2/test/runtests.jl index a038f7d..37b7eb9 100644 --- a/LibBlosc2/test/runtests.jl +++ b/LibBlosc2/test/runtests.jl @@ -14,52 +14,61 @@ Aqua.test_all(ChunkCodecLibBlosc2; persistent_tasks=false) Random.seed!(1234) -#TODO @testset "default" begin -#TODO test_codec(Blosc2Codec(), Blosc2EncodeOptions(), Blosc2DecodeOptions(); trials=100) -#TODO end -#TODO @testset "typesize" begin -#TODO for i in 1:50 -#TODO test_codec(Blosc2Codec(), Blosc2EncodeOptions(; typesize=i), Blosc2DecodeOptions(); trials=10) -#TODO end -#TODO end -#TODO @testset "compressors" begin -#TODO for clevel in 0:9 -#TODO for compressor in ["blosclz", "lz4", "lz4hc", "zlib", "zstd"] -#TODO test_codec(Blosc2Codec(), Blosc2EncodeOptions(; compressor, clevel), Blosc2DecodeOptions(); trials=10) -#TODO end -#TODO end -#TODO end -#TODO @testset "invalid options" begin -#TODO @test Blosc2EncodeOptions(; clevel=-1).clevel == 0 -#TODO @test Blosc2EncodeOptions(; clevel=100).clevel == 9 -#TODO # typesize can be anything, but out of the range it gets set to 1 -#TODO e = Blosc2EncodeOptions(; typesize=typemax(UInt128)) -#TODO @test e.typesize == 1 -#TODO e = Blosc2EncodeOptions(; typesize=0) -#TODO @test e.typesize == 1 -#TODO e = Blosc2EncodeOptions(; typesize=-1) -#TODO @test e.typesize == 1 -#TODO e = Blosc2EncodeOptions(; typesize=ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE) -#TODO @test e.typesize == ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE -#TODO e = Blosc2EncodeOptions(; typesize=(ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE+1)) -#TODO @test e.typesize == 1 -#TODO @test_throws ArgumentError Blosc2EncodeOptions(; compressor="") -#TODO @test_throws ArgumentError Blosc2EncodeOptions(; compressor="asfdgfsdgrwwea") -#TODO @test_throws ArgumentError Blosc2EncodeOptions(; compressor="blosclz,") -#TODO @test_throws ArgumentError Blosc2EncodeOptions(; compressor="blosclz\0") -#TODO end -#TODO @testset "compcode and compname" begin -#TODO @test ChunkCodecLibBlosc2.compcode("blosclz") == 0 -#TODO @test ChunkCodecLibBlosc2.is_compressor_valid("blosclz") -#TODO @test ChunkCodecLibBlosc2.compname(0) == "blosclz" -#TODO -#TODO @test_throws ArgumentError ChunkCodecLibBlosc2.compcode("sdaffads") -#TODO @test !ChunkCodecLibBlosc2.is_compressor_valid("sdaffads") -#TODO @test_throws ArgumentError ChunkCodecLibBlosc2.compcode("sdaffads") -#TODO @test_throws ArgumentError ChunkCodecLibBlosc2.compname(100) -#TODO -#TODO @test !ChunkCodecLibBlosc2.is_compressor_valid("\0") -#TODO end +@testset "default" begin + test_codec(Blosc2Codec(), Blosc2EncodeOptions(), Blosc2DecodeOptions(); trials=100) +end +@testset "typesize" begin + for i in 1:50 + test_codec(Blosc2Codec(), Blosc2EncodeOptions(; typesize=i), Blosc2DecodeOptions(); trials=10) + end +end +@testset "compressors" begin + for clevel in 0:9 + for compressor in ["blosclz", "lz4", "lz4hc", "zlib", "zstd"] + test_codec(Blosc2Codec(), Blosc2EncodeOptions(; compressor, clevel), Blosc2DecodeOptions(); trials=10) + end + end +end +@testset "large inputs" begin + # We cannot really test large inputs (multi-Gigabyte) in a regular test. + # We therefore simulate this with smaller inputs and a ridiculously small chunk size. + u = reinterpret(UInt8, collect(float(1:10^6))) + e = Blosc2EncodeOptions(; clevel=9, doshuffle=2, typesize=sizeof(float(1)), chunksize=10^4, compressor="zstd") + c = encode(e, u) + u′ = decode(Blosc2DecodeOptions(), c) + @test u′ == u +end +@testset "invalid options" begin + @test Blosc2EncodeOptions(; clevel=-1).clevel == 0 + @test Blosc2EncodeOptions(; clevel=100).clevel == 9 + # typesize can be anything, but out of the range it gets set to 1 + e = Blosc2EncodeOptions(; typesize=typemax(UInt128)) + @test e.typesize == 1 + e = Blosc2EncodeOptions(; typesize=0) + @test e.typesize == 1 + e = Blosc2EncodeOptions(; typesize=-1) + @test e.typesize == 1 + e = Blosc2EncodeOptions(; typesize=ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE) + @test e.typesize == ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE + e = Blosc2EncodeOptions(; typesize=(ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE+1)) + @test e.typesize == 1 + @test_throws ArgumentError Blosc2EncodeOptions(; compressor="") + @test_throws ArgumentError Blosc2EncodeOptions(; compressor="asfdgfsdgrwwea") + @test_throws ArgumentError Blosc2EncodeOptions(; compressor="blosclz,") + @test_throws ArgumentError Blosc2EncodeOptions(; compressor="blosclz\0") +end +@testset "compcode and compname" begin + @test ChunkCodecLibBlosc2.compcode("blosclz") == 0 + @test ChunkCodecLibBlosc2.is_compressor_valid("blosclz") + @test ChunkCodecLibBlosc2.compname(0) == "blosclz" + + @test_throws ArgumentError ChunkCodecLibBlosc2.compcode("sdaffads") + @test !ChunkCodecLibBlosc2.is_compressor_valid("sdaffads") + @test_throws ArgumentError ChunkCodecLibBlosc2.compcode("sdaffads") + @test_throws ArgumentError ChunkCodecLibBlosc2.compname(100) + + @test !ChunkCodecLibBlosc2.is_compressor_valid("\0") +end @testset "errors" begin # check Blosc2DecodingError prints the correct error message @test sprint(Base.showerror, Blosc2DecodingError()) == "Blosc2DecodingError: blosc2 compressed buffer cannot be decoded" From 22b219d56c31276ba8802b9a64dc413e8882c9fd Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Sun, 8 Jun 2025 14:03:17 -0400 Subject: [PATCH 04/14] README: List Blosc2 --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 043d285..6977457 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ A consistent Julia interface for lossless encoding and decoding of bytes in memo | BZ2 | .bz2 bzip2 | ChunkCodecLibBzip2 | ✅ | ✅ | | Brotli | .br RFC7932 | ChunkCodecLibBrotli | ✅ | ✅ | | Blosc | | ChunkCodecLibBlosc | ✅ | ✅ | +| Blosc2 | | ChunkCodecLibBlosc2 | ✅ | ✅ | ## Simple encoding and decoding From 173f6291c9a38b8b86fa725ef8b774d1362e4c91 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Sun, 8 Jun 2025 14:36:47 -0400 Subject: [PATCH 05/14] Improve code --- LibBlosc2/src/decode.jl | 88 ++++++++++---------------------------- LibBlosc2/src/encode.jl | 3 +- LibBlosc2/test/runtests.jl | 2 +- 3 files changed, 25 insertions(+), 68 deletions(-) diff --git a/LibBlosc2/src/decode.jl b/LibBlosc2/src/decode.jl index c943bd4..1532d57 100644 --- a/LibBlosc2/src/decode.jl +++ b/LibBlosc2/src/decode.jl @@ -37,33 +37,7 @@ function try_find_decoded_size(::Blosc2DecodeOptions, src::AbstractVector{UInt8} end @ccall libblosc2.blosc2_schunk_avoid_cframe_free(schunk::Ptr{Blosc2SChunk}, true::UInt8)::Cvoid - total_nbytes = Int64(0) - - nchunks = unsafe_load(schunk).nchunks - for nchunk in 0:(nchunks - 1) - cbuffer = Ref{Ptr{UInt8}}() - needs_free = Ref{UInt8}() - chunksize = @ccall libblosc2.blosc2_schunk_get_chunk(schunk::Ptr{Blosc2SChunk}, nchunk::Int64, cbuffer::Ref{Ptr{UInt8}}, - needs_free::Ref{UInt8})::Cint - @assert chunksize > 0 - cbuffer = cbuffer[] - needs_free = Bool(needs_free[]) - - nbytes = Ref{Int32}() - success = @ccall libblosc2.blosc1_cbuffer_validate(cbuffer::Ptr{Cvoid}, chunksize::Cint, nbytes::Ref{Cint})::Cint - @assert success == 0 - nbytes = nbytes[] - - total_nbytes += nbytes - - if needs_free - # We could provide buffer into which to decode instead, reusing that buffer - Libc.free(cbuffer) - end - end - - # TODO: Use this instead of the loop above - @assert unsafe_load(schunk).nbytes == total_nbytes + total_nbytes = unsafe_load(schunk).nbytes success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Cvoid})::Cint @assert success == 0 @@ -78,56 +52,38 @@ function try_decode!(d::Blosc2DecodeOptions, dst::AbstractVector{UInt8}, src::Ab check_contiguous(dst) check_contiguous(src) - schunk = @ccall libblosc2.blosc2_schunk_from_buffer(src::Ptr{UInt8}, length(src)::Int64, false::UInt8)::Ptr{Blosc2SChunk} - @assert schunk != Ptr{Blosc2Storage}() + copy_cframe = false + schunk = @ccall libblosc2.blosc2_schunk_from_buffer(src::Ptr{UInt8}, length(src)::Int64, copy_cframe::UInt8)::Ptr{Blosc2SChunk} + if schunk == Ptr{Blosc2Storage}() + # These are not a valid blosc2-encoded data + throw(Blosc2DecodingError()) + end @ccall libblosc2.blosc2_schunk_avoid_cframe_free(schunk::Ptr{Blosc2SChunk}, true::UInt8)::Cvoid - there_was_an_error = false - total_nbytes = Int64(0) - - nchunks = unsafe_load(schunk).nchunks - for nchunk in 0:(nchunks - 1) - cbuffer = Ref{Ptr{UInt8}}() - needs_free = Ref{UInt8}() - chunksize = @ccall libblosc2.blosc2_schunk_get_chunk(schunk::Ptr{Blosc2SChunk}, nchunk::Int64, cbuffer::Ref{Ptr{UInt8}}, - needs_free::Ref{UInt8})::Cint - @assert chunksize > 0 - cbuffer = cbuffer[] - needs_free = Bool(needs_free[]) - - nbytes = Ref{Int32}() - success = @ccall libblosc2.blosc1_cbuffer_validate(cbuffer::Ptr{Cvoid}, chunksize::Cint, nbytes::Ref{Cint})::Cint + total_nbytes = unsafe_load(schunk).nbytes + if total_nbytes > length(dst) + # There is not enough space to decode the data + success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Cvoid})::Cint @assert success == 0 - nbytes = nbytes[] - - if needs_free - Libc.free(cbuffer) - end - # TODO: Use this instead of checking each chunk - # overall uncompressed size: unsafe_load(schunk).nbytes - # this chunk uncompressed size: nbytes + return nothing + end - if total_nbytes + nbytes > length(dst) - there_was_an_error = true - break - end + dst_position = Int64(0) - @assert total_nbytes + nbytes <= length(dst) - nbytes′ = @ccall libblosc2.blosc2_schunk_decompress_chunk(schunk::Ptr{Blosc2SChunk}, nchunk::Int64, - pointer(dst, total_nbytes+1)::Ptr{Cvoid}, nbytes::Int32)::Cint - @assert nbytes′ >= 0 - @assert nbytes′ == nbytes + nchunks = unsafe_load(schunk).nchunks + for nchunk in 0:(nchunks - 1) + nbytes_left = clamp(total_nbytes - dst_position, Int32) + nbytes = @ccall libblosc2.blosc2_schunk_decompress_chunk(schunk::Ptr{Blosc2SChunk}, nchunk::Int64, + pointer(dst, dst_position+1)::Ptr{Cvoid}, nbytes_left::Int32)::Cint + @assert nbytes > 0 - total_nbytes += nbytes + dst_position += nbytes end + @assert dst_position == total_nbytes success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Cvoid})::Cint @assert success == 0 - if there_was_an_error - return nothing - end - return total_nbytes::Int64 end diff --git a/LibBlosc2/src/encode.jl b/LibBlosc2/src/encode.jl index 221b96e..02226e4 100644 --- a/LibBlosc2/src/encode.jl +++ b/LibBlosc2/src/encode.jl @@ -121,7 +121,8 @@ function try_encode!(e::Blosc2EncodeOptions, dst::AbstractVector{UInt8}, src::Ab needs_free = Bool(needs_free[]) if compressed_size <= length(dst) - # TODO: Encode directly into `dst` + # We should try to encode directly into `dst`. (This may + # not be possible with the Blosc2 API.) unsafe_copyto!(pointer(dst), cframe, compressed_size) else # Insufficient space to stored compressed data. diff --git a/LibBlosc2/test/runtests.jl b/LibBlosc2/test/runtests.jl index 37b7eb9..3d9b9a2 100644 --- a/LibBlosc2/test/runtests.jl +++ b/LibBlosc2/test/runtests.jl @@ -32,7 +32,7 @@ end @testset "large inputs" begin # We cannot really test large inputs (multi-Gigabyte) in a regular test. # We therefore simulate this with smaller inputs and a ridiculously small chunk size. - u = reinterpret(UInt8, collect(float(1:10^6))) + u = reinterpret(UInt8, collect(float(1:(10 ^ 6)))) e = Blosc2EncodeOptions(; clevel=9, doshuffle=2, typesize=sizeof(float(1)), chunksize=10^4, compressor="zstd") c = encode(e, u) u′ = decode(Blosc2DecodeOptions(), c) From bc406576397fb665ee5dc8c722864bae558a17f2 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Mon, 9 Jun 2025 12:44:04 -0400 Subject: [PATCH 06/14] No global initialization --- LibBlosc2/src/ChunkCodecLibBlosc2.jl | 5 +- LibBlosc2/src/decode.jl | 27 ++++- LibBlosc2/src/encode.jl | 152 +++++++++++++++++++++------ LibBlosc2/src/libblosc2.jl | 15 +++ LibBlosc2/test/runtests.jl | 12 +-- 5 files changed, 166 insertions(+), 45 deletions(-) diff --git a/LibBlosc2/src/ChunkCodecLibBlosc2.jl b/LibBlosc2/src/ChunkCodecLibBlosc2.jl index 38768a7..d0fe448 100644 --- a/LibBlosc2/src/ChunkCodecLibBlosc2.jl +++ b/LibBlosc2/src/ChunkCodecLibBlosc2.jl @@ -1,6 +1,7 @@ module ChunkCodecLibBlosc2 using Base.Libc: free +using Base.Threads using Accessors @@ -54,8 +55,4 @@ decode_options(::Blosc2Codec) = Blosc2DecodeOptions() include("encode.jl") include("decode.jl") -# Initialize the Blosc2 library. This function is idempotent, i.e. it -# can be called called multiple times without harm. -__init__() = @ccall libblosc2.blosc2_init()::Cvoid - end # module ChunkCodecLibBlosc2 diff --git a/LibBlosc2/src/decode.jl b/LibBlosc2/src/decode.jl index 1532d57..204a36f 100644 --- a/LibBlosc2/src/decode.jl +++ b/LibBlosc2/src/decode.jl @@ -20,15 +20,34 @@ Blosc2 decompression using c-blosc2 library: https://github.com/Blosc/c-blosc2 # Keyword Arguments - `codec::Blosc2Codec=Blosc2Codec()` + +# Keyword Arguments + +- `codec::Blosc2Codec=Blosc2Codec()` +- `nthreads::Integer=1`: The number of threads to use """ struct Blosc2DecodeOptions <: DecodeOptions codec::Blosc2Codec + + nthreads::Int end -Blosc2DecodeOptions(; codec::Blosc2Codec=Blosc2Codec(), kwargs...) = Blosc2DecodeOptions(codec) +function Blosc2DecodeOptions(; codec::Blosc2Codec=Blosc2Codec(), + nthreads::Integer=1, + kwargs...) + _nthreads = nthreads + check_in_range(1:typemax(Int32); nthreads=_nthreads) + + return Blosc2DecodeOptions(codec, _nthreads) +end + +# This decoder is thread safe: We don't use any of Blosc2's global variables. +is_thread_safe(::Blosc2DecodeOptions) = true function try_find_decoded_size(::Blosc2DecodeOptions, src::AbstractVector{UInt8})::Int64 check_contiguous(src) + blosc2_init() + copy_cframe = false schunk = @ccall libblosc2.blosc2_schunk_from_buffer(src::Ptr{UInt8}, length(src)::Int64, copy_cframe::UInt8)::Ptr{Blosc2SChunk} if schunk == Ptr{Blosc2Storage}() @@ -52,6 +71,12 @@ function try_decode!(d::Blosc2DecodeOptions, dst::AbstractVector{UInt8}, src::Ab check_contiguous(dst) check_contiguous(src) + blosc2_init() + + # I don't think there is a way to specify a decompression context. + # That means that our `Blosc2DecodeOptions` will be unused. + # We could try writing to the `dctx` field in the `schunk`. + copy_cframe = false schunk = @ccall libblosc2.blosc2_schunk_from_buffer(src::Ptr{UInt8}, length(src)::Int64, copy_cframe::UInt8)::Ptr{Blosc2SChunk} if schunk == Ptr{Blosc2Storage}() diff --git a/LibBlosc2/src/encode.jl b/LibBlosc2/src/encode.jl index 02226e4..e95826c 100644 --- a/LibBlosc2/src/encode.jl +++ b/LibBlosc2/src/encode.jl @@ -7,50 +7,133 @@ Blosc2 compression using c-blosc2 library: https://github.com/Blosc2/c-blosc2 # Keyword Arguments - `codec::Blosc2Codec=Blosc2Codec()` -- `clevel::Integer=5`: The compression level, between 0 (no compression) and 9 (maximum compression) -- `doshuffle::Integer=1`: Whether to use the shuffle filter. +- `doshuffle::Union{Integer,Symbol,AbstractString}=1`: Whether to use the shuffle filter. + + Possible values are + - `:noshuffle`, `"noshuffle"`, 0: do not shuffle + - `:shuffle`, `"shuffle"`, 1: shuffle bytes + - `:bitshuffle`, `"bitshuffle"`, 2: shuffle bits (slower but compresses better) +- `dodelta::Union{Integer,Symbol,AbstractString}=1`: Whether to use the delta filter. - 0 means not applying it, 1 means applying it at a byte level, - and 2 means at a bit level (slower but may achieve better entropy alignment). -- `typesize::Integer=1`: The element size to use when shuffling. + Possible values are + - `:nofilter`, `"nofilter"`, 0: no filter + - `:delta`, `"delta"`, 1: use delta filter +- `typesize::Integer=8`: The element size to use when shuffling. - For implementation reasons, only `typesize` in `1:$(BLOSC_MAX_TYPESIZE)` will allow the - shuffle filter to work. When `typesize` is not in this range, shuffle - will be silently disabled. -- `compressor::AbstractString="lz4"`: The string representing the type of compressor to use. + `typesize` must be in the range `1:$(BLOSC_MAX_TYPESIZE)`. +- `clevel::Integer=5`: The compression level, between 0 (no compression) and 9 (maximum compression) +- `compressor::AbstractString="blosclz"`: The string representing the type of compressor to use. - For example, "blosclz", "lz4", "lz4hc", "zlib", or "zstd". + For example, `"blosclz"`, `"lz4"`, `"lz4hc"`, `"zlib"`, or `"zstd"`. Use `is_compressor_valid` to check if a compressor is supported. +- `blocksize::Integer=0`: Length of block in bytes (0 for automatic choice) +- `nthreads::Integer=1`: The number of threads to use +- `splitmode::Union{Integer,Symbol,AbstractString}=4: Whether blocks should be split or not + + Possible values are + - `:always`, `"always"`, 1 + - `:never`, `"never"`, 2 + - `:auto`, `"auto"`, 3 + - `:forward_compat`, `"forward_compat"`, 4: default setting +- `chunksize::Integer=1024^3`: Chunk size for very large inputs """ struct Blosc2EncodeOptions <: EncodeOptions codec::Blosc2Codec - clevel::Int32 - doshuffle::Int32 - typesize::Int64 - chunksize::Int64 + + doshuffle::Int # :noshuffle, :shuffle, :bitshuffle + dodelta::Int # :nofilter, :delta + typesize::Int + clevel::Int compressor::String + blocksize::Int + nthreads::Int + splitmode::Int # :always, :never, :auto, :forward_compat + + chunksize::Int64 end function Blosc2EncodeOptions(; codec::Blosc2Codec=Blosc2Codec(), + doshuffle::Union{Integer,Symbol,AbstractString}=1, + dodelta::Union{Integer,Symbol,AbstractString}=0, + typesize::Integer=8, clevel::Integer=5, - doshuffle::Integer=1, - typesize::Integer=1, + compressor::Union{Symbol,AbstractString}=:blosclz, + blocksize::Integer=0, + nthreads::Integer=1, + splitmode::Union{Integer,Symbol,AbstractString}=4, chunksize::Integer=Int64(1024)^3, # 1 GByte - compressor::AbstractString="lz4", kwargs...) - _clevel = Int32(clamp(clevel, 0, 9)) - check_in_range(0:2; doshuffle) - _typesize = if typesize ∈ 2:BLOSC_MAX_TYPESIZE - Int64(typesize) - else - Int64(1) + _doshuffle = doshuffle + if _doshuffle isa AbstractString + _doshuffle = Symbol(lowercase(_doshuffle)) end - _chunksize = Int64(clamp(chunksize, 1024, Int64(1024)^3)) # 1 GByte - is_compressor_valid(compressor) || + if _doshuffle isa Symbol + _doshuffle = get(Dict(:noshuffle => 0, + :shuffle => 1, + :bitshuffle => 2), _doshuffle, -1) + _doshuffle >= 0 || + throw(ArgumentError("Unknown `doshuffle` value `$(repr(doshuffle))`")) + end + _doshuffle::Integer + check_in_range(0:2; doshuffle=_doshuffle) + + _dodelta = dodelta + if _dodelta isa AbstractString + _dodelta = Symbol(lowercase(_dodelta)) + end + if _dodelta isa Symbol + _dodelta = get(Dict(:nofilter => 0, + :delta => 1), _dodelta, -1) + _dodelta >= 0 || + throw(ArgumentError("Unknown `dodelta` value `$(repr(dodelta))`")) + end + _dodelta::Integer + check_in_range(0:1; dodelta=_dodelta) + + _typesize = typesize + if _typesize ∉ 1:BLOSC_MAX_TYPESIZE + _typesize = 8 # use default + end + + _clevel = clamp(clevel, 0:9) + + _compressor = compressor + if _compressor isa Symbol + _compressor = string(_compressor) + end + is_compressor_valid(_compressor) || throw(ArgumentError("is_compressor_valid(compressor) must hold. Got\ncompressor => $(repr(compressor))")) - return Blosc2EncodeOptions(codec, _clevel, doshuffle, _typesize, _chunksize, compressor) + + _blocksize = blocksize + check_in_range(0:typemax(Int32); blocksize=_blocksize) + + _nthreads=nthreads + check_in_range(1:typemax(Int32); nthreads=_nthreads) + + _splitmode = splitmode + if _splitmode isa AbstractString + _splitmode = Symbol(lowercase(_splitmode)) + end + if _splitmode isa Symbol + _splitmode = get(Dict(:always => 1, + :never => 2, + :auto => 3, + :forward_compat => 4), _splitmode, -1) + _splitmode >= 0 || + throw(ArgumentError("Unknown `splitmode` value `$(repr(splitmode))`")) + end + _splitmode::Integer + check_in_range(1:4; splitmode=_splitmode) + + _chunksize = clamp(chunksize, 1024, Int64(1024)^3) # at least 1 kByte, at most 1 GByte + + return Blosc2EncodeOptions(codec, + _doshuffle, _dodelta, _typesize, _clevel, _compressor, _blocksize, _nthreads, _splitmode, _chunksize) end +# This encoder is thread safe: We don't use any of Blosc2's global variables. +is_thread_safe(::Blosc2EncodeOptions) = true + # The maximum overhead for the schunk const MAX_SCHUNK_OVERHEAD = 172 # apparently undocumented -- just a guess @@ -69,35 +152,36 @@ function try_encode!(e::Blosc2EncodeOptions, dst::AbstractVector{UInt8}, src::Ab dst_size::Int64 = length(dst) check_in_range(decoded_size_range(e); src_size) + blosc2_init() + ccode = compcode(e.compressor) @assert ccode >= 0 - numinternalthreads = 1 # Create a super-chunk container cparams = Blosc2CParams() @reset cparams.typesize = e.typesize @reset cparams.compcode = ccode @reset cparams.clevel = e.clevel - @reset cparams.nthreads = numinternalthreads + @reset cparams.nthreads = e.nthreads + @reset cparams.blocksize = e.blocksize + @reset cparams.splitmode = e.splitmode @reset cparams.filters[BLOSC2_MAX_FILTERS] = e.doshuffle + if e.dodelta > 0 + @reset cparams.filters[BLOSC2_MAX_FILTERS-1] = e.dodelta + end cparams_obj = [cparams] - dparams = Blosc2DParams() - @reset dparams.nthreads = numinternalthreads - dparams_obj = [dparams] - io = Blosc2IO() io_obj = [io] storage = Blosc2Storage() @reset storage.cparams = pointer(cparams_obj) - @reset storage.dparams = pointer(dparams_obj) @reset storage.io = pointer(io_obj) storage_obj = [storage] there_was_an_error = false - GC.@preserve cparams_obj dparams_obj io_obj storage_obj begin + GC.@preserve cparams_obj io_obj storage_obj begin schunk = @ccall libblosc2.blosc2_schunk_new(storage_obj::Ptr{Blosc2Storage})::Ptr{Blosc2SChunk} @assert schunk != Ptr{Blosc2Storage}() diff --git a/LibBlosc2/src/libblosc2.jl b/LibBlosc2/src/libblosc2.jl index 3783b3d..dc081c9 100644 --- a/LibBlosc2/src/libblosc2.jl +++ b/LibBlosc2/src/libblosc2.jl @@ -225,6 +225,21 @@ end ################################################################################ # Functions +const blosc2_initialized = Atomic{Bool}(false) +const blosc2_initialized_lock = ReentrantLock() +# Initialize the Blosc2 library. This function is reentrant and +# idempotent, i.e. it can be called called multiple times without +# harm. +function blosc2_init() + blosc2_initialized[] && return + @lock blosc2_initialized_lock begin + blosc2_initialized[] && return + @ccall libblosc2.blosc2_init()::Cvoid + blosc2_initialized[] = true + end + return +end + """ is_compressor_valid(s::AbstractString)::Bool diff --git a/LibBlosc2/test/runtests.jl b/LibBlosc2/test/runtests.jl index 3d9b9a2..7ffbe69 100644 --- a/LibBlosc2/test/runtests.jl +++ b/LibBlosc2/test/runtests.jl @@ -41,17 +41,17 @@ end @testset "invalid options" begin @test Blosc2EncodeOptions(; clevel=-1).clevel == 0 @test Blosc2EncodeOptions(; clevel=100).clevel == 9 - # typesize can be anything, but out of the range it gets set to 1 + # typesize can be anything, but out of the range it gets set to 8 (the default) e = Blosc2EncodeOptions(; typesize=typemax(UInt128)) - @test e.typesize == 1 + @test e.typesize == 8 e = Blosc2EncodeOptions(; typesize=0) - @test e.typesize == 1 + @test e.typesize == 8 e = Blosc2EncodeOptions(; typesize=-1) - @test e.typesize == 1 + @test e.typesize == 8 e = Blosc2EncodeOptions(; typesize=ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE) @test e.typesize == ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE e = Blosc2EncodeOptions(; typesize=(ChunkCodecLibBlosc2.BLOSC_MAX_TYPESIZE+1)) - @test e.typesize == 1 + @test e.typesize == 8 @test_throws ArgumentError Blosc2EncodeOptions(; compressor="") @test_throws ArgumentError Blosc2EncodeOptions(; compressor="asfdgfsdgrwwea") @test_throws ArgumentError Blosc2EncodeOptions(; compressor="blosclz,") @@ -73,7 +73,7 @@ end # check Blosc2DecodingError prints the correct error message @test sprint(Base.showerror, Blosc2DecodingError()) == "Blosc2DecodingError: blosc2 compressed buffer cannot be decoded" # check that a truncated buffer throws a Blosc2DecodingError - u = UInt8[0x00] + u = zeros(UInt8, 8) c = encode(Blosc2EncodeOptions(), u) @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c[1:(end - 1)]) @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), UInt8[0x00]) From d757e5ce2e84508fb6b67f4589e8fa31d5e5b92c Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Tue, 10 Jun 2025 12:50:36 -0400 Subject: [PATCH 07/14] Blosc2 is not thread-safe --- LibBlosc2/src/decode.jl | 3 --- LibBlosc2/src/encode.jl | 3 --- 2 files changed, 6 deletions(-) diff --git a/LibBlosc2/src/decode.jl b/LibBlosc2/src/decode.jl index 204a36f..0a5fb07 100644 --- a/LibBlosc2/src/decode.jl +++ b/LibBlosc2/src/decode.jl @@ -40,9 +40,6 @@ function Blosc2DecodeOptions(; codec::Blosc2Codec=Blosc2Codec(), return Blosc2DecodeOptions(codec, _nthreads) end -# This decoder is thread safe: We don't use any of Blosc2's global variables. -is_thread_safe(::Blosc2DecodeOptions) = true - function try_find_decoded_size(::Blosc2DecodeOptions, src::AbstractVector{UInt8})::Int64 check_contiguous(src) diff --git a/LibBlosc2/src/encode.jl b/LibBlosc2/src/encode.jl index e95826c..d20c733 100644 --- a/LibBlosc2/src/encode.jl +++ b/LibBlosc2/src/encode.jl @@ -131,9 +131,6 @@ function Blosc2EncodeOptions(; _doshuffle, _dodelta, _typesize, _clevel, _compressor, _blocksize, _nthreads, _splitmode, _chunksize) end -# This encoder is thread safe: We don't use any of Blosc2's global variables. -is_thread_safe(::Blosc2EncodeOptions) = true - # The maximum overhead for the schunk const MAX_SCHUNK_OVERHEAD = 172 # apparently undocumented -- just a guess From 9a80e1c77f22feea38cbd621f2b691f6c0012e95 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Tue, 10 Jun 2025 12:52:01 -0400 Subject: [PATCH 08/14] CI: Add debug output, build only Windows --- .github/workflows/.#CI.yml | 1 + .github/workflows/CI.yml | 78 +++++++++++++++++++------------------- LibBlosc2/test/runtests.jl | 10 +++++ 3 files changed, 50 insertions(+), 39 deletions(-) create mode 120000 .github/workflows/.#CI.yml diff --git a/.github/workflows/.#CI.yml b/.github/workflows/.#CI.yml new file mode 120000 index 0000000..f0ad493 --- /dev/null +++ b/.github/workflows/.#CI.yml @@ -0,0 +1 @@ +eschnett@Redshift-763.local.2401:1747147887 \ No newline at end of file diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index a40b1ec..6fcda56 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -28,50 +28,50 @@ jobs: id: filter with: filters: | - ChunkCodecCore: - - .github/** - - ChunkCodecCore/** - - ChunkCodecTests/** - LibBlosc: - - .github/** - - ChunkCodecCore/** - - ChunkCodecTests/** - - LibBlosc/** + #TODO ChunkCodecCore: + #TODO - .github/** + #TODO - ChunkCodecCore/** + #TODO - ChunkCodecTests/** + #TODO LibBlosc: + #TODO - .github/** + #TODO - ChunkCodecCore/** + #TODO - ChunkCodecTests/** + #TODO - LibBlosc/** LibBlosc2: - .github/** - ChunkCodecCore/** - ChunkCodecTests/** - LibBlosc2/** - LibBrotli: - - .github/** - - ChunkCodecCore/** - - ChunkCodecTests/** - - LibBrotli/** - LibBzip2: - - .github/** - - ChunkCodecCore/** - - ChunkCodecTests/** - - LibBzip2/** - LibLz4: - - .github/** - - ChunkCodecCore/** - - ChunkCodecTests/** - - LibLz4/** - LibSnappy: - - .github/** - - ChunkCodecCore/** - - ChunkCodecTests/** - - LibSnappy/** - LibZlib: - - .github/** - - ChunkCodecCore/** - - ChunkCodecTests/** - - LibZlib/** - LibZstd: - - .github/** - - ChunkCodecCore/** - - ChunkCodecTests/** - - LibZstd/** + #TODO LibBrotli: + #TODO - .github/** + #TODO - ChunkCodecCore/** + #TODO - ChunkCodecTests/** + #TODO - LibBrotli/** + #TODO LibBzip2: + #TODO - .github/** + #TODO - ChunkCodecCore/** + #TODO - ChunkCodecTests/** + #TODO - LibBzip2/** + #TODO LibLz4: + #TODO - .github/** + #TODO - ChunkCodecCore/** + #TODO - ChunkCodecTests/** + #TODO - LibLz4/** + #TODO LibSnappy: + #TODO - .github/** + #TODO - ChunkCodecCore/** + #TODO - ChunkCodecTests/** + #TODO - LibSnappy/** + #TODO LibZlib: + #TODO - .github/** + #TODO - ChunkCodecCore/** + #TODO - ChunkCodecTests/** + #TODO - LibZlib/** + #TODO LibZstd: + #TODO - .github/** + #TODO - ChunkCodecCore/** + #TODO - ChunkCodecTests/** + #TODO - LibZstd/** test: name: Julia ${{ matrix.version }} - ${{matrix.package}} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} diff --git a/LibBlosc2/test/runtests.jl b/LibBlosc2/test/runtests.jl index 7ffbe69..1f44b8c 100644 --- a/LibBlosc2/test/runtests.jl +++ b/LibBlosc2/test/runtests.jl @@ -71,16 +71,22 @@ end end @testset "errors" begin # check Blosc2DecodingError prints the correct error message + @show :sprint @test sprint(Base.showerror, Blosc2DecodingError()) == "Blosc2DecodingError: blosc2 compressed buffer cannot be decoded" # check that a truncated buffer throws a Blosc2DecodingError u = zeros(UInt8, 8) + @show :encode1 c = encode(Blosc2EncodeOptions(), u) + @show :decode1 @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c[1:(end - 1)]) + @show :decode2 @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), UInt8[0x00]) # check that a buffer with extra data throws a Blosc2DecodingError + @show :decode3 @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), [c; 0x00;]) # check corrupting LZ4 encoding throws a Blosc2DecodingError u = zeros(UInt8, 1000) + @show :encode2 c = encode(Blosc2EncodeOptions(), u) c[end-5] = 0x40 @@ -89,16 +95,20 @@ end # compressed data.) We check whether at least the decompressed # data are correct. # BROKEN @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c) + @show :decode4 @test decode(Blosc2DecodeOptions(), c) == u # There's more unused/unchecked data c[end-50] = 0x40 # BROKEN @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c) + @show :decode5 @test decode(Blosc2DecodeOptions(), c) == u # Finally, this corruption has an effect c[end-100] = 0x40 + @show :decode6 @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c) + @show :errors end @testset "public" begin if VERSION >= v"1.11.0-DEV.469" From f947489dfc5db35d38ec6bf901d5b103a679f982 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Tue, 10 Jun 2025 12:52:44 -0400 Subject: [PATCH 09/14] CI: Add debug output, build only Windows --- .github/workflows/.#CI.yml | 1 - .github/workflows/CI.yml | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) delete mode 120000 .github/workflows/.#CI.yml diff --git a/.github/workflows/.#CI.yml b/.github/workflows/.#CI.yml deleted file mode 120000 index f0ad493..0000000 --- a/.github/workflows/.#CI.yml +++ /dev/null @@ -1 +0,0 @@ -eschnett@Redshift-763.local.2401:1747147887 \ No newline at end of file diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 6fcda56..c9d8af8 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -83,13 +83,13 @@ jobs: # e.g. ['package1', 'package2'] if both package folders contains changes package: ${{ fromJSON(needs.changes.outputs.packages) }} version: - - '1.10' + #TODO - '1.10' - '1' - - 'pre' + #TODO - 'pre' os: - - ubuntu-latest + #TODO - ubuntu-latest - windows-latest - - macos-latest + #TODO - macos-latest arch: - 'default' - 'x86' From 1b68df1c85b64625e4e6ef06edf40e583cec31e5 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Tue, 10 Jun 2025 13:15:01 -0400 Subject: [PATCH 10/14] CI: Avoid segfault --- LibBlosc2/src/decode.jl | 2 +- LibBlosc2/test/runtests.jl | 17 ++++++----------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/LibBlosc2/src/decode.jl b/LibBlosc2/src/decode.jl index 0a5fb07..8e12d8c 100644 --- a/LibBlosc2/src/decode.jl +++ b/LibBlosc2/src/decode.jl @@ -61,7 +61,7 @@ function try_find_decoded_size(::Blosc2DecodeOptions, src::AbstractVector{UInt8} return total_nbytes::Int64 end -#TODO: implement `try_resize_decode!` +# Note: We should implement `try_resize_decode!` function try_decode!(d::Blosc2DecodeOptions, dst::AbstractVector{UInt8}, src::AbstractVector{UInt8}; kwargs...)::Union{Nothing,Int64} diff --git a/LibBlosc2/test/runtests.jl b/LibBlosc2/test/runtests.jl index 1f44b8c..cbdf6a0 100644 --- a/LibBlosc2/test/runtests.jl +++ b/LibBlosc2/test/runtests.jl @@ -71,22 +71,16 @@ end end @testset "errors" begin # check Blosc2DecodingError prints the correct error message - @show :sprint @test sprint(Base.showerror, Blosc2DecodingError()) == "Blosc2DecodingError: blosc2 compressed buffer cannot be decoded" # check that a truncated buffer throws a Blosc2DecodingError u = zeros(UInt8, 8) - @show :encode1 c = encode(Blosc2EncodeOptions(), u) - @show :decode1 @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c[1:(end - 1)]) - @show :decode2 @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), UInt8[0x00]) # check that a buffer with extra data throws a Blosc2DecodingError - @show :decode3 @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), [c; 0x00;]) # check corrupting LZ4 encoding throws a Blosc2DecodingError u = zeros(UInt8, 1000) - @show :encode2 c = encode(Blosc2EncodeOptions(), u) c[end-5] = 0x40 @@ -95,20 +89,21 @@ end # compressed data.) We check whether at least the decompressed # data are correct. # BROKEN @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c) - @show :decode4 @test decode(Blosc2DecodeOptions(), c) == u # There's more unused/unchecked data c[end-50] = 0x40 # BROKEN @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c) - @show :decode5 @test decode(Blosc2DecodeOptions(), c) == u # Finally, this corruption has an effect c[end-100] = 0x40 - @show :decode6 - @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c) - @show :errors + # Windows segfaults in this call with exit code 3221226356, + # indicating a heap corruption. That's clearly a bug in c-blosc2. + # It seems c-blosc2 does not checksum its compressed data. + if !Sys.iswindows() + @test_throws Blosc2DecodingError decode(Blosc2DecodeOptions(), c) + end end @testset "public" begin if VERSION >= v"1.11.0-DEV.469" From 08e8e6d5a0f673d7cb6588b9d73deef1e63e6698 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Tue, 10 Jun 2025 13:15:50 -0400 Subject: [PATCH 11/14] CI: Avoid segfault --- .github/workflows/CI.yml | 86 ++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index c9d8af8..a40b1ec 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -28,50 +28,50 @@ jobs: id: filter with: filters: | - #TODO ChunkCodecCore: - #TODO - .github/** - #TODO - ChunkCodecCore/** - #TODO - ChunkCodecTests/** - #TODO LibBlosc: - #TODO - .github/** - #TODO - ChunkCodecCore/** - #TODO - ChunkCodecTests/** - #TODO - LibBlosc/** + ChunkCodecCore: + - .github/** + - ChunkCodecCore/** + - ChunkCodecTests/** + LibBlosc: + - .github/** + - ChunkCodecCore/** + - ChunkCodecTests/** + - LibBlosc/** LibBlosc2: - .github/** - ChunkCodecCore/** - ChunkCodecTests/** - LibBlosc2/** - #TODO LibBrotli: - #TODO - .github/** - #TODO - ChunkCodecCore/** - #TODO - ChunkCodecTests/** - #TODO - LibBrotli/** - #TODO LibBzip2: - #TODO - .github/** - #TODO - ChunkCodecCore/** - #TODO - ChunkCodecTests/** - #TODO - LibBzip2/** - #TODO LibLz4: - #TODO - .github/** - #TODO - ChunkCodecCore/** - #TODO - ChunkCodecTests/** - #TODO - LibLz4/** - #TODO LibSnappy: - #TODO - .github/** - #TODO - ChunkCodecCore/** - #TODO - ChunkCodecTests/** - #TODO - LibSnappy/** - #TODO LibZlib: - #TODO - .github/** - #TODO - ChunkCodecCore/** - #TODO - ChunkCodecTests/** - #TODO - LibZlib/** - #TODO LibZstd: - #TODO - .github/** - #TODO - ChunkCodecCore/** - #TODO - ChunkCodecTests/** - #TODO - LibZstd/** + LibBrotli: + - .github/** + - ChunkCodecCore/** + - ChunkCodecTests/** + - LibBrotli/** + LibBzip2: + - .github/** + - ChunkCodecCore/** + - ChunkCodecTests/** + - LibBzip2/** + LibLz4: + - .github/** + - ChunkCodecCore/** + - ChunkCodecTests/** + - LibLz4/** + LibSnappy: + - .github/** + - ChunkCodecCore/** + - ChunkCodecTests/** + - LibSnappy/** + LibZlib: + - .github/** + - ChunkCodecCore/** + - ChunkCodecTests/** + - LibZlib/** + LibZstd: + - .github/** + - ChunkCodecCore/** + - ChunkCodecTests/** + - LibZstd/** test: name: Julia ${{ matrix.version }} - ${{matrix.package}} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} @@ -83,13 +83,13 @@ jobs: # e.g. ['package1', 'package2'] if both package folders contains changes package: ${{ fromJSON(needs.changes.outputs.packages) }} version: - #TODO - '1.10' + - '1.10' - '1' - #TODO - 'pre' + - 'pre' os: - #TODO - ubuntu-latest + - ubuntu-latest - windows-latest - #TODO - macos-latest + - macos-latest arch: - 'default' - 'x86' From 10744cb8c6989b5963857bc676828f7845d92b5a Mon Sep 17 00:00:00 2001 From: Nathan Zimmerberg <39104088+nhz2@users.noreply.github.com> Date: Thu, 12 Jun 2025 00:41:52 -0400 Subject: [PATCH 12/14] Update CI.yml --- .github/workflows/CI.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 3be74dc..7dd8105 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -36,7 +36,6 @@ jobs: - ChunkCodecTests/** - LibBlosc/** LibBlosc2: - - .github/** - ChunkCodecCore/** - ChunkCodecTests/** - LibBlosc2/** From a5e48728874b1ed4b7f4249ee74efe852ec32b13 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Sat, 5 Jul 2025 13:01:48 -0400 Subject: [PATCH 13/14] LibBlosc2: Improve error handling --- LibBlosc2/README.md | 12 ++++-- LibBlosc2/src/ChunkCodecLibBlosc2.jl | 13 +++--- LibBlosc2/src/decode.jl | 44 ++++++++++++------- LibBlosc2/src/encode.jl | 64 +++++++++++++++++----------- LibBlosc2/src/libblosc2.jl | 2 +- LibBlosc2/test/runtests.jl | 10 ++--- 6 files changed, 88 insertions(+), 57 deletions(-) diff --git a/LibBlosc2/README.md b/LibBlosc2/README.md index ecb7e24..40a8a72 100644 --- a/LibBlosc2/README.md +++ b/LibBlosc2/README.md @@ -5,18 +5,24 @@ This package implements the ChunkCodec interface for the following encoders and decoders using the c-blosc2 library -1. `Blosc2Codec`, `Blosc2EncodeOptions`, `Blosc2DecodeOptions` +1. `Blosc2CFrame`, `Blosc2EncodeOptions`, `Blosc2DecodeOptions` + +Note: It appears that the [Blosc2 Contiguous Frame +Format](https://www.blosc.org/c-blosc2/format/cframe_format.html) is +not fully protected by checksums. The [`c-blosc2` +library](https://www.blosc.org/c-blosc2) may crash (segfault) for +invalid inputs. ## Example ```julia-repl julia> using ChunkCodecLibBlosc2 -julia> data = [0x00, 0x01, 0x02, 0x03]; +julia> data = collect(0x00:0x07); julia> compressed_data = encode(Blosc2EncodeOptions(), data); -julia> decompressed_data = decode(Blosc2Codec(), compressed_data; max_size=length(data), size_hint=length(data)); +julia> decompressed_data = decode(Blosc2CFrame(), compressed_data; max_size=length(data), size_hint=length(data)); julia> data == decompressed_data true diff --git a/LibBlosc2/src/ChunkCodecLibBlosc2.jl b/LibBlosc2/src/ChunkCodecLibBlosc2.jl index d0fe448..ef90504 100644 --- a/LibBlosc2/src/ChunkCodecLibBlosc2.jl +++ b/LibBlosc2/src/ChunkCodecLibBlosc2.jl @@ -1,9 +1,8 @@ module ChunkCodecLibBlosc2 -using Base.Libc: free using Base.Threads -using Accessors +using Accessors: @reset using Blosc2_jll: libblosc2 @@ -22,7 +21,7 @@ import ChunkCodecCore: try_find_decoded_size, decoded_size_range -export Blosc2Codec, +export Blosc2CFrame, Blosc2EncodeOptions, Blosc2DecodeOptions, Blosc2DecodingError @@ -38,8 +37,8 @@ export ChunkCodecCore, encode, decode include("libblosc2.jl") """ - struct Blosc2Codec <: Codec - Blosc2Codec() + struct Blosc2CFrame <: Codec + Blosc2CFrame() Blosc2 compression using c-blosc2 library: https://github.com/Blosc2/c-blosc2 @@ -49,8 +48,8 @@ Decoding also does not accept truncated data, or multiple compressed blocks conc [`Blosc2EncodeOptions`](@ref) and [`Blosc2DecodeOptions`](@ref) can be used to set decoding and encoding options. """ -struct Blosc2Codec <: Codec end -decode_options(::Blosc2Codec) = Blosc2DecodeOptions() +struct Blosc2CFrame <: Codec end +decode_options(::Blosc2CFrame) = Blosc2DecodeOptions() include("encode.jl") include("decode.jl") diff --git a/LibBlosc2/src/decode.jl b/LibBlosc2/src/decode.jl index 8e12d8c..c8fc42d 100644 --- a/LibBlosc2/src/decode.jl +++ b/LibBlosc2/src/decode.jl @@ -4,10 +4,11 @@ Error for data that cannot be decoded. """ struct Blosc2DecodingError <: DecodingError + code::Cint end function Base.showerror(io::IO, err::Blosc2DecodingError) - print(io, "Blosc2DecodingError: blosc2 compressed buffer cannot be decoded") + print(io, "Blosc2DecodingError: blosc2 compressed buffer cannot be decoded, error code: $(err.code)") return nothing end @@ -19,19 +20,15 @@ Blosc2 decompression using c-blosc2 library: https://github.com/Blosc/c-blosc2 # Keyword Arguments -- `codec::Blosc2Codec=Blosc2Codec()` - -# Keyword Arguments - -- `codec::Blosc2Codec=Blosc2Codec()` -- `nthreads::Integer=1`: The number of threads to use +- `codec::Blosc2CFrame = Blosc2CFrame()` +- `nthreads::Integer = 1`: The number of threads to use """ struct Blosc2DecodeOptions <: DecodeOptions - codec::Blosc2Codec + codec::Blosc2CFrame nthreads::Int end -function Blosc2DecodeOptions(; codec::Blosc2Codec=Blosc2Codec(), +function Blosc2DecodeOptions(; codec::Blosc2CFrame=Blosc2CFrame(), nthreads::Integer=1, kwargs...) _nthreads = nthreads @@ -49,14 +46,17 @@ function try_find_decoded_size(::Blosc2DecodeOptions, src::AbstractVector{UInt8} schunk = @ccall libblosc2.blosc2_schunk_from_buffer(src::Ptr{UInt8}, length(src)::Int64, copy_cframe::UInt8)::Ptr{Blosc2SChunk} if schunk == Ptr{Blosc2Storage}() # These are not a valid blosc2-encoded data - throw(Blosc2DecodingError()) + throw(Blosc2DecodingError(0)) end @ccall libblosc2.blosc2_schunk_avoid_cframe_free(schunk::Ptr{Blosc2SChunk}, true::UInt8)::Cvoid total_nbytes = unsafe_load(schunk).nbytes success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Cvoid})::Cint - @assert success == 0 + if success != 0 + # Something went wrong + throw(Blosc2DecodingError(0)) + end return total_nbytes::Int64 end @@ -78,7 +78,7 @@ function try_decode!(d::Blosc2DecodeOptions, dst::AbstractVector{UInt8}, src::Ab schunk = @ccall libblosc2.blosc2_schunk_from_buffer(src::Ptr{UInt8}, length(src)::Int64, copy_cframe::UInt8)::Ptr{Blosc2SChunk} if schunk == Ptr{Blosc2Storage}() # These are not a valid blosc2-encoded data - throw(Blosc2DecodingError()) + throw(Blosc2DecodingError(0)) end @ccall libblosc2.blosc2_schunk_avoid_cframe_free(schunk::Ptr{Blosc2SChunk}, true::UInt8)::Cvoid @@ -86,7 +86,10 @@ function try_decode!(d::Blosc2DecodeOptions, dst::AbstractVector{UInt8}, src::Ab if total_nbytes > length(dst) # There is not enough space to decode the data success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Cvoid})::Cint - @assert success == 0 + if success != 0 + # Something went wrong + throw(Blosc2DecodingError(0)) + end return nothing end @@ -98,14 +101,23 @@ function try_decode!(d::Blosc2DecodeOptions, dst::AbstractVector{UInt8}, src::Ab nbytes_left = clamp(total_nbytes - dst_position, Int32) nbytes = @ccall libblosc2.blosc2_schunk_decompress_chunk(schunk::Ptr{Blosc2SChunk}, nchunk::Int64, pointer(dst, dst_position+1)::Ptr{Cvoid}, nbytes_left::Int32)::Cint - @assert nbytes > 0 + if nbytes <= 0 + # There was an error decompressing the data + throw(Blosc2DecodingError(nbytes)) + end dst_position += nbytes end - @assert dst_position == total_nbytes + if dst_position != total_nbytes + # The decompressed size is inconsistent + throw(Blosc2DecodingError(0)) + end success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Cvoid})::Cint - @assert success == 0 + if success != 0 + # Something went wrong + throw(Blosc2DecodingError(0)) + end return total_nbytes::Int64 end diff --git a/LibBlosc2/src/encode.jl b/LibBlosc2/src/encode.jl index d20c733..2b58b69 100644 --- a/LibBlosc2/src/encode.jl +++ b/LibBlosc2/src/encode.jl @@ -6,39 +6,39 @@ Blosc2 compression using c-blosc2 library: https://github.com/Blosc2/c-blosc2 # Keyword Arguments -- `codec::Blosc2Codec=Blosc2Codec()` -- `doshuffle::Union{Integer,Symbol,AbstractString}=1`: Whether to use the shuffle filter. +- `codec::Blosc2CFrame = Blosc2CFrame()` +- `doshuffle::Union{Integer,Symbol,AbstractString} = 1`: Whether to use the shuffle filter. Possible values are - `:noshuffle`, `"noshuffle"`, 0: do not shuffle - `:shuffle`, `"shuffle"`, 1: shuffle bytes - `:bitshuffle`, `"bitshuffle"`, 2: shuffle bits (slower but compresses better) -- `dodelta::Union{Integer,Symbol,AbstractString}=1`: Whether to use the delta filter. +- `dodelta::Union{Integer,Symbol,AbstractString} = 1`: Whether to use the delta filter. Possible values are - `:nofilter`, `"nofilter"`, 0: no filter - `:delta`, `"delta"`, 1: use delta filter -- `typesize::Integer=8`: The element size to use when shuffling. +- `typesize::Integer = 8`: The element size to use when shuffling. `typesize` must be in the range `1:$(BLOSC_MAX_TYPESIZE)`. -- `clevel::Integer=5`: The compression level, between 0 (no compression) and 9 (maximum compression) -- `compressor::AbstractString="blosclz"`: The string representing the type of compressor to use. +- `clevel::Integer = 5`: The compression level, between 0 (no compression) and 9 (maximum compression) +- `compressor::AbstractString = "blosclz"`: The string representing the type of compressor to use. For example, `"blosclz"`, `"lz4"`, `"lz4hc"`, `"zlib"`, or `"zstd"`. Use `is_compressor_valid` to check if a compressor is supported. -- `blocksize::Integer=0`: Length of block in bytes (0 for automatic choice) -- `nthreads::Integer=1`: The number of threads to use -- `splitmode::Union{Integer,Symbol,AbstractString}=4: Whether blocks should be split or not +- `blocksize::Integer = 0`: Length of block in bytes (0 for automatic choice) +- `nthreads::Integer = 1`: The number of threads to use +- `splitmode::Union{Integer,Symbol,AbstractString} = 4: Whether blocks should be split or not Possible values are - `:always`, `"always"`, 1 - `:never`, `"never"`, 2 - `:auto`, `"auto"`, 3 - `:forward_compat`, `"forward_compat"`, 4: default setting -- `chunksize::Integer=1024^3`: Chunk size for very large inputs +- `chunksize::Integer = 1024^3`: Chunk size for very large inputs """ struct Blosc2EncodeOptions <: EncodeOptions - codec::Blosc2Codec + codec::Blosc2CFrame doshuffle::Int # :noshuffle, :shuffle, :bitshuffle dodelta::Int # :nofilter, :delta @@ -52,7 +52,7 @@ struct Blosc2EncodeOptions <: EncodeOptions chunksize::Int64 end function Blosc2EncodeOptions(; - codec::Blosc2Codec=Blosc2Codec(), + codec::Blosc2CFrame=Blosc2CFrame(), doshuffle::Union{Integer,Symbol,AbstractString}=1, dodelta::Union{Integer,Symbol,AbstractString}=0, typesize::Integer=8, @@ -176,11 +176,12 @@ function try_encode!(e::Blosc2EncodeOptions, dst::AbstractVector{UInt8}, src::Ab @reset storage.io = pointer(io_obj) storage_obj = [storage] - there_was_an_error = false - GC.@preserve cparams_obj io_obj storage_obj begin schunk = @ccall libblosc2.blosc2_schunk_new(storage_obj::Ptr{Blosc2Storage})::Ptr{Blosc2SChunk} - @assert schunk != Ptr{Blosc2Storage}() + if schunk == Ptr{Blosc2Storage}() + # Allocation failure + return nothing + end # Break input into chunks for pos in 1:e.chunksize:src_size @@ -189,15 +190,27 @@ function try_encode!(e::Blosc2EncodeOptions, dst::AbstractVector{UInt8}, src::Ab nbytes = length(srcview) nchunks = @ccall libblosc2.blosc2_schunk_append_buffer(schunk::Ptr{Blosc2SChunk}, srcview::Ptr{Cvoid}, nbytes::Int32)::Int64 - @assert nchunks >= 0 - @assert nchunks == (pos-1) ÷ e.chunksize + 1 + if nchunks < 0 + # Internal error in libblosc2, possibly due to invalid input + @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Blosc2SChunk})::Cint + return nothing + end + if nchunks != (pos-1) ÷ e.chunksize + 1 + # Our accounting went wrong, probably an internal error in libblosc2, possibly due to invalid input + @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Blosc2SChunk})::Cint + return nothing + end end cframe = Ref{Ptr{UInt8}}() needs_free = Ref{UInt8}() # bool compressed_size = @ccall libblosc2.blosc2_schunk_to_buffer(schunk::Ptr{Blosc2SChunk}, cframe::Ref{Ptr{UInt8}}, needs_free::Ref{UInt8})::Int64 - @assert compressed_size >= 0 + if compressed_size < 0 + # Internal error in libblosc2, possibly due to invalid input + @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Blosc2SChunk})::Cint + return nothing + end cframe = cframe[] needs_free = Bool(needs_free[]) @@ -208,19 +221,20 @@ function try_encode!(e::Blosc2EncodeOptions, dst::AbstractVector{UInt8}, src::Ab else # Insufficient space to stored compressed data. # We should detect this earlier, already in the loop above. - there_was_an_error = true + needs_free && Libc.free(cframe) + @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Blosc2SChunk})::Cint + return nothing end - success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Blosc2SChunk})::Cint - @assert success == 0 - if needs_free Libc.free(cframe) end - end - if there_was_an_error - return nothing + success = @ccall libblosc2.blosc2_schunk_free(schunk::Ptr{Blosc2SChunk})::Cint + if success != 0 + # Internal error in libblosc2, possibly due to invalid input + return nothing + end end return compressed_size::Int64 diff --git a/LibBlosc2/src/libblosc2.jl b/LibBlosc2/src/libblosc2.jl index dc081c9..44ab872 100644 --- a/LibBlosc2/src/libblosc2.jl +++ b/LibBlosc2/src/libblosc2.jl @@ -259,7 +259,7 @@ Throws an `ArgumentError` if `s` is not the name of a supported algorithm. """ function compcode(s::AbstractString) code = @ccall libblosc2.blosc2_compname_to_compcode(s::Cstring)::Cint - code == -1 && throw(ArgumentError("unrecognized compressor $(repr(s))")) + code < 0 && throw(ArgumentError("unrecognized compressor $(repr(s))")) return Int(code) end diff --git a/LibBlosc2/test/runtests.jl b/LibBlosc2/test/runtests.jl index cbdf6a0..51d01fa 100644 --- a/LibBlosc2/test/runtests.jl +++ b/LibBlosc2/test/runtests.jl @@ -1,7 +1,7 @@ using Random: Random using ChunkCodecLibBlosc2: ChunkCodecLibBlosc2, - Blosc2Codec, + Blosc2CFrame, Blosc2EncodeOptions, Blosc2DecodeOptions, Blosc2DecodingError @@ -15,17 +15,17 @@ Aqua.test_all(ChunkCodecLibBlosc2; persistent_tasks=false) Random.seed!(1234) @testset "default" begin - test_codec(Blosc2Codec(), Blosc2EncodeOptions(), Blosc2DecodeOptions(); trials=100) + test_codec(Blosc2CFrame(), Blosc2EncodeOptions(), Blosc2DecodeOptions(); trials=100) end @testset "typesize" begin for i in 1:50 - test_codec(Blosc2Codec(), Blosc2EncodeOptions(; typesize=i), Blosc2DecodeOptions(); trials=10) + test_codec(Blosc2CFrame(), Blosc2EncodeOptions(; typesize=i), Blosc2DecodeOptions(); trials=10) end end @testset "compressors" begin for clevel in 0:9 for compressor in ["blosclz", "lz4", "lz4hc", "zlib", "zstd"] - test_codec(Blosc2Codec(), Blosc2EncodeOptions(; compressor, clevel), Blosc2DecodeOptions(); trials=10) + test_codec(Blosc2CFrame(), Blosc2EncodeOptions(; compressor, clevel), Blosc2DecodeOptions(); trials=10) end end end @@ -71,7 +71,7 @@ end end @testset "errors" begin # check Blosc2DecodingError prints the correct error message - @test sprint(Base.showerror, Blosc2DecodingError()) == "Blosc2DecodingError: blosc2 compressed buffer cannot be decoded" + @test sprint(Base.showerror, Blosc2DecodingError(0)) == "Blosc2DecodingError: blosc2 compressed buffer cannot be decoded, error code: 0" # check that a truncated buffer throws a Blosc2DecodingError u = zeros(UInt8, 8) c = encode(Blosc2EncodeOptions(), u) From 1a4b310993ab9b5c1e29f2f7aa29c7724590d17a Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Sat, 5 Jul 2025 13:09:30 -0400 Subject: [PATCH 14/14] Remove Blosc2 from list of registered codecs --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 91f9ece..df26053 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,6 @@ A consistent Julia interface for lossless encoding and decoding of bytes in memo | BZ2 | .bz2 bzip2 | ChunkCodecLibBzip2 | ✅ | ✅ | | Brotli | .br RFC7932 | ChunkCodecLibBrotli | ✅ | ✅ | | Blosc | | ChunkCodecLibBlosc | ✅ | ✅ | -| Blosc2 | | ChunkCodecLibBlosc2 | ✅ | ✅ | ## Simple encoding and decoding