diff --git a/CMakeLists.txt b/CMakeLists.txt index 86b80e3..49e6719 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,6 +102,7 @@ set(SPARROW_IPC_HEADERS ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/arrow_interface/arrow_schema/private_data.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/config/config.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/config/sparrow_ipc_version.hpp + ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/compression.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_variable_size_binary_array.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_fixedsizebinary_array.hpp ${SPARROW_IPC_INCLUDE_DIR}/sparrow_ipc/deserialize_primitive_array.hpp @@ -122,6 +123,7 @@ set(SPARROW_IPC_SRC ${SPARROW_IPC_SOURCE_DIR}/arrow_interface/arrow_array/private_data.cpp ${SPARROW_IPC_SOURCE_DIR}/arrow_interface/arrow_schema.cpp ${SPARROW_IPC_SOURCE_DIR}/arrow_interface/arrow_schema/private_data.cpp + ${SPARROW_IPC_SOURCE_DIR}/compression.cpp ${SPARROW_IPC_SOURCE_DIR}/deserialize_fixedsizebinary_array.cpp ${SPARROW_IPC_SOURCE_DIR}/deserialize_utils.cpp ${SPARROW_IPC_SOURCE_DIR}/deserialize.cpp @@ -239,6 +241,8 @@ target_link_libraries(sparrow-ipc PUBLIC sparrow::sparrow flatbuffers::flatbuffers + PRIVATE + lz4::lz4 ) # Ensure generated headers are available when building sparrow-ipc @@ -297,6 +301,25 @@ if (TARGET flatbuffers) endif() endif() +if (TARGET lz4) + get_target_property(is_imported lz4 IMPORTED) + if(NOT is_imported) + # This means `lz4` was fetched using FetchContent + # We need to export `lz4` target explicitly + list(APPEND SPARROW_IPC_EXPORTED_TARGETS lz4) + endif() +endif() + +if (TARGET lz4_static) + get_target_property(is_imported lz4_static IMPORTED) + if(NOT is_imported) + # `lz4_static` is needed as this is the actual library + # and `lz4` is an interface pointing to it. + # If `lz4_shared` is used instead for some reason, modify this accordingly + list(APPEND SPARROW_IPC_EXPORTED_TARGETS lz4_static) + endif() +endif() + install(TARGETS ${SPARROW_IPC_EXPORTED_TARGETS} EXPORT ${PROJECT_NAME}-targets) diff --git a/cmake/Findlz4.cmake b/cmake/Findlz4.cmake new file mode 100644 index 0000000..2b9e9c0 --- /dev/null +++ b/cmake/Findlz4.cmake @@ -0,0 +1,42 @@ +# Find LZ4 library and headers + +# This module defines: +# LZ4_FOUND - True if lz4 is found +# LZ4_INCLUDE_DIRS - LZ4 include directories +# LZ4_LIBRARIES - Libraries needed to use LZ4 +# LZ4_VERSION - LZ4 version number +# + +find_package(PkgConfig) +if(PKG_CONFIG_FOUND) + pkg_check_modules(LZ4 QUIET liblz4) + if(NOT LZ4_FOUND) + message(STATUS "Did not find 'liblz4.pc', trying 'lz4.pc'") + pkg_check_modules(LZ4 QUIET lz4) + endif() +endif() + +find_path(LZ4_INCLUDE_DIR lz4.h) +# HINTS ${LZ4_INCLUDEDIR} ${LZ4_INCLUDE_DIRS}) +find_library(LZ4_LIBRARY NAMES lz4 liblz4) +# HINTS ${LZ4_LIBDIR} ${LZ4_LIBRARY_DIRS}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(lz4 DEFAULT_MSG + LZ4_LIBRARY LZ4_INCLUDE_DIR) +mark_as_advanced(LZ4_INCLUDE_DIR LZ4_LIBRARY) + +set(LZ4_LIBRARIES ${LZ4_LIBRARY}) +set(LZ4_INCLUDE_DIRS ${LZ4_INCLUDE_DIR}) + +if(LZ4_FOUND AND NOT TARGET lz4::lz4) + add_library(lz4::lz4 UNKNOWN IMPORTED) + set_target_properties(lz4::lz4 PROPERTIES + IMPORTED_LOCATION "${LZ4_LIBRARIES}" + INTERFACE_INCLUDE_DIRECTORIES "${LZ4_INCLUDE_DIRS}") + if (NOT TARGET LZ4::LZ4 AND TARGET lz4::lz4) + add_library(LZ4::LZ4 ALIAS lz4::lz4) + endif () +endif() + +#TODO add version? diff --git a/cmake/external_dependencies.cmake b/cmake/external_dependencies.cmake index 0276425..2bc20fd 100644 --- a/cmake/external_dependencies.cmake +++ b/cmake/external_dependencies.cmake @@ -11,8 +11,8 @@ endif() function(find_package_or_fetch) set(options) - set(oneValueArgs CONAN_PKG_NAME PACKAGE_NAME GIT_REPOSITORY TAG) - set(multiValueArgs) + set(oneValueArgs CONAN_PKG_NAME PACKAGE_NAME GIT_REPOSITORY TAG SOURCE_SUBDIR) + set(multiValueArgs CMAKE_ARGS) cmake_parse_arguments(PARSE_ARGV 0 arg "${options}" "${oneValueArgs}" "${multiValueArgs}" ) @@ -29,7 +29,14 @@ function(find_package_or_fetch) if(FETCH_DEPENDENCIES_WITH_CMAKE STREQUAL "ON" OR FETCH_DEPENDENCIES_WITH_CMAKE STREQUAL "MISSING") if(NOT ${actual_pkg_name}_FOUND) message(STATUS "📦 Fetching ${arg_PACKAGE_NAME}") - FetchContent_Declare( + # Apply CMAKE_ARGS before fetching + foreach(cmake_arg ${arg_CMAKE_ARGS}) + string(REGEX MATCH "^([^=]+)=(.*)$" _ ${cmake_arg}) + if(CMAKE_MATCH_1) + set(${CMAKE_MATCH_1} ${CMAKE_MATCH_2} CACHE BOOL "" FORCE) + endif() + endforeach() + set(fetch_args ${arg_PACKAGE_NAME} GIT_SHALLOW TRUE GIT_REPOSITORY ${arg_GIT_REPOSITORY} @@ -37,6 +44,10 @@ function(find_package_or_fetch) GIT_PROGRESS TRUE SYSTEM EXCLUDE_FROM_ALL) + if(arg_SOURCE_SUBDIR) + list(APPEND fetch_args SOURCE_SUBDIR ${arg_SOURCE_SUBDIR}) + endif() + FetchContent_Declare(${fetch_args}) FetchContent_MakeAvailable(${arg_PACKAGE_NAME}) message(STATUS "\t✅ Fetched ${arg_PACKAGE_NAME}") else() @@ -79,6 +90,25 @@ if(NOT TARGET flatbuffers::flatbuffers) endif() unset(FLATBUFFERS_BUILD_TESTS CACHE) +# Fetching lz4 +# Disable bundled mode to allow shared libraries if needed +# lz4 is built as static by default if bundled +# set(LZ4_BUNDLED_MODE OFF CACHE BOOL "" FORCE) +# set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE) +find_package_or_fetch( + PACKAGE_NAME lz4 + GIT_REPOSITORY https://github.com/lz4/lz4.git + TAG v1.10.0 + SOURCE_SUBDIR build/cmake + CMAKE_ARGS + "LZ4_BUILD_CLI=OFF" + "LZ4_BUILD_LEGACY_LZ4C=OFF" +) + +if(NOT TARGET lz4::lz4) + add_library(lz4::lz4 ALIAS lz4) +endif() + if(SPARROW_IPC_BUILD_TESTS) find_package_or_fetch( PACKAGE_NAME doctest @@ -109,10 +139,18 @@ if(SPARROW_IPC_BUILD_TESTS) ) message(STATUS "\t✅ Fetched arrow-testing") - # Iterate over all the files in the arrow-testing-data source directiory. When it's a gz, extract in place. - file(GLOB_RECURSE arrow_testing_data_targz_files CONFIGURE_DEPENDS + # Fetch all the files in the 1.0.0-littleendian directory + file(GLOB_RECURSE arrow_testing_data_targz_files_littleendian CONFIGURE_DEPENDS "${arrow-testing_SOURCE_DIR}/data/arrow-ipc-stream/integration/1.0.0-littleendian/*.json.gz" ) + # Fetch all the files in the 2.0.0-compression directory + file(GLOB_RECURSE arrow_testing_data_targz_files_compression CONFIGURE_DEPENDS + "${arrow-testing_SOURCE_DIR}/data/arrow-ipc-stream/integration/2.0.0-compression/*.json.gz" + ) + + # Combine lists of files + list(APPEND arrow_testing_data_targz_files ${arrow_testing_data_targz_files_littleendian} ${arrow_testing_data_targz_files_compression}) + # Iterate over all the files in the arrow-testing-data source directory. When it's a gz, extract in place. foreach(file_path IN LISTS arrow_testing_data_targz_files) cmake_path(GET file_path PARENT_PATH parent_dir) cmake_path(GET file_path STEM filename) @@ -128,5 +166,4 @@ if(SPARROW_IPC_BUILD_TESTS) endif() endif() endforeach() - endif() diff --git a/conanfile.py b/conanfile.py index 59916f8..e2f251a 100644 --- a/conanfile.py +++ b/conanfile.py @@ -45,6 +45,8 @@ def configure(self): def requirements(self): self.requires("sparrow/1.0.0") self.requires(f"flatbuffers/{self._flatbuffers_version}") + self.requires("lz4/1.9.4") + #self.requires("zstd/1.5.5") if self.options.get_safe("build_tests"): self.test_requires("doctest/2.4.12") diff --git a/environment-dev.yml b/environment-dev.yml index 7a3f086..b05e4fd 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -8,8 +8,10 @@ dependencies: - cxx-compiler # Libraries dependencies - flatbuffers + - lz4 - nlohmann_json - sparrow-devel >=1.1.2 + # Testing dependencies - doctest # Documentation dependencies - doxygen diff --git a/include/sparrow_ipc/arrow_interface/arrow_array.hpp b/include/sparrow_ipc/arrow_interface/arrow_array.hpp index 2f1f72d..4faecf4 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array.hpp @@ -1,34 +1,86 @@ - #pragma once -#include +#include #include +#include #include "sparrow_ipc/config/config.hpp" +#include "sparrow_ipc/arrow_interface/arrow_array/private_data.hpp" namespace sparrow_ipc { - [[nodiscard]] SPARROW_IPC_API ArrowArray make_non_owning_arrow_array( + SPARROW_IPC_API void release_arrow_array_children_and_dictionary(ArrowArray* array); + + template + void arrow_array_release(ArrowArray* array) + { + SPARROW_ASSERT_TRUE(array != nullptr) + SPARROW_ASSERT_TRUE(array->release == std::addressof(arrow_array_release)) + + SPARROW_ASSERT_TRUE(array->private_data != nullptr); + + delete static_cast(array->private_data); + array->private_data = nullptr; + array->buffers = nullptr; // The buffers were deleted with the private data + + release_arrow_array_children_and_dictionary(array); + array->release = nullptr; + } + + template + void fill_arrow_array( + ArrowArray& array, int64_t length, int64_t null_count, int64_t offset, - std::vector&& buffers, size_t children_count, ArrowArray** children, - ArrowArray* dictionary - ); + ArrowArray* dictionary, + Arg&& private_data_arg + ) + { + SPARROW_ASSERT_TRUE(length >= 0); + SPARROW_ASSERT_TRUE(null_count >= -1); + SPARROW_ASSERT_TRUE(offset >= 0); - SPARROW_IPC_API void release_non_owning_arrow_array(ArrowArray* array); + array.length = length; + array.null_count = null_count; + array.offset = offset; + array.n_children = static_cast(children_count); + array.children = children; + array.dictionary = dictionary; - SPARROW_IPC_API void fill_non_owning_arrow_array( - ArrowArray& array, + auto private_data = new T(std::forward(private_data_arg)); + array.private_data = private_data; + array.n_buffers = private_data->n_buffers(); + array.buffers = private_data->buffers_ptrs(); + + array.release = &arrow_array_release; + } + + template + [[nodiscard]] ArrowArray make_arrow_array( int64_t length, int64_t null_count, int64_t offset, - std::vector&& buffers, size_t children_count, ArrowArray** children, - ArrowArray* dictionary - ); -} \ No newline at end of file + ArrowArray* dictionary, + Arg&& private_data_arg + ) + { + ArrowArray array{}; + fill_arrow_array( + array, + length, + null_count, + offset, + children_count, + children, + dictionary, + std::forward(private_data_arg) + ); + return array; + } +} diff --git a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp index 90e633f..5ad6c90 100644 --- a/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp +++ b/include/sparrow_ipc/arrow_interface/arrow_array/private_data.hpp @@ -1,5 +1,5 @@ #pragma once - +#include #include #include @@ -7,19 +7,40 @@ namespace sparrow_ipc { + template + concept ArrowPrivateData = requires(T& t) + { + { t.buffers_ptrs() } -> std::same_as; + { t.n_buffers() } -> std::convertible_to; + }; + + class owning_arrow_array_private_data + { + public: + + explicit owning_arrow_array_private_data(std::vector>&& buffers); + + [[nodiscard]] SPARROW_IPC_API const void** buffers_ptrs() noexcept; + [[nodiscard]] SPARROW_IPC_API std::size_t n_buffers() const noexcept; + + private: + std::vector> m_buffers; + std::vector m_buffer_pointers; + }; + class non_owning_arrow_array_private_data { public: explicit constexpr non_owning_arrow_array_private_data(std::vector&& buffers_pointers) - : m_buffers_pointers(std::move(buffers_pointers)) + : m_buffer_pointers(std::move(buffers_pointers)) { } [[nodiscard]] SPARROW_IPC_API const void** buffers_ptrs() noexcept; + [[nodiscard]] SPARROW_IPC_API std::size_t n_buffers() const noexcept; private: - - std::vector m_buffers_pointers; + std::vector m_buffer_pointers; }; } diff --git a/include/sparrow_ipc/compression.hpp b/include/sparrow_ipc/compression.hpp new file mode 100644 index 0000000..47d0cc3 --- /dev/null +++ b/include/sparrow_ipc/compression.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include +#include +#include + +#include "Message_generated.h" + +namespace sparrow_ipc +{ +// TODO use these later if needed for wrapping purposes (flatbuffers/lz4) +// enum class CompressionType +// { +// NONE, +// LZ4, +// ZSTD +// }; + +// CompressionType to_compression_type(org::apache::arrow::flatbuf::CompressionType compression_type); + + std::vector compress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); + std::vector decompress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data); +} diff --git a/include/sparrow_ipc/deserialize_primitive_array.hpp b/include/sparrow_ipc/deserialize_primitive_array.hpp index a1c5dad..76f7212 100644 --- a/include/sparrow_ipc/deserialize_primitive_array.hpp +++ b/include/sparrow_ipc/deserialize_primitive_array.hpp @@ -34,28 +34,45 @@ namespace sparrow_ipc nullptr, nullptr ); - const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count( - record_batch, - body, - buffer_index++ - ); - const auto primitive_buffer_metadata = record_batch.buffers()->Get(buffer_index++); - if (body.size() < (primitive_buffer_metadata->offset() + primitive_buffer_metadata->length())) + + const auto compression = record_batch.compression(); + std::vector> decompressed_buffers; + + auto validity_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); + + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); + + auto data_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); + + ArrowArray array; + if (compression) { - throw std::runtime_error("Primitive buffer exceeds body size"); + array = make_arrow_array( + record_batch.length(), + null_count, + 0, + 0, + nullptr, + nullptr, + std::move(decompressed_buffers) + ); } - auto primitives_ptr = const_cast(body.data() + primitive_buffer_metadata->offset()); - std::vector buffers = {bitmap_ptr, primitives_ptr}; - ArrowArray array = make_non_owning_arrow_array( - record_batch.length(), - null_count, - 0, - std::move(buffers), - 0, - nullptr, - nullptr - ); + else + { + auto primitives_ptr = const_cast(data_buffer_span.data()); + std::vector buffers = {bitmap_ptr, primitives_ptr}; + array = make_arrow_array( + record_batch.length(), + null_count, + 0, + 0, + nullptr, + nullptr, + std::move(buffers) + ); + } + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; return sparrow::primitive_array{std::move(ap)}; } -} \ No newline at end of file +} diff --git a/include/sparrow_ipc/deserialize_utils.hpp b/include/sparrow_ipc/deserialize_utils.hpp index fc1ca05..36f93ad 100644 --- a/include/sparrow_ipc/deserialize_utils.hpp +++ b/include/sparrow_ipc/deserialize_utils.hpp @@ -2,15 +2,35 @@ #include #include +#include #include #include #include "Message_generated.h" -#include "Schema_generated.h" namespace sparrow_ipc::utils { + /** + * @brief Extracts bitmap pointer and null count from a validity buffer span. + * + * This function calculates the number of null values represented by the bitmap. + * + * @param validity_buffer_span The validity buffer as a byte span. + * @param length The Arrow RecordBatch length (number of values in the array). + * + * @return A pair containing: + * - First: Pointer to the bitmap data (nullptr if buffer is empty) + * - Second: Count of null values in the bitmap (0 if buffer is empty) + * + * @note If the bitmap buffer is empty, returns {nullptr, 0} + * @note The returned pointer is a non-const cast of the original const data + */ + [[nodiscard]] std::pair get_bitmap_pointer_and_null_count( + std::span validity_buffer_span, + const int64_t length + ); + /** * @brief Extracts bitmap pointer and null count from a RecordBatch buffer. * @@ -28,9 +48,35 @@ namespace sparrow_ipc::utils * @note If the bitmap buffer has zero length, returns {nullptr, 0} * @note The returned pointer is a non-const cast of the original const data */ + // TODO to be removed when not used anymore (after adding compression to deserialize_fixedsizebinary_array) [[nodiscard]] std::pair get_bitmap_pointer_and_null_count( const org::apache::arrow::flatbuf::RecordBatch& record_batch, std::span body, size_t index ); -} \ No newline at end of file + + /** + * @brief Extracts a buffer from a RecordBatch and decompresses it if necessary. + * + * This function retrieves a buffer span from the specified index, increments the index, + * and applies decompression if specified. If the buffer is decompressed, the new + * data is stored in `decompressed_storage` and the returned span will point to this new data. + * + * @param record_batch The Arrow RecordBatch containing buffer metadata. + * @param body The raw buffer data as a byte span. + * @param buffer_index The index of the buffer to retrieve. This value is incremented by the function. + * @param compression The compression algorithm to use. If nullptr, no decompression is performed. + * @param decompressed_storage A vector that will be used to store the data of any decompressed buffers. + * + * @return A span viewing the resulting buffer data. This will be a view of the original + * `body` if no decompression occurs, or a view of the newly added buffer in + * `decompressed_storage` if decompression occurs. + */ + [[nodiscard]] std::span get_and_decompress_buffer( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + size_t& buffer_index, + const org::apache::arrow::flatbuf::BodyCompression* compression, + std::vector>& decompressed_storage + ); +} diff --git a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp index f6a5729..623776d 100644 --- a/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp +++ b/include/sparrow_ipc/deserialize_variable_size_binary_array.hpp @@ -31,35 +31,47 @@ namespace sparrow_ipc nullptr, nullptr ); - const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count( - record_batch, - body, - buffer_index++ - ); - const auto offset_metadata = record_batch.buffers()->Get(buffer_index++); - if ((offset_metadata->offset() + offset_metadata->length()) > body.size()) + const auto compression = record_batch.compression(); + std::vector> decompressed_buffers; + + auto validity_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); + + const auto [bitmap_ptr, null_count] = utils::get_bitmap_pointer_and_null_count(validity_buffer_span, record_batch.length()); + + auto offset_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); + auto data_buffer_span = utils::get_and_decompress_buffer(record_batch, body, buffer_index, compression, decompressed_buffers); + + ArrowArray array; + if (compression) { - throw std::runtime_error("Offset buffer exceeds body size"); + array = make_arrow_array( + record_batch.length(), + null_count, + 0, + 0, + nullptr, + nullptr, + std::move(decompressed_buffers) + ); } - auto offset_ptr = const_cast(body.data() + offset_metadata->offset()); - const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); - if ((buffer_metadata->offset() + buffer_metadata->length()) > body.size()) + else { - throw std::runtime_error("Data buffer exceeds body size"); + auto offset_ptr = const_cast(offset_buffer_span.data()); + auto buffer_ptr = const_cast(data_buffer_span.data()); + std::vector buffers = {bitmap_ptr, offset_ptr, buffer_ptr}; + array = make_arrow_array( + record_batch.length(), + null_count, + 0, + 0, + nullptr, + nullptr, + std::move(buffers) + ); } - auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); - std::vector buffers = {bitmap_ptr, offset_ptr, buffer_ptr}; - ArrowArray array = make_non_owning_arrow_array( - record_batch.length(), - null_count, - 0, - std::move(buffers), - 0, - nullptr, - nullptr - ); + sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; return T{std::move(ap)}; } -} \ No newline at end of file +} diff --git a/include/sparrow_ipc/serialize.hpp b/include/sparrow_ipc/serialize.hpp index 1ab8003..edc1752 100644 --- a/include/sparrow_ipc/serialize.hpp +++ b/include/sparrow_ipc/serialize.hpp @@ -26,6 +26,7 @@ namespace sparrow_ipc * @tparam R Container type that holds record batches (must support empty(), operator[], begin(), end()) * @param record_batches Collection of record batches to serialize. All batches must have identical * schemas. + * @param compression The compression type to use when serializing * * @return std::vector Binary serialized data containing schema, record batches, and * end-of-stream marker. Returns empty vector if input collection is empty. @@ -38,7 +39,7 @@ namespace sparrow_ipc */ template requires std::same_as, sparrow::record_batch> - std::vector serialize(const R& record_batches) + std::vector serialize(const R& record_batches, std::optional compression) { if (record_batches.empty()) { @@ -51,7 +52,7 @@ namespace sparrow_ipc ); } std::vector serialized_schema = serialize_schema_message(record_batches[0]); - std::vector serialized_record_batches = serialize_record_batches_without_schema_message(record_batches); + std::vector serialized_record_batches = serialize_record_batches_without_schema_message(record_batches, compression); serialized_schema.insert( serialized_schema.end(), std::make_move_iterator(serialized_record_batches.begin()), diff --git a/include/sparrow_ipc/serialize_utils.hpp b/include/sparrow_ipc/serialize_utils.hpp index 9ead8ea..b947832 100644 --- a/include/sparrow_ipc/serialize_utils.hpp +++ b/include/sparrow_ipc/serialize_utils.hpp @@ -8,6 +8,7 @@ #include "Message_generated.h" #include "sparrow_ipc/config/config.hpp" +#include "sparrow_ipc/compression.hpp" #include "sparrow_ipc/magic_values.hpp" #include "sparrow_ipc/utils.hpp" @@ -41,11 +42,12 @@ namespace sparrow_ipc * consists of a metadata section followed by a body section containing the actual data. * * @param record_batch The sparrow record batch to be serialized + * @param compression The compression type to use when serializing * @return std::vector A byte vector containing the complete serialized record batch * in Arrow IPC format, ready for transmission or storage */ [[nodiscard]] SPARROW_IPC_API std::vector - serialize_record_batch(const sparrow::record_batch& record_batch); + serialize_record_batch(const sparrow::record_batch& record_batch, std::optional compression); template requires std::same_as, sparrow::record_batch> @@ -59,17 +61,18 @@ namespace sparrow_ipc * * @tparam R The type of the record batch container/range (must be iterable) * @param record_batches A collection of record batches to be serialized + * @param compression The compression type to use when serializing * @return std::vector A byte vector containing the serialized data of all record batches * * @note The function uses move iterators to efficiently transfer the serialized data * from individual record batches to the output vector. */ - [[nodiscard]] std::vector serialize_record_batches_without_schema_message(const R& record_batches) + [[nodiscard]] std::vector serialize_record_batches_without_schema_message(const R& record_batches, std::optional compression) { std::vector output; for (const auto& record_batch : record_batches) { - const auto rb_serialized = serialize_record_batch(record_batch); + const auto rb_serialized = serialize_record_batch(record_batch, compression); output.insert( output.end(), std::make_move_iterator(rb_serialized.begin()), @@ -215,6 +218,24 @@ namespace sparrow_ipc std::vector& nodes ); + /** + * @brief Generates the compressed message body and buffer metadata for a record batch. + * + * This function traverses the record batch, compresses each buffer using the specified + * compression algorithm, and constructs the message body. For each compressed buffer, + * it is prefixed by its 8-byte uncompressed size. Padding is added after each + * compressed buffer to ensure 8-byte alignment. + * + * @param record_batch The record batch to serialize. + * @param compression_type The compression algorithm to use (e.g., LZ4_FRAME, ZSTD). + * @return A std::pair containing: + * - first: A vector of bytes representing the complete compressed message body. + * - second: A vector of FlatBuffer Buffer objects describing the offset and + * size of each buffer within the compressed body. + */ + [[nodiscard]] SPARROW_IPC_API std::pair, std::vector> + generate_compressed_body_and_buffers(const sparrow::record_batch& record_batch, const org::apache::arrow::flatbuf::CompressionType compression_type); + /** * @brief Creates a vector of Apache Arrow FieldNode objects from a record batch. * @@ -333,6 +354,8 @@ namespace sparrow_ipc * @param nodes Vector of field nodes describing the structure and null counts of columns * @param buffers Vector of buffer descriptors containing offset and length information * for the data buffers + * @param body_size The body size + * @param compression The compression type to use when serializing * * @return A FlatBufferBuilder containing the complete serialized message ready for * transmission or storage. The builder is finished and ready to be accessed @@ -345,7 +368,9 @@ namespace sparrow_ipc [[nodiscard]] SPARROW_IPC_API flatbuffers::FlatBufferBuilder get_record_batch_message_builder( const sparrow::record_batch& record_batch, const std::vector& nodes, - const std::vector& buffers + const std::vector& buffers, + const int64_t body_size, + std::optional compression ); /** @@ -359,6 +384,7 @@ namespace sparrow_ipc * - The record batch body containing the actual data buffers * * @param record_batch The sparrow record batch to serialize + * @param compression The compression type to use when serializing * @return std::vector A byte vector containing the serialized record batch * in Arrow IPC format, ready for transmission or storage * @@ -366,7 +392,7 @@ namespace sparrow_ipc * includes both metadata and data portions of the record batch */ [[nodiscard]] SPARROW_IPC_API std::vector - serialize_record_batch(const sparrow::record_batch& record_batch); + serialize_record_batch(const sparrow::record_batch& record_batch, std::optional compression); /** * @brief Adds padding bytes to a buffer to ensure 8-byte alignment. diff --git a/src/arrow_interface/arrow_array.cpp b/src/arrow_interface/arrow_array.cpp index ed0a0f2..a01006b 100644 --- a/src/arrow_interface/arrow_array.cpp +++ b/src/arrow_interface/arrow_array.cpp @@ -1,73 +1,40 @@ #include "sparrow_ipc/arrow_interface/arrow_array.hpp" -#include - #include -#include - -#include "sparrow_ipc/arrow_interface/arrow_array/private_data.hpp" -#include "sparrow_ipc/arrow_interface/arrow_array_schema_common_release.hpp" namespace sparrow_ipc { - void release_non_owning_arrow_array(ArrowArray* array) - { - SPARROW_ASSERT_FALSE(array == nullptr) - SPARROW_ASSERT_TRUE(array->release == std::addressof(release_non_owning_arrow_array)) - - release_common_non_owning_arrow(*array); - array->buffers = nullptr; // The buffers were deleted with the private data - } - - void fill_non_owning_arrow_array( - ArrowArray& array, - int64_t length, - int64_t null_count, - int64_t offset, - std::vector&& buffers, - size_t children_count, - ArrowArray** children, - ArrowArray* dictionary - ) - { - SPARROW_ASSERT_TRUE(length >= 0); - SPARROW_ASSERT_TRUE(null_count >= -1); - SPARROW_ASSERT_TRUE(offset >= 0); - - array.length = length; - array.null_count = null_count; - array.offset = offset; - array.n_buffers = static_cast(buffers.size()); - array.private_data = new non_owning_arrow_array_private_data(std::move(buffers)); - const auto private_data = static_cast(array.private_data); - array.buffers = private_data->buffers_ptrs(); - array.n_children = static_cast(children_count); - array.children = children; - array.dictionary = dictionary; - array.release = release_non_owning_arrow_array; - } - - ArrowArray make_non_owning_arrow_array( - int64_t length, - int64_t null_count, - int64_t offset, - std::vector&& buffers, - size_t children_count, - ArrowArray** children, - ArrowArray* dictionary - ) + void release_arrow_array_children_and_dictionary(ArrowArray* array) { - ArrowArray array{}; - fill_non_owning_arrow_array( - array, - length, - null_count, - offset, - std::move(buffers), - children_count, - children, - dictionary - ); - return array; + SPARROW_ASSERT_TRUE(array != nullptr) + + if (array->children) + { + for (int64_t i = 0; i < array->n_children; ++i) + { + ArrowArray* child = array->children[i]; + if (child) + { + if (child->release) + { + child->release(child); + } + delete child; + child = nullptr; + } + } + delete[] array->children; + array->children = nullptr; + } + + if (array->dictionary) + { + if (array->dictionary->release) + { + array->dictionary->release(array->dictionary); + } + delete array->dictionary; + array->dictionary = nullptr; + } } } diff --git a/src/arrow_interface/arrow_array/private_data.cpp b/src/arrow_interface/arrow_array/private_data.cpp index b133c8e..9c3738b 100644 --- a/src/arrow_interface/arrow_array/private_data.cpp +++ b/src/arrow_interface/arrow_array/private_data.cpp @@ -2,8 +2,33 @@ namespace sparrow_ipc { + owning_arrow_array_private_data::owning_arrow_array_private_data(std::vector>&& buffers) + : m_buffers(std::move(buffers)) + { + m_buffer_pointers.reserve(m_buffers.size()); + for (const auto& buffer : m_buffers) + { + m_buffer_pointers.push_back(buffer.data()); + } + } + + const void** owning_arrow_array_private_data::buffers_ptrs() noexcept + { + return m_buffer_pointers.data(); + } + + std::size_t owning_arrow_array_private_data::n_buffers() const noexcept + { + return m_buffers.size(); + } + const void** non_owning_arrow_array_private_data::buffers_ptrs() noexcept { - return const_cast(reinterpret_cast(m_buffers_pointers.data())); + return const_cast(reinterpret_cast(m_buffer_pointers.data())); + } + + std::size_t non_owning_arrow_array_private_data::n_buffers() const noexcept + { + return m_buffer_pointers.size(); } -} \ No newline at end of file +} diff --git a/src/compression.cpp b/src/compression.cpp new file mode 100644 index 0000000..5170903 --- /dev/null +++ b/src/compression.cpp @@ -0,0 +1,88 @@ +#include + +#include + +#include "sparrow_ipc/compression.hpp" + +namespace sparrow_ipc +{ +// CompressionType to_compression_type(org::apache::arrow::flatbuf::CompressionType compression_type) +// { +// switch (compression_type) +// { +// case org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME: +// return CompressionType::LZ4; +// // case org::apache::arrow::flatbuf::CompressionType::ZSTD: +// // // TODO: Add ZSTD support +// // break; +// default: +// return CompressionType::NONE; +// } +// } + + std::vector compress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data) + { + if (data.empty()) + { + return {}; + } + switch (compression_type) + { + case org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME: + { + const std::int64_t uncompressed_size = data.size(); + const size_t max_compressed_size = LZ4F_compressFrameBound(uncompressed_size, nullptr); + std::vector compressed_data(max_compressed_size); + const size_t compressed_size = LZ4F_compressFrame(compressed_data.data(), max_compressed_size, data.data(), uncompressed_size, nullptr); + if (LZ4F_isError(compressed_size)) + { + throw std::runtime_error("Failed to compress data with LZ4 frame format"); + } + compressed_data.resize(compressed_size); + return compressed_data; + } + default: + return {data.begin(), data.end()}; + } + } + + std::vector decompress(const org::apache::arrow::flatbuf::CompressionType compression_type, std::span data) + { + if (data.empty()) + { + return {}; + } + switch (compression_type) + { + case org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME: + { + if (data.size() < 8) + { + throw std::runtime_error("Invalid compressed data: missing decompressed size"); + } + const std::int64_t decompressed_size = *reinterpret_cast(data.data()); + const auto compressed_data = data.subspan(8); + + if (decompressed_size == -1) + { + return {compressed_data.begin(), compressed_data.end()}; + } + + std::vector decompressed_data(decompressed_size); + LZ4F_dctx* dctx = nullptr; + LZ4F_createDecompressionContext(&dctx, LZ4F_VERSION); + size_t compressed_size_in_out = compressed_data.size(); + size_t decompressed_size_in_out = decompressed_size; + size_t result = LZ4F_decompress(dctx, decompressed_data.data(), &decompressed_size_in_out, compressed_data.data(), &compressed_size_in_out, nullptr); + if (LZ4F_isError(result)) + { + throw std::runtime_error("Failed to decompress data with LZ4 frame format"); + } + LZ4F_freeDecompressionContext(dctx); + return decompressed_data; + } + default: + return {data.begin(), data.end()}; + } + } +} diff --git a/src/deserialize.cpp b/src/deserialize.cpp index 0d13072..1776673 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -49,7 +49,6 @@ namespace sparrow_ipc const std::vector>>& field_metadata ) { - const size_t length = static_cast(record_batch.length()); size_t buffer_index = 0; std::vector arrays; @@ -270,4 +269,4 @@ namespace sparrow_ipc } while (true); return record_batches; } -} \ No newline at end of file +} diff --git a/src/deserialize_fixedsizebinary_array.cpp b/src/deserialize_fixedsizebinary_array.cpp index 63ea213..427f600 100644 --- a/src/deserialize_fixedsizebinary_array.cpp +++ b/src/deserialize_fixedsizebinary_array.cpp @@ -2,6 +2,7 @@ namespace sparrow_ipc { + // TODO add compression here and tests (not available for this type in apache arrow integration tests files) sparrow::fixed_width_binary_array deserialize_non_owning_fixedwidthbinary( const org::apache::arrow::flatbuf::RecordBatch& record_batch, std::span body, @@ -33,14 +34,14 @@ namespace sparrow_ipc } auto buffer_ptr = const_cast(body.data() + buffer_metadata->offset()); std::vector buffers = {bitmap_ptr, buffer_ptr}; - ArrowArray array = make_non_owning_arrow_array( + ArrowArray array = make_arrow_array( record_batch.length(), null_count, 0, - std::move(buffers), 0, nullptr, - nullptr + nullptr, + std::move(buffers) ); sparrow::arrow_proxy ap{std::move(array), std::move(schema)}; return sparrow::fixed_width_binary_array{std::move(ap)}; diff --git a/src/deserialize_utils.cpp b/src/deserialize_utils.cpp index d89be6c..3eba4cc 100644 --- a/src/deserialize_utils.cpp +++ b/src/deserialize_utils.cpp @@ -1,7 +1,26 @@ #include "sparrow_ipc/deserialize_utils.hpp" +#include "sparrow_ipc/compression.hpp" + namespace sparrow_ipc::utils { + std::pair get_bitmap_pointer_and_null_count( + std::span validity_buffer_span, + const int64_t length + ) + { + if (validity_buffer_span.empty()) + { + return {nullptr, 0}; + } + auto ptr = const_cast(validity_buffer_span.data()); + const sparrow::dynamic_bitset_view bitmap_view{ + ptr, + static_cast(length) + }; + return {ptr, bitmap_view.null_count()}; + } + std::pair get_bitmap_pointer_and_null_count( const org::apache::arrow::flatbuf::RecordBatch& record_batch, std::span body, @@ -24,4 +43,27 @@ namespace sparrow_ipc::utils }; return {ptr, bitmap_view.null_count()}; } -} \ No newline at end of file + + std::span get_and_decompress_buffer( + const org::apache::arrow::flatbuf::RecordBatch& record_batch, + std::span body, + size_t& buffer_index, + const org::apache::arrow::flatbuf::BodyCompression* compression, + std::vector>& decompressed_storage + ) + { + const auto buffer_metadata = record_batch.buffers()->Get(buffer_index++); + if (body.size() < (buffer_metadata->offset() + buffer_metadata->length())) + { + throw std::runtime_error("Buffer metadata exceeds body size"); + } + auto buffer_span = body.subspan(buffer_metadata->offset(), buffer_metadata->length()); + + if (compression) + { + decompressed_storage.emplace_back(decompress(compression->codec(), buffer_span)); + buffer_span = decompressed_storage.back(); + } + return buffer_span; + } +} diff --git a/src/serialize_utils.cpp b/src/serialize_utils.cpp index ac1e026..d590021 100644 --- a/src/serialize_utils.cpp +++ b/src/serialize_utils.cpp @@ -1,4 +1,5 @@ #include +#include #include "sparrow_ipc/magic_values.hpp" #include "sparrow_ipc/serialize.hpp" @@ -181,6 +182,49 @@ namespace sparrow_ipc return buffers; } + namespace + { + void fill_compressed_body_and_buffers_recursive(const sparrow::arrow_proxy& arrow_proxy, std::vector& body, std::vector& flatbuf_buffers, int64_t& offset, const org::apache::arrow::flatbuf::CompressionType compression_type) + { + for (const auto& buffer : arrow_proxy.buffers()) + { + if (buffer.size() > 0) + { + auto compressed_buffer = compress(compression_type, {buffer.data(), buffer.size()}); + int64_t uncompressed_size = buffer.size(); + body.insert(body.end(), reinterpret_cast(&uncompressed_size), reinterpret_cast(&uncompressed_size) + sizeof(uncompressed_size)); + body.insert(body.end(), compressed_buffer.begin(), compressed_buffer.end()); + add_padding(body); + + flatbuf_buffers.emplace_back(offset, sizeof(uncompressed_size) + compressed_buffer.size()); + offset = body.size(); + } + else + { + flatbuf_buffers.emplace_back(offset, 0); + } + } + for (const auto& child : arrow_proxy.children()) + { + fill_compressed_body_and_buffers_recursive(child, body, flatbuf_buffers, offset, compression_type); + } + } + } // namespace + + std::pair, std::vector> + generate_compressed_body_and_buffers(const sparrow::record_batch& record_batch, const org::apache::arrow::flatbuf::CompressionType compression_type) + { + std::vector body; + std::vector flatbuf_buffers; + int64_t offset = 0; + for (const auto& column : record_batch.columns()) + { + const auto& arrow_proxy = sparrow::detail::array_access::get_arrow_proxy(column); + fill_compressed_body_and_buffers_recursive(arrow_proxy, body, flatbuf_buffers, offset, compression_type); + } + return {std::move(body), std::move(flatbuf_buffers)}; + } + void fill_body(const sparrow::arrow_proxy& arrow_proxy, std::vector& body) { for (const auto& buffer : arrow_proxy.buffers()) @@ -236,23 +280,29 @@ namespace sparrow_ipc flatbuffers::FlatBufferBuilder get_record_batch_message_builder( const sparrow::record_batch& record_batch, const std::vector& nodes, - const std::vector& buffers + const std::vector& buffers, + const int64_t body_size, + std::optional compression ) { flatbuffers::FlatBufferBuilder record_batch_builder; auto nodes_offset = record_batch_builder.CreateVectorOfStructs(nodes); auto buffers_offset = record_batch_builder.CreateVectorOfStructs(buffers); + flatbuffers::Offset compression_offset = 0; + if (compression) + { + compression_offset = org::apache::arrow::flatbuf::CreateBodyCompression(record_batch_builder, compression.value(), org::apache::arrow::flatbuf::BodyCompressionMethod::BUFFER); + } const auto record_batch_offset = org::apache::arrow::flatbuf::CreateRecordBatch( record_batch_builder, static_cast(record_batch.nb_rows()), nodes_offset, buffers_offset, - 0, // TODO: Compression + compression_offset, 0 // TODO :variadic buffer Counts ); - const int64_t body_size = calculate_body_size(record_batch); const auto record_batch_message_offset = org::apache::arrow::flatbuf::CreateMessage( record_batch_builder, org::apache::arrow::flatbuf::MetadataVersion::V5, @@ -265,18 +315,41 @@ namespace sparrow_ipc return record_batch_builder; } - std::vector serialize_record_batch(const sparrow::record_batch& record_batch) + std::vector serialize_record_batch(const sparrow::record_batch& record_batch, std::optional compression) { std::vector nodes = create_fieldnodes(record_batch); - std::vector flatbuf_buffers = get_buffers(record_batch); + + std::vector body; + std::vector flatbuf_buffers; + int64_t body_size = 0; + + if (compression) + { + std::tie(body, flatbuf_buffers) = generate_compressed_body_and_buffers(record_batch, compression.value()); + body_size = body.size(); + } + else + { + body = generate_body(record_batch); + flatbuf_buffers = get_buffers(record_batch); + body_size = calculate_body_size(record_batch); + } + flatbuffers::FlatBufferBuilder record_batch_builder = get_record_batch_message_builder( record_batch, nodes, - flatbuf_buffers + flatbuf_buffers, + body_size, + compression ); + const flatbuffers::uoffset_t record_batch_len = record_batch_builder.GetSize(); + const size_t metadata_size = continuation.size() + sizeof(record_batch_len) + record_batch_len; + const size_t padded_metadata_size = utils::align_to_8(metadata_size); + std::vector output; + output.reserve(padded_metadata_size + body.size()); + output.insert(output.end(), continuation.begin(), continuation.end()); - const flatbuffers::uoffset_t record_batch_len = record_batch_builder.GetSize(); output.insert( output.end(), reinterpret_cast(&record_batch_len), @@ -288,7 +361,6 @@ namespace sparrow_ipc record_batch_builder.GetBufferPointer() + record_batch_len ); add_padding(output); - std::vector body = generate_body(record_batch); output.insert(output.end(), std::make_move_iterator(body.begin()), std::make_move_iterator(body.end())); return output; } @@ -302,4 +374,4 @@ namespace sparrow_ipc ); } -} \ No newline at end of file +} diff --git a/tests/test_de_serialization_with_files.cpp b/tests/test_de_serialization_with_files.cpp index 8fe825b..13a504f 100644 --- a/tests/test_de_serialization_with_files.cpp +++ b/tests/test_de_serialization_with_files.cpp @@ -21,6 +21,9 @@ const std::filesystem::path arrow_testing_data_dir = ARROW_TESTING_DATA_DIR; const std::filesystem::path tests_resources_files_path = arrow_testing_data_dir / "data" / "arrow-ipc-stream" / "integration" / "1.0.0-littleendian"; +const std::filesystem::path tests_resources_files_path_with_compression = arrow_testing_data_dir / "data" / "arrow-ipc-stream" + / "integration" / "2.0.0-compression"; + const std::vector files_paths_to_test = { tests_resources_files_path / "generated_primitive", tests_resources_files_path / "generated_primitive_large_offsets", @@ -28,6 +31,14 @@ const std::vector files_paths_to_test = { // tests_resources_files_path / "generated_primitive_no_batches" }; +const std::vector files_paths_to_test_with_compression = { + tests_resources_files_path_with_compression / "generated_lz4", + tests_resources_files_path_with_compression/ "generated_uncompressible_lz4" +// tests_resources_files_path_with_compression / "generated_zstd" +// tests_resources_files_path_with_compression/ "generated_uncompressible_zstd" +}; + + size_t get_number_of_batches(const std::filesystem::path& json_path) { std::ifstream json_file(json_path); @@ -162,7 +173,54 @@ TEST_SUITE("Integration tests") std::span(stream_data) ); - const auto serialized_data = sparrow_ipc::serialize(record_batches_from_json); + const auto serialized_data = sparrow_ipc::serialize(record_batches_from_json, std::nullopt); + const auto deserialized_serialized_data = sparrow_ipc::deserialize_stream( + std::span(serialized_data) + ); + compare_record_batches(record_batches_from_stream, deserialized_serialized_data); + } + } + } + + TEST_CASE("Compare record_batch serialization with stream file using LZ4 compression") + { + for (const auto& file_path : files_paths_to_test_with_compression) + { + std::filesystem::path json_path = file_path; + json_path.replace_extension(".json"); + const std::string test_name = "Testing LZ4 compression with " + file_path.filename().string(); + SUBCASE(test_name.c_str()) + { + // Load the JSON file + auto json_data = load_json_file(json_path); + CHECK(json_data != nullptr); + + const size_t num_batches = get_number_of_batches(json_path); + std::vector record_batches_from_json; + for (size_t batch_idx = 0; batch_idx < num_batches; ++batch_idx) + { + INFO("Processing batch " << batch_idx << " of " << num_batches); + record_batches_from_json.emplace_back( + sparrow::json_reader::build_record_batch_from_json(json_data, batch_idx) + ); + } + + // Load stream file + std::filesystem::path stream_file_path = file_path; + stream_file_path.replace_extension(".stream"); + std::ifstream stream_file(stream_file_path, std::ios::in | std::ios::binary); + REQUIRE(stream_file.is_open()); + const std::vector stream_data( + (std::istreambuf_iterator(stream_file)), + (std::istreambuf_iterator()) + ); + stream_file.close(); + + // Process the stream file + const auto record_batches_from_stream = sparrow_ipc::deserialize_stream( + std::span(stream_data) + ); + const auto serialized_data = sparrow_ipc::serialize(record_batches_from_json, org::apache::arrow::flatbuf::CompressionType::LZ4_FRAME); const auto deserialized_serialized_data = sparrow_ipc::deserialize_stream( std::span(serialized_data) ); diff --git a/tests/test_serialize_utils.cpp b/tests/test_serialize_utils.cpp index 2997843..237c161 100644 --- a/tests/test_serialize_utils.cpp +++ b/tests/test_serialize_utils.cpp @@ -292,7 +292,8 @@ namespace sparrow_ipc auto record_batch = create_test_record_batch(); auto nodes = create_fieldnodes(record_batch); auto buffers = get_buffers(record_batch); - auto builder = get_record_batch_message_builder(record_batch, nodes, buffers); + auto body_size = calculate_body_size(record_batch); + auto builder = get_record_batch_message_builder(record_batch, nodes, buffers, body_size, std::nullopt); CHECK_GT(builder.GetSize(), 0); CHECK_NE(builder.GetBufferPointer(), nullptr); } @@ -303,7 +304,7 @@ namespace sparrow_ipc SUBCASE("Valid record batch") { auto record_batch = create_test_record_batch(); - auto serialized = serialize_record_batch(record_batch); + auto serialized = serialize_record_batch(record_batch, std::nullopt); CHECK_GT(serialized.size(), 0); // Check that it starts with continuation bytes @@ -335,7 +336,7 @@ namespace sparrow_ipc SUBCASE("Empty record batch") { auto empty_batch = sp::record_batch({}); - auto serialized = serialize_record_batch(empty_batch); + auto serialized = serialize_record_batch(empty_batch, std::nullopt); CHECK_GT(serialized.size(), 0); CHECK_GE(serialized.size(), continuation.size()); } @@ -348,7 +349,7 @@ namespace sparrow_ipc auto record_batch = create_test_record_batch(); auto schema_serialized = serialize_schema_message(record_batch); - auto record_batch_serialized = serialize_record_batch(record_batch); + auto record_batch_serialized = serialize_record_batch(record_batch, std::nullopt); CHECK_GT(schema_serialized.size(), 0); CHECK_GT(record_batch_serialized.size(), 0); @@ -361,4 +362,4 @@ namespace sparrow_ipc CHECK_EQ(schema_serialized.size() % 8, 0); } } -} \ No newline at end of file +}