From ac6eb6892844907fd96b1d798c06eb603167720c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Klaim=20=28Jo=C3=ABl=20Lamotte=29?= <142265+Klaim@users.noreply.github.com> Date: Wed, 14 Aug 2024 16:12:38 +0200 Subject: [PATCH 1/9] ci: Windows builds for both 64bit and 32bit, whatever the toolchain --- .github/workflows/windows.yml | 88 ++++++++++++++++++++++++++++------- .gitignore | 3 ++ vcpkg.json | 11 +++++ 3 files changed, 86 insertions(+), 16 deletions(-) create mode 100644 vcpkg.json diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index d0517e54f..04c83cd83 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -13,46 +13,88 @@ defaults: jobs: build: runs-on: ${{ matrix.runs-on }} - name: ${{ matrix.sys.compiler }} / ${{ matrix.build-system }} / ${{ matrix.config.name }} / date-polyfill ${{ matrix.sys.date-polyfill}} + name: ${{ matrix.toolchain.compiler }} / ${{ matrix.build-system.name }} / ${{ matrix.config.name }} / targets ${{ matrix.target-arch.name }} / date-polyfill ${{ matrix.toolchain.date-polyfill}} / Refactoring ${{ matrix.toolchain.build_refactoring }} strategy: fail-fast: false matrix: runs-on: [windows-latest] - sys: + toolchain: - { compiler: msvc, date-polyfill: 'ON', build_refactoring: 'OFF' } - { compiler: msvc, date-polyfill: 'OFF', build_refactoring: 'ON' } - { compiler: clang, date-polyfill: 'ON', version: 17, build_refactoring: 'OFF'} - { compiler: clang, date-polyfill: 'OFF', version: 17, build_refactoring: 'ON' } + target-arch: + - { + name: "Win64", + vs-flags: "-A x64", # only used by VS generator + cl-flags: "/arch (x64)", + gnu-flags: "-m64", + msvc-dev-cmd: "win64" + } + - { + name: "Win32", + vs-flags: "-A Win32", # only used by VS generator + cl-flags: "/arch (x86)", + gnu-flags: "-m32", + msvc-dev-cmd: "win32" + } config: - { name: Debug } - { name: Release } build-system: - - "Visual Studio 17 2022" - - "Ninja" + - { name: "Visual Studio 17 2022" } + - { name: "Ninja" } steps: - - name: Setup MSVC - if: matrix.sys.compiler == 'msvc' && matrix.build-system != 'Visual Studio 17 2022' + - name: Setup MSVC (only for non-VS buildsystems) + if: matrix.toolchain.compiler == 'msvc' && !startsWith(matrix.build-system.name, 'Visual Studio') uses: ilammy/msvc-dev-cmd@v1 + with: + # Ninja will take cues of which target architecture to use from the (powershell) + # msvc environment so we need to setup everything at this level. + arch: ${{matrix.target-arch.msvc-dev-cmd}} - name: Install LLVM and Clang - if: matrix.sys.compiler == 'clang' + if: matrix.toolchain.compiler == 'clang' uses: egor-tensin/setup-clang@v1 with: - version: ${{matrix.sys.version}} + version: ${{matrix.toolchain.version}} platform: x64 - name: Setup clang - if: matrix.sys.compiler == 'clang' + if: matrix.toolchain.compiler == 'clang' run: | echo "CC=clang" >> $GITHUB_ENV echo "CXX=clang++" >> $GITHUB_ENV + - name: Setup gnu compilers + if: matrix.toolchain.compiler != 'msvc' + run: | + echo "CMAKE_CXX_FLAGS=${{matrix.target-arch.gnu-flags}}" >> $GITHUB_ENV + echo "CMAKE_CXX_LINK_FLAGS=${{matrix.target-arch.gnu-flags}}" >> $GITHUB_ENV + + - name: Setup Ninja + msvc + if: matrix.toolchain.compiler == 'msvc' && matrix.build-system.name == 'Ninja' + run: | + echo "CMAKE_CXX_FLAGS=${{matrix.target-arch.cl-flags}}" >> $GITHUB_ENV + echo "CMAKE_CXX_LINK_FLAGS=${{matrix.target-arch.cl-flags}}" >> $GITHUB_ENV + + - name: Setup Visual Studio + if: startsWith(matrix.build-system.name, 'Visual Studio') + run: | + echo "GENERATOR_EXTRA_FLAGS=${{matrix.target-arch.vs-flags}}" >> $GITHUB_ENV + - name: Checkout code uses: actions/checkout@v4 + - name: Setup vcpkg environment + if: matrix.target-arch.name == 'Win32' + run: | + vcpkg install --triplet=x86-windows + - name: Set conda environment + if: matrix.target-arch.name == 'Win64' uses: mamba-org/setup-micromamba@main with: environment-name: myenv @@ -62,11 +104,25 @@ jobs: create-args: | ninja + - name: Set dependencies install prefix dir for 64bit + if: matrix.target-arch.name == 'Win64' + run: | + echo "SPARROW_DEPS_PREFIX=$CONDA_PREFIX" >> $GITHUB_ENV + echo "SPARROW_INSTALL_PREFIX=$CONDA_PREFIX" >> $GITHUB_ENV + + - name: Set dependencies install prefix dir for 32bit + if: matrix.target-arch.name == 'Win32' + run: | + echo "SPARROW_DEPS_PREFIX=$VCPKG_INSTALLED_DIR" >> $GITHUB_ENV + echo "SPARROW_INSTALL_PREFIX=$VCPKG_INSTALLED_DIR" >> $GITHUB_ENV + echo "VCPKG_TARGET_TRIPLET='x86-windows'" >> $GITHUB_ENV + echo "CMAKE_TOOLCHAIN_FILE=$VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake" >> $GITHUB_ENV + - name: Configure using CMake - run: cmake -Bbuild -DCMAKE_BUILD_TYPE:STRING=${{matrix.config.name}} -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DBUILD_TESTS=ON -DBUILD_EXAMPLES=ON -DUSE_DATE_POLYFILL=${{matrix.sys.date-polyfill}} -DBUILD_REFACTORING=${{matrix.sys.build_refactoring}} -G "${{matrix.build-system}}" + run: cmake -S ./ -B ./build -DCMAKE_BUILD_TYPE:STRING=${{matrix.config.name}} -DCMAKE_PREFIX_PATH=$SPARROW_DEPS_PREFIX -DCMAKE_INSTALL_PREFIX=$SPARROW_INSTALL_PREFIX -DBUILD_TESTS=ON -DBUILD_EXAMPLES=ON -DUSE_DATE_POLYFILL=${{matrix.toolchain.date-polyfill}} -DBUILD_REFACTORING=${{matrix.toolchain.build_refactoring}} -G "${{matrix.build-system.name}}" $GENERATOR_EXTRA_FLAGS - name: Build - if: matrix.sys.build_refactoring == 'ON' + if: matrix.toolchain.build_refactoring == 'ON' working-directory: build run: cmake --build . --config ${{matrix.config.name}} --target sparrow --parallel 8 @@ -86,24 +142,24 @@ jobs: uses: actions/upload-artifact@v4 if: success() || failure() with: - name: test_sparrow_lib_report_Windows_${{ matrix.sys.compiler }}_${{ matrix.build-system }}_${{ matrix.config.name }}_date-polyfill_${{ matrix.sys.date-polyfill}} + name: test_sparrow_lib_report_Windows_${{ matrix.toolchain.compiler }}_${{ matrix.build-system.name }}_${{ matrix.config.name }}_${{ matrix.target-arch.name }}_date-polyfill_${{ matrix.toolchain.date-polyfill}} path: '**/test_sparrow_lib_report.xml' - name: Build refactoring - if: matrix.sys.build_refactoring == 'ON' + if: matrix.toolchain.build_refactoring == 'ON' working-directory: build run: cmake --build . --config ${{matrix.config.name}} --target test_sparrow_lib_v01 --parallel 8 - name: Run refactoring tests - if: matrix.sys.build_refactoring == 'ON' + if: matrix.toolchain.build_refactoring == 'ON' working-directory: build run: cmake --build . --config ${{matrix.config.name}} --target run_tests_with_junit_report_v01 - name: Upload refactoring test results uses: actions/upload-artifact@v4 - if: matrix.sys.build_refactoring == 'ON' && (success() || failure()) + if: matrix.toolchain.build_refactoring == 'ON' && (success() || failure()) with: - name: test_sparrow_lib_report_Windows_v01_${{ matrix.sys.compiler }}_${{ matrix.build-system }}_${{ matrix.config.name }}_date-polyfill_${{ matrix.sys.date-polyfill}} + name: test_sparrow_lib_report_Windows_v01_${{ matrix.toolchain.compiler }}_${{ matrix.target-arch.name }}_${{ matrix.build-system }}_${{ matrix.config.name }}_date-polyfill_${{ matrix.toolchain.date-polyfill}}_refactoring_${{ matrix.toolchain.build_refactoring }} path: '**/test_sparrow_lib_report_v01.xml' - name: Run all examples diff --git a/.gitignore b/.gitignore index 0102ea9e9..c1538172e 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,6 @@ # Clangd cache .cache CMakeUserPresets.json + +# Vcpkg packages +/vcpkg_installed/ diff --git a/vcpkg.json b/vcpkg.json new file mode 100644 index 000000000..775f6984f --- /dev/null +++ b/vcpkg.json @@ -0,0 +1,11 @@ +{ + "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg-tool/main/docs/vcpkg.schema.json", + "dependencies": [ + "doctest", + "date", + { + "name": "vcpkg-tool-ninja", + "host": true + } + ] + } From 350d7993ff92abaef53d53c5044036d79be3e4a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Klaim=20=28Jo=C3=ABl=20Lamotte=29?= <142265+Klaim@users.noreply.github.com> Date: Mon, 16 Sep 2024 18:28:21 +0200 Subject: [PATCH 2/9] 32bit support - added special conversion functions `to_native_size`, `to_native_offset` and `to_arrow_length`; - used these functions where necessary; - setup options for users to decide - if they want runtime checks on sizes/lengths/offsets - if they want to limit possible sizes/lengths/offsets to 32bit values --- CMakeLists.txt | 13 ++ include/sparrow/array/array_data_factory.hpp | 24 +-- include/sparrow/array/data_type.hpp | 163 ++++++++++++++++-- include/sparrow/array/external_array_data.hpp | 10 +- include/sparrow/arrow_array_schema_proxy.hpp | 39 +++-- .../arrow_array_schema_info_utils.hpp | 4 +- .../arrow_array_schema_utils.hpp | 4 +- include/sparrow/buffer/dynamic_bitset.hpp | 10 +- include/sparrow/c_interface.hpp | 4 +- include/sparrow/config/config.hpp | 34 +++- include/sparrow/layout/fixed_size_layout.hpp | 20 +-- .../layout/list_layout/list_layout.hpp | 48 +++--- .../list_layout_value_iterator.hpp | 21 ++- include/sparrow/layout/null_layout.hpp | 2 +- .../layout/variable_size_binary_layout.hpp | 61 ++++--- include/sparrow/utils/bit.hpp | 2 +- include/sparrow/utils/nullable.hpp | 34 ++-- test/test_fixed_size_layout.cpp | 4 +- test/test_list_layout.cpp | 26 +-- 19 files changed, 361 insertions(+), 162 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 260dce8c8..3a185d916 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,6 +50,8 @@ OPTION(BUILD_DOCS "Build sparrow documentation" OFF) OPTION(BUILD_EXAMPLES "Build sparrow examples" OFF) OPTION(USE_DATE_POLYFILL "Use date polyfill implementation" ON) OPTION(BUILD_REFACTORING "Build the refactoring target" OFF) +OPTION(SPAROW_ENABLE_SIZE_CHECKS "Enables runtime checks that values of sizes/lengths/offsets from the various C++ types are always in range of the supported Arrow lengths" OFF) +OPTION(SPAROW_ENABLE_32BIT_SIZE_LIMIT "Once enabled, all the sizes are assumed to be in the range of 32bit representation. Can be checked at runtime with `SPAROW_ENABLE_SIZE_CHECKS=ON`" OFF) include(CheckCXXSymbolExists) @@ -67,6 +69,7 @@ if(LIBCPP) add_compile_definitions(_LIBCPP_DISABLE_AVAILABILITY) endif() + # Linter options # ============= @@ -208,6 +211,16 @@ if (BUILD_TESTS) endif () endif () +if (SPAROW_ENABLE_SIZE_CHECKS) + target_compile_definitions(sparrow INTERFACE SPAROW_ENABLE_SIZE_CHECKS 1) + message("-> sparrow config: size limit runtime checks enabled (SPAROW_ENABLE_SIZE_CHECKS=ON): sizes will be checked at runtime to be between zero and max(in64_t) (or max(int32_t if SPAROW_ENABLE_32BIT_SIZE_LIMIT=ON)") +endif() + +if (SPAROW_ENABLE_32BIT_SIZE_LIMIT) + target_compile_definitions(sparrow INTERFACE SPAROW_ENABLE_32BIT_SIZE_LIMIT) + message("-> sparrow config: 32bit size limit enabled (SPAROW_ENABLE_32BIT_SIZE_LIMIT=ON): valid size values are restricted to range zero to max(int32_t)") +endif() + # Docs # ==== diff --git a/include/sparrow/array/array_data_factory.hpp b/include/sparrow/array/array_data_factory.hpp index 30e35066b..67c11aa00 100644 --- a/include/sparrow/array/array_data_factory.hpp +++ b/include/sparrow/array/array_data_factory.hpp @@ -98,7 +98,7 @@ namespace sparrow { return { .type = data_descriptor(arrow_type_id()), - .length = static_cast(size), + .length = to_arrow_length(size), .offset = 0, .bitmap = {}, .buffers = {}, @@ -200,7 +200,7 @@ namespace sparrow return { .type = data_descriptor(arrow_type_id()), - .length = static_cast(size), + .length = to_arrow_length(size), .offset = offset, .bitmap = detail::make_array_data_bitmap(std::forward(bitmap)), .buffers = std::move(buffers), @@ -281,7 +281,7 @@ namespace sparrow std::vector buffers(2); buffers[0].resize(sizeof(std::int64_t) * (values_size + 1), 0); - size_t acc_size = 0; + size_t acc_size = 0; auto bitmap_iter = bitmap.begin(); auto value_iter = values.begin(); for(std::size_t i = 0;i < values_size; ++i, ++bitmap_iter, ++value_iter) @@ -296,8 +296,8 @@ namespace sparrow auto iter = buffers[1].begin(); const auto offsets = buffers[0].data(); offsets[0] = 0; - - + + value_iter = values.begin(); bitmap_iter = bitmap.begin(); for(std::size_t i = 0; i < values_size; ++i, ++bitmap_iter, ++value_iter) @@ -308,7 +308,7 @@ namespace sparrow SPARROW_ASSERT_TRUE( std::cmp_less(unwraped_value.size(), std::numeric_limits::max()) ); - offsets[i + 1] = offsets[i] + static_cast(unwraped_value.size()); + offsets[i + 1] = offsets[i] + to_arrow_length(unwraped_value.size()); std::ranges::copy(unwraped_value, iter); std::advance(iter, unwraped_value.size()); } @@ -316,13 +316,13 @@ namespace sparrow { offsets[i + 1] = offsets[i]; } - } + } using T = std::unwrap_ref_decay_t>>; using U = get_corresponding_arrow_type_t; return { .type = data_descriptor(arrow_type_id()), - .length = static_cast(values.size()), + .length = to_arrow_length(values.size()), .offset = offset, .bitmap = detail::make_array_data_bitmap(std::forward(bitmap)), .buffers = std::move(buffers), @@ -460,7 +460,7 @@ namespace sparrow }; return { .type = data_descriptor(arrow_type_id()), - .length = static_cast(indexes.size()), + .length = to_arrow_length(indexes.size()), .offset = offset, .bitmap = detail::make_array_data_bitmap(std::forward(bitmap)), .buffers = {create_buffer()}, @@ -619,16 +619,16 @@ namespace sparrow /** * \brief Creates a default array_data object with the specified layout and values. - * + * * @tparam Layout The layout of the array_data object. * @tparam T The type of the value to be repeated. - * + * * @param n The number of times the value should be repeated. * @param value The value to be repeated. */ template requires is_arrow_base_type_extended> - array_data make_default_array_data( + array_data make_default_array_data( typename Layout::size_type n , T && value) { diff --git a/include/sparrow/array/data_type.hpp b/include/sparrow/array/data_type.hpp index 0ab2abb33..60744c856 100644 --- a/include/sparrow/array/data_type.hpp +++ b/include/sparrow/array/data_type.hpp @@ -28,7 +28,9 @@ namespace date = std::chrono; #include #include #include +#include +#include "sparrow/config/config.hpp" #include "sparrow/layout/list_layout/list_value.hpp" #include "sparrow/utils/contracts.hpp" #include "sparrow/utils/mp_utils.hpp" @@ -113,11 +115,135 @@ namespace sparrow { }; - inline bool operator==(const null_type&, const null_type&) + inline bool operator==(const null_type&, const null_type&) noexcept { return true; } + + using arrow_length = std::int64_t; + + /// Maximum size allowed for arrow sizes. + /// This can be constrained to 32bit signed sizes using configuration options. + /// See: https://arrow.apache.org/docs/format/Columnar.html#array-lengths + static constexpr arrow_length max_arrow_length = config::enable_32bit_size_limit + ? std::numeric_limits::max() + : std::numeric_limits::max(); + + /// @returns True if the provided value is between zero and max_arrow_length. + template + inline constexpr + bool is_valid_arrow_length(T size_or_offset, bool allow_negatives = false) noexcept + { + return std::in_range(size_or_offset) + and (allow_negatives or size_or_offset >= T(0)) + and size_or_offset <= max_arrow_length + ; + } + + /// Throws `std::runtime_error` if the provided value is not in the + /// valid range of arrow length values or if the value is not representable + /// in the specified `TargetRepr` type. + /// The checks is only enabled if `config::enable_size_limit_runtime_check == true`, + /// otherwise this function is no-op. + template + inline constexpr + void throw_if_invalid_size(std::integral auto size_or_offset, bool allow_negatives = false) + { + if constexpr (config::enable_size_limit_runtime_check) + { + // checks that the value is in range of arrow length in general + if (not is_valid_arrow_length(size_or_offset, allow_negatives)) + { + throw std::runtime_error( // TODO: replace by sparrow-specific error + std::string("size/length/offset is outside the valid arrow length limits [0:") + + std::to_string(max_arrow_length) + "] : " + std::to_string(size_or_offset) + "(" + + typeid(size_or_offset).name() + ") " + /* + TODO: when possible, replace by + std::format( + "size/length/offset is outside the valid arrow length limits [0:{}] : {} ({})", + max_arrow_length, + size_or_offset, + typeid(size_or_offset).name() + )*/ + ); + } + + if constexpr (not std::same_as) // Already checked in `is_valid_size()` + // otherwise + { + // check that the value is representable by the specified type TargetRepr + if (not std::in_range(size_or_offset)) + { + throw std::runtime_error( // TODO: replace by sparrow-specific error + std::string("size/length/offset cannot be represented by ") + typeid(TargetRepr).name() + + ": " + std::to_string(size_or_offset) + " (" + typeid(size_or_offset).name() + ")" + /* + TODO: when possible, replace by + + std::format( + "size/length/offset cannot be represented by {} : {} ({})", + typeid(TargetRepr).name(), + size_or_offset, + typeid(size_or_offset).name() + )*/ + ); + } + } + } + } + + /// @returns The provided arrow length value as represented by the native standard size type `std::size_t`. + /// If `config::enable_size_limit_runtime_check == true` it will also check that the value is + /// a valid arrow length and representable by `std::size_t` or throws otherwise. + /// @throws @see `throw_if_invalid_size()` for details. + inline constexpr + auto to_native_size(arrow_length length) -> std::size_t + { + throw_if_invalid_size(length); + return static_cast(length); + } + + /// @returns The provided arrow length value as represented by the native standard offset type `std::ptrdiff_t`. + /// If `config::enable_size_limit_runtime_check == true` it will also check that the value is + /// a valid arrow length and representable by `std::ptrdiff_t` or throws otherwise. + /// @throws @see `throw_if_invalid_size()` for details. + inline constexpr + auto to_native_offset(arrow_length offset) -> std::ptrdiff_t + { + throw_if_invalid_size(offset, true); + return static_cast(offset); + } + + /// @returns The provided size or offset value as represented by an arrow-length type (int64_t). + /// If `config::enable_size_limit_runtime_check == true` it will also check that the value is + /// a valid arrow length and representable in `int64_t` or throws otherwise. + /// @throws @see `throw_if_invalid_size()` for details. + inline constexpr + auto to_arrow_length(std::integral auto size_or_offset, bool allow_negatives = false) -> arrow_length + { + throw_if_invalid_size(size_or_offset, allow_negatives); + return static_cast(size_or_offset); + } + + /// @returns The sum of the provided offsets with `R` representation, whatever the offset types as long as + /// they are integrals and can represent and arrow length. + /// If `config::enable_size_limit_runtime_check == true` it will also check that each value and the resulting sum are + /// valid arrow lengths and representable in the specified result `R` or throws otherwise. + /// @throws @see `throw_if_invalid_size()` for details. + template + inline constexpr + auto sum_arrow_offsets(std::integral auto... offsets) -> R + { + // We cast every offset as an arrow length, then sum them as arrow length representation, + // and finally cast back to the expected result type after verifying that + // the resulting value is still a valid arrow length. + auto result = (to_arrow_length(offsets, true) + ...); + throw_if_invalid_size(result, false); // dont allow negatives as the result must be a size + return static_cast(result); + } + /// Runtime identifier of arrow data types, usually associated with raw bytes with the associated value. // TODO: does not support all types specified by the Arrow specification // yet @@ -312,7 +438,7 @@ namespace sparrow } mpl::unreachable(); } - + /// @returns The default floating-point `data_type` that should be associated with the provided type. /// The deduction will be based on the size of the type. Calling this function with unsupported sizes /// will not compile. @@ -446,35 +572,36 @@ namespace sparrow template constexpr size_t primitive_bytes_count(data_type data_type, T length) { + // TODO: check that the obtained size is not bigger than the limit set by options SPARROW_ASSERT_TRUE(data_type_is_primitive(data_type)); + const auto size = to_native_size(to_arrow_length(length)); constexpr double bit_per_byte = 8.; switch (data_type) { - + case data_type::BOOL: - return static_cast(std::ceil(static_cast(length) / bit_per_byte)); + return static_cast(std::ceil(static_cast(size) / bit_per_byte)); case data_type::UINT8: - // TODO: Replace static_cast by the 32 bit fix check function case data_type::INT8: - return static_cast(length); + return size; case data_type::UINT16: - return (sizeof(std::uint16_t) / sizeof(std::uint8_t)) * static_cast(length); + return (sizeof(std::uint16_t) / sizeof(std::uint8_t)) * size; case data_type::INT16: - return (sizeof(std::int16_t) / sizeof(std::uint8_t)) * static_cast(length); + return (sizeof(std::int16_t) / sizeof(std::uint8_t)) * size; case data_type::UINT32: - return (sizeof(std::uint32_t) / sizeof(std::uint8_t)) * static_cast(length); + return (sizeof(std::uint32_t) / sizeof(std::uint8_t)) * size; case data_type::INT32: - return (sizeof(std::int32_t) / sizeof(std::uint8_t)) * static_cast(length); + return (sizeof(std::int32_t) / sizeof(std::uint8_t)) * size; case data_type::UINT64: - return (sizeof(std::uint64_t) / sizeof(std::uint8_t)) * static_cast(length); + return (sizeof(std::uint64_t) / sizeof(std::uint8_t)) * size; case data_type::INT64: - return (sizeof(std::int64_t) / sizeof(std::uint8_t)) * static_cast(length); + return (sizeof(std::int64_t) / sizeof(std::uint8_t)) * size; case data_type::HALF_FLOAT: - return (sizeof(float16_t) / sizeof(std::uint8_t)) * static_cast(length); + return (sizeof(float16_t) / sizeof(std::uint8_t)) * size; case data_type::FLOAT: - return (sizeof(float32_t) / sizeof(std::uint8_t)) * static_cast(length); + return (sizeof(float32_t) / sizeof(std::uint8_t)) * size; case data_type::DOUBLE: - return (sizeof(float64_t) / sizeof(std::uint8_t)) * static_cast(length); + return (sizeof(float64_t) / sizeof(std::uint8_t)) * size; default: throw std::runtime_error("Unsupported data type"); } @@ -570,6 +697,7 @@ namespace sparrow { }; } + /// Matches valid and complete `arrow_traits` specializations for type T. /// Every type that needs to be compatible with this library's interface must /// provide a specialization of `arrow_traits` @@ -618,7 +746,7 @@ namespace sparrow return arrow_type_id(); } - /// @returns Format string matching the arrow data-type mathcing the provided + /// @returns Format string matching the arrow data-type matching the provided /// arrow type. template constexpr std::string_view data_type_format_of() @@ -675,4 +803,7 @@ namespace sparrow template concept layout_offset = std::same_as || std::same_as; + + + } diff --git a/include/sparrow/array/external_array_data.hpp b/include/sparrow/array/external_array_data.hpp index 3fb7a471a..1a6fd4ff4 100644 --- a/include/sparrow/array/external_array_data.hpp +++ b/include/sparrow/array/external_array_data.hpp @@ -272,7 +272,7 @@ namespace sparrow inline void external_array_data::build_children() { - const auto size = static_cast(array().n_children); + const auto size = to_native_size(array().n_children); m_children.reserve(size); for (std::size_t i = 0; i < size; ++i) { @@ -327,7 +327,7 @@ namespace sparrow { using return_type = external_array_data::bitmap_type; return return_type(impl::buffer_at(data, 0u), - static_cast(length(data))); + to_native_size(length(data))); } inline std::size_t buffers_size(const external_array_data& data) @@ -338,7 +338,7 @@ namespace sparrow return std::size_t(0); } // The first buffer in external data is used for the bitmap - return static_cast(data.array().n_buffers - 1); + return sum_arrow_offsets(data.array().n_buffers, - 1); } inline external_array_data::buffer_type @@ -347,12 +347,12 @@ namespace sparrow using return_type = external_array_data::buffer_type; // The first buffer in external data is used for the bitmap return return_type(impl::buffer_at(data, i + 1u), - static_cast(length(data))); + to_native_size(length(data))); } inline std::size_t child_data_size(const external_array_data& data) { - return static_cast(data.array().n_children); + return to_native_size(data.array().n_children); } inline const external_array_data& child_data_at(const external_array_data& data, std::size_t i) diff --git a/include/sparrow/arrow_array_schema_proxy.hpp b/include/sparrow/arrow_array_schema_proxy.hpp index 7743221d1..61bde0189 100644 --- a/include/sparrow/arrow_array_schema_proxy.hpp +++ b/include/sparrow/arrow_array_schema_proxy.hpp @@ -267,7 +267,7 @@ namespace sparrow inline void arrow_proxy::update_buffers() { m_buffers.clear(); - const auto buffer_count = static_cast(array().n_buffers); + const auto buffer_count = to_native_size(array().n_buffers); m_buffers.reserve(buffer_count); const auto data_type = format_to_data_type(schema().format); const std::vector buffers_type = get_buffer_types_from_data_type(data_type); @@ -551,18 +551,18 @@ namespace sparrow [[nodiscard]] inline size_t arrow_proxy::length() const { SPARROW_ASSERT_TRUE(array().length >= 0); - SPARROW_ASSERT_TRUE(std::cmp_less(array().length, std::numeric_limits::max())); - return static_cast(array().length); + SPARROW_ASSERT_TRUE(std::cmp_less(array().length, max_arrow_length)); + return to_native_size(array().length); } inline void arrow_proxy::set_length(size_t length) { - SPARROW_ASSERT_TRUE(std::cmp_less(length, std::numeric_limits::max())); + SPARROW_ASSERT_TRUE(std::cmp_less(length, max_arrow_length)); if (!array_created_with_sparrow()) { throw arrow_proxy_exception("Cannot set length on non-sparrow created ArrowArray"); } - array().length = static_cast(length); + array().length = to_arrow_length(length); } [[nodiscard]] inline int64_t arrow_proxy::null_count() const @@ -581,41 +581,41 @@ namespace sparrow [[nodiscard]] inline size_t arrow_proxy::offset() const { - return static_cast(array().offset); + return to_native_size(array().offset); } inline void arrow_proxy::set_offset(size_t offset) { - SPARROW_ASSERT_TRUE(std::cmp_less(offset, std::numeric_limits::max())); + SPARROW_ASSERT_TRUE(std::cmp_less(offset, max_arrow_length)); if (!array_created_with_sparrow()) { throw arrow_proxy_exception("Cannot set offset on non-sparrow created ArrowArray"); } - array().offset = static_cast(offset); + array().offset = to_arrow_length(offset); } [[nodiscard]] inline size_t arrow_proxy::n_buffers() const { - return static_cast(array().n_buffers); + return to_native_size(array().n_buffers); } inline void arrow_proxy::set_n_buffers(size_t n_buffers) { - SPARROW_ASSERT_TRUE(std::cmp_less(n_buffers, std::numeric_limits::max())); + SPARROW_ASSERT_TRUE(std::cmp_less(n_buffers, max_arrow_length)); if (!array_created_with_sparrow()) { throw arrow_proxy_exception("Cannot set n_buffers on non-sparrow created ArrowArray"); } - array().n_buffers = static_cast(n_buffers); + array().n_buffers = to_arrow_length(n_buffers); arrow_array_private_data* private_data = get_array_private_data(); private_data->resize_buffers(n_buffers); array().buffers = private_data->buffers_ptrs(); - array().n_buffers = static_cast(n_buffers); + array().n_buffers = to_arrow_length(n_buffers); } [[nodiscard]] inline size_t arrow_proxy::n_children() const { - return static_cast(array().n_children); + return to_native_size(array().n_children); } template @@ -660,7 +660,7 @@ namespace sparrow inline void arrow_proxy::resize_children(size_t children_count) { - SPARROW_ASSERT_TRUE(std::cmp_less(children_count, std::numeric_limits::max())); + SPARROW_ASSERT_TRUE(std::cmp_less(children_count, max_arrow_length)); if (!is_created_with_sparrow()) { throw arrow_proxy_exception("Cannot set n_children on non-sparrow created ArrowArray or ArrowSchema"); @@ -676,7 +676,7 @@ namespace sparrow } // Release the remaining children if the new size is smaller than the current size - for (size_t i = children_count; i < static_cast(array().n_children); ++i) + for (size_t i = children_count; i < to_native_size(array().n_children); ++i) { schema().children[i]->release(schema().children[i]); array().children[i]->release(array().children[i]); @@ -684,7 +684,8 @@ namespace sparrow auto arrow_array_children = new ArrowArray*[children_count]; auto arrow_schemas_children = new ArrowSchema*[children_count]; - for (size_t i = 0; i < std::min(children_count, static_cast(array().n_children)); ++i) + const auto min_children_count = std::min(children_count, to_native_size(array().n_children)); + for (size_t i = 0; i < min_children_count; ++i) { arrow_array_children[i] = array().children[i]; arrow_schemas_children[i] = schema().children[i]; @@ -692,10 +693,10 @@ namespace sparrow delete[] array().children; array().children = arrow_array_children; - array().n_children = static_cast(children_count); + array().n_children = to_arrow_length(children_count); delete[] schema().children; schema().children = arrow_schemas_children; - schema().n_children = static_cast(children_count); + schema().n_children = to_arrow_length(children_count); } inline arrow_schema_private_data* arrow_proxy::get_schema_private_data() @@ -849,7 +850,7 @@ namespace sparrow auto& validity_buffer = buffers()[static_cast(validity_index)]; const dynamic_bitset_view bitmap(validity_buffer.data(), validity_buffer.size()); const auto null_count = bitmap.null_count(); - set_null_count(static_cast(null_count)); + set_null_count(to_arrow_length(null_count)); } inline bool arrow_proxy::is_arrow_array_valid() const diff --git a/include/sparrow/arrow_interface/arrow_array_schema_info_utils.hpp b/include/sparrow/arrow_interface/arrow_array_schema_info_utils.hpp index 2d97da851..9db1faace 100644 --- a/include/sparrow/arrow_interface/arrow_array_schema_info_utils.hpp +++ b/include/sparrow/arrow_interface/arrow_array_schema_info_utils.hpp @@ -24,7 +24,7 @@ namespace sparrow constexpr bool validate_buffers_count(data_type data_type, int64_t n_buffers) { const std::size_t expected_buffer_count = get_expected_buffer_count(data_type); - return static_cast(n_buffers) == expected_buffer_count; + return to_native_size(n_buffers) == expected_buffer_count; } /// @returns The the expected number of children for a given data type. @@ -71,7 +71,7 @@ namespace sparrow inline bool validate_format_with_arrow_array(data_type data_type, const ArrowArray& array) { const bool buffers_count_valid = validate_buffers_count(data_type, array.n_buffers); - const bool children_count_valid = static_cast(array.n_children) + const bool children_count_valid = to_native_size(array.n_children) == get_expected_children_count(data_type); return buffers_count_valid && children_count_valid; } diff --git a/include/sparrow/arrow_interface/arrow_array_schema_utils.hpp b/include/sparrow/arrow_interface/arrow_array_schema_utils.hpp index 9c7248854..70819e930 100644 --- a/include/sparrow/arrow_interface/arrow_array_schema_utils.hpp +++ b/include/sparrow/arrow_interface/arrow_array_schema_utils.hpp @@ -29,7 +29,7 @@ namespace sparrow { /** * Release the children and dictionnary of an `ArrowArray` or `ArrowSchema`. - * + * * @tparam T `ArrowArray` or `ArrowSchema` * @param t The `ArrowArray` or `ArrowSchema` to release. */ @@ -41,7 +41,7 @@ namespace sparrow { return; } - + if (t.dictionary) { if (t.dictionary->release) diff --git a/include/sparrow/buffer/dynamic_bitset.hpp b/include/sparrow/buffer/dynamic_bitset.hpp index a92b8538b..49f8b759d 100644 --- a/include/sparrow/buffer/dynamic_bitset.hpp +++ b/include/sparrow/buffer/dynamic_bitset.hpp @@ -20,6 +20,7 @@ #include #include +#include "sparrow/array/data_type.hpp" #include "sparrow/buffer/buffer.hpp" #include "sparrow/buffer/buffer_view.hpp" #include "sparrow/utils/contracts.hpp" @@ -519,12 +520,16 @@ namespace sparrow constexpr dynamic_bitset_base::dynamic_bitset_base(storage_type&& buf, size_type size) : m_buffer(std::move(buf)) , m_size(size) - , m_null_count(m_size - count_non_null()) { if constexpr (!std::is_const_v) { zero_unused_bits(); } + + const auto null_count = count_non_null(); + m_null_count = m_size - null_count; + SPARROW_ASSERT_TRUE(is_valid_arrow_length(size)); + SPARROW_ASSERT_TRUE(is_valid_arrow_length(m_null_count)); } template @@ -533,6 +538,8 @@ namespace sparrow , m_size(size) , m_null_count(null_count) { + SPARROW_ASSERT_TRUE(is_valid_arrow_length(size)); + SPARROW_ASSERT_TRUE(is_valid_arrow_length(m_null_count)); if constexpr (!std::is_const_v) { zero_unused_bits(); @@ -593,6 +600,7 @@ namespace sparrow { res += table[*p]; } + SPARROW_ASSERT_TRUE(is_valid_arrow_length(res)); return res; } diff --git a/include/sparrow/c_interface.hpp b/include/sparrow/c_interface.hpp index e1c7f9927..97cd6f211 100644 --- a/include/sparrow/c_interface.hpp +++ b/include/sparrow/c_interface.hpp @@ -92,10 +92,10 @@ namespace sparrow /// through `ArrowArray` and `ArrowSchema` struct arrow_data_ownership { - ///< Specifies if the ownership of the schema data + /// Specifies if the ownership of the schema data ownership schema = ownership::not_owning; - ///< Specifies if the ownership of the array data + /// Specifies if the ownership of the array data ownership array = ownership::not_owning; }; diff --git a/include/sparrow/config/config.hpp b/include/sparrow/config/config.hpp index 87698e34f..bb78a3249 100644 --- a/include/sparrow/config/config.hpp +++ b/include/sparrow/config/config.hpp @@ -38,7 +38,37 @@ # define SPARROW_API __attribute__((visibility("default"))) #endif -consteval bool is_apple_compiler() +namespace sparrow { - return static_cast(COMPILING_WITH_APPLE_CLANG); + consteval bool is_apple_compiler() + { + return static_cast(COMPILING_WITH_APPLE_CLANG); + } + + namespace config + { + inline constexpr bool enable_size_limit_runtime_check = +#if defined(SPAROW_ENABLE_SIZE_CHECKS) +# if SPAROW_ENABLE_SIZE_CHECKS == 1 + true +# else + false +# endif +#else +# ifndef NDEBUG // if not specified, by default in debug builds, we enable these checks + true +# else + false +# endif +#endif + ; + + inline constexpr bool enable_32bit_size_limit = +#if defined(SPAROW_ENABLE_32BIT_SIZE_LIMIT) + true +#else + false +#endif + ; + } } diff --git a/include/sparrow/layout/fixed_size_layout.hpp b/include/sparrow/layout/fixed_size_layout.hpp index 92561ddc9..311e5523f 100644 --- a/include/sparrow/layout/fixed_size_layout.hpp +++ b/include/sparrow/layout/fixed_size_layout.hpp @@ -141,7 +141,7 @@ namespace sparrow { // We only require the presence of the bitmap and the first buffer. SPARROW_ASSERT_TRUE(buffers_size(storage()) > 0); - SPARROW_ASSERT_TRUE(static_cast(length(storage())) == sparrow::bitmap(storage()).size()) + SPARROW_ASSERT_TRUE(to_native_size(length(storage())) == sparrow::bitmap(storage()).size()) } template @@ -154,21 +154,21 @@ namespace sparrow auto fixed_size_layout::size() const -> size_type { SPARROW_ASSERT_TRUE(offset(storage()) <= length(storage())); - return static_cast(length(storage()) - offset(storage())); + return sum_arrow_offsets(length(storage()), -offset(storage())); } template auto fixed_size_layout::value(size_type i) -> inner_reference { SPARROW_ASSERT_TRUE(i < size()); - return data()[i + static_cast(offset(storage()))]; + return data()[ sum_arrow_offsets(i, offset(storage())) ]; } template auto fixed_size_layout::value(size_type i) const -> inner_const_reference { SPARROW_ASSERT_TRUE(i < size()); - return data()[i + static_cast(offset(storage()))]; + return data()[ sum_arrow_offsets(i, offset(storage())) ]; } template @@ -238,20 +238,20 @@ namespace sparrow auto fixed_size_layout::has_value(size_type i) -> bitmap_reference { SPARROW_ASSERT_TRUE(i < size()); - return sparrow::bitmap(storage())[i + static_cast(offset(storage()))]; + return sparrow::bitmap(storage())[ sum_arrow_offsets(i, offset(storage())) ]; } template auto fixed_size_layout::has_value(size_type i) const -> bitmap_const_reference { SPARROW_ASSERT_TRUE(i < size()); - return sparrow::bitmap(storage())[i + static_cast(offset(storage()))]; + return sparrow::bitmap(storage())[ sum_arrow_offsets( i, offset(storage())) ]; } template auto fixed_size_layout::value_begin() -> value_iterator { - return value_iterator{data() + offset(storage())}; + return value_iterator{data() + to_native_offset(offset(storage()))}; } template @@ -263,7 +263,7 @@ namespace sparrow template auto fixed_size_layout::value_cbegin() const -> const_value_iterator { - return const_value_iterator{data() + offset(storage())}; + return const_value_iterator{data() + to_native_offset(offset(storage()))}; } template @@ -275,7 +275,7 @@ namespace sparrow template auto fixed_size_layout::bitmap_begin() -> bitmap_iterator { - return sparrow::next(sparrow::bitmap(storage()).begin(), offset(storage())); + return sparrow::next(sparrow::bitmap(storage()).begin(), to_native_offset(offset(storage()))); } template @@ -287,7 +287,7 @@ namespace sparrow template auto fixed_size_layout::bitmap_cbegin() const -> const_bitmap_iterator { - return sparrow::next(sparrow::bitmap(storage()).cbegin(), offset(storage())); + return sparrow::next(sparrow::bitmap(storage()).cbegin(), to_native_offset(offset(storage()))); } template diff --git a/include/sparrow/layout/list_layout/list_layout.hpp b/include/sparrow/layout/list_layout/list_layout.hpp index 9c2e4c097..602e2ea7b 100644 --- a/include/sparrow/layout/list_layout/list_layout.hpp +++ b/include/sparrow/layout/list_layout/list_layout.hpp @@ -79,7 +79,7 @@ namespace sparrow void rebind_data(data_storage_type& data) { - m_data = data; + m_data = data; m_child_layout = data.child_data[0]; } @@ -92,17 +92,17 @@ namespace sparrow { const auto _length = sparrow::length(storage()); const auto _offset = sparrow::offset(storage()); - return static_cast(_length - _offset); + return to_native_size(_length - _offset); } reference operator[](size_type i) - { + { auto bitmap_ref = has_value(i); if(bitmap_ref){ return reference( inner_reference( - sparrow::next(m_child_layout.begin(), element_offset(i)), - sparrow::next(m_child_layout.begin(), element_offset(i) + element_length(i)) + sparrow::next(m_child_layout.begin(), to_native_offset(element_offset(i))), + sparrow::next(m_child_layout.begin(), to_native_offset(element_offset(i) + element_length(i))) ), bitmap_ref ); @@ -117,8 +117,8 @@ namespace sparrow if(bitmap_ref){ return const_reference( inner_const_reference( - sparrow::next(m_child_layout.begin(), element_offset(i)), - sparrow::next(m_child_layout.begin(), element_offset(i) + element_length(i)) + sparrow::next(m_child_layout.begin(), to_native_offset(element_offset(i))), + sparrow::next(m_child_layout.begin(), to_native_offset(element_offset(i) + element_length(i))) ), bitmap_ref ); @@ -157,28 +157,28 @@ namespace sparrow } private: - + value_iterator value_begin() { - return value_iterator(this, static_cast(offset(storage()))); + return value_iterator(this, to_native_size(offset(storage()))); } value_iterator value_end() { - return value_iterator(this, static_cast(sparrow::length(storage()))); + return value_iterator(this, to_native_size(sparrow::length(storage()))); } const_value_iterator value_cbegin() const { - return const_value_iterator(this, static_cast(offset(storage()))); + return const_value_iterator(this, to_native_size(offset(storage()))); } const_value_iterator value_cend() const { - return const_value_iterator(this, static_cast(sparrow::length(storage()))); + return const_value_iterator(this, to_native_size(sparrow::length(storage()))); } bitmap_iterator bitmap_begin(){ - return sparrow::next(sparrow::bitmap(storage()).begin(), sparrow::offset(storage())); + return sparrow::next(sparrow::bitmap(storage()).begin(), to_native_offset(sparrow::offset(storage()))); } bitmap_iterator bitmap_end(){ @@ -186,40 +186,42 @@ namespace sparrow } const_bitmap_iterator bitmap_cbegin() const{ - return sparrow::next(sparrow::bitmap(storage()).cbegin(), sparrow::offset(storage())); + return sparrow::next(sparrow::bitmap(storage()).cbegin(), to_native_offset(sparrow::offset(storage()))); } const_bitmap_iterator bitmap_cend() const { return sparrow::bitmap(storage()).cend(); - } + } - bitmap_reference has_value(size_type i) + bitmap_reference has_value(size_type i) { - const size_type pos = i + static_cast(sparrow::offset(storage())); + const size_type pos = sum_arrow_offsets(i, sparrow::offset(storage())); return sparrow::bitmap(storage())[pos]; } bitmap_const_reference has_value(size_type i) const { - const size_type pos = i + static_cast(sparrow::offset(storage())); + const size_type pos = sum_arrow_offsets(i, sparrow::offset(storage())); return sparrow::bitmap(storage())[pos]; } - + offset_type element_offset(size_type i)const { - const size_type pos = i + static_cast(sparrow::offset(storage())); - return buffer_at(storage(), 0u).template data()[pos]; + const size_type pos = sum_arrow_offsets(i, sparrow::offset(storage())); + return buffer_at(storage(), 0u).template data()[pos]; } - + offset_type element_length(size_type i)const { - const size_type j = static_cast(sparrow::offset(storage())) + i; + const size_type j = sum_arrow_offsets(sparrow::offset(storage()), i); const auto offset_ptr = buffer_at(storage(), 0u).template data(); const auto size = offset_ptr[j + 1] - offset_ptr[j]; return size; } + data_storage_type& storage() { return m_data; } + const data_storage_type& storage() const { return m_data; diff --git a/include/sparrow/layout/list_layout/list_layout_value_iterator.hpp b/include/sparrow/layout/list_layout/list_layout_value_iterator.hpp index 9d11230bd..fdd103c4d 100644 --- a/include/sparrow/layout/list_layout/list_layout_value_iterator.hpp +++ b/include/sparrow/layout/list_layout/list_layout_value_iterator.hpp @@ -36,13 +36,13 @@ namespace sparrow template class list_layout_value_iterator: public iterator_base< // derived - list_layout_value_iterator, + list_layout_value_iterator, // element list_value_t, // category std::random_access_iterator_tag, // REFERENCE - list_value_t + list_value_t > { public: @@ -67,15 +67,16 @@ namespace sparrow self_type& operator=(const self_type& rhs) = default; list_value_type dereference() const - { + { child_layout_reference child_layout = p_layout->m_child_layout; const auto offset = p_layout->element_offset(m_index); const auto length = p_layout->element_length(m_index); - + return list_value_type( - sparrow::next(child_layout.begin(), offset), - sparrow::next(child_layout.begin(), offset + length)); + sparrow::next(child_layout.begin(), to_native_offset(offset)), + sparrow::next(child_layout.begin(), to_native_offset(offset + length)) + ); } bool equal(const self_type& rhs) const @@ -87,23 +88,27 @@ namespace sparrow { ++m_index; } + void decrement() { - --m_index; + --m_index; } + void advance(std::ptrdiff_t n) { m_index += static_cast(n); } + std::ptrdiff_t distance_to(const self_type& rhs) const { return static_cast(rhs.m_index) - static_cast(m_index); } + bool less_than(const self_type& rhs) const { return m_index < rhs.m_index; } - + private: using list_layout_type = LIST_LAYOUT_TYPE; diff --git a/include/sparrow/layout/null_layout.hpp b/include/sparrow/layout/null_layout.hpp index ccc9a4217..775d79d82 100644 --- a/include/sparrow/layout/null_layout.hpp +++ b/include/sparrow/layout/null_layout.hpp @@ -196,7 +196,7 @@ namespace sparrow template auto null_layout::size() const -> size_type { - return static_cast(length(storage())); + return to_native_size(length(storage())); } template diff --git a/include/sparrow/layout/variable_size_binary_layout.hpp b/include/sparrow/layout/variable_size_binary_layout.hpp index 207aee352..abcd270f0 100644 --- a/include/sparrow/layout/variable_size_binary_layout.hpp +++ b/include/sparrow/layout/variable_size_binary_layout.hpp @@ -296,18 +296,20 @@ namespace sparrow : p_layout(layout) , m_index(static_cast(index)) { + SPARROW_ASSERT_TRUE(is_valid_arrow_length(m_index)); } template auto vs_binary_value_iterator::dereference() const -> reference { + SPARROW_ASSERT_TRUE(is_valid_arrow_length(m_index)); if constexpr (is_const) { return p_layout->value(static_cast(m_index)); } else { - return reference(p_layout, static_cast(m_index)); + return reference(p_layout, m_index); } } @@ -315,18 +317,21 @@ namespace sparrow void vs_binary_value_iterator::increment() { ++m_index; + SPARROW_ASSERT_TRUE(is_valid_arrow_length(m_index)); } template void vs_binary_value_iterator::decrement() { --m_index; + SPARROW_ASSERT_TRUE(is_valid_arrow_length(m_index)); } template void vs_binary_value_iterator::advance(difference_type n) { m_index += n; + SPARROW_ASSERT_TRUE(is_valid_arrow_length(m_index)); } template @@ -378,7 +383,7 @@ namespace sparrow template auto vs_binary_reference::size() const -> size_type { - return static_cast(offset(m_index + 1) - offset(m_index)); + return to_native_size(offset(m_index + 1) - offset(m_index)); } template @@ -458,7 +463,7 @@ namespace sparrow template auto vs_binary_reference::uoffset(size_type index) const -> size_type { - return static_cast(offset(index)); + return to_native_size(offset(index)); } /********************************************** @@ -481,8 +486,10 @@ namespace sparrow template auto variable_size_binary_layout::size() const -> size_type { - SPARROW_ASSERT_TRUE(sparrow::offset(storage()) <= length(storage())); - return static_cast(length(storage()) - sparrow::offset(storage())); + const auto off = sparrow::offset(storage()); + const auto len = length(storage()); + SPARROW_ASSERT_TRUE(off <= len); + return to_native_size(len - off); } template @@ -537,9 +544,11 @@ namespace sparrow typename data_storage_type::buffer_type& data_buffer = buffer_at(storage(), 1u); const auto layout_data_length = size(); - const auto offset_beg = *offset(index); - const auto offset_end = *offset(index + 1); - const auto initial_value_length = static_cast(offset_end - offset_beg); + SPARROW_ASSERT_TRUE(is_valid_arrow_length(*offset(index))); + const auto offset_beg = to_native_offset(*offset(index)); + const auto offset_end = to_native_offset(*offset(index + 1)); + const std::size_t initial_value_length = static_cast(offset_end - offset_beg); + SPARROW_ASSERT_TRUE(is_valid_arrow_length(initial_value_length)); const auto new_value_length = std::ranges::size(rhs); if (new_value_length > initial_value_length) { @@ -564,7 +573,7 @@ namespace sparrow offset(layout_data_length + 1), [shift_val](auto& offset) { - offset += static_cast(shift_val); + offset += to_arrow_length(shift_val); } ); } @@ -587,7 +596,7 @@ namespace sparrow offset(layout_data_length + 1), [shift_val](auto& offset) { - offset -= static_cast(shift_val); + offset -= to_arrow_length(shift_val); } ); } @@ -603,7 +612,7 @@ namespace sparrow template auto variable_size_binary_layout::bitmap_begin() -> bitmap_iterator { - return sparrow::next(sparrow::bitmap(storage()).begin(), sparrow::offset(storage())); + return sparrow::next(sparrow::bitmap(storage()).begin(), to_native_offset(sparrow::offset(storage()))); } template @@ -615,7 +624,7 @@ namespace sparrow template auto variable_size_binary_layout::bitmap_cbegin() const -> const_bitmap_iterator { - return sparrow::next(sparrow::bitmap(storage()).cbegin(), sparrow::offset(storage())); + return sparrow::next(sparrow::bitmap(storage()).cbegin(), to_native_offset(sparrow::offset(storage()))); } template @@ -651,16 +660,16 @@ namespace sparrow template auto variable_size_binary_layout::has_value(size_type i) -> bitmap_reference { - SPARROW_ASSERT_TRUE(sparrow::offset(storage()) >= 0); - const size_type pos = i + static_cast(sparrow::offset(storage())); + SPARROW_ASSERT_TRUE(is_valid_arrow_length(sparrow::offset(storage()))); + const size_type pos = sum_arrow_offsets(i, sparrow::offset(storage())); return sparrow::bitmap(storage())[pos]; } template auto variable_size_binary_layout::has_value(size_type i) const -> bitmap_const_reference { - SPARROW_ASSERT_TRUE(sparrow::offset(storage()) >= 0); - const size_type pos = i + static_cast(sparrow::offset(storage())); + SPARROW_ASSERT_TRUE(is_valid_arrow_length(sparrow::offset(storage()))); + const size_type pos = sum_arrow_offsets(i, sparrow::offset(storage())); return sparrow::bitmap(storage())[pos]; } @@ -673,12 +682,12 @@ namespace sparrow template auto variable_size_binary_layout::value(size_type i) const -> inner_const_reference { - const long long offset_i = *offset(i); - SPARROW_ASSERT_TRUE(offset_i >= 0); - const long long offset_next = *offset(i + 1); - SPARROW_ASSERT_TRUE(offset_next >= 0); - const const_data_iterator pointer1 = data(static_cast(offset_i)); - const const_data_iterator pointer2 = data(static_cast(offset_next)); + const auto offset_i = *offset(i); + SPARROW_ASSERT_TRUE(is_valid_arrow_length(offset_i)); + const auto offset_next = *offset(i + 1); + SPARROW_ASSERT_TRUE(is_valid_arrow_length(offset_next)); + const const_data_iterator pointer1 = data(to_native_size(offset_i)); + const const_data_iterator pointer2 = data(to_native_size(offset_next)); return inner_const_reference(pointer1, pointer2); } @@ -686,14 +695,14 @@ namespace sparrow auto variable_size_binary_layout::offset(size_type i) -> offset_iterator { SPARROW_ASSERT_FALSE(buffers_size(storage()) == 0u); - return buffer_at(storage(), 0u).template data() + sparrow::offset(storage()) + i; + return buffer_at(storage(), 0u).template data() + sum_arrow_offsets(sparrow::offset(storage()), i); } template auto variable_size_binary_layout::offset_end() -> offset_iterator { SPARROW_ASSERT_FALSE(buffers_size(storage()) == 0u); - return buffers_at(storage(), 0u).template data() + length(storage()); + return buffers_at(storage(), 0u).template data() + to_native_size(length(storage())); } template @@ -707,14 +716,14 @@ namespace sparrow auto variable_size_binary_layout::offset(size_type i) const -> const_offset_iterator { SPARROW_ASSERT_FALSE(buffers_size(storage()) == 0u); - return buffer_at(storage(), 0u).template data() + sparrow::offset(storage()) + i; + return buffer_at(storage(), 0u).template data() + sum_arrow_offsets(sparrow::offset(storage()), i); } template auto variable_size_binary_layout::offset_end() const -> const_offset_iterator { SPARROW_ASSERT_FALSE(buffers_size(storage()) == 0u); - return buffer_at(storage(), 0u).template data() + length(storage()); + return buffer_at(storage(), 0u).template data() + to_native_size(length(storage())); } template diff --git a/include/sparrow/utils/bit.hpp b/include/sparrow/utils/bit.hpp index d494363a5..876bcd007 100644 --- a/include/sparrow/utils/bit.hpp +++ b/include/sparrow/utils/bit.hpp @@ -36,7 +36,7 @@ namespace sparrow std::ranges::reverse(value_representation); return std::bit_cast(value_representation); } - + template constexpr auto to_native_endian(std::integral auto value) noexcept { diff --git a/include/sparrow/utils/nullable.hpp b/include/sparrow/utils/nullable.hpp index 4995cf0fe..18fbac9a9 100644 --- a/include/sparrow/utils/nullable.hpp +++ b/include/sparrow/utils/nullable.hpp @@ -236,7 +236,7 @@ namespace sparrow * private: * std::vector m_values; * std::vector m_flags; - * + * * public: * * using reference = nullable; @@ -287,21 +287,21 @@ namespace sparrow using flag_const_reference = typename flag_traits::const_reference; using flag_rvalue_reference = typename flag_traits::rvalue_reference; using flag_const_rvalue_reference = typename flag_traits::const_rvalue_reference; - + template constexpr nullable() noexcept : m_value() , m_null_flag(false) { } - + template constexpr nullable(nullval_t) noexcept : m_value() , m_null_flag(false) { } - + template requires ( not std::same_as> and @@ -436,7 +436,7 @@ namespace sparrow constexpr const_reference get() const & noexcept; constexpr rvalue_reference get() && noexcept; constexpr const_rvalue_reference get() const && noexcept; - + constexpr reference value() &; constexpr const_reference value() const &; constexpr rvalue_reference value() &&; @@ -470,7 +470,7 @@ namespace sparrow template constexpr std::strong_ordering operator<=>(const nullable& lhs, nullval_t) noexcept; - + template constexpr bool operator==(const nullable& lhs, const U& rhs) noexcept; @@ -490,7 +490,7 @@ namespace sparrow // to their argument, making the deduction impossible. template constexpr nullable make_nullable(T&& value, B&& flag = true); - + /** * variant of nullable, exposing has_value for convenience * @@ -536,13 +536,13 @@ namespace sparrow { return m_null_flag; } - + template constexpr auto nullable::null_flag() const & noexcept -> flag_const_reference { return m_null_flag; } - + template constexpr auto nullable::null_flag() && noexcept -> flag_rvalue_reference { @@ -555,7 +555,7 @@ namespace sparrow return flag_rvalue_reference(m_null_flag); } } - + template constexpr auto nullable::null_flag() const && noexcept -> flag_const_rvalue_reference { @@ -580,7 +580,7 @@ namespace sparrow { return m_value; } - + template constexpr auto nullable::get() && noexcept -> rvalue_reference { @@ -620,7 +620,7 @@ namespace sparrow throw_if_null(); return get(); } - + template constexpr auto nullable::value() && -> rvalue_reference { @@ -639,14 +639,14 @@ namespace sparrow template constexpr auto nullable::value_or(U&& default_value) const & -> value_type { - return *this ? get() : value_type(std::forward(default_value)); + return *this ? get() : value_type(std::forward(default_value)); } template template constexpr auto nullable::value_or(U&& default_value) && -> value_type { - return *this ? get() : value_type(std::forward(default_value)); + return *this ? get() : value_type(std::forward(default_value)); } template @@ -662,7 +662,7 @@ namespace sparrow { m_null_flag = false; } - + template void nullable::throw_if_null() const { @@ -689,7 +689,7 @@ namespace sparrow { return lhs <=> false; } - + template constexpr bool operator==(const nullable& lhs, const U& rhs) noexcept { @@ -744,7 +744,7 @@ namespace sparrow base_type::operator=(std::move(rhs)); return *this; } - + template requires (is_nullable_v && ...) constexpr nullable_variant::operator bool() const diff --git a/test/test_fixed_size_layout.cpp b/test/test_fixed_size_layout.cpp index d12ab6787..e4965b849 100644 --- a/test/test_fixed_size_layout.cpp +++ b/test/test_fixed_size_layout.cpp @@ -51,7 +51,7 @@ namespace sparrow auto buffer_data = ad.buffers[0].data(); for (std::size_t i = 0; i < lt.size(); ++i) { - CHECK_EQ(lt[i].value(), buffer_data[i + static_cast(ad.offset)]); + CHECK_EQ(lt[i].value(), buffer_data[ sum_arrow_offsets(i, ad.offset) ]); } } @@ -65,7 +65,7 @@ namespace sparrow auto buffer_data = ad2.buffers[0].data(); for (std::size_t i = 0; i < lt.size(); ++i) { - CHECK_EQ(lt[i].value(), buffer_data[i + static_cast(ad2.offset)]); + CHECK_EQ(lt[i].value(), buffer_data[ sum_arrow_offsets(i, ad2.offset) ]); } } diff --git a/test/test_list_layout.cpp b/test/test_list_layout.cpp index db082d5be..daebf9fdc 100644 --- a/test/test_list_layout.cpp +++ b/test/test_list_layout.cpp @@ -29,21 +29,21 @@ namespace sparrow { - + TEST_SUITE("list_layout") { // this test will be invoked for each **scalar** type. - // Since this test uses a "fixed_size_layout" as the inner layout, + // Since this test uses a "fixed_size_layout" as the inner layout, // it will **not** work for non-scalar types (ie std::string) TEST_CASE_TEMPLATE_DEFINE("generic-scalar-test", T, generic_scalar_test) { SUBCASE("list") - { + { auto d = test::iota_vector(11); std::vector> values = { - {d[0],d[1], d[2], d[3]}, - {d[4], d[5]}, + {d[0],d[1], d[2], d[3]}, + {d[4], d[5]}, {d[6], d[7], d[8], d[9], d[10]} }; @@ -77,38 +77,38 @@ TEST_SUITE("list_layout") { test::layout_tester(list_layout); } - + } SUBCASE("list>") { auto d = test::iota_vector(28); std::vector>> values = { { - {d[0],d[1], d[2], d[3]}, + {d[0],d[1], d[2], d[3]}, {d[4], d[5], d[6]}, {d[7], d[8], d[9], d[10]} }, { - {d[11],d[12], d[13], d[14]}, - {d[15], d[16]}, + {d[11],d[12], d[13], d[14]}, + {d[15], d[16]}, {d[17], d[18], d[19], d[20], d[21]} }, { - {d[22],d[23], d[24], d[25]}, + {d[22],d[23], d[24], d[25]}, {d[26], d[27]} } }; auto outer_list_array_data = test::make_array_data_for_list_of_list_of_scalars(values); - + using data_storage = sparrow::array_data; using inner_layout_type = sparrow::fixed_size_layout; using inner_list_layout_type = sparrow::list_layout; using outer_list_layout_type = sparrow::list_layout; - + outer_list_layout_type outer_list_layout(outer_list_array_data); REQUIRE_EQ(outer_list_layout.size(), values.size()); - + SUBCASE("operator[]") { for(std::size_t i = 0; i < values.size(); i++) From e736790b43a275fd939ab18b4ac7e0c92d804412 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Klaim=20=28Jo=C3=ABl=20Lamotte=29?= <142265+Klaim@users.noreply.github.com> Date: Tue, 24 Sep 2024 11:37:22 +0200 Subject: [PATCH 3/9] Workaround warning cast into same type --- include/sparrow/array/data_type.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/sparrow/array/data_type.hpp b/include/sparrow/array/data_type.hpp index 60744c856..e6a366dae 100644 --- a/include/sparrow/array/data_type.hpp +++ b/include/sparrow/array/data_type.hpp @@ -194,6 +194,12 @@ namespace sparrow } } + +#if defined(__GNUC__) && !defined(__clang__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wuseless-cast" // We want to be able to cast type aliases to their real types. +#endif + /// @returns The provided arrow length value as represented by the native standard size type `std::size_t`. /// If `config::enable_size_limit_runtime_check == true` it will also check that the value is /// a valid arrow length and representable by `std::size_t` or throws otherwise. @@ -243,6 +249,10 @@ namespace sparrow throw_if_invalid_size(result, false); // dont allow negatives as the result must be a size return static_cast(result); } +#if defined(__GNUC__) && !defined(__clang__) +# pragma GCC diagnostic pop +#endif + /// Runtime identifier of arrow data types, usually associated with raw bytes with the associated value. // TODO: does not support all types specified by the Arrow specification From 8f784043d66bffc230e18b2699dcf690a1328fa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=ABl=20Lamotte=20=28Klaim=29?= Date: Tue, 24 Sep 2024 18:24:15 +0200 Subject: [PATCH 4/9] fixup artifact name --- .github/workflows/windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 04c83cd83..85d53443b 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -159,7 +159,7 @@ jobs: uses: actions/upload-artifact@v4 if: matrix.toolchain.build_refactoring == 'ON' && (success() || failure()) with: - name: test_sparrow_lib_report_Windows_v01_${{ matrix.toolchain.compiler }}_${{ matrix.target-arch.name }}_${{ matrix.build-system }}_${{ matrix.config.name }}_date-polyfill_${{ matrix.toolchain.date-polyfill}}_refactoring_${{ matrix.toolchain.build_refactoring }} + name: test_sparrow_lib_report_Windows_v01_${{ matrix.toolchain.compiler }}_${{ matrix.target-arch.name }}_${{ matrix.build-system.name }}_${{ matrix.config.name }}_date-polyfill_${{ matrix.toolchain.date-polyfill}}_refactoring_${{ matrix.toolchain.build_refactoring }} path: '**/test_sparrow_lib_report_v01.xml' - name: Run all examples From a2a8e9e28ae8bce8a4cc44f54c30b39cf479e45c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Klaim=20=28Jo=C3=ABl=20Lamotte=29?= <142265+Klaim@users.noreply.github.com> Date: Wed, 25 Sep 2024 17:33:59 +0200 Subject: [PATCH 5/9] Improved clarify on why we need to have checks that also allow negative sizes (offsets) --- include/sparrow/array/data_type.hpp | 69 ++++++++++++++++------------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/include/sparrow/array/data_type.hpp b/include/sparrow/array/data_type.hpp index e6a366dae..c76aa8018 100644 --- a/include/sparrow/array/data_type.hpp +++ b/include/sparrow/array/data_type.hpp @@ -120,23 +120,41 @@ namespace sparrow return true; } - + /// Type representing an Arrow length (sizes, offsets) in the Arrow specification and storage. + /// This is used internally but is not always directly convertible to `std::size_t` and `std::ptrdiff_t` + /// which are the "native" types to represent sizes and offsets. + /// For conversion purposes: + /// - @see `to_native_size()` + /// - @see `to_native_offset()` + /// - @see `to_arrow_length()` + /// - @see `sum_arrow_offsets()` using arrow_length = std::int64_t; - /// Maximum size allowed for arrow sizes. - /// This can be constrained to 32bit signed sizes using configuration options. + /// Clarifies if a length value is supposed to be a size/length or an offset. + /// Offsets can be negative, sizes cannot. This is only important for runtime checks + /// and should only be used when calling functions that do runtime checks on sizes + /// and offsets values validity. + enum class arrow_length_kind + { + size, offset + }; + + /// Maximum size allowed for arrow lengths. + /// This can be constrained to 32bit signed values using configuration options. /// See: https://arrow.apache.org/docs/format/Columnar.html#array-lengths static constexpr arrow_length max_arrow_length = config::enable_32bit_size_limit ? std::numeric_limits::max() : std::numeric_limits::max(); - /// @returns True if the provided value is between zero and max_arrow_length. + /// @returns True if the provided value is in a valid range for an arrow size. + /// By default the range is zero to `max_arrow_length`, but if it is specified + /// that the value is an offset, we allow negatives values too. template inline constexpr - bool is_valid_arrow_length(T size_or_offset, bool allow_negatives = false) noexcept + bool is_valid_arrow_length(T size_or_offset, arrow_length_kind kind = arrow_length_kind::size) noexcept { return std::in_range(size_or_offset) - and (allow_negatives or size_or_offset >= T(0)) + and (kind == arrow_length_kind::offset or size_or_offset >= T(0)) and size_or_offset <= max_arrow_length ; } @@ -148,46 +166,31 @@ namespace sparrow /// otherwise this function is no-op. template inline constexpr - void throw_if_invalid_size(std::integral auto size_or_offset, bool allow_negatives = false) + void throw_if_invalid_size(std::integral auto size_or_offset, + arrow_length_kind kind = arrow_length_kind::size) { if constexpr (config::enable_size_limit_runtime_check) { // checks that the value is in range of arrow length in general - if (not is_valid_arrow_length(size_or_offset, allow_negatives)) + if (not is_valid_arrow_length(size_or_offset, kind)) { throw std::runtime_error( // TODO: replace by sparrow-specific error + // TODO: use std::format instead once available std::string("size/length/offset is outside the valid arrow length limits [0:") + std::to_string(max_arrow_length) + "] : " + std::to_string(size_or_offset) + "(" + typeid(size_or_offset).name() + ") " - /* - TODO: when possible, replace by - std::format( - "size/length/offset is outside the valid arrow length limits [0:{}] : {} ({})", - max_arrow_length, - size_or_offset, - typeid(size_or_offset).name() - )*/ ); } - if constexpr (not std::same_as) // Already checked in `is_valid_size()` - // otherwise + if constexpr (not std::same_as) // Already checked in `is_valid_size()` otherwise { // check that the value is representable by the specified type TargetRepr if (not std::in_range(size_or_offset)) { throw std::runtime_error( // TODO: replace by sparrow-specific error + // TODO: use std::format instead once available std::string("size/length/offset cannot be represented by ") + typeid(TargetRepr).name() + ": " + std::to_string(size_or_offset) + " (" + typeid(size_or_offset).name() + ")" - /* - TODO: when possible, replace by - - std::format( - "size/length/offset cannot be represented by {} : {} ({})", - typeid(TargetRepr).name(), - size_or_offset, - typeid(size_or_offset).name() - )*/ ); } } @@ -218,7 +221,7 @@ namespace sparrow inline constexpr auto to_native_offset(arrow_length offset) -> std::ptrdiff_t { - throw_if_invalid_size(offset, true); + throw_if_invalid_size(offset, arrow_length_kind::offset); return static_cast(offset); } @@ -227,9 +230,11 @@ namespace sparrow /// a valid arrow length and representable in `int64_t` or throws otherwise. /// @throws @see `throw_if_invalid_size()` for details. inline constexpr - auto to_arrow_length(std::integral auto size_or_offset, bool allow_negatives = false) -> arrow_length + auto + to_arrow_length(std::integral auto size_or_offset, arrow_length_kind kind = arrow_length_kind::size) + -> arrow_length { - throw_if_invalid_size(size_or_offset, allow_negatives); + throw_if_invalid_size(size_or_offset, kind); return static_cast(size_or_offset); } @@ -245,8 +250,8 @@ namespace sparrow // We cast every offset as an arrow length, then sum them as arrow length representation, // and finally cast back to the expected result type after verifying that // the resulting value is still a valid arrow length. - auto result = (to_arrow_length(offsets, true) + ...); - throw_if_invalid_size(result, false); // dont allow negatives as the result must be a size + auto result = (to_arrow_length(offsets, arrow_length_kind::offset) + ...); + throw_if_invalid_size(result, arrow_length_kind::size); // dont allow negatives as the result must be a size return static_cast(result); } #if defined(__GNUC__) && !defined(__clang__) From 562238d083ce0795c4b3f1856cc49bf0d3432714 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Klaim=20=28Jo=C3=ABl=20Lamotte=29?= <142265+Klaim@users.noreply.github.com> Date: Wed, 25 Sep 2024 17:52:12 +0200 Subject: [PATCH 6/9] complete usage of length conversion functions in more recent code --- include/sparrow/arrow_interface/arrow_array.hpp | 12 ++++++------ include/sparrow/arrow_interface/arrow_schema.hpp | 5 +++-- include/sparrow/buffer/buffer.hpp | 2 +- test/array_data_creation.cpp | 4 ++-- test/test_arrow_array.cpp | 6 ++++-- test/test_dictionary_encoded_layout.cpp | 4 ++-- 6 files changed, 18 insertions(+), 15 deletions(-) diff --git a/include/sparrow/arrow_interface/arrow_array.hpp b/include/sparrow/arrow_interface/arrow_array.hpp index 2a59114c6..f6fe7472b 100644 --- a/include/sparrow/arrow_interface/arrow_array.hpp +++ b/include/sparrow/arrow_interface/arrow_array.hpp @@ -168,7 +168,7 @@ namespace sparrow array.private_data = new arrow_array_private_data(std::move(buffers)); const auto private_data = static_cast(array.private_data); array.buffers = private_data->buffers_ptrs(); - array.n_children = static_cast(n_children); + array.n_children = to_arrow_length(n_children); array.children = children; array.dictionary = dictionary; array.release = release_arrow_array; @@ -276,7 +276,7 @@ namespace sparrow get_arrow_array_buffers(const ArrowArray& array, const ArrowSchema& schema) { std::vector> buffers; - const auto buffer_count = static_cast(array.n_buffers); + const auto buffer_count = to_native_size(array.n_buffers); buffers.reserve(buffer_count); const enum data_type data_type = format_to_data_type(schema.format); const std::vector buffers_type = get_buffer_types_from_data_type(data_type); @@ -286,8 +286,8 @@ namespace sparrow auto buffer = array.buffers[i]; const std::size_t buffer_size = compute_buffer_size( buffer_type, - static_cast(array.length), - static_cast(array.offset), + to_native_size(array.length), + to_native_size(array.offset), data_type ); auto* ptr = static_cast(const_cast(buffer)); @@ -310,7 +310,7 @@ namespace sparrow target.n_children = source_array.n_children; if (source_array.n_children > 0) { - target.children = new ArrowArray*[static_cast(source_array.n_children)]; + target.children = new ArrowArray*[ to_native_size(source_array.n_children) ]; for (int64_t i = 0; i < source_array.n_children; ++i) { SPARROW_ASSERT_TRUE(source_array.children[i] != nullptr); @@ -331,7 +331,7 @@ namespace sparrow target.n_buffers = source_array.n_buffers; std::vector> buffers_copy; - buffers_copy.reserve(static_cast(source_array.n_buffers)); + buffers_copy.reserve(to_native_size(source_array.n_buffers)); const auto buffers = get_arrow_array_buffers(source_array, source_schema); for (const auto& buffer : buffers) { diff --git a/include/sparrow/arrow_interface/arrow_schema.hpp b/include/sparrow/arrow_interface/arrow_schema.hpp index 38eaea95a..384dcc424 100644 --- a/include/sparrow/arrow_interface/arrow_schema.hpp +++ b/include/sparrow/arrow_interface/arrow_schema.hpp @@ -16,6 +16,7 @@ #include +#include "sparrow/array/data_type.hpp" #include "sparrow/arrow_interface/arrow_array_schema_utils.hpp" #include "sparrow/arrow_interface/arrow_schema/private_data.hpp" #include "sparrow/arrow_interface/arrow_schema/smart_pointers.hpp" @@ -151,7 +152,7 @@ namespace sparrow schema.dictionary = dictionary; schema.release = release_arrow_schema; } - + inline void release_arrow_schema(ArrowSchema* schema) { @@ -250,7 +251,7 @@ namespace sparrow target.n_children = source.n_children; if (source.n_children > 0) { - target.children = new ArrowSchema*[static_cast(source.n_children)]; + target.children = new ArrowSchema*[ to_native_size(source.n_children) ]; for (int64_t i = 0; i < source.n_children; ++i) { SPARROW_ASSERT_TRUE(source.children[i] != nullptr); diff --git a/include/sparrow/buffer/buffer.hpp b/include/sparrow/buffer/buffer.hpp index 4246fcdb5..4bc737f7e 100644 --- a/include/sparrow/buffer/buffer.hpp +++ b/include/sparrow/buffer/buffer.hpp @@ -335,7 +335,7 @@ namespace sparrow template buffer_base::~buffer_base() { - deallocate(m_data.p_begin, static_cast(m_data.p_storage_end - m_data.p_begin)); + deallocate(m_data.p_begin, static_cast(std::distance(m_data.p_begin, m_data.p_storage_end))); } template diff --git a/test/array_data_creation.cpp b/test/array_data_creation.cpp index 4ada19471..8fe4a3261 100644 --- a/test/array_data_creation.cpp +++ b/test/array_data_creation.cpp @@ -58,8 +58,8 @@ namespace sparrow::test } ad.buffers.push_back(b); - ad.length = static_cast(n); - ad.offset = static_cast(offset); + ad.length = to_arrow_length(n); + ad.offset = to_arrow_length(offset); ad.child_data.emplace_back(); return ad; } diff --git a/test/test_arrow_array.cpp b/test/test_arrow_array.cpp index 6a23fe1ff..2e6357a32 100644 --- a/test/test_arrow_array.cpp +++ b/test/test_arrow_array.cpp @@ -291,14 +291,16 @@ TEST_SUITE("C Data Interface") CHECK_EQ(array.n_children , array_copy.n_children); CHECK_NE(array.buffers , array_copy.buffers); CHECK_NE(array.private_data , array_copy.private_data); - for(size_t i = 0; i < static_cast(array.n_buffers); ++i) + const auto buffer_count = sparrow::to_native_size(array.n_buffers); + for (size_t i = 0; i < buffer_count; ++i) { CHECK_NE(array.buffers[i] , array_copy.buffers[i]); } auto array_buffers = reinterpret_cast(array.buffers); auto array_copy_buffers = reinterpret_cast(array_copy.buffers); - for(size_t i = 0; i < static_cast(array.length); ++i) + const auto length = sparrow::to_native_size(array.length); + for(size_t i = 0; i < length; ++i) { CHECK_EQ(array_buffers[1][i], array_copy_buffers[1][i]); } diff --git a/test/test_dictionary_encoded_layout.cpp b/test/test_dictionary_encoded_layout.cpp index 983e4b688..7749851a8 100644 --- a/test/test_dictionary_encoded_layout.cpp +++ b/test/test_dictionary_encoded_layout.cpp @@ -70,14 +70,14 @@ namespace sparrow for (size_t i = 0; i < lwords.size(); ++i) { - offset()[i + 1] = offset()[i] + static_cast(lwords[i].size()); + offset()[i + 1] = sum_arrow_offsets(offset()[i], lwords[i].size()); std::ranges::copy(lwords[i], iter); iter += static_cast(lwords[i].size()); dictionary.bitmap.set(i, true); } dictionary.bitmap.set(4, false); - dictionary.length = static_cast(lwords.size()); + dictionary.length = to_arrow_length(lwords.size()); dictionary.offset = 0; return dictionary; } From 949907c5a6ea4e61564be9ae8f5510480ff5785e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Klaim=20=28Jo=C3=ABl=20Lamotte=29?= <142265+Klaim@users.noreply.github.com> Date: Wed, 25 Sep 2024 17:58:17 +0200 Subject: [PATCH 7/9] Attempt to enable runtime size checks in a few configurations --- .github/workflows/windows.yml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 85d53443b..17cc474ac 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -118,8 +118,18 @@ jobs: echo "VCPKG_TARGET_TRIPLET='x86-windows'" >> $GITHUB_ENV echo "CMAKE_TOOLCHAIN_FILE=$VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake" >> $GITHUB_ENV + - name: Enable runtime size/length/offset validity checks + if: matrix.config.name == 'Debug' + run: | + echo "SPARROW_CHECKS='$SPARROW_CHECKS -DSPAROW_ENABLE_SIZE_CHECKS=ON'" >> $GITHUB_ENV + + - name: Enable runtime 32bit size/length/offset limit + if: matrix.target-arch.name == 'Win32' # Note: this is not a mandatory limitation in any context, we add this just to have at least one configuration testing that feature + run: | + echo "SPARROW_CHECKS='$SPARROW_CHECKS -DSPAROW_ENABLE_32BIT_SIZE_LIMIT=ON'" >> $GITHUB_ENV + - name: Configure using CMake - run: cmake -S ./ -B ./build -DCMAKE_BUILD_TYPE:STRING=${{matrix.config.name}} -DCMAKE_PREFIX_PATH=$SPARROW_DEPS_PREFIX -DCMAKE_INSTALL_PREFIX=$SPARROW_INSTALL_PREFIX -DBUILD_TESTS=ON -DBUILD_EXAMPLES=ON -DUSE_DATE_POLYFILL=${{matrix.toolchain.date-polyfill}} -DBUILD_REFACTORING=${{matrix.toolchain.build_refactoring}} -G "${{matrix.build-system.name}}" $GENERATOR_EXTRA_FLAGS + run: cmake -S ./ -B ./build -DCMAKE_BUILD_TYPE:STRING=${{matrix.config.name}} -DCMAKE_PREFIX_PATH=$SPARROW_DEPS_PREFIX -DCMAKE_INSTALL_PREFIX=$SPARROW_INSTALL_PREFIX -DBUILD_TESTS=ON -DBUILD_EXAMPLES=ON -DUSE_DATE_POLYFILL=${{matrix.toolchain.date-polyfill}} -DBUILD_REFACTORING=${{matrix.toolchain.build_refactoring}} -G "${{matrix.build-system.name}}" $GENERATOR_EXTRA_FLAGS $SPARROW_CHECKS - name: Build if: matrix.toolchain.build_refactoring == 'ON' From 4f81943cc35b21615f30b6d7fb4774a87b7f0a40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Klaim=20=28Jo=C3=ABl=20Lamotte=29?= <142265+Klaim@users.noreply.github.com> Date: Wed, 2 Oct 2024 15:00:10 +0200 Subject: [PATCH 8/9] tests: is_valid_arrow_length --- test/CMakeLists.txt | 3 +- test/test_arrow_length.cpp | 72 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 test/test_arrow_length.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8825d16a6..67d5f1f16 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -47,6 +47,7 @@ set(SPARROW_TESTS_SOURCES test_arrow_array_schema_utils.cpp test_arrow_array.cpp test_arrow_schema.cpp + test_arrow_length.cpp test_bit.cpp test_buffer_adaptor.cpp test_buffer.cpp @@ -75,7 +76,7 @@ set(SPARROW_TESTS_SOURCES set(test_target "test_sparrow_lib") add_executable(${test_target} ${SPARROW_TESTS_SOURCES}) target_link_libraries(${test_target} - PRIVATE + PRIVATE sparrow doctest::doctest $<$:${SANITIZER_LINK_LIBRARIES}>) include(doctest) diff --git a/test/test_arrow_length.cpp b/test/test_arrow_length.cpp new file mode 100644 index 000000000..9419d54d5 --- /dev/null +++ b/test/test_arrow_length.cpp @@ -0,0 +1,72 @@ +// Copyright 2024 Man Group Operations Limited +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "doctest/doctest.h" + +#include "sparrow/array/data_type.hpp" + + + +TEST_SUITE("value_ptr") +{ + TEST_CASE("is_valid_arrow_length") + { + using namespace sparrow; + CHECK(is_valid_arrow_length(0)); + CHECK(is_valid_arrow_length(1)); + CHECK(is_valid_arrow_length(1024)); + CHECK(is_valid_arrow_length(std::numeric_limits::max())); + CHECK(is_valid_arrow_length(max_arrow_length)); + + CHECK_FALSE(is_valid_arrow_length(-1)); + CHECK_FALSE(is_valid_arrow_length(-1024)); + CHECK_FALSE(is_valid_arrow_length(std::numeric_limits::min())); + CHECK(is_valid_arrow_length(-1, arrow_length_kind::offset)); + CHECK(is_valid_arrow_length(-1024, arrow_length_kind::offset)); + CHECK(is_valid_arrow_length(std::numeric_limits::min(), arrow_length_kind::offset)); + + CHECK(is_valid_arrow_length(std::size_t{0})); + CHECK(is_valid_arrow_length(std::size_t{1})); + CHECK(is_valid_arrow_length(std::ptrdiff_t{0})); + CHECK(is_valid_arrow_length(std::ptrdiff_t{1})); + CHECK(is_valid_arrow_length(std::ptrdiff_t{-1}, arrow_length_kind::offset)); + + // When not contrained to 32bit length and native offsets can represent less or equal values of arrow lengh, + // we check that the native offset types are usable as expected as arrow length. + if constexpr( sizeof(std::ptrdiff_t) <= max_arrow_length and config::enable_32bit_size_limit == false ) + { + CHECK(is_valid_arrow_length(std::numeric_limits::max())); + CHECK(is_valid_arrow_length(std::numeric_limits::max(), arrow_length_kind::offset)); + CHECK(is_valid_arrow_length(std::numeric_limits::min(), arrow_length_kind::offset)); + } + + // We always support at least 32bit lengths + CHECK(is_valid_arrow_length(std::size_t(std::numeric_limits::max()))); + CHECK(is_valid_arrow_length(std::ptrdiff_t(std::numeric_limits::max()))); + CHECK(is_valid_arrow_length( + std::ptrdiff_t(std::numeric_limits::max()), + arrow_length_kind::offset + )); + CHECK(is_valid_arrow_length( + std::ptrdiff_t(std::numeric_limits::min()), + arrow_length_kind::offset + )); + } + + + +} From b7b933c52b275e480459641fedd5f33d380063ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Klaim=20=28Jo=C3=ABl=20Lamotte=29?= <142265+Klaim@users.noreply.github.com> Date: Thu, 3 Oct 2024 15:08:08 +0200 Subject: [PATCH 9/9] tests: throw_if_invalid_size --- test/test_arrow_length.cpp | 42 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/test/test_arrow_length.cpp b/test/test_arrow_length.cpp index 9419d54d5..55b0d9fdc 100644 --- a/test/test_arrow_length.cpp +++ b/test/test_arrow_length.cpp @@ -67,6 +67,48 @@ TEST_SUITE("value_ptr") )); } + TEST_CASE("throw_if_invalid_size") + { + using namespace sparrow; + throw_if_invalid_size(0); + throw_if_invalid_size(1); + throw_if_invalid_size(1024); + throw_if_invalid_size(std::numeric_limits::max()); + throw_if_invalid_size(max_arrow_length); + + CHECK_THROWS(throw_if_invalid_size(-1)); + CHECK_THROWS(throw_if_invalid_size(-1024)); + CHECK_THROWS(throw_if_invalid_size(std::numeric_limits::min())); + throw_if_invalid_size(-1, arrow_length_kind::offset); + throw_if_invalid_size(-1024, arrow_length_kind::offset); + throw_if_invalid_size(std::numeric_limits::min(), arrow_length_kind::offset); + + throw_if_invalid_size(std::size_t{0}); + throw_if_invalid_size(std::size_t{1}); + throw_if_invalid_size(std::ptrdiff_t{0}); + throw_if_invalid_size(std::ptrdiff_t{1}); + throw_if_invalid_size(std::ptrdiff_t{-1}, arrow_length_kind::offset); + + // When not contrained to 32bit length and native offsets can represent less or equal values of arrow + // lengh, we check that the native offset types are usable as expected as arrow length. + if constexpr (sizeof(std::ptrdiff_t) <= max_arrow_length and config::enable_32bit_size_limit == false) + { + throw_if_invalid_size(std::numeric_limits::max()); + throw_if_invalid_size(std::numeric_limits::max(), arrow_length_kind::offset); + throw_if_invalid_size(std::numeric_limits::min(), arrow_length_kind::offset); + } + // We always support at least 32bit lengths + throw_if_invalid_size(std::size_t(std::numeric_limits::max())); + throw_if_invalid_size(std::ptrdiff_t(std::numeric_limits::max())); + throw_if_invalid_size( + std::ptrdiff_t(std::numeric_limits::max()), + arrow_length_kind::offset + ); + throw_if_invalid_size( + std::ptrdiff_t(std::numeric_limits::min()), + arrow_length_kind::offset + ); + } }