Skip to content

Commit

Permalink
Merge branch 'main' into parquet/support-write-bloom-filter
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Apr 26, 2024
2 parents de27ce4 + 299eb26 commit 259f15b
Show file tree
Hide file tree
Showing 73 changed files with 2,229 additions and 1,452 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,9 @@ jobs:
with:
python-version: '3.12'
- name: Install Ruby
uses: ruby/setup-ruby@250fcd6a742febb1123a77a841497ccaa8b9e939 # v1.152.0
uses: ruby/setup-ruby@v1
with:
ruby-version: '2.7'
ruby-version: ruby
- name: Install .NET
uses: actions/setup-dotnet@4d6c8fcf3c8f7a60068d26b594648e99df24cee3 # v4.0.0
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/matlab.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ jobs:
select-by-folder: matlab/test
macos:
name: AMD64 macOS 12 MATLAB
runs-on: macos-latest
runs-on: macos-12
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
steps:
- name: Check out repository
Expand Down
14 changes: 11 additions & 3 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,12 +128,20 @@ jobs:
run: archery docker push ${{ matrix.image }}

macos:
name: AMD64 macOS 12 Python 3
runs-on: macos-latest
name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} Python 3
runs-on: macos-${{ matrix.macos-version }}
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
include:
- architecture: AMD64
macos-version: "12"
- architecture: ARM64
macos-version: "14"
env:
ARROW_HOME: /usr/local
ARROW_HOME: /tmp/local
ARROW_AZURE: ON
ARROW_DATASET: ON
ARROW_FLIGHT: ON
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/ruby.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ jobs:
run: archery docker push ubuntu-ruby

macos:
name: AMD64 macOS 12 GLib & Ruby
name: AMD64 macOS 14 GLib & Ruby
runs-on: macos-latest
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 60
Expand All @@ -132,7 +132,7 @@ jobs:
ARROW_GCS: ON
ARROW_GLIB_GTK_DOC: true
ARROW_GLIB_WERROR: true
ARROW_HOME: /usr/local
ARROW_HOME: /tmp/local
ARROW_JEMALLOC: OFF
ARROW_ORC: OFF
ARROW_PARQUET: ON
Expand All @@ -141,7 +141,6 @@ jobs:
ARROW_WITH_SNAPPY: ON
ARROW_WITH_ZLIB: ON
ARROW_WITH_ZSTD: ON
XML_CATALOG_FILES: /usr/local/etc/xml/catalog
steps:
- name: Checkout Arrow
uses: actions/checkout@v4
Expand Down
6 changes: 4 additions & 2 deletions c_glib/arrow-glib/scalar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1063,7 +1063,8 @@ garrow_base_binary_scalar_get_value(GArrowBaseBinaryScalar *scalar)
if (!priv->value) {
const auto arrow_scalar = std::static_pointer_cast<arrow::BaseBinaryScalar>(
garrow_scalar_get_raw(GARROW_SCALAR(scalar)));
priv->value = garrow_buffer_new_raw(&(arrow_scalar->value));
priv->value = garrow_buffer_new_raw(
const_cast<std::shared_ptr<arrow::Buffer> *>(&(arrow_scalar->value)));
}
return priv->value;
}
Expand Down Expand Up @@ -1983,7 +1984,8 @@ garrow_base_list_scalar_get_value(GArrowBaseListScalar *scalar)
if (!priv->value) {
const auto arrow_scalar = std::static_pointer_cast<arrow::BaseListScalar>(
garrow_scalar_get_raw(GARROW_SCALAR(scalar)));
priv->value = garrow_array_new_raw(&(arrow_scalar->value));
priv->value = garrow_array_new_raw(
const_cast<std::shared_ptr<arrow::Array> *>(&(arrow_scalar->value)));
}
return priv->value;
}
Expand Down
1 change: 1 addition & 0 deletions ci/scripts/c_glib_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ build_dir=${2}/c_glib

: ${ARROW_GLIB_VAPI:=true}

export DYLD_LIBRARY_PATH=${ARROW_HOME}/lib:${DYLD_LIBRARY_PATH}
export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH}
export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig
export GI_TYPELIB_PATH=${ARROW_HOME}/lib/girepository-1.0
Expand Down
2 changes: 2 additions & 0 deletions ci/scripts/python_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ export PYARROW_WITH_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF}

export PYARROW_PARALLEL=${n_jobs}

: ${CMAKE_PREFIX_PATH:=${ARROW_HOME}}
export CMAKE_PREFIX_PATH
export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH}

pushd ${source_dir}
Expand Down
1 change: 1 addition & 0 deletions ci/scripts/ruby_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ set -ex
source_dir=${1}/ruby
build_dir=${2}/ruby

export DYLD_LIBRARY_PATH=${ARROW_HOME}/lib:${DYLD_LIBRARY_PATH}
export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH}
export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig
export GI_TYPELIB_PATH=${ARROW_HOME}/lib/girepository-1.0
Expand Down
5 changes: 1 addition & 4 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,6 @@ if(ARROW_USE_CCACHE
endif()

if(ARROW_OPTIONAL_INSTALL)
# Don't make the "install" target depend on the "all" target
set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true)

set(INSTALL_IS_OPTIONAL OPTIONAL)
endif()

Expand Down Expand Up @@ -711,7 +708,7 @@ list(APPEND ARROW_TEST_LINK_LIBS ${ARROW_GTEST_GMOCK} ${ARROW_GTEST_GTEST_MAIN})
if(ARROW_BUILD_BENCHMARKS)
set(ARROW_BENCHMARK_LINK_LIBS benchmark::benchmark_main ${ARROW_TEST_LINK_LIBS})
if(WIN32)
list(APPEND ARROW_BENCHMARK_LINK_LIBS Shlwapi.dll)
list(APPEND ARROW_BENCHMARK_LINK_LIBS shlwapi)
endif()
endif()

Expand Down
4 changes: 2 additions & 2 deletions cpp/examples/arrow/filesystem_definition_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ class ExampleFileSystem : public fs::FileSystem {
}
};

fs::FileSystemRegistrar kExampleFileSystemModule{
auto kExampleFileSystemModule = ARROW_REGISTER_FILESYSTEM(
"example",
[](const arrow::util::Uri& uri, const io::IOContext& io_context,
std::string* out_path) -> Result<std::shared_ptr<fs::FileSystem>> {
Expand All @@ -148,4 +148,4 @@ fs::FileSystemRegistrar kExampleFileSystemModule{
}
return fs;
},
};
{});
8 changes: 4 additions & 4 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ if(CMAKE_THREAD_LIBS_INIT)
endif()

if(WIN32)
list(APPEND ARROW_SYSTEM_LINK_LIBS "ws2_32.dll")
list(APPEND ARROW_SYSTEM_LINK_LIBS "ws2_32")
endif()

if(NOT WIN32 AND NOT APPLE)
Expand Down Expand Up @@ -628,9 +628,9 @@ else()
list(APPEND ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS ArrowTesting::gtest)
endif()
if(WIN32)
list(APPEND ARROW_TESTING_SHARED_LINK_LIBS "ws2_32.dll")
list(APPEND ARROW_TESTING_STATIC_LINK_LIBS "ws2_32.dll")
list(APPEND ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS "ws2_32.dll")
list(APPEND ARROW_TESTING_SHARED_LINK_LIBS "ws2_32")
list(APPEND ARROW_TESTING_STATIC_LINK_LIBS "ws2_32")
list(APPEND ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS "ws2_32")
endif()

set(ARROW_TESTING_SRCS
Expand Down
19 changes: 19 additions & 0 deletions cpp/src/arrow/acero/aggregate_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "arrow/util/benchmark_util.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_reader.h"
#include "arrow/util/byte_size.h"
#include "arrow/util/string.h"

namespace arrow {
Expand All @@ -50,6 +51,7 @@ namespace acero {
#include <random>

using arrow::internal::ToChars;
using arrow::util::TotalBufferSize;

#ifdef ARROW_WITH_BENCHMARKS_REFERENCE

Expand Down Expand Up @@ -371,9 +373,11 @@ static void BenchmarkGroupBy(benchmark::State& state, std::vector<Aggregate> agg
for (std::size_t arg_idx = 0; arg_idx < arguments.size(); arg_idx++) {
aggregates[arg_idx].target = {FieldRef(static_cast<int>(arg_idx))};
}
int64_t total_bytes = TotalBufferSize(*batch);
for (auto _ : state) {
ABORT_NOT_OK(BatchGroupBy(batch, aggregates, key_refs));
}
state.SetBytesProcessed(total_bytes * state.iterations());
}

#define GROUP_BY_BENCHMARK(Name, Impl) \
Expand Down Expand Up @@ -578,6 +582,8 @@ static void SumKernel(benchmark::State& state) {
for (auto _ : state) {
ABORT_NOT_OK(Sum(array).status());
}

state.SetItemsProcessed(state.iterations() * array_size);
}

static void SumKernelArgs(benchmark::internal::Benchmark* bench) {
Expand Down Expand Up @@ -611,6 +617,8 @@ void ModeKernel(benchmark::State& state, int min, int max) {
for (auto _ : state) {
ABORT_NOT_OK(Mode(array).status());
}

state.SetItemsProcessed(state.iterations() * array_size);
}

template <typename ArrowType>
Expand All @@ -625,13 +633,18 @@ void ModeKernelNarrow<Int8Type>(benchmark::State& state) {

template <>
void ModeKernelNarrow<BooleanType>(benchmark::State& state) {
using CType = typename TypeTraits<BooleanType>::CType;

RegressionArgs args(state);
const int64_t array_size = args.size / sizeof(CType);
auto rand = random::RandomArrayGenerator(1924);
auto array = rand.Boolean(args.size * 8, 0.5, args.null_proportion);

for (auto _ : state) {
ABORT_NOT_OK(Mode(array).status());
}

state.SetItemsProcessed(state.iterations() * array_size);
}

template <typename ArrowType>
Expand Down Expand Up @@ -668,6 +681,8 @@ static void MinMaxKernelBench(benchmark::State& state) {
for (auto _ : state) {
ABORT_NOT_OK(MinMax(array).status());
}

state.SetItemsProcessed(state.iterations() * array_size);
}

static void MinMaxKernelBenchArgs(benchmark::internal::Benchmark* bench) {
Expand Down Expand Up @@ -698,6 +713,8 @@ static void CountKernelBenchInt64(benchmark::State& state) {
for (auto _ : state) {
ABORT_NOT_OK(Count(array->Slice(1, array_size)).status());
}

state.SetItemsProcessed(state.iterations() * array_size);
}
BENCHMARK(CountKernelBenchInt64)->Args({1 * 1024 * 1024, 2}); // 1M with 50% null.

Expand All @@ -718,6 +735,8 @@ void VarianceKernelBench(benchmark::State& state) {
for (auto _ : state) {
ABORT_NOT_OK(Variance(array, options).status());
}

state.SetItemsProcessed(state.iterations() * array_size);
}

static void VarianceKernelBenchArgs(benchmark::internal::Benchmark* bench) {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/acero/asof_join_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ static void TableJoinOverhead(benchmark::State& state,
ASSERT_OK(DeclarationToStatus(std::move(join_node), /*use_threads=*/false));
}

state.counters["input_rows_per_second"] = benchmark::Counter(
state.counters["rows_per_second"] = benchmark::Counter(
static_cast<double>(state.iterations() * (left_table_stats.rows + right_hand_rows)),
benchmark::Counter::kIsRate);

Expand Down
43 changes: 40 additions & 3 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cmath>
#include <cstdint>
#include <cstring>
#include <future>
#include <limits>
#include <memory>
#include <numeric>
Expand Down Expand Up @@ -603,11 +604,11 @@ void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr<Scalar>& scalar)
ASSERT_EQ(out->length(), 9);

auto out_type_id = out->type()->id();
const bool has_validity = internal::HasValidityBitmap(out_type_id);
const bool can_check_nulls = internal::may_have_validity_bitmap(out_type_id);
// For a dictionary builder, the output dictionary won't necessarily be the same
const bool can_check_values = !is_dictionary(out_type_id);

if (has_validity) {
if (can_check_nulls) {
ASSERT_EQ(out->null_count(), 4);
} else {
ASSERT_EQ(out->null_count(), 0);
Expand Down Expand Up @@ -823,6 +824,41 @@ TEST_F(TestArray, TestFillFromScalar) {
}
}

// GH-40069: Data-race when concurrent calling ArraySpan::FillFromScalar of the same
// scalar instance.
TEST_F(TestArray, TestConcurrentFillFromScalar) {
for (auto type : TestArrayUtilitiesAgainstTheseTypes()) {
ARROW_SCOPED_TRACE("type = ", type->ToString());
for (auto seed : {0u, 0xdeadbeef, 42u}) {
ARROW_SCOPED_TRACE("seed = ", seed);

Field field("", type, /*nullable=*/true,
key_value_metadata({{"extension_allow_random_storage", "true"}}));
auto array = random::GenerateArray(field, 1, seed);

ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(0));

// Lambda to create fill an ArraySpan with the scalar and use the ArraySpan a bit.
auto array_span_from_scalar = [&]() {
ArraySpan span(*scalar);
auto roundtripped_array = span.ToArray();
ASSERT_OK(roundtripped_array->ValidateFull());

AssertArraysEqual(*array, *roundtripped_array);
ASSERT_OK_AND_ASSIGN(auto roundtripped_scalar, roundtripped_array->GetScalar(0));
AssertScalarsEqual(*scalar, *roundtripped_scalar);
};

// Two concurrent calls to the lambda are just enough for TSAN to detect a race
// condition.
auto fut1 = std::async(std::launch::async, array_span_from_scalar);
auto fut2 = std::async(std::launch::async, array_span_from_scalar);
fut1.get();
fut2.get();
}
}
}

TEST_F(TestArray, ExtensionSpanRoundTrip) {
// Other types are checked in MakeEmptyArray but MakeEmptyArray doesn't
// work for extension types so we check that here
Expand Down Expand Up @@ -855,7 +891,8 @@ TEST_F(TestArray, TestAppendArraySlice) {
span.SetMembers(*nulls->data());
ASSERT_OK(builder->AppendArraySlice(span, 0, 4));
ASSERT_EQ(12, builder->length());
const bool has_validity_bitmap = internal::HasValidityBitmap(scalar->type->id());
const bool has_validity_bitmap =
internal::may_have_validity_bitmap(scalar->type->id());
if (has_validity_bitmap) {
ASSERT_EQ(4, builder->null_count());
}
Expand Down
18 changes: 7 additions & 11 deletions cpp/src/arrow/array/builder_nested.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,13 +181,11 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder {
if constexpr (is_list_view(TYPE::type_id)) {
sizes = array.GetValues<offset_type>(2);
}
const bool all_valid = !array.MayHaveLogicalNulls();
const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR;
static_assert(internal::may_have_validity_bitmap(TYPE::type_id));
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
ARROW_RETURN_NOT_OK(Reserve(length));
for (int64_t row = offset; row < offset + length; row++) {
const bool is_valid =
all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) ||
array.IsValid(row);
const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row);
int64_t size = 0;
if (is_valid) {
if constexpr (is_list_view(TYPE::type_id)) {
Expand Down Expand Up @@ -569,13 +567,11 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder {

Status AppendArraySlice(const ArraySpan& array, int64_t offset,
int64_t length) override {
const int32_t* offsets = array.GetValues<int32_t>(1);
const bool all_valid = !array.MayHaveLogicalNulls();
const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR;
const auto* offsets = array.GetValues<int32_t>(1);
static_assert(internal::may_have_validity_bitmap(MapType::type_id));
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
for (int64_t row = offset; row < offset + length; row++) {
const bool is_valid =
all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) ||
array.IsValid(row);
const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row);
if (is_valid) {
ARROW_RETURN_NOT_OK(Append());
const int64_t slot_length = offsets[row + 1] - offsets[row];
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/concatenate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ class ConcatenateImpl {
}

Status Concatenate(std::shared_ptr<ArrayData>* out) && {
if (out_->null_count != 0 && internal::HasValidityBitmap(out_->type->id())) {
if (out_->null_count != 0 && internal::may_have_validity_bitmap(out_->type->id())) {
RETURN_NOT_OK(ConcatenateBitmaps(Bitmaps(0), pool_, &out_->buffers[0]));
}
RETURN_NOT_OK(VisitTypeInline(*out_->type, this));
Expand Down
Loading

0 comments on commit 259f15b

Please sign in to comment.