Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into apachegh-43519-clea…
Browse files Browse the repository at this point in the history
…nup-requirements
  • Loading branch information
jorisvandenbossche committed Oct 10, 2024
2 parents 6eea81c + 8be5f9c commit 63963fe
Show file tree
Hide file tree
Showing 979 changed files with 1,389 additions and 764,571 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pr_review_trigger.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: "Upload PR review Payload"
uses: actions/upload-artifact@v4.4.0
uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1
with:
path: "${{ github.event_path }}"
name: "pr_review_payload"
6 changes: 3 additions & 3 deletions .github/workflows/r.yml
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ jobs:
if: always()
- name: Save the test output
if: always()
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1
with:
name: test-output-${{ matrix.ubuntu }}-${{ matrix.r }}
path: r/check/arrow.Rcheck/tests/testthat.Rout*
Expand Down Expand Up @@ -230,7 +230,7 @@ jobs:
if: always()
- name: Save the test output
if: always()
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1
with:
name: test-output-bundled
path: r/check/arrow.Rcheck/tests/testthat.Rout*
Expand Down Expand Up @@ -292,7 +292,7 @@ jobs:
# So that they're unique when multiple are downloaded in the next step
shell: bash
run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # # v4.0.0
- uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1
with:
name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ Major components of the project include:
- [C# .NET libraries](https://github.com/apache/arrow/tree/main/csharp)
- [Gandiva](https://github.com/apache/arrow/tree/main/cpp/src/gandiva):
an [LLVM](https://llvm.org)-based Arrow expression compiler, part of the C++ codebase
- [Go libraries](https://github.com/apache/arrow/tree/main/go)
- [Go libraries](https://github.com/apache/arrow-go)
- [Java libraries](https://github.com/apache/arrow/tree/main/java)
- [JavaScript libraries](https://github.com/apache/arrow/tree/main/js)
- [Python libraries](https://github.com/apache/arrow/tree/main/python)
Expand Down
2 changes: 1 addition & 1 deletion c_glib/arrow-glib/array-builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6320,7 +6320,7 @@ garrow_union_array_builder_class_init(GArrowUnionArrayBuilderClass *klass)
* garrow_union_array_builder_append_child:
* @builder: A #GArrowUnionArrayBuilder.
* @child: A #GArrowArrayBuilder for new child.
* @filed_name: (nullable): A field name for new child.
* @field_name: (nullable): A field name for new child.
*
* Returns: The type ID for the appended child.
*
Expand Down
2 changes: 1 addition & 1 deletion c_glib/arrow-glib/array-builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -1820,7 +1820,7 @@ GARROW_AVAILABLE_IN_12_0
gint8
garrow_union_array_builder_append_child(GArrowUnionArrayBuilder *builder,
GArrowArrayBuilder *child,
const gchar *filed_name);
const gchar *field_name);

GARROW_AVAILABLE_IN_12_0
gboolean
Expand Down
9 changes: 8 additions & 1 deletion c_glib/test/test-stream-decoder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,15 @@ def test_consume_bytes
end

def test_consume_buffer
# We need to keep data that aren't processed yet.
data = []
@buffer.data.to_s.each_byte do |byte|
@decoder.consume_buffer(Arrow::Buffer.new(byte.chr))
data << byte.chr
can_clear = (@decoder.next_required_size == 1)
@decoder.consume_buffer(Arrow::Buffer.new(data.last))
# We can release a reference for kept data after they are
# processed.
data.clear if can_clear
end
assert_equal([
[:schema_decoded, @schema, @schema],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
ARG base
FROM ${base}

ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update -y -q && \
apt install -y -q --no-install-recommends software-properties-common gpg-agent && \
add-apt-repository -y ppa:deadsnakes/ppa && \
Expand Down
1 change: 1 addition & 0 deletions cpp/cmake_modules/SetupCxxFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,7 @@ if("${BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-conversion")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-sign-conversion")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wdate-time")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wimplicit-fallthrough")
string(APPEND CXX_ONLY_FLAGS " -Wredundant-move")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wunused-result")
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/c/bridge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2333,6 +2333,7 @@ class ArrayStreamReader {
break;
case ENOSYS:
code = StatusCode::NotImplemented;
break;
default:
code = StatusCode::IOError;
break;
Expand Down
31 changes: 19 additions & 12 deletions cpp/src/arrow/extension/json.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,13 @@
namespace arrow::extension {

bool JsonExtensionType::ExtensionEquals(const ExtensionType& other) const {
return other.extension_name() == this->extension_name();
return other.extension_name() == this->extension_name() &&
other.storage_type()->Equals(storage_type_);
}

Result<std::shared_ptr<DataType>> JsonExtensionType::Deserialize(
std::shared_ptr<DataType> storage_type, const std::string& serialized) const {
if (storage_type->id() != Type::STRING && storage_type->id() != Type::STRING_VIEW &&
storage_type->id() != Type::LARGE_STRING) {
return Status::Invalid("Invalid storage type for JsonExtensionType: ",
storage_type->ToString());
}
return std::make_shared<JsonExtensionType>(storage_type);
return JsonExtensionType::Make(std::move(storage_type));
}

std::string JsonExtensionType::Serialize() const { return ""; }
Expand All @@ -51,11 +47,22 @@ std::shared_ptr<Array> JsonExtensionType::MakeArray(
return std::make_shared<ExtensionArray>(data);
}

std::shared_ptr<DataType> json(const std::shared_ptr<DataType> storage_type) {
ARROW_CHECK(storage_type->id() != Type::STRING ||
storage_type->id() != Type::STRING_VIEW ||
storage_type->id() != Type::LARGE_STRING);
return std::make_shared<JsonExtensionType>(storage_type);
bool JsonExtensionType::IsSupportedStorageType(Type::type type_id) {
return type_id == Type::STRING || type_id == Type::STRING_VIEW ||
type_id == Type::LARGE_STRING;
}

Result<std::shared_ptr<DataType>> JsonExtensionType::Make(
std::shared_ptr<DataType> storage_type) {
if (!IsSupportedStorageType(storage_type->id())) {
return Status::Invalid("Invalid storage type for JsonExtensionType: ",
storage_type->ToString());
}
return std::make_shared<JsonExtensionType>(std::move(storage_type));
}

std::shared_ptr<DataType> json(std::shared_ptr<DataType> storage_type) {
return JsonExtensionType::Make(std::move(storage_type)).ValueOrDie();
}

} // namespace arrow::extension
4 changes: 4 additions & 0 deletions cpp/src/arrow/extension/json.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ class ARROW_EXPORT JsonExtensionType : public ExtensionType {

std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;

static Result<std::shared_ptr<DataType>> Make(std::shared_ptr<DataType> storage_type);

static bool IsSupportedStorageType(Type::type type_id);

private:
std::shared_ptr<DataType> storage_type_;
};
Expand Down
14 changes: 14 additions & 0 deletions cpp/src/arrow/extension/json_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,18 @@ TEST_F(TestJsonExtensionType, InvalidUTF8) {
}
}

TEST_F(TestJsonExtensionType, StorageTypeValidation) {
ASSERT_TRUE(json(utf8())->Equals(json(utf8())));
ASSERT_FALSE(json(large_utf8())->Equals(json(utf8())));
ASSERT_FALSE(json(utf8_view())->Equals(json(utf8())));
ASSERT_FALSE(json(utf8_view())->Equals(json(large_utf8())));

for (const auto& storage_type : {int16(), binary(), float64(), null()}) {
ASSERT_RAISES_WITH_MESSAGE(Invalid,
"Invalid: Invalid storage type for JsonExtensionType: " +
storage_type->ToString(),
extension::JsonExtensionType::Make(storage_type));
}
}

} // namespace arrow
39 changes: 36 additions & 3 deletions cpp/src/arrow/filesystem/azurefs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1793,6 +1793,8 @@ class AzureFileSystem::Impl {
// BlobPrefixes. A BlobPrefix always ends with kDelimiter ("/"), so we can
// distinguish between a directory and a file by checking if we received a
// prefix or a blob.
// This strategy allows us to implement GetFileInfo with just 1 blob storage
// operation in almost every case.
if (!list_response.BlobPrefixes.empty()) {
// Ensure the returned BlobPrefixes[0] string doesn't contain more characters than
// the requested Prefix. For instance, if we request with Prefix="dir/abra" and
Expand All @@ -1814,6 +1816,25 @@ class AzureFileSystem::Impl {
info.set_mtime(
std::chrono::system_clock::time_point{blob.Details.LastModified});
return info;
} else if (blob.Name[options.Prefix.Value().length()] < internal::kSep) {
// First list result did not indicate a directory and there is definitely no
// exactly matching blob. However, there may still be a directory that we
// initially missed because the first list result came before
// `options.Prefix + internal::kSep` lexigraphically.
// For example the flat namespace storage account has the following blobs:
// - container/dir.txt
// - container/dir/file.txt
// GetFileInfo(container/dir) should return FileType::Directory but in this
// edge case `blob = "dir.txt"`, so without further checks we would incorrectly
// return FileType::NotFound.
// Therefore we make an extra list operation with the trailing slash to confirm
// whether the path is a directory.
options.Prefix = internal::EnsureTrailingSlash(location.path);
auto list_with_trailing_slash_response = container_client.ListBlobs(options);
if (!list_with_trailing_slash_response.Blobs.empty()) {
info.set_type(FileType::Directory);
return info;
}
}
}
info.set_type(FileType::NotFound);
Expand Down Expand Up @@ -1895,18 +1916,22 @@ class AzureFileSystem::Impl {
/// \brief List the paths at the root of a filesystem or some dir in a filesystem.
///
/// \pre adlfs_client is the client for the filesystem named like the first
/// segment of select.base_dir.
/// segment of select.base_dir. The filesystem is know to exist.
Status GetFileInfoWithSelectorFromFileSystem(
const DataLake::DataLakeFileSystemClient& adlfs_client,
const Core::Context& context, Azure::Nullable<int32_t> page_size_hint,
const FileSelector& select, FileInfoVector* acc_results) {
ARROW_ASSIGN_OR_RAISE(auto base_location, AzureLocation::FromString(select.base_dir));

// The filesystem a.k.a. the container is known to exist so if the path is empty then
// we have already found the base_location, so initialize found to true.
bool found = base_location.path.empty();

auto directory_client = adlfs_client.GetDirectoryClient(base_location.path);
bool found = false;
DataLake::ListPathsOptions options;
options.PageSizeHint = page_size_hint;

auto base_path_depth = internal::GetAbstractPathDepth(base_location.path);
try {
auto list_response = directory_client.ListPaths(select.recursive, options, context);
for (; list_response.HasPage(); list_response.MoveToNextPage(context)) {
Expand All @@ -1918,7 +1943,15 @@ class AzureFileSystem::Impl {
if (path.Name == base_location.path && !path.IsDirectory) {
return NotADir(base_location);
}
acc_results->push_back(FileInfoFromPath(base_location.container, path));
// Subtract 1 because with `max_recursion=0` we want to list the base path,
// which will produce results with depth 1 greater that the base path's depth.
// NOTE: `select.max_recursion` + anything will cause integer overflows because
// `select.max_recursion` defaults to `INT32_MAX`. Therefore, options to
// rewrite this condition in a more readable way are limited.
if (internal::GetAbstractPathDepth(path.Name) - base_path_depth - 1 <=
select.max_recursion) {
acc_results->push_back(FileInfoFromPath(base_location.container, path));
}
}
}
} catch (const Storage::StorageException& exception) {
Expand Down
32 changes: 27 additions & 5 deletions cpp/src/arrow/filesystem/azurefs_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -364,9 +364,9 @@ class TestGeneric : public ::testing::Test, public GenericFileSystemTest {
std::shared_ptr<FileSystem> GetEmptyFileSystem() override { return fs_; }

bool have_implicit_directories() const override { return true; }
bool allow_write_file_over_dir() const override { return true; }
bool allow_read_dir_as_file() const override { return true; }
bool allow_move_dir() const override { return false; }
bool allow_write_file_over_dir() const override { return false; }
bool allow_read_dir_as_file() const override { return false; }
bool allow_move_dir() const override { return true; }
bool allow_move_file() const override { return true; }
bool allow_append_to_file() const override { return true; }
bool have_directory_mtimes() const override { return true; }
Expand Down Expand Up @@ -404,7 +404,11 @@ class TestAzuriteGeneric : public TestGeneric {
}

protected:
// Azurite doesn't support moving files over containers.
// Azurite doesn't block writing files over directories.
bool allow_write_file_over_dir() const override { return true; }
// Azurite doesn't support moving directories.
bool allow_move_dir() const override { return false; }
// Azurite doesn't support moving files.
bool allow_move_file() const override { return false; }
// Azurite doesn't support directory mtime.
bool have_directory_mtimes() const override { return false; }
Expand All @@ -426,7 +430,11 @@ class TestAzureFlatNSGeneric : public TestGeneric {
}

protected:
// Flat namespace account doesn't support moving files over containers.
// Flat namespace account doesn't block writing files over directories.
bool allow_write_file_over_dir() const override { return true; }
// Flat namespace account doesn't support moving directories.
bool allow_move_dir() const override { return false; }
// Flat namespace account doesn't support moving files.
bool allow_move_file() const override { return false; }
// Flat namespace account doesn't support directory mtime.
bool have_directory_mtimes() const override { return false; }
Expand Down Expand Up @@ -2065,6 +2073,20 @@ void TestAzureFileSystem::TestGetFileInfoObjectWithNestedStructure() {
ASSERT_OK(output->Write(lorem_ipsum));
ASSERT_OK(output->Close());

// . is immediately before "/" lexicographically, ensure that this doesn't
// cause unexpected issues. NOTE: Its seems real Azure blob storage doesn't
// allow blob names to end in `.`
ASSERT_OK_AND_ASSIGN(output, fs()->OpenOutputStream(
data.ContainerPath("test-object-dir/some_other_dir.a"),
/*metadata=*/{}));
ASSERT_OK(output->Write(lorem_ipsum));
ASSERT_OK(output->Close());
ASSERT_OK_AND_ASSIGN(output,
fs()->OpenOutputStream(data.ContainerPath(kObjectName + ".a"),
/*metadata=*/{}));
ASSERT_OK(output->Write(lorem_ipsum));
ASSERT_OK(output->Close());

AssertFileInfo(fs(), data.ContainerPath(kObjectName), FileType::File);
AssertFileInfo(fs(), data.ContainerPath(kObjectName) + "/", FileType::NotFound);
AssertFileInfo(fs(), data.ContainerPath("test-object-dir"), FileType::Directory);
Expand Down
Loading

0 comments on commit 63963fe

Please sign in to comment.