From 939195183657daa2060970b6fcd1938eab53d44b Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 29 Apr 2021 10:14:29 +0200 Subject: [PATCH] ARROW-12506: [Python] Improve modularity of pyarrow codebase: _hdfsio module Second batch of changes related to making pyarrow build more modular. `hdfs-io` is no longer included in `pyarrow.lib` but has been separated to its own module. This PR is based on https://github.com/apache/arrow/pull/10131 Closes #10159 from amol-/ARROW-12506-2 Authored-by: Alessandro Molina Signed-off-by: Antoine Pitrou --- python/CMakeLists.txt | 3 ++- python/pyarrow/__init__.py | 8 +++++--- python/pyarrow/{io-hdfs.pxi => _hdfsio.pyx} | 10 ++++++++++ python/pyarrow/hdfs.py | 4 ++-- python/pyarrow/io.pxi | 1 + python/pyarrow/lib.pyx | 1 - python/setup.py | 1 + 7 files changed, 21 insertions(+), 7 deletions(-) rename python/pyarrow/{io-hdfs.pxi => _hdfsio.pyx} (97%) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 3058431f0f38b..3ed518d010985 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -387,10 +387,11 @@ endif() set(CYTHON_EXTENSIONS lib - _fs _compute _csv _feather + _fs + _hdfsio _json) set(LINK_LIBS arrow_shared arrow_python_shared) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index adfd69c18b323..1488f5c42e852 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -164,15 +164,17 @@ def show_versions(): log_memory_allocations, jemalloc_set_decay_ms) # I/O -from pyarrow.lib import (HdfsFile, NativeFile, PythonFile, +from pyarrow.lib import (NativeFile, PythonFile, BufferedInputStream, BufferedOutputStream, CompressedInputStream, CompressedOutputStream, TransformInputStream, transcoding_input_stream, FixedSizeBufferWriter, BufferReader, BufferOutputStream, OSFile, MemoryMappedFile, memory_map, - create_memory_map, have_libhdfs, - MockOutputStream, input_stream, output_stream) + create_memory_map, MockOutputStream, + input_stream, output_stream) + +from pyarrow._hdfsio import HdfsFile, have_libhdfs from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table, concat_arrays, concat_tables) diff --git a/python/pyarrow/io-hdfs.pxi b/python/pyarrow/_hdfsio.pyx similarity index 97% rename from python/pyarrow/io-hdfs.pxi rename to python/pyarrow/_hdfsio.pyx index 2cdb1b7bfda82..b864f8a686a99 100644 --- a/python/pyarrow/io-hdfs.pxi +++ b/python/pyarrow/_hdfsio.pyx @@ -18,6 +18,16 @@ # ---------------------------------------------------------------------- # HDFS IO implementation +# cython: language_level = 3 + +import re + +from pyarrow.lib cimport check_status, _Weakrefable, NativeFile +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_fs cimport * +from pyarrow.lib import frombytes, tobytes, ArrowIOError + from queue import Queue, Empty as QueueEmpty, Full as QueueFull diff --git a/python/pyarrow/hdfs.py b/python/pyarrow/hdfs.py index c4daac9fd1a53..56667bd5df57d 100644 --- a/python/pyarrow/hdfs.py +++ b/python/pyarrow/hdfs.py @@ -23,10 +23,10 @@ from pyarrow.util import implements, _DEPR_MSG from pyarrow.filesystem import FileSystem -import pyarrow.lib as lib +import pyarrow._hdfsio as _hdfsio -class HadoopFileSystem(lib.HadoopFileSystem, FileSystem): +class HadoopFileSystem(_hdfsio.HadoopFileSystem, FileSystem): """ DEPRECATED: FileSystem interface for HDFS cluster. diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 3fc098478d61a..9c501adcc2bbe 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -27,6 +27,7 @@ import threading import time import warnings from io import BufferedIOBase, IOBase, TextIOBase, UnsupportedOperation +from queue import Queue, Empty as QueueEmpty from pyarrow.util import _is_path_like, _stringify_path diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 1866d07112166..191250b3d5bc4 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -140,7 +140,6 @@ include "tensor.pxi" # File IO include "io.pxi" -include "io-hdfs.pxi" # IPC / Messaging include "ipc.pxi" diff --git a/python/setup.py b/python/setup.py index b4de5799d66db..24d54809a4212 100755 --- a/python/setup.py +++ b/python/setup.py @@ -203,6 +203,7 @@ def initialize_options(self): '_plasma', '_s3fs', '_hdfs', + '_hdfsio', 'gandiva'] def _run_cmake(self):