Skip to content

Commit

Permalink
GH-40153: [Python][C++] Fix large file handling on 32-bit Python build (
Browse files Browse the repository at this point in the history
#40176)

### Rationale for this change

Python large file tests fail on 32-bit platforms.

### What changes are included in this PR?

1. Fix passing `int64_t` position to the Python file methods when a Python file object is wrapped in an Arrow `RandomAccessFile`
2. Disallow creating a `MemoryMappedFile` spanning more than the `size_t` maximum, instead of silently truncating its length

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.

* GitHub Issue: #40153

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
pitrou authored Feb 21, 2024
1 parent 6a22a1d commit c344446
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 21 deletions.
22 changes: 14 additions & 8 deletions cpp/src/arrow/io/file.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <cerrno>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <mutex>
#include <sstream>
Expand Down Expand Up @@ -560,17 +561,22 @@ class MemoryMappedFile::MemoryMap
RETURN_NOT_OK(::arrow::internal::FileTruncate(file_->fd(), initial_size));
}

size_t mmap_length = static_cast<size_t>(initial_size);
if (length > initial_size) {
return Status::Invalid("mapping length is beyond file size");
}
if (length >= 0 && length < initial_size) {
int64_t mmap_length = initial_size;
if (length >= 0) {
// memory mapping a file region
mmap_length = static_cast<size_t>(length);
if (length > initial_size) {
return Status::Invalid("mapping length is beyond file size");
}
mmap_length = length;
}
if (static_cast<int64_t>(static_cast<size_t>(mmap_length)) != mmap_length) {
return Status::CapacityError("Requested memory map length ", mmap_length,
" does not fit in a C size_t "
"(are you using a 32-bit build of Arrow?");
}

void* result = mmap(nullptr, mmap_length, prot_flags_, map_mode_, file_->fd(),
static_cast<off_t>(offset));
void* result = mmap(nullptr, static_cast<size_t>(mmap_length), prot_flags_, map_mode_,
file_->fd(), static_cast<off_t>(offset));
if (result == MAP_FAILED) {
return Status::IOError("Memory mapping file failed: ",
::arrow::internal::ErrnoMessage(errno));
Expand Down
15 changes: 9 additions & 6 deletions python/pyarrow/src/arrow/python/io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,12 @@ class PythonFile {
Status Seek(int64_t position, int whence) {
RETURN_NOT_OK(CheckClosed());

// NOTE: `long long` is at least 64 bits in the C standard, the cast below is
// therefore safe.

// whence: 0 for relative to start of file, 2 for end of file
PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "seek", "(ni)",
static_cast<Py_ssize_t>(position), whence);
PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "seek", "(Li)",
static_cast<long long>(position), whence);
Py_XDECREF(result);
PY_RETURN_IF_ERROR(StatusCode::IOError);
return Status::OK();
Expand All @@ -103,16 +106,16 @@ class PythonFile {
Status Read(int64_t nbytes, PyObject** out) {
RETURN_NOT_OK(CheckClosed());

PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read", "(n)",
static_cast<Py_ssize_t>(nbytes));
PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read", "(L)",
static_cast<long long>(nbytes));
PY_RETURN_IF_ERROR(StatusCode::IOError);
*out = result;
return Status::OK();
}

Status ReadBuffer(int64_t nbytes, PyObject** out) {
PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer", "(n)",
static_cast<Py_ssize_t>(nbytes));
PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer", "(L)",
static_cast<long long>(nbytes));
PY_RETURN_IF_ERROR(StatusCode::IOError);
*out = result;
return Status::OK();
Expand Down
26 changes: 19 additions & 7 deletions python/pyarrow/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
import pyarrow as pa


def check_large_seeks(file_factory):
def check_large_seeks(file_factory, expected_error=None):
if sys.platform in ('win32', 'darwin'):
pytest.skip("need sparse file support")
try:
Expand All @@ -45,11 +45,16 @@ def check_large_seeks(file_factory):
f.truncate(2 ** 32 + 10)
f.seek(2 ** 32 + 5)
f.write(b'mark\n')
with file_factory(filename) as f:
assert f.seek(2 ** 32 + 5) == 2 ** 32 + 5
assert f.tell() == 2 ** 32 + 5
assert f.read(5) == b'mark\n'
assert f.tell() == 2 ** 32 + 10
if expected_error:
with expected_error:
file_factory(filename)
else:
with file_factory(filename) as f:
assert f.size() == 2 ** 32 + 10
assert f.seek(2 ** 32 + 5) == 2 ** 32 + 5
assert f.tell() == 2 ** 32 + 5
assert f.read(5) == b'mark\n'
assert f.tell() == 2 ** 32 + 10
finally:
os.unlink(filename)

Expand Down Expand Up @@ -1137,7 +1142,14 @@ def test_memory_zero_length(tmpdir):


def test_memory_map_large_seeks():
check_large_seeks(pa.memory_map)
if sys.maxsize >= 2**32:
expected_error = None
else:
expected_error = pytest.raises(
pa.ArrowCapacityError,
match="Requested memory map length 4294967306 "
"does not fit in a C size_t")
check_large_seeks(pa.memory_map, expected_error=expected_error)


def test_memory_map_close_remove(tmpdir):
Expand Down

0 comments on commit c344446

Please sign in to comment.