diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 96c771aeb61b8..77b111f61bf4c 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -3389,6 +3389,12 @@ struct AwsInstance { ARROW_LOG(WARNING) << " arrow::fs::FinalizeS3 was not called even though S3 was initialized. " "This could lead to a segmentation fault at exit"; + // Leak the S3ClientFinalizer to avoid crashes when destroying remaining + // S3Client instances (GH-44071). + auto* leaked_shared_ptr = + new std::shared_ptr(GetClientFinalizer()); + ARROW_UNUSED(leaked_shared_ptr); + return; } GetClientFinalizer()->Finalize(); #ifdef ARROW_S3_HAS_S3CLIENT_CONFIGURATION @@ -3480,7 +3486,14 @@ Status EnsureS3Initialized() { } Status FinalizeS3() { - GetAwsInstance()->Finalize(); + auto instance = GetAwsInstance(); + // The AWS instance might already be destroyed in case FinalizeS3 + // is called from an atexit handler (which is a bad idea anyway as the + // AWS SDK is not safe anymore to shutdown by this time). See GH-44071. + if (instance == nullptr) { + return Status::Invalid("FinalizeS3 called too late"); + } + instance->Finalize(); return Status::OK(); } diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index f8ce74700dea8..1c639412cd1ab 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -19,8 +19,10 @@ import gzip import os import pathlib +from urllib.request import urlopen import subprocess import sys +import time import pytest import weakref @@ -34,6 +36,10 @@ LocalFileSystem, SubTreeFileSystem, _MockFileSystem, FileSystemHandler, PyFileSystem, FSSpecHandler, copy_files) +from pyarrow.util import find_free_port + + +here = os.path.dirname(os.path.abspath(__file__)) class DummyHandler(FileSystemHandler): @@ -2010,3 +2016,36 @@ def test_concurrent_s3fs_init(): finalize_s3() """ subprocess.check_call([sys.executable, "-c", code]) + + +@pytest.mark.s3 +def test_uwsgi_integration(): + # GH-44071: using S3FileSystem under uwsgi shouldn't lead to a crash at shutdown + try: + subprocess.check_call(["uwsgi", "--version"]) + except FileNotFoundError: + pytest.skip("uwsgi not installed on this Python") + + port = find_free_port() + args = ["uwsgi", "-i", "--http", f"127.0.0.1:{port}", + "--wsgi-file", os.path.join(here, "wsgi_examples.py")] + proc = subprocess.Popen(args, stdin=subprocess.DEVNULL) + # Try to fetch URL, it should return 200 Ok... + try: + url = f"http://127.0.0.1:{port}/s3/" + start_time = time.time() + error = None + while time.time() < start_time + 5: + try: + with urlopen(url) as resp: + assert resp.status == 200 + break + except OSError as e: + error = e + time.sleep(0.1) + else: + pytest.fail(f"Could not fetch {url!r}: {error}") + finally: + proc.terminate() + # ... and uwsgi should gracefully shutdown after it's been asked above + assert proc.wait() == 30 # UWSGI_END_CODE = 30 diff --git a/python/pyarrow/tests/wsgi_examples.py b/python/pyarrow/tests/wsgi_examples.py new file mode 100644 index 0000000000000..440b107abe511 --- /dev/null +++ b/python/pyarrow/tests/wsgi_examples.py @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow.fs + + +def application(env, start_response): + path = env['PATH_INFO'] + members = path.split('/') + assert members[0] == '' + assert len(members) >= 2 + root = members[1] + if root == 's3': + # See test_fs::test_uwsgi_integration + start_response('200 OK', [('Content-Type', 'text/html')]) + # flake8: noqa + fs = pyarrow.fs.S3FileSystem() + return [b"Hello World\n"] + else: + start_response('404 Not Found', [('Content-Type', 'text/html')]) + return [f"Path {path!r} not found\n".encode()] diff --git a/python/requirements-test.txt b/python/requirements-test.txt index 975477c4223dd..48422f86cc720 100644 --- a/python/requirements-test.txt +++ b/python/requirements-test.txt @@ -3,3 +3,4 @@ hypothesis pandas pytest pytz +uwsgi; sys.platform != 'win32' and python_version < '3.13' diff --git a/python/requirements-wheel-test.txt b/python/requirements-wheel-test.txt index 98ec2bd4fd4e4..bad3e251d4464 100644 --- a/python/requirements-wheel-test.txt +++ b/python/requirements-wheel-test.txt @@ -10,6 +10,7 @@ hypothesis pytest pytz tzdata; sys_platform == 'win32' +uwsgi; sys.platform != 'win32' and python_version < '3.13' # We generally test with the oldest numpy version that supports a given Python # version. However, there is no need to make this strictly the oldest version,