diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index c5482f730823b..fd23e0cf217e6 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -465,15 +465,17 @@ jobs: chmod +x /usr/local/bin/minio.exe - name: Set up Python uses: actions/setup-python@v5.1.1 + id: python-install with: python-version: 3.9 - name: Install Google Cloud Storage Testbench - shell: bash + shell: msys2 {0} + env: + PIPX_BIN_DIR: /usr/local/bin + PIPX_PYTHON: ${{ steps.python-install.outputs.python-path }} run: | ci/scripts/install_gcs_testbench.sh default - echo "PYTHON_BIN_DIR=$(cygpath --windows $(dirname $(which python3.exe)))" >> $GITHUB_ENV - name: Test shell: msys2 {0} run: | - PATH="$(cygpath --unix ${PYTHON_BIN_DIR}):${PATH}" ci/scripts/cpp_test.sh "$(pwd)" "$(pwd)/build" diff --git a/appveyor.yml b/appveyor.yml index 5954251d34733..9e4582f1d8d7f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -24,6 +24,7 @@ only_commits: - appveyor.yml - ci/appveyor* - ci/conda* + - ci/scripts/*.bat - cpp/ - format/ - python/ diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index f688fbb63a9ad..08a052e82f24d 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -46,7 +46,9 @@ set ARROW_CMAKE_ARGS=-DARROW_DEPENDENCY_SOURCE=CONDA -DARROW_WITH_BZ2=ON set ARROW_CXXFLAGS=/WX /MP @rem Install GCS testbench +set PIPX_BIN_DIR=C:\Windows\ call %CD%\ci\scripts\install_gcs_testbench.bat +storage-testbench -h || exit /B @rem @rem Build and test Arrow C++ libraries (including Parquet) diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile index dff1f2224809a..eb035d887a158 100644 --- a/ci/docker/conda-cpp.dockerfile +++ b/ci/docker/conda-cpp.dockerfile @@ -42,17 +42,19 @@ RUN mamba install -q -y \ valgrind && \ mamba clean --all +# We want to install the GCS testbench using the Conda base environment's Python, +# because the test environment's Python may later change. +ENV PIPX_PYTHON=/opt/conda/bin/python3 +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts +RUN /arrow/ci/scripts/install_gcs_testbench.sh default + # Ensure npm, node and azurite are on path. npm and node are required to install azurite, which will then need to -# be on the path for the tests to run. +# be on the path for the tests to run. ENV PATH=/opt/conda/envs/arrow/bin:$PATH COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_azurite.sh -# We want to install the GCS testbench using the same Python binary that the Conda code will use. -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile index 027fd589cecca..7e8dbe76f6248 100644 --- a/ci/docker/conda-python.dockerfile +++ b/ci/docker/conda-python.dockerfile @@ -32,11 +32,6 @@ RUN mamba install -q -y \ nomkl && \ mamba clean --all -# XXX The GCS testbench was already installed in conda-cpp.dockerfile, -# but we changed the installed Python version above, so we need to reinstall it. -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - ENV ARROW_ACERO=ON \ ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile index 5f488a4c285ff..625ab25f848f2 100644 --- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile +++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile @@ -35,16 +35,27 @@ RUN setx path "%path%;C:\Program Files\Git\usr\bin" RUN wmic product where "name like 'python%%'" call uninstall /nointeractive && \ rm -rf Python* +# Install the GCS testbench using a well-known Python version. +# NOTE: cannot use pipx's `--fetch-missing-python` because of +# https://github.com/pypa/pipx/issues/1521, therefore download Python ourselves. +RUN choco install -r -y --pre --no-progress python --version=3.11.9 +ENV PIPX_BIN_DIR=C:\\Windows\\ +ENV PIPX_PYTHON="C:\Python311\python.exe" +COPY ci/scripts/install_gcs_testbench.bat C:/arrow/ci/scripts/ +RUN call "C:\arrow\ci\scripts\install_gcs_testbench.bat" && \ + storage-testbench -h + # Define the full version number otherwise choco falls back to patch number 0 (3.8 => 3.8.0) ARG python=3.8 -RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \ - (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \ - (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \ - (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \ - (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") & \ - (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1" && setx PATH "%PATH%;C:\Python313;C:\Python313\Scripts") +RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10") & \ + (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13") & \ + (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11") & \ + (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9") & \ + (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4") & \ + (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1") # Install archiver to extract xz archives -RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION% & \ - python -m pip install --no-cache-dir -U pip setuptools & \ +RUN choco install -r -y --pre --no-progress --force python --version=%PYTHON_VERSION% && \ choco install --no-progress -r -y archiver + +ENV PYTHON=$python diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile index e17c0306f115d..4d867a448c994 100644 --- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile @@ -33,6 +33,7 @@ RUN apt-get update -y -q && \ libssl-dev \ libcurl4-openssl-dev \ python3-pip \ + python3-venv \ tzdata \ wget && \ apt-get clean && \ diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile index 341d8a87e8661..f26cad51f0983 100644 --- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile @@ -33,6 +33,7 @@ RUN apt-get update -y -q && \ libssl-dev \ libcurl4-openssl-dev \ python3-pip \ + python3-venv \ tzdata \ wget && \ apt-get clean && \ diff --git a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile index a995ab2a8bc2d..125bc7ba46a81 100644 --- a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile @@ -33,6 +33,7 @@ RUN apt-get update -y -q && \ libssl-dev \ libcurl4-openssl-dev \ python3-pip \ + python3-venv \ tzdata \ tzdata-legacy \ wget && \ diff --git a/ci/scripts/install_gcs_testbench.bat b/ci/scripts/install_gcs_testbench.bat index b03d0c2ad6608..f54f98db7cac8 100644 --- a/ci/scripts/install_gcs_testbench.bat +++ b/ci/scripts/install_gcs_testbench.bat @@ -17,9 +17,18 @@ @echo on -set GCS_TESTBENCH_VERSION="v0.36.0" +set GCS_TESTBENCH_VERSION="v0.40.0" + +set PIPX_FLAGS=--verbose +if NOT "%PIPX_PYTHON%"=="" ( + set PIPX_FLAGS=--python %PIPX_PYTHON% %PIPX_FLAGS% +) + +python -m pip install -U pipx || exit /B 1 @REM Install GCS testbench %GCS_TESTBENCH_VERSION% -python -m pip install ^ +pipx install %PIPX_FLAGS% ^ "https://github.com/googleapis/storage-testbench/archive/%GCS_TESTBENCH_VERSION%.tar.gz" ^ || exit /B 1 + +pipx list --verbose diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh index 5471b3cc238ca..78826e94d3294 100755 --- a/ci/scripts/install_gcs_testbench.sh +++ b/ci/scripts/install_gcs_testbench.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -set -e +set -ex if [ "$#" -ne 1 ]; then echo "Usage: $0 " @@ -34,19 +34,23 @@ case "$(uname -m)" in ;; esac -# On newer pythons install into the system will fail, so override that -export PIP_BREAK_SYSTEM_PACKAGES=1 - version=$1 if [[ "${version}" -eq "default" ]]; then version="v0.39.0" - # Latests versions of Testbench require newer setuptools - python3 -m pip install --upgrade setuptools fi +: ${PIPX_PYTHON:=$(which python3)} + +export PIP_BREAK_SYSTEM_PACKAGES=1 +${PIPX_PYTHON} -m pip install -U pipx + # This script is run with PYTHON undefined in some places, # but those only use older pythons. if [[ -z "${PYTHON_VERSION}" ]] || [[ "${PYTHON_VERSION}" != "3.13" ]]; then - python3 -m pip install \ - "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz" + pipx_flags=--verbose + if [[ $(id -un) == "root" ]]; then + # Install globally as /root/.local/bin is typically not in $PATH + pipx_flags="${pipx_flags} --global" + fi + ${PIPX_PYTHON} -m pipx install ${pipx_flags} "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz" fi diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat index 87c0bb1252024..cac3f18434b6c 100755 --- a/ci/scripts/python_wheel_windows_test.bat +++ b/ci/scripts/python_wheel_windows_test.bat @@ -37,28 +37,32 @@ set PYARROW_TEST_TENSORFLOW=ON set ARROW_TEST_DATA=C:\arrow\testing\data set PARQUET_TEST_DATA=C:\arrow\cpp\submodules\parquet-testing\data -@REM Install testing dependencies -pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1 +@REM List installed Pythons +py -0p + +set PYTHON_CMD=py -%PYTHON% -@REM Install GCS testbench -call "C:\arrow\ci\scripts\install_gcs_testbench.bat" +%PYTHON_CMD% -m pip install -U pip setuptools || exit /B 1 + +@REM Install testing dependencies +%PYTHON_CMD% -m pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1 @REM Install the built wheels -python -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow || exit /B 1 +%PYTHON_CMD% -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow || exit /B 1 @REM Test that the modules are importable -python -c "import pyarrow" || exit /B 1 -python -c "import pyarrow._gcsfs" || exit /B 1 -python -c "import pyarrow._hdfs" || exit /B 1 -python -c "import pyarrow._s3fs" || exit /B 1 -python -c "import pyarrow.csv" || exit /B 1 -python -c "import pyarrow.dataset" || exit /B 1 -python -c "import pyarrow.flight" || exit /B 1 -python -c "import pyarrow.fs" || exit /B 1 -python -c "import pyarrow.json" || exit /B 1 -python -c "import pyarrow.orc" || exit /B 1 -python -c "import pyarrow.parquet" || exit /B 1 -python -c "import pyarrow.substrait" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow._gcsfs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow._hdfs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow._s3fs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.csv" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.dataset" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.flight" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.fs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.json" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.orc" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.parquet" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.substrait" || exit /B 1 @rem Download IANA Timezone Database for ORC C++ curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B @@ -67,4 +71,4 @@ arc unarchive tzdata.tar.xz %USERPROFILE%\Downloads\test\tzdata set TZDIR=%USERPROFILE%\Downloads\test\tzdata\usr\share\zoneinfo @REM Execute unittest -pytest -r s --pyargs pyarrow || exit /B 1 +%PYTHON_CMD% -m pytest -r s --pyargs pyarrow || exit /B 1 diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index a6022a8d21681..2098cf4d7f319 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -95,44 +95,41 @@ class GcsTestbench : public ::testing::Environment { if (const auto* env = std::getenv("PYTHON")) { names = {env}; } - auto error = std::string( - "Could not start GCS emulator." - " Used the following list of python interpreter names:"); - for (const auto& interpreter : names) { - auto exe_path = bp::search_path(interpreter); - error += " " + interpreter; - if (exe_path.empty()) { - error += " (exe not found)"; - continue; - } + auto error = std::string("Could not start GCS emulator 'storage-testbench'"); - bp::ipstream output; - server_process_ = bp::child(exe_path, "-m", "testbench", "--port", port_, group_, - bp::std_err > output); + auto testbench_is_running = [](bp::child& process, bp::ipstream& output) { // Wait for message: "* Restarting with" - auto testbench_is_running = [&output, this](bp::child& process) { - std::string line; - std::chrono::time_point end = - std::chrono::steady_clock::now() + std::chrono::seconds(10); - while (server_process_.valid() && server_process_.running() && - std::chrono::steady_clock::now() < end) { - if (output.peek() && std::getline(output, line)) { - std::cerr << line << std::endl; - if (line.find("* Restarting with") != std::string::npos) return true; - } else { - std::this_thread::sleep_for(std::chrono::milliseconds(20)); - } + std::string line; + std::chrono::time_point end = + std::chrono::steady_clock::now() + std::chrono::seconds(10); + while (process.valid() && process.running() && + std::chrono::steady_clock::now() < end) { + if (output.peek() && std::getline(output, line)) { + std::cerr << line << std::endl; + if (line.find("* Restarting with") != std::string::npos) return true; + } else { + std::this_thread::sleep_for(std::chrono::milliseconds(20)); } - return false; - }; + } + return false; + }; - if (testbench_is_running(server_process_)) break; - error += " (failed to start)"; - server_process_.terminate(); - server_process_.wait(); + auto exe_path = bp::search_path("storage-testbench"); + if (!exe_path.empty()) { + bp::ipstream output; + server_process_ = + bp::child(exe_path, "--port", port_, group_, bp::std_err > output); + if (!testbench_is_running(server_process_, output)) { + error += " (failed to start)"; + server_process_.terminate(); + server_process_.wait(); + } + } else { + error += " (exe not found)"; + } + if (!server_process_.valid()) { + error_ = std::move(error); } - if (server_process_.valid() && server_process_.valid()) return; - error_ = std::move(error); } bool running() { return server_process_.running(); } @@ -140,7 +137,10 @@ class GcsTestbench : public ::testing::Environment { ~GcsTestbench() override { // Brutal shutdown, kill the full process group because the GCS testbench may launch // additional children. - group_.terminate(); + try { + group_.terminate(); + } catch (bp::process_error&) { + } if (server_process_.valid()) { server_process_.wait(); } diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index e1919497b5116..7a222cec8a7c4 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -233,17 +233,16 @@ def minio_server_health_check(address): def gcs_server(): port = find_free_port() env = os.environ.copy() - args = [sys.executable, '-m', 'testbench', '--port', str(port)] + exe = 'storage-testbench' + args = [exe, '--port', str(port)] proc = None try: - # check first if testbench module is available - import testbench # noqa:F401 # start server proc = subprocess.Popen(args, env=env) # Make sure the server is alive. if proc.poll() is not None: pytest.skip(f"Command {args} did not start server successfully!") - except (ModuleNotFoundError, OSError) as e: + except OSError as e: pytest.skip(f"Command {args} failed to execute: {e}") else: yield { diff --git a/python/scripts/run_emscripten_tests.py b/python/scripts/run_emscripten_tests.py index 1a4b4a4e05614..53d3dd52bd8a6 100644 --- a/python/scripts/run_emscripten_tests.py +++ b/python/scripts/run_emscripten_tests.py @@ -335,7 +335,7 @@ def _load_pyarrow_in_runner(driver, wheel_name): """ import pyarrow,pathlib pyarrow_dir = pathlib.Path(pyarrow.__file__).parent -pytest.main([pyarrow_dir, '-v']) +pytest.main([pyarrow_dir, '-r', 's']) """, wait_for_terminate=False, ) diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R index d671c12138c60..54159e82ca60f 100644 --- a/r/tests/testthat/test-gcs.R +++ b/r/tests/testthat/test-gcs.R @@ -116,12 +116,12 @@ test_that("GcsFileSystem$create() can read json_credentials", { }) skip_on_cran() -skip_if_not(system('python -c "import testbench"') == 0, message = "googleapis-storage-testbench is not installed.") +skip_if_not(system("storage-testbench -h") == 0, message = "googleapis-storage-testbench is not installed.") library(dplyr) testbench_port <- Sys.getenv("TESTBENCH_PORT", "9001") -pid_minio <- sys::exec_background("python", c("-m", "testbench", "--port", testbench_port), +pid_minio <- sys::exec_background("storage-testbench", c("--port", testbench_port), std_out = FALSE, std_err = FALSE # TODO: is there a good place to send output? )