From 38454008b8d24ae99987316bc80c85576938bee2 Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Tue, 14 Jul 2020 20:44:09 -0700 Subject: [PATCH] [SPARK-32316][TESTS][INFRA] Test PySpark with Python 3.8 in Github Actions This PR aims to test PySpark with Python 3.8 in Github Actions. In the script side, it is already ready: https://github.com/apache/spark/blob/4ad9bfd53b84a6d2497668c73af6899bae14c187/python/run-tests.py#L161 This PR includes small related fixes together: 1. Install Python 3.8 2. Only install one Python implementation instead of installing many for SQL and Yarn test cases because they need one Python executable in their test cases that is higher than Python 2. 3. Do not install Python 2 which is not needed anymore after we dropped Python 2 at SPARK-32138 4. Remove a comment about installing PyPy3 on Jenkins - SPARK-32278. It is already installed. Currently, only PyPy3 and Python 3.6 are being tested with PySpark in Github Actions. We should test the latest version of Python as well because some optimizations can be only enabled with Python 3.8+. See also https://github.com/apache/spark/pull/29114 No, dev-only. Was not tested. Github Actions build in this PR will test it out. Closes #29116 from HyukjinKwon/test-python3.8-togehter. Authored-by: HyukjinKwon Signed-off-by: Dongjoon Hyun --- .github/workflows/master.yml | 29 +++++++++++++++++------------ python/run-tests.py | 2 +- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 52f8d08bdc326..f62074bfbeb13 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -119,37 +119,42 @@ jobs: java-version: ${{ matrix.java }} # PySpark - name: Install PyPy3 - # SQL component also has Python related tests, for example, IntegratedUDFTestUtils. # Note that order of Python installations here matters because default python3 is # overridden by pypy3. uses: actions/setup-python@v2 - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + if: contains(matrix.modules, 'pyspark') with: python-version: pypy3 architecture: x64 - name: Install Python 2.7 uses: actions/setup-python@v2 - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + if: contains(matrix.modules, 'pyspark') with: python-version: 2.7 architecture: x64 - - name: Install Python 3.6 + - name: Install Python 3.8 uses: actions/setup-python@v2 - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + # We should install one Python that is higher then 3+ for SQL and Yarn because: + # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. + # - Yarn has a Python specific test too, for example, YarnClusterSuite. + if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) with: - python-version: 3.6 + python-version: 3.8 architecture: x64 - - name: Install Python packages - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + - name: Install Python packages (Python 2.7 and PyPy3) + if: contains(matrix.modules, 'pyspark') # PyArrow is not supported in PyPy yet, see ARROW-2651. # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason. run: | - python3 -m pip install numpy pyarrow pandas scipy - python3 -m pip list - python2 -m pip install numpy pyarrow pandas scipy - python2 -m pip list + python2.7 -m pip install numpy pyarrow pandas scipy + python2.7 -m pip list pypy3 -m pip install numpy pandas pypy3 -m pip list + - name: Install Python packages (Python 3.8) + if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + run: | + python3.8 -m pip install numpy pyarrow pandas scipy + python3.8 -m pip list # SparkR - name: Install R 3.6 uses: r-lib/actions/setup-r@v1 diff --git a/python/run-tests.py b/python/run-tests.py index 42510c7642264..a404c5364e9bb 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -161,7 +161,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python): def get_default_python_executables(): - python_execs = [x for x in ["python3.6", "python2.7", "pypy3", "pypy"] if which(x)] + python_execs = [x for x in ["python3.8", "python2.7", "pypy3", "pypy"] if which(x)] if "python3.6" not in python_execs: p = which("python3")