diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 7bb5481a561b9..69aba955c81f7 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -75,9 +75,9 @@ jobs: excluded-tags: org.apache.spark.tags.ExtendedSQLTest comment: "- other tests" env: - TEST_ONLY_MODULES: ${{ matrix.modules }} - TEST_ONLY_EXCLUDED_TAGS: ${{ matrix.excluded-tags }} - TEST_ONLY_INCLUDED_TAGS: ${{ matrix.included-tags }} + MODULES_TO_TEST: ${{ matrix.modules }} + EXCLUDED_TAGS: ${{ matrix.excluded-tags }} + INCLUDED_TAGS: ${{ matrix.included-tags }} HADOOP_PROFILE: ${{ matrix.hadoop }} HIVE_PROFILE: ${{ matrix.hive }} # GitHub Actions' default miniconda to use in pip packaging test. @@ -85,146 +85,20 @@ jobs: steps: - name: Checkout Spark repository uses: actions/checkout@v2 - # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - - name: Cache Scala, SBT, Maven and Zinc - uses: actions/cache@v1 + # In order to fetch changed files with: - path: build - key: build-${{ hashFiles('**/pom.xml') }} - restore-keys: | - build- - - name: Cache Maven local repository - uses: actions/cache@v2 - with: - path: ~/.m2/repository - key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-maven- - - name: Cache Ivy local repository - uses: actions/cache@v2 - with: - path: ~/.ivy2/cache - key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-ivy- - - name: Install JDK ${{ matrix.java }} - uses: actions/setup-java@v1 - with: - java-version: ${{ matrix.java }} + fetch-depth: 0 # PySpark - - name: Install PyPy3 - # SQL component also has Python related tests, for example, IntegratedUDFTestUtils. - # Note that order of Python installations here matters because default python3 is - # overridden by pypy3. - uses: actions/setup-python@v2 - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) - with: - python-version: pypy3 - architecture: x64 - - name: Install Python 2.7 - uses: actions/setup-python@v2 - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) - with: - python-version: 2.7 - architecture: x64 - name: Install Python 3.6 uses: actions/setup-python@v2 if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) with: python-version: 3.6 architecture: x64 - - name: Install Python packages - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) - # PyArrow is not supported in PyPy yet, see ARROW-2651. - # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason. - run: | - python3 -m pip install numpy pyarrow pandas scipy - python3 -m pip list - python2 -m pip install numpy pyarrow pandas scipy - python2 -m pip list - pypy3 -m pip install numpy pandas - pypy3 -m pip list - # SparkR - - name: Install R 3.6 - uses: r-lib/actions/setup-r@v1 - if: contains(matrix.modules, 'sparkr') - with: - r-version: 3.6 - - name: Install R packages - if: contains(matrix.modules, 'sparkr') - run: | - sudo apt-get install -y libcurl4-openssl-dev - sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')" - # Show installed packages in R. - sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]' - # Run the tests. - name: "Run tests: ${{ matrix.modules }}" run: | # Hive tests become flaky when running in parallel as it's too intensive. - if [[ "$TEST_ONLY_MODULES" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi + if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi mkdir -p ~/.m2 - ./dev/run-tests --parallelism 2 + ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" rm -rf ~/.m2/repository/org/apache/spark - - # Static analysis, and documentation build - lint: - name: Linters, licenses, dependencies and documentation generation - runs-on: ubuntu-latest - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - - name: Cache Maven local repository - uses: actions/cache@v2 - with: - path: ~/.m2/repository - key: docs-maven-repo-${{ hashFiles('**/pom.xml') }} - restore-keys: | - docs-maven- - - name: Install JDK 1.8 - uses: actions/setup-java@v1 - with: - java-version: 1.8 - - name: Install Python 3.6 - uses: actions/setup-python@v2 - with: - python-version: 3.6 - architecture: x64 - - name: Install Python linter dependencies - run: | - pip3 install flake8 sphinx numpy - - name: Install R 3.6 - uses: r-lib/actions/setup-r@v1 - with: - r-version: 3.6 - - name: Install R linter dependencies and SparkR - run: | - sudo apt-get install -y libcurl4-openssl-dev - sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" - sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')" - ./R/install-dev.sh - - name: Install Ruby 2.7 for documentation generation - uses: actions/setup-ruby@v1 - with: - ruby-version: 2.7 - - name: Install dependencies for documentation generation - run: | - sudo apt-get install -y libcurl4-openssl-dev pandoc - pip install sphinx mkdocs numpy - gem install jekyll jekyll-redirect-from rouge - sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')" - - name: Scala linter - run: ./dev/lint-scala - - name: Java linter - run: ./dev/lint-java - - name: Python linter - run: ./dev/lint-python - - name: R linter - run: ./dev/lint-r - - name: License test - run: ./dev/check-license - - name: Dependencies test - run: ./dev/test-dependencies.sh - - name: Run documentation build - run: | - cd docs - jekyll build diff --git a/R/README.md b/R/README.md index 31174c73526f2..bd59b3daad1d4 100644 --- a/R/README.md +++ b/R/README.md @@ -1,4 +1,4 @@ -# R on Spark +# R on Spark. SparkR is an R package that provides a light-weight frontend to use Spark from R. diff --git a/dev/run-tests.py b/dev/run-tests.py index 03cc3230a65fd..74a8152536ab4 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -79,17 +79,20 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))] True """ - if target_branch is None and target_ref is None: - raise AttributeError("must specify either target_branch or target_ref") - elif target_branch is not None and target_ref is not None: + if target_branch is not None and target_ref is not None: raise AttributeError("must specify either target_branch or target_ref, not both") if target_branch is not None: - diff_target = target_branch + diff_target = [target_branch] run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)]) + elif target_ref is not None: + diff_target = [target_ref] else: - diff_target = target_ref - raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target], - universal_newlines=True) + # If both are not specified, just show the diff from the commit only. + diff_target = [] + raw_output = subprocess.check_output( + ['git', 'diff', '--name-only', patch_sha] + diff_target, + universal_newlines=True) + print(raw_output) # Remove any empty strings return [f for f in raw_output.split('\n') if f] @@ -539,6 +542,24 @@ def parse_opts(): "-p", "--parallelism", type=int, default=8, help="The number of suites to test in parallel (default %(default)d)" ) + parser.add_argument( + "-m", "--modules", type=str, + default=None, + help="A comma-separated list of modules to test " + "(default: %s)" % ",".join(sorted([m.name for m in modules.all_modules])) + ) + parser.add_argument( + "-e", "--excluded-tags", type=str, + default=None, + help="A comma-separated list of tags to exclude in the tests, " + "e.g., org.apache.spark.tags.ExtendedHiveTest " + ) + parser.add_argument( + "-i", "--included-tags", type=str, + default=None, + help="A comma-separated list of tags to include in the tests, " + "e.g., org.apache.spark.tags.ExtendedHiveTest " + ) args, unknown = parser.parse_known_args() if unknown: @@ -589,43 +610,64 @@ def main(): # /home/jenkins/anaconda2/envs/py36/bin os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH") else: - # else we're running locally and can use local settings + # else we're running locally or Github Actions. build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7") hive_version = os.environ.get("HIVE_PROFILE", "hive2.3") - test_env = "local" + if "GITHUB_ACTIONS" in os.environ: + test_env = "github_actions" + else: + test_env = "local" print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, "and Hive profile", hive_version, "under environment", test_env) extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version) changed_modules = None + test_modules = None changed_files = None - should_only_test_modules = "TEST_ONLY_MODULES" in os.environ + should_only_test_modules = opts.modules is not None included_tags = [] + excluded_tags = [] if should_only_test_modules: - str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")] + str_test_modules = [m.strip() for m in opts.modules.split(",")] test_modules = [m for m in modules.all_modules if m.name in str_test_modules] - # Directly uses test_modules as changed modules to apply tags and environments - # as if all specified test modules are changed. + + # If we're running the tests in Github Actions, attempt to detect and test + # only the affected modules. + if test_env == "github_actions": + base_ref = os.environ["GITHUB_BASE_REF"] + changed_files = identify_changed_files_from_git_commits( + os.environ["GITHUB_SHA"], target_branch=None if base_ref == "" else base_ref) + print("changed_files : %s" % changed_files) + test_modules = list(set(determine_modules_to_test( + determine_modules_for_files(changed_files))).intersection(test_modules)) + print("test_modules : %s" % test_modules) + changed_modules = test_modules - str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None) - str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None) - excluded_tags = [] - if str_excluded_tags: - excluded_tags = [t.strip() for t in str_excluded_tags.split(",")] - included_tags = [] - if str_included_tags: - included_tags = [t.strip() for t in str_included_tags.split(",")] + + # If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and + # detect modules to test. elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) + test_modules = determine_modules_to_test(changed_modules) excluded_tags = determine_tags_to_exclude(changed_modules) + # If there is no changed module found, tests all. if not changed_modules: changed_modules = [modules.root] - excluded_tags = [] + if not test_modules: + test_modules = determine_modules_to_test(changed_modules) + + str_excluded_tags = opts.excluded_tags + str_included_tags = opts.included_tags + if str_excluded_tags: + excluded_tags.extend([t.strip() for t in str_excluded_tags.split(",")]) + if str_included_tags: + included_tags.extend([t.strip() for t in str_included_tags.split(",")]) + print("[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)) @@ -640,8 +682,6 @@ def main(): should_run_java_style_checks = False if not should_only_test_modules: - test_modules = determine_modules_to_test(changed_modules) - # license checks run_apache_rat_checks() @@ -672,40 +712,43 @@ def main(): # if "DOCS" in changed_modules and test_env == "amplab_jenkins": # build_spark_documentation() - if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins": - run_build_tests() - - # spark build - build_apache_spark(build_tool, extra_profiles) - - # backwards compatibility checks - if build_tool == "sbt": - # Note: compatibility tests only supported in sbt for now - detect_binary_inop_with_mima(extra_profiles) - # Since we did not build assembly/package before running dev/mima, we need to - # do it here because the tests still rely on it; see SPARK-13294 for details. - build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks) - - # run the test suites - run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags) - - modules_with_python_tests = [m for m in test_modules if m.python_test_goals] - if modules_with_python_tests: - # We only run PySpark tests with coverage report in one specific job with - # Spark master with SBT in Jenkins. - is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ - run_python_tests( - modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job) - run_python_packaging_tests() - if any(m.should_run_r_tests for m in test_modules): - run_sparkr_tests() + print(changed_modules) + print(test_modules) + print([m for m in test_modules if m.python_test_goals]) + print([m.should_run_r_tests for m in test_modules]) + print(excluded_tags) + print(included_tags) + + # if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins": + # run_build_tests() + # + # # spark build + # build_apache_spark(build_tool, extra_profiles) + # + # # backwards compatibility checks + # if build_tool == "sbt": + # # Note: compatibility tests only supported in sbt for now + # detect_binary_inop_with_mima(extra_profiles) + # # Since we did not build assembly/package before running dev/mima, we need to + # # do it here because the tests still rely on it; see SPARK-13294 for details. + # build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks) + # + # # run the test suites + # run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags) + # + # modules_with_python_tests = [m for m in test_modules if m.python_test_goals] + # if modules_with_python_tests: + # # We only run PySpark tests with coverage report in one specific job with + # # Spark master with SBT in Jenkins. + # is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ + # run_python_tests( + # modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job) + # run_python_packaging_tests() + # if any(m.should_run_r_tests for m in test_modules): + # run_sparkr_tests() def _test(): - if "TEST_ONLY_MODULES" in os.environ: - # TODO(SPARK-32252): Enable doctests back in Github Actions. - return - import doctest failure_count = doctest.testmod()[0] if failure_count: diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 5f4a8a2d2db1f..bb192b487d4c4 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -16,7 +16,7 @@ # """ -Worker that receives input from Piped RDD. +Worker that receives input from Piped RDD """ from __future__ import print_function from __future__ import absolute_import diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index da542c67d9c51..215ba8b6846c7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -61,10 +61,10 @@ private[sql] object Column { /** * A [[Column]] where an [[Encoder]] has been given for the expected input and return type. - * To create a [[TypedColumn]], use the `as` function on a [[Column]]. + * To create a [[TypedColumn]], use the `as` function on a [[Column]] * * @tparam T The input type expected for this expression. Can be `Any` if the expression is type - * checked by the analyzer instead of the compiler (i.e. `expr("sum(...)")`). + * checked by the analyzer instead of the compiler (i.e. `expr("sum(...)")`) * @tparam U The output type of this column. * * @since 1.6.0