Skip to content

Commit

Permalink
Detect changed files and run the relavent tests only in GitHub Actions (
Browse files Browse the repository at this point in the history
#8)

Test in master bracnh
  • Loading branch information
HyukjinKwon authored Jul 13, 2020
1 parent b6229df commit 74cad54
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 192 deletions.
140 changes: 7 additions & 133 deletions .github/workflows/master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,156 +75,30 @@ jobs:
excluded-tags: org.apache.spark.tags.ExtendedSQLTest
comment: "- other tests"
env:
TEST_ONLY_MODULES: ${{ matrix.modules }}
TEST_ONLY_EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
TEST_ONLY_INCLUDED_TAGS: ${{ matrix.included-tags }}
MODULES_TO_TEST: ${{ matrix.modules }}
EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
INCLUDED_TAGS: ${{ matrix.included-tags }}
HADOOP_PROFILE: ${{ matrix.hadoop }}
HIVE_PROFILE: ${{ matrix.hive }}
# GitHub Actions' default miniconda to use in pip packaging test.
CONDA_PREFIX: /usr/share/miniconda
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
- name: Cache Scala, SBT, Maven and Zinc
uses: actions/cache@v1
# In order to fetch changed files
with:
path: build
key: build-${{ hashFiles('**/pom.xml') }}
restore-keys: |
build-
- name: Cache Maven local repository
uses: actions/cache@v2
with:
path: ~/.m2/repository
key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
${{ matrix.java }}-${{ matrix.hadoop }}-maven-
- name: Cache Ivy local repository
uses: actions/cache@v2
with:
path: ~/.ivy2/cache
key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }}
restore-keys: |
${{ matrix.java }}-${{ matrix.hadoop }}-ivy-
- name: Install JDK ${{ matrix.java }}
uses: actions/setup-java@v1
with:
java-version: ${{ matrix.java }}
fetch-depth: 0
# PySpark
- name: Install PyPy3
# SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
# Note that order of Python installations here matters because default python3 is
# overridden by pypy3.
uses: actions/setup-python@v2
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
with:
python-version: pypy3
architecture: x64
- name: Install Python 2.7
uses: actions/setup-python@v2
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
with:
python-version: 2.7
architecture: x64
- name: Install Python 3.6
uses: actions/setup-python@v2
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
with:
python-version: 3.6
architecture: x64
- name: Install Python packages
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
# PyArrow is not supported in PyPy yet, see ARROW-2651.
# TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
run: |
python3 -m pip install numpy pyarrow pandas scipy
python3 -m pip list
python2 -m pip install numpy pyarrow pandas scipy
python2 -m pip list
pypy3 -m pip install numpy pandas
pypy3 -m pip list
# SparkR
- name: Install R 3.6
uses: r-lib/actions/setup-r@v1
if: contains(matrix.modules, 'sparkr')
with:
r-version: 3.6
- name: Install R packages
if: contains(matrix.modules, 'sparkr')
run: |
sudo apt-get install -y libcurl4-openssl-dev
sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
# Show installed packages in R.
sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
# Run the tests.
- name: "Run tests: ${{ matrix.modules }}"
run: |
# Hive tests become flaky when running in parallel as it's too intensive.
if [[ "$TEST_ONLY_MODULES" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
mkdir -p ~/.m2
./dev/run-tests --parallelism 2
./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
rm -rf ~/.m2/repository/org/apache/spark
# Static analysis, and documentation build
lint:
name: Linters, licenses, dependencies and documentation generation
runs-on: ubuntu-latest
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
- name: Cache Maven local repository
uses: actions/cache@v2
with:
path: ~/.m2/repository
key: docs-maven-repo-${{ hashFiles('**/pom.xml') }}
restore-keys: |
docs-maven-
- name: Install JDK 1.8
uses: actions/setup-java@v1
with:
java-version: 1.8
- name: Install Python 3.6
uses: actions/setup-python@v2
with:
python-version: 3.6
architecture: x64
- name: Install Python linter dependencies
run: |
pip3 install flake8 sphinx numpy
- name: Install R 3.6
uses: r-lib/actions/setup-r@v1
with:
r-version: 3.6
- name: Install R linter dependencies and SparkR
run: |
sudo apt-get install -y libcurl4-openssl-dev
sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')"
./R/install-dev.sh
- name: Install Ruby 2.7 for documentation generation
uses: actions/setup-ruby@v1
with:
ruby-version: 2.7
- name: Install dependencies for documentation generation
run: |
sudo apt-get install -y libcurl4-openssl-dev pandoc
pip install sphinx mkdocs numpy
gem install jekyll jekyll-redirect-from rouge
sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
- name: Scala linter
run: ./dev/lint-scala
- name: Java linter
run: ./dev/lint-java
- name: Python linter
run: ./dev/lint-python
- name: R linter
run: ./dev/lint-r
- name: License test
run: ./dev/check-license
- name: Dependencies test
run: ./dev/test-dependencies.sh
- name: Run documentation build
run: |
cd docs
jekyll build
2 changes: 1 addition & 1 deletion R/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# R on Spark
# R on Spark.

SparkR is an R package that provides a light-weight frontend to use Spark from R.

Expand Down
153 changes: 98 additions & 55 deletions dev/run-tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,20 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe
identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))]
True
"""
if target_branch is None and target_ref is None:
raise AttributeError("must specify either target_branch or target_ref")
elif target_branch is not None and target_ref is not None:
if target_branch is not None and target_ref is not None:
raise AttributeError("must specify either target_branch or target_ref, not both")
if target_branch is not None:
diff_target = target_branch
diff_target = [target_branch]
run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
elif target_ref is not None:
diff_target = [target_ref]
else:
diff_target = target_ref
raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
universal_newlines=True)
# If both are not specified, just show the diff from the commit only.
diff_target = []
raw_output = subprocess.check_output(
['git', 'diff', '--name-only', patch_sha] + diff_target,
universal_newlines=True)
print(raw_output)
# Remove any empty strings
return [f for f in raw_output.split('\n') if f]

Expand Down Expand Up @@ -539,6 +542,24 @@ def parse_opts():
"-p", "--parallelism", type=int, default=8,
help="The number of suites to test in parallel (default %(default)d)"
)
parser.add_argument(
"-m", "--modules", type=str,
default=None,
help="A comma-separated list of modules to test "
"(default: %s)" % ",".join(sorted([m.name for m in modules.all_modules]))
)
parser.add_argument(
"-e", "--excluded-tags", type=str,
default=None,
help="A comma-separated list of tags to exclude in the tests, "
"e.g., org.apache.spark.tags.ExtendedHiveTest "
)
parser.add_argument(
"-i", "--included-tags", type=str,
default=None,
help="A comma-separated list of tags to include in the tests, "
"e.g., org.apache.spark.tags.ExtendedHiveTest "
)

args, unknown = parser.parse_known_args()
if unknown:
Expand Down Expand Up @@ -589,43 +610,64 @@ def main():
# /home/jenkins/anaconda2/envs/py36/bin
os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH")
else:
# else we're running locally and can use local settings
# else we're running locally or Github Actions.
build_tool = "sbt"
hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7")
hive_version = os.environ.get("HIVE_PROFILE", "hive2.3")
test_env = "local"
if "GITHUB_ACTIONS" in os.environ:
test_env = "github_actions"
else:
test_env = "local"

print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
"and Hive profile", hive_version, "under environment", test_env)
extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version)

changed_modules = None
test_modules = None
changed_files = None
should_only_test_modules = "TEST_ONLY_MODULES" in os.environ
should_only_test_modules = opts.modules is not None
included_tags = []
excluded_tags = []
if should_only_test_modules:
str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")]
str_test_modules = [m.strip() for m in opts.modules.split(",")]
test_modules = [m for m in modules.all_modules if m.name in str_test_modules]
# Directly uses test_modules as changed modules to apply tags and environments
# as if all specified test modules are changed.

# If we're running the tests in Github Actions, attempt to detect and test
# only the affected modules.
if test_env == "github_actions":
base_ref = os.environ["GITHUB_BASE_REF"]
changed_files = identify_changed_files_from_git_commits(
os.environ["GITHUB_SHA"], target_branch=None if base_ref == "" else base_ref)
print("changed_files : %s" % changed_files)
test_modules = list(set(determine_modules_to_test(
determine_modules_for_files(changed_files))).intersection(test_modules))
print("test_modules : %s" % test_modules)

changed_modules = test_modules
str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None)
str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None)
excluded_tags = []
if str_excluded_tags:
excluded_tags = [t.strip() for t in str_excluded_tags.split(",")]
included_tags = []
if str_included_tags:
included_tags = [t.strip() for t in str_included_tags.split(",")]

# If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and
# detect modules to test.
elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
target_branch = os.environ["ghprbTargetBranch"]
changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
changed_modules = determine_modules_for_files(changed_files)
test_modules = determine_modules_to_test(changed_modules)
excluded_tags = determine_tags_to_exclude(changed_modules)

# If there is no changed module found, tests all.
if not changed_modules:
changed_modules = [modules.root]
excluded_tags = []
if not test_modules:
test_modules = determine_modules_to_test(changed_modules)

str_excluded_tags = opts.excluded_tags
str_included_tags = opts.included_tags
if str_excluded_tags:
excluded_tags.extend([t.strip() for t in str_excluded_tags.split(",")])
if str_included_tags:
included_tags.extend([t.strip() for t in str_included_tags.split(",")])

print("[info] Found the following changed modules:",
", ".join(x.name for x in changed_modules))

Expand All @@ -640,8 +682,6 @@ def main():

should_run_java_style_checks = False
if not should_only_test_modules:
test_modules = determine_modules_to_test(changed_modules)

# license checks
run_apache_rat_checks()

Expand Down Expand Up @@ -672,40 +712,43 @@ def main():
# if "DOCS" in changed_modules and test_env == "amplab_jenkins":
# build_spark_documentation()

if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins":
run_build_tests()

# spark build
build_apache_spark(build_tool, extra_profiles)

# backwards compatibility checks
if build_tool == "sbt":
# Note: compatibility tests only supported in sbt for now
detect_binary_inop_with_mima(extra_profiles)
# Since we did not build assembly/package before running dev/mima, we need to
# do it here because the tests still rely on it; see SPARK-13294 for details.
build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)

# run the test suites
run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags)

modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
if modules_with_python_tests:
# We only run PySpark tests with coverage report in one specific job with
# Spark master with SBT in Jenkins.
is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
run_python_tests(
modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job)
run_python_packaging_tests()
if any(m.should_run_r_tests for m in test_modules):
run_sparkr_tests()
print(changed_modules)
print(test_modules)
print([m for m in test_modules if m.python_test_goals])
print([m.should_run_r_tests for m in test_modules])
print(excluded_tags)
print(included_tags)

# if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins":
# run_build_tests()
#
# # spark build
# build_apache_spark(build_tool, extra_profiles)
#
# # backwards compatibility checks
# if build_tool == "sbt":
# # Note: compatibility tests only supported in sbt for now
# detect_binary_inop_with_mima(extra_profiles)
# # Since we did not build assembly/package before running dev/mima, we need to
# # do it here because the tests still rely on it; see SPARK-13294 for details.
# build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)
#
# # run the test suites
# run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags)
#
# modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
# if modules_with_python_tests:
# # We only run PySpark tests with coverage report in one specific job with
# # Spark master with SBT in Jenkins.
# is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
# run_python_tests(
# modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job)
# run_python_packaging_tests()
# if any(m.should_run_r_tests for m in test_modules):
# run_sparkr_tests()


def _test():
if "TEST_ONLY_MODULES" in os.environ:
# TODO(SPARK-32252): Enable doctests back in Github Actions.
return

import doctest
failure_count = doctest.testmod()[0]
if failure_count:
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#

"""
Worker that receives input from Piped RDD.
Worker that receives input from Piped RDD
"""
from __future__ import print_function
from __future__ import absolute_import
Expand Down
Loading

0 comments on commit 74cad54

Please sign in to comment.