Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-32292][SPARK-32252][INFRA] Run the relevant tests only in GitHub Actions #29086

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions .github/workflows/master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
modules:
- |-
core, unsafe, kvstore, avro,
network_common, network_shuffle, repl, launcher
network-common, network-shuffle, repl, launcher,
Copy link
Member

@dongjoon-hyun dongjoon-hyun Jul 13, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops. I missed this.

examples, sketch, graphx
- |-
catalyst, hive-thriftserver
Expand Down Expand Up @@ -75,16 +75,20 @@ jobs:
excluded-tags: org.apache.spark.tags.ExtendedSQLTest
comment: "- other tests"
env:
TEST_ONLY_MODULES: ${{ matrix.modules }}
TEST_ONLY_EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
TEST_ONLY_INCLUDED_TAGS: ${{ matrix.included-tags }}
MODULES_TO_TEST: ${{ matrix.modules }}
EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
INCLUDED_TAGS: ${{ matrix.included-tags }}
HADOOP_PROFILE: ${{ matrix.hadoop }}
HIVE_PROFILE: ${{ matrix.hive }}
# GitHub Actions' default miniconda to use in pip packaging test.
CONDA_PREFIX: /usr/share/miniconda
GITHUB_PREV_SHA: ${{ github.event.before }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
# In order to fetch changed files
with:
fetch-depth: 0
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
- name: Cache Scala, SBT, Maven and Zinc
uses: actions/cache@v1
Expand Down Expand Up @@ -161,9 +165,9 @@ jobs:
- name: "Run tests: ${{ matrix.modules }}"
run: |
# Hive tests become flaky when running in parallel as it's too intensive.
if [[ "$TEST_ONLY_MODULES" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
mkdir -p ~/.m2
./dev/run-tests --parallelism 2
./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
rm -rf ~/.m2/repository/org/apache/spark

# Static analysis, and documentation build
Expand Down
109 changes: 86 additions & 23 deletions dev/run-tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,14 @@ def setup_test_environ(environ):
os.environ[k] = v


def determine_modules_to_test(changed_modules):
def determine_modules_to_test(changed_modules, deduplicated=True):
"""
Given a set of modules that have changed, compute the transitive closure of those modules'
dependent modules in order to determine the set of modules that should be tested.

Returns a topologically-sorted list of modules (ties are broken by sorting on module names).
If ``deduplicated`` is disabled, the modules are returned without tacking the deduplication
by dependencies into account.

>>> [x.name for x in determine_modules_to_test([modules.root])]
['root']
Expand All @@ -122,11 +124,29 @@ def determine_modules_to_test(changed_modules):
... # doctest: +NORMALIZE_WHITESPACE
['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', 'hive-thriftserver',
'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-ml']
>>> sorted([x.name for x in determine_modules_to_test(
... [modules.sparkr, modules.sql], deduplicated=False)])
... # doctest: +NORMALIZE_WHITESPACE
['avro', 'examples', 'hive', 'hive-thriftserver', 'mllib', 'pyspark-ml',
'pyspark-mllib', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
>>> sorted([x.name for x in determine_modules_to_test(
... [modules.sql, modules.core], deduplicated=False)])
... # doctest: +NORMALIZE_WHITESPACE
['avro', 'catalyst', 'core', 'examples', 'graphx', 'hive', 'hive-thriftserver',
'mllib', 'mllib-local', 'pyspark-core', 'pyspark-ml', 'pyspark-mllib',
'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root',
'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10',
'streaming-kinesis-asl']
"""
modules_to_test = set()
for module in changed_modules:
modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
modules_to_test = modules_to_test.union(
determine_modules_to_test(module.dependent_modules, deduplicated))
modules_to_test = modules_to_test.union(set(changed_modules))

if not deduplicated:
return modules_to_test

# If we need to run all of the tests, then we should short-circuit and return 'root'
if modules.root in modules_to_test:
return [modules.root]
Expand Down Expand Up @@ -539,6 +559,24 @@ def parse_opts():
"-p", "--parallelism", type=int, default=8,
help="The number of suites to test in parallel (default %(default)d)"
)
parser.add_argument(
"-m", "--modules", type=str,
default=None,
help="A comma-separated list of modules to test "
"(default: %s)" % ",".join(sorted([m.name for m in modules.all_modules]))
)
parser.add_argument(
"-e", "--excluded-tags", type=str,
default=None,
help="A comma-separated list of tags to exclude in the tests, "
"e.g., org.apache.spark.tags.ExtendedHiveTest "
)
parser.add_argument(
"-i", "--included-tags", type=str,
default=None,
help="A comma-separated list of tags to include in the tests, "
"e.g., org.apache.spark.tags.ExtendedHiveTest "
)

args, unknown = parser.parse_known_args()
if unknown:
Expand Down Expand Up @@ -589,43 +627,74 @@ def main():
# /home/jenkins/anaconda2/envs/py36/bin
os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH")
else:
# else we're running locally and can use local settings
# else we're running locally or Github Actions.
build_tool = "sbt"
hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7")
hive_version = os.environ.get("HIVE_PROFILE", "hive2.3")
test_env = "local"
if "GITHUB_ACTIONS" in os.environ:
test_env = "github_actions"
else:
test_env = "local"

print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
"and Hive profile", hive_version, "under environment", test_env)
extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version)

changed_modules = None
test_modules = None
changed_files = None
should_only_test_modules = "TEST_ONLY_MODULES" in os.environ
should_only_test_modules = opts.modules is not None
included_tags = []
excluded_tags = []
if should_only_test_modules:
str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")]
str_test_modules = [m.strip() for m in opts.modules.split(",")]
test_modules = [m for m in modules.all_modules if m.name in str_test_modules]
# Directly uses test_modules as changed modules to apply tags and environments
# as if all specified test modules are changed.

# If we're running the tests in Github Actions, attempt to detect and test
# only the affected modules.
if test_env == "github_actions":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. This looks safe.

if os.environ["GITHUB_BASE_REF"] != "":
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GITHUB_BASE_REF is only set in the PR builder; otherwise, it's an empty string.

Copy link
Member

@dongjoon-hyun dongjoon-hyun Jul 13, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean Jenkins PR Builder?

# Pull requests
changed_files = identify_changed_files_from_git_commits(
os.environ["GITHUB_SHA"], target_branch=os.environ["GITHUB_BASE_REF"])
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GITHUB_SHA is the commit in PR. GITHUB_BASE_REF is the target repo's branch.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For many commits in a PR, each time changed_files is different? If a previous commit changed two modules but later one changes only one, will this only run test for the later module?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From my testing, it combines all commits and changed_files is based on all commits always.

Seems like GITHUB_SHA became a merge commit internally in Github Actions, and GITHUB_BASE_REF becomes the branch which PR opens against.

For example, if a PR has commits 1, 2 and 3, GitHub Actions looks creates a merge commit 4 for commits 1, 2 and 3. And, it calculates the diff from the branch, for example, master in the original repo, Apache Spark.

git diff 4 master

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an example of the merge commit: HyukjinKwon@8f36ec4 at HyukjinKwon#7

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay. Thanks for clarifying. Looks good.

else:
# Build for each commit.
changed_files = identify_changed_files_from_git_commits(
os.environ["GITHUB_SHA"], target_ref=os.environ["GITHUB_PREV_SHA"])

modules_to_test = determine_modules_to_test(
determine_modules_for_files(changed_files), deduplicated=False)

if modules.root not in modules_to_test:
# If root module is not found, only test the intersected modules.
# If root module is found, just run the modules as specified initially.
test_modules = list(set(modules_to_test).intersection(test_modules))

changed_modules = test_modules
str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None)
str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None)
excluded_tags = []
if str_excluded_tags:
excluded_tags = [t.strip() for t in str_excluded_tags.split(",")]
included_tags = []
if str_included_tags:
included_tags = [t.strip() for t in str_included_tags.split(",")]
if len(changed_modules) == 0:
print("[info] There are no modules to test, exiting without testing.")
return

# If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and
# detect modules to test.
elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
target_branch = os.environ["ghprbTargetBranch"]
changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
changed_modules = determine_modules_for_files(changed_files)
test_modules = determine_modules_to_test(changed_modules)
excluded_tags = determine_tags_to_exclude(changed_modules)

# If there is no changed module found, tests all.
if not changed_modules:
changed_modules = [modules.root]
excluded_tags = []
if not test_modules:
test_modules = determine_modules_to_test(changed_modules)

if opts.excluded_tags:
excluded_tags.extend([t.strip() for t in opts.excluded_tags.split(",")])
if opts.included_tags:
included_tags.extend([t.strip() for t in opts.included_tags.split(",")])

print("[info] Found the following changed modules:",
", ".join(x.name for x in changed_modules))

Expand All @@ -640,8 +709,6 @@ def main():

should_run_java_style_checks = False
if not should_only_test_modules:
test_modules = determine_modules_to_test(changed_modules)

# license checks
run_apache_rat_checks()

Expand Down Expand Up @@ -702,10 +769,6 @@ def main():


def _test():
if "TEST_ONLY_MODULES" in os.environ:
# TODO(SPARK-32252): Enable doctests back in Github Actions.
return

import doctest
failure_count = doctest.testmod()[0]
if failure_count:
Expand Down