-
Notifications
You must be signed in to change notification settings - Fork 28.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-32292][SPARK-32252][INFRA] Run the relevant tests only in GitHub Actions #29086
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -101,12 +101,14 @@ def setup_test_environ(environ): | |
os.environ[k] = v | ||
|
||
|
||
def determine_modules_to_test(changed_modules): | ||
def determine_modules_to_test(changed_modules, deduplicated=True): | ||
""" | ||
Given a set of modules that have changed, compute the transitive closure of those modules' | ||
dependent modules in order to determine the set of modules that should be tested. | ||
|
||
Returns a topologically-sorted list of modules (ties are broken by sorting on module names). | ||
If ``deduplicated`` is disabled, the modules are returned without tacking the deduplication | ||
by dependencies into account. | ||
|
||
>>> [x.name for x in determine_modules_to_test([modules.root])] | ||
['root'] | ||
|
@@ -122,11 +124,29 @@ def determine_modules_to_test(changed_modules): | |
... # doctest: +NORMALIZE_WHITESPACE | ||
['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', 'hive-thriftserver', | ||
'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-ml'] | ||
>>> sorted([x.name for x in determine_modules_to_test( | ||
... [modules.sparkr, modules.sql], deduplicated=False)]) | ||
... # doctest: +NORMALIZE_WHITESPACE | ||
['avro', 'examples', 'hive', 'hive-thriftserver', 'mllib', 'pyspark-ml', | ||
'pyspark-mllib', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10'] | ||
>>> sorted([x.name for x in determine_modules_to_test( | ||
... [modules.sql, modules.core], deduplicated=False)]) | ||
... # doctest: +NORMALIZE_WHITESPACE | ||
['avro', 'catalyst', 'core', 'examples', 'graphx', 'hive', 'hive-thriftserver', | ||
'mllib', 'mllib-local', 'pyspark-core', 'pyspark-ml', 'pyspark-mllib', | ||
'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root', | ||
'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10', | ||
'streaming-kinesis-asl'] | ||
""" | ||
modules_to_test = set() | ||
for module in changed_modules: | ||
modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules)) | ||
modules_to_test = modules_to_test.union( | ||
determine_modules_to_test(module.dependent_modules, deduplicated)) | ||
modules_to_test = modules_to_test.union(set(changed_modules)) | ||
|
||
if not deduplicated: | ||
return modules_to_test | ||
|
||
# If we need to run all of the tests, then we should short-circuit and return 'root' | ||
if modules.root in modules_to_test: | ||
return [modules.root] | ||
|
@@ -539,6 +559,24 @@ def parse_opts(): | |
"-p", "--parallelism", type=int, default=8, | ||
help="The number of suites to test in parallel (default %(default)d)" | ||
) | ||
parser.add_argument( | ||
"-m", "--modules", type=str, | ||
default=None, | ||
help="A comma-separated list of modules to test " | ||
"(default: %s)" % ",".join(sorted([m.name for m in modules.all_modules])) | ||
) | ||
parser.add_argument( | ||
"-e", "--excluded-tags", type=str, | ||
default=None, | ||
help="A comma-separated list of tags to exclude in the tests, " | ||
"e.g., org.apache.spark.tags.ExtendedHiveTest " | ||
) | ||
parser.add_argument( | ||
"-i", "--included-tags", type=str, | ||
default=None, | ||
help="A comma-separated list of tags to include in the tests, " | ||
"e.g., org.apache.spark.tags.ExtendedHiveTest " | ||
) | ||
|
||
args, unknown = parser.parse_known_args() | ||
if unknown: | ||
|
@@ -589,43 +627,74 @@ def main(): | |
# /home/jenkins/anaconda2/envs/py36/bin | ||
os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH") | ||
else: | ||
# else we're running locally and can use local settings | ||
# else we're running locally or Github Actions. | ||
build_tool = "sbt" | ||
hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7") | ||
hive_version = os.environ.get("HIVE_PROFILE", "hive2.3") | ||
test_env = "local" | ||
if "GITHUB_ACTIONS" in os.environ: | ||
test_env = "github_actions" | ||
else: | ||
test_env = "local" | ||
|
||
print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, | ||
"and Hive profile", hive_version, "under environment", test_env) | ||
extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version) | ||
|
||
changed_modules = None | ||
test_modules = None | ||
changed_files = None | ||
should_only_test_modules = "TEST_ONLY_MODULES" in os.environ | ||
should_only_test_modules = opts.modules is not None | ||
included_tags = [] | ||
excluded_tags = [] | ||
if should_only_test_modules: | ||
str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")] | ||
str_test_modules = [m.strip() for m in opts.modules.split(",")] | ||
test_modules = [m for m in modules.all_modules if m.name in str_test_modules] | ||
# Directly uses test_modules as changed modules to apply tags and environments | ||
# as if all specified test modules are changed. | ||
|
||
# If we're running the tests in Github Actions, attempt to detect and test | ||
# only the affected modules. | ||
if test_env == "github_actions": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. This looks safe. |
||
if os.environ["GITHUB_BASE_REF"] != "": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
# Pull requests | ||
changed_files = identify_changed_files_from_git_commits( | ||
os.environ["GITHUB_SHA"], target_branch=os.environ["GITHUB_BASE_REF"]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For many commits in a PR, each time There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From my testing, it combines all commits and Seems like For example, if a PR has commits 1, 2 and 3, GitHub Actions looks creates a merge commit 4 for commits 1, 2 and 3. And, it calculates the diff from the branch, for example, git diff 4 master There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an example of the merge commit: HyukjinKwon@8f36ec4 at HyukjinKwon#7 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay. Thanks for clarifying. Looks good. |
||
else: | ||
# Build for each commit. | ||
changed_files = identify_changed_files_from_git_commits( | ||
os.environ["GITHUB_SHA"], target_ref=os.environ["GITHUB_PREV_SHA"]) | ||
|
||
modules_to_test = determine_modules_to_test( | ||
determine_modules_for_files(changed_files), deduplicated=False) | ||
|
||
if modules.root not in modules_to_test: | ||
# If root module is not found, only test the intersected modules. | ||
# If root module is found, just run the modules as specified initially. | ||
test_modules = list(set(modules_to_test).intersection(test_modules)) | ||
|
||
changed_modules = test_modules | ||
str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None) | ||
str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None) | ||
excluded_tags = [] | ||
if str_excluded_tags: | ||
excluded_tags = [t.strip() for t in str_excluded_tags.split(",")] | ||
included_tags = [] | ||
if str_included_tags: | ||
included_tags = [t.strip() for t in str_included_tags.split(",")] | ||
if len(changed_modules) == 0: | ||
print("[info] There are no modules to test, exiting without testing.") | ||
return | ||
|
||
# If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and | ||
# detect modules to test. | ||
elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): | ||
target_branch = os.environ["ghprbTargetBranch"] | ||
changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) | ||
changed_modules = determine_modules_for_files(changed_files) | ||
test_modules = determine_modules_to_test(changed_modules) | ||
excluded_tags = determine_tags_to_exclude(changed_modules) | ||
|
||
# If there is no changed module found, tests all. | ||
if not changed_modules: | ||
changed_modules = [modules.root] | ||
excluded_tags = [] | ||
if not test_modules: | ||
test_modules = determine_modules_to_test(changed_modules) | ||
|
||
if opts.excluded_tags: | ||
excluded_tags.extend([t.strip() for t in opts.excluded_tags.split(",")]) | ||
if opts.included_tags: | ||
included_tags.extend([t.strip() for t in opts.included_tags.split(",")]) | ||
|
||
print("[info] Found the following changed modules:", | ||
", ".join(x.name for x in changed_modules)) | ||
|
||
|
@@ -640,8 +709,6 @@ def main(): | |
|
||
should_run_java_style_checks = False | ||
if not should_only_test_modules: | ||
test_modules = determine_modules_to_test(changed_modules) | ||
|
||
# license checks | ||
run_apache_rat_checks() | ||
|
||
|
@@ -702,10 +769,6 @@ def main(): | |
|
||
|
||
def _test(): | ||
if "TEST_ONLY_MODULES" in os.environ: | ||
# TODO(SPARK-32252): Enable doctests back in Github Actions. | ||
return | ||
|
||
import doctest | ||
failure_count = doctest.testmod()[0] | ||
if failure_count: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops. I missed this.