apache · HyukjinKwon · Jul 13, 2020 · dongjoon-hyun · Jul 13, 2020 · dongjoon-hyun
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -29,7 +29,7 @@ jobs:
         modules:
           - |-
             core, unsafe, kvstore, avro,
-            network_common, network_shuffle, repl, launcher
+            network-common, network-shuffle, repl, launcher,
             examples, sketch, graphx
           - |-
             catalyst, hive-thriftserver
@@ -75,16 +75,20 @@ jobs:
             excluded-tags: org.apache.spark.tags.ExtendedSQLTest
             comment: "- other tests"
     env:
-      TEST_ONLY_MODULES: ${{ matrix.modules }}
-      TEST_ONLY_EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
-      TEST_ONLY_INCLUDED_TAGS: ${{ matrix.included-tags }}
+      MODULES_TO_TEST: ${{ matrix.modules }}
+      EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
+      INCLUDED_TAGS: ${{ matrix.included-tags }}
       HADOOP_PROFILE: ${{ matrix.hadoop }}
       HIVE_PROFILE: ${{ matrix.hive }}
       # GitHub Actions' default miniconda to use in pip packaging test.
       CONDA_PREFIX: /usr/share/miniconda
+      GITHUB_PREV_SHA: ${{ github.event.before }}
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
+      # In order to fetch changed files
+      with:
+        fetch-depth: 0
     # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
     - name: Cache Scala, SBT, Maven and Zinc
       uses: actions/cache@v1
@@ -161,9 +165,9 @@ jobs:
     - name: "Run tests: ${{ matrix.modules }}"
       run: |
         # Hive tests become flaky when running in parallel as it's too intensive.
-        if [[ "$TEST_ONLY_MODULES" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
+        if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
         mkdir -p ~/.m2
-        ./dev/run-tests --parallelism 2
+        ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
         rm -rf ~/.m2/repository/org/apache/spark
 
   # Static analysis, and documentation build

diff --git a/dev/run-tests.py b/dev/run-tests.py
@@ -101,12 +101,14 @@ def setup_test_environ(environ):
         os.environ[k] = v
 
 
-def determine_modules_to_test(changed_modules):
+def determine_modules_to_test(changed_modules, deduplicated=True):
     """
     Given a set of modules that have changed, compute the transitive closure of those modules'
     dependent modules in order to determine the set of modules that should be tested.
 
     Returns a topologically-sorted list of modules (ties are broken by sorting on module names).
+    If ``deduplicated`` is disabled, the modules are returned without tacking the deduplication
+    by dependencies into account.
 
     >>> [x.name for x in determine_modules_to_test([modules.root])]
     ['root']
@@ -122,11 +124,29 @@ def determine_modules_to_test(changed_modules):
     ... # doctest: +NORMALIZE_WHITESPACE
     ['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', 'hive-thriftserver',
      'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-ml']
+    >>> sorted([x.name for x in determine_modules_to_test(
+    ...     [modules.sparkr, modules.sql], deduplicated=False)])
+    ... # doctest: +NORMALIZE_WHITESPACE
+    ['avro', 'examples', 'hive', 'hive-thriftserver', 'mllib', 'pyspark-ml',
+     'pyspark-mllib', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
+    >>> sorted([x.name for x in determine_modules_to_test(
+    ...     [modules.sql, modules.core], deduplicated=False)])
+    ... # doctest: +NORMALIZE_WHITESPACE
+    ['avro', 'catalyst', 'core', 'examples', 'graphx', 'hive', 'hive-thriftserver',
+     'mllib', 'mllib-local', 'pyspark-core', 'pyspark-ml', 'pyspark-mllib',
+     'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root',
+     'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10',
+     'streaming-kinesis-asl']
     """
     modules_to_test = set()
     for module in changed_modules:
-        modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
+        modules_to_test = modules_to_test.union(
+            determine_modules_to_test(module.dependent_modules, deduplicated))
     modules_to_test = modules_to_test.union(set(changed_modules))
+
+    if not deduplicated:
+        return modules_to_test
+
     # If we need to run all of the tests, then we should short-circuit and return 'root'
     if modules.root in modules_to_test:
         return [modules.root]
@@ -539,6 +559,24 @@ def parse_opts():
         "-p", "--parallelism", type=int, default=8,
         help="The number of suites to test in parallel (default %(default)d)"
     )
+    parser.add_argument(
+        "-m", "--modules", type=str,
+        default=None,
+        help="A comma-separated list of modules to test "
+             "(default: %s)" % ",".join(sorted([m.name for m in modules.all_modules]))
+    )
+    parser.add_argument(
+        "-e", "--excluded-tags", type=str,
+        default=None,
+        help="A comma-separated list of tags to exclude in the tests, "
+             "e.g., org.apache.spark.tags.ExtendedHiveTest "
+    )
+    parser.add_argument(
+        "-i", "--included-tags", type=str,
+        default=None,
+        help="A comma-separated list of tags to include in the tests, "
+             "e.g., org.apache.spark.tags.ExtendedHiveTest "
+    )
 
     args, unknown = parser.parse_known_args()
     if unknown:
@@ -589,43 +627,74 @@ def main():
         # /home/jenkins/anaconda2/envs/py36/bin
         os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH")
     else:
-        # else we're running locally and can use local settings
+        # else we're running locally or Github Actions.
         build_tool = "sbt"
         hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7")
         hive_version = os.environ.get("HIVE_PROFILE", "hive2.3")
-        test_env = "local"
+        if "GITHUB_ACTIONS" in os.environ:
+            test_env = "github_actions"
+        else:
+            test_env = "local"
 
     print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
           "and Hive profile", hive_version, "under environment", test_env)
     extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version)
 
     changed_modules = None
+    test_modules = None
     changed_files = None
-    should_only_test_modules = "TEST_ONLY_MODULES" in os.environ
+    should_only_test_modules = opts.modules is not None
     included_tags = []
+    excluded_tags = []
     if should_only_test_modules:
-        str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")]
+        str_test_modules = [m.strip() for m in opts.modules.split(",")]
         test_modules = [m for m in modules.all_modules if m.name in str_test_modules]
-        # Directly uses test_modules as changed modules to apply tags and environments
-        # as if all specified test modules are changed.
+
+        # If we're running the tests in Github Actions, attempt to detect and test
+        # only the affected modules.
+        if test_env == "github_actions":
+            if os.environ["GITHUB_BASE_REF"] != "":
+                # Pull requests
+                changed_files = identify_changed_files_from_git_commits(
+                    os.environ["GITHUB_SHA"], target_branch=os.environ["GITHUB_BASE_REF"])
+            else:
+                # Build for each commit.
+                changed_files = identify_changed_files_from_git_commits(
+                    os.environ["GITHUB_SHA"], target_ref=os.environ["GITHUB_PREV_SHA"])
+
+            modules_to_test = determine_modules_to_test(
+                determine_modules_for_files(changed_files), deduplicated=False)
+
+            if modules.root not in modules_to_test:
+                # If root module is not found, only test the intersected modules.
+                # If root module is found, just run the modules as specified initially.
+                test_modules = list(set(modules_to_test).intersection(test_modules))
+
         changed_modules = test_modules
-        str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None)
-        str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None)
-        excluded_tags = []
-        if str_excluded_tags:
-            excluded_tags = [t.strip() for t in str_excluded_tags.split(",")]
-        included_tags = []
-        if str_included_tags:
-            included_tags = [t.strip() for t in str_included_tags.split(",")]
+        if len(changed_modules) == 0:
+            print("[info] There are no modules to test, exiting without testing.")
+            return
+
+    # If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and
+    # detect modules to test.
     elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
         target_branch = os.environ["ghprbTargetBranch"]
         changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
         changed_modules = determine_modules_for_files(changed_files)
+        test_modules = determine_modules_to_test(changed_modules)
         excluded_tags = determine_tags_to_exclude(changed_modules)
 
+    # If there is no changed module found, tests all.
     if not changed_modules:
         changed_modules = [modules.root]
-        excluded_tags = []
+    if not test_modules:
+        test_modules = determine_modules_to_test(changed_modules)
+
+    if opts.excluded_tags:
+        excluded_tags.extend([t.strip() for t in opts.excluded_tags.split(",")])
+    if opts.included_tags:
+        included_tags.extend([t.strip() for t in opts.included_tags.split(",")])
+
     print("[info] Found the following changed modules:",
           ", ".join(x.name for x in changed_modules))
 
@@ -640,8 +709,6 @@ def main():
 
     should_run_java_style_checks = False
     if not should_only_test_modules:
-        test_modules = determine_modules_to_test(changed_modules)
-
         # license checks
         run_apache_rat_checks()
 
@@ -702,10 +769,6 @@ def main():
 
 
 def _test():
-    if "TEST_ONLY_MODULES" in os.environ:
-        # TODO(SPARK-32252): Enable doctests back in Github Actions.
-        return
-
     import doctest
     failure_count = doctest.testmod()[0]
     if failure_count: