Update notification service (#17921)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
huggingface · Jul 25, 2022 · 726098f · 726098f
1 parent 28d474c
commit 726098f
Show file tree

Hide file tree

Showing 2 changed files with 96 additions and 34 deletions.
diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml
@@ -164,6 +164,11 @@ jobs:
     steps:
       - uses: actions/checkout@v2
       - uses: actions/download-artifact@v2
+
+      # Create a directory to store test failure tables in the next step
+      - name: Create directory
+        run: mkdir test_failure_tables
+
       - name: Send message to Slack
         env:
           CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
@@ -176,4 +181,12 @@ jobs:
         # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
         run: |
           pip install slack_sdk
-          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
+          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
+
+      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
+      - name: Failure table artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_failure_tables_${{ inputs.framework }}-${{ inputs.version }}
+          path: test_failure_tables
diff --git a/utils/notification_service.py b/utils/notification_service.py
@@ -233,15 +233,11 @@ def category_failures(self) -> Dict:
                     individual_reports.append(key)
 
         header = "Single |  Multi | Category\n"
-        category_failures_report = header + "\n".join(sorted(individual_reports))
+        category_failures_report = prepare_reports(
+            title="The following modeling categories had failures", header=header, reports=individual_reports
+        )
 
-        return {
-            "type": "section",
-            "text": {
-                "type": "mrkdwn",
-                "text": f"The following modeling categories had failures:\n\n```\n{category_failures_report}\n```",
-            },
-        }
+        return {"type": "section", "text": {"type": "mrkdwn", "text": category_failures_report}}
 
     @property
     def model_failures(self) -> Dict:
@@ -302,21 +298,44 @@ def per_model_sum(model_category_dict):
 
         model_header = "Single PT |  Multi PT | Single TF |  Multi TF |     Other | Category\n"
         sorted_model_reports = sorted(model_reports, key=lambda s: s.split("] ")[-1])
-        model_failures_report = model_header + "\n".join(sorted_model_reports)
+        model_failures_report = prepare_reports(
+            title="These following model modules had failures", header=model_header, reports=sorted_model_reports
+        )
 
         module_header = "Single |  Multi | Category\n"
         sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("] ")[-1])
-        module_failures_report = module_header + "\n".join(sorted_module_reports)
-
-        report = ""
+        module_failures_report = prepare_reports(
+            title="The following non-model modules had failures", header=module_header, reports=sorted_module_reports
+        )
 
-        if len(model_reports):
-            report += f"These following model modules had failures:\n```\n{model_failures_report}\n```\n\n"
+        model_failure_sections = [
+            {"type": "section", "text": {"type": "mrkdwn", "text": model_failures_report}},
+            {"type": "section", "text": {"type": "mrkdwn", "text": module_failures_report}},
+        ]
 
-        if len(other_module_reports):
-            report += f"The following non-model modules had failures:\n```\n{module_failures_report}\n```\n\n"
+        # Save complete tables (for past CI) - to be uploaded as artifacts
+        if ci_event.startswith("Past CI"):
+            model_failures_report = prepare_reports(
+                title="These following model modules had failures",
+                header=model_header,
+                reports=sorted_model_reports,
+                to_truncate=False,
+            )
+            file_path = os.path.join(os.getcwd(), "test_failure_tables/model_failures_report.txt")
+            with open(file_path, "w", encoding="UTF-8") as fp:
+                fp.write(model_failures_report)
+
+            module_failures_report = prepare_reports(
+                title="The following non-model modules had failures",
+                header=module_header,
+                reports=sorted_module_reports,
+                to_truncate=False,
+            )
+            file_path = os.path.join(os.getcwd(), "test_failure_tables/module_failures_report.txt")
+            with open(file_path, "w", encoding="UTF-8") as fp:
+                fp.write(module_failures_report)
 
-        return {"type": "section", "text": {"type": "mrkdwn", "text": report}}
+        return model_failure_sections
 
     @property
     def additional_failures(self) -> Dict:
@@ -337,15 +356,11 @@ def additional_failures(self) -> Dict:
                 individual_reports.append(report)
 
         header = "Single |  Multi | Category\n"
-        failures_report = header + "\n".join(sorted(individual_reports))
+        failures_report = prepare_reports(
+            title="The following non-modeling tests had failures", header=header, reports=individual_reports
+        )
 
-        return {
-            "type": "section",
-            "text": {
-                "type": "mrkdwn",
-                "text": f"The following non-modeling tests had failures:\n```\n{failures_report}\n```",
-            },
-        }
+        return {"type": "section", "text": {"type": "mrkdwn", "text": failures_report}}
 
     @property
     def payload(self) -> str:
@@ -358,7 +373,10 @@ def payload(self) -> str:
             blocks.append(self.failures)
 
         if self.n_model_failures > 0:
-            blocks.extend([self.category_failures, self.model_failures])
+            blocks.append(self.category_failures)
+            for block in self.model_failures:
+                if block["text"]["text"]:
+                    blocks.append(block)
 
         if self.n_additional_failures > 0:
             blocks.append(self.additional_failures)
@@ -582,6 +600,29 @@ def add_path(self, path: str, gpu: str = None):
     return _available_artifacts
 
 
+def prepare_reports(title, header, reports, to_truncate=True):
+    report = ""
+
+    MAX_ERROR_TEXT = 3000 - len("[Truncated]")
+    if not to_truncate:
+        MAX_ERROR_TEXT = float("inf")
+
+    if len(reports) > 0:
+        # `text` must be less than 3001 characters in Slack SDK
+        # keep some room for adding "[Truncated]" when necessary
+
+        for idx in range(len(reports)):
+            _report = header + "\n".join(reports[: idx + 1])
+            new_report = f"{title}:\n```\n{_report}\n```\n"
+            if len(new_report) > MAX_ERROR_TEXT:
+                # `report` here has length <= 3000
+                report = report + "[Truncated]"
+                break
+            report = new_report
+
+    return report
+
+
 if __name__ == "__main__":
 
     org = "huggingface"
@@ -686,16 +727,24 @@ def add_path(self, path: str, gpu: str = None):
 
     unclassified_model_failures = []
 
+    # This prefix is used to get job links below. For past CI, we use `workflow_call`, which changes the job names from
+    # `Model tests (...)` to `PyTorch 1.5 / Model tests (...)` for example.
+    job_name_prefix = ""
+    if ci_event.startswith("Past CI - "):
+        framework, version = ci_event.replace("Past CI - ", "").split("-")
+        framework = "PyTorch" if framework == "pytorch" else "TensorFlow"
+        job_name_prefix = f"{framework} {version}"
+
     for model in model_results.keys():
         for artifact_path in available_artifacts[f"run_all_tests_gpu_{model}_test_reports"].paths:
             artifact = retrieve_artifact(artifact_path["name"], artifact_path["gpu"])
             if "stats" in artifact:
                 # Link to the GitHub Action job
-                model_results[model]["job_link"][artifact_path["gpu"]] = github_actions_job_links.get(
-                    # The job names use `matrix.folder` which contain things like `models/bert` instead of `models_bert`
-                    f"Model tests ({model.replace('models_', 'models/')}, {artifact_path['gpu']}-gpu)"
-                )
-
+                # The job names use `matrix.folder` which contain things like `models/bert` instead of `models_bert`
+                job_name = f"Model tests ({model.replace('models_', 'models/')}, {artifact_path['gpu']}-gpu)"
+                if job_name_prefix:
+                    job_name = f"{job_name_prefix} / {job_name}"
+                model_results[model]["job_link"][artifact_path["gpu"]] = github_actions_job_links.get(job_name)
                 failed, success, time_spent = handle_test_results(artifact["stats"])
                 model_results[model]["success"] += success
                 model_results[model]["time_spent"] += time_spent[1:-1] + ", "
@@ -809,7 +858,7 @@ def add_path(self, path: str, gpu: str = None):
 
     message = Message(title, ci_title, model_results, additional_results)
 
-    # send report only if there is any failure
-    if message.n_failures:
+    # send report only if there is any failure (for push CI)
+    if message.n_failures or ci_event != "push":
         message.post()
         message.post_reply()