RasaHQ · kedz · Mar 5, 2021 · Feb 23, 2021 · Feb 24, 2021 · Feb 24, 2021
diff --git a/changelog/8000.improvement.md b/changelog/8000.improvement.md
@@ -0,0 +1,3 @@
+Remove console logging of conversation level F1-score and precision since these calculations were not meaningful.
+
+Add conversation level accuracy to core policy results logged to file in `story_report.json` after running `rasa test core` or `rasa test`.
@@ -667,7 +667,7 @@ async def _collect_story_predictions(
     use_e2e: bool = False,
 ) -> Tuple[StoryEvaluation, int]:
     """Test the stories from a file, running them through the stored model."""
-    from rasa.test import get_evaluation_metrics
+    from sklearn.metrics import accuracy_score
     from tqdm import tqdm
 
     story_eval_store = EvaluationStore()
@@ -702,25 +702,18 @@ async def _collect_story_predictions(
             success.append(predicted_tracker)
 
     logger.info("Finished collecting predictions.")
-    with warnings.catch_warnings():
-        from sklearn.exceptions import UndefinedMetricWarning
-
-        warnings.simplefilter("ignore", UndefinedMetricWarning)
-        report, precision, f1, accuracy = get_evaluation_metrics(
-            [1] * len(completed_trackers), correct_dialogues
-        )
 
     in_training_data_fraction = _in_training_data_fraction(action_list)
 
+    if len(correct_dialogues):
+        accuracy = accuracy_score([1] * len(correct_dialogues), correct_dialogues)
+    else:
+        accuracy = 0
+
     _log_evaluation_table(
         [1] * len(completed_trackers),
         "END-TO-END" if use_e2e else "CONVERSATION",
-        report,
-        precision,
-        f1,
         accuracy,
-        in_training_data_fraction,
-        include_report=False,
     )
 
     return (
@@ -795,15 +788,26 @@ async def test(
         targets, predictions = evaluation_store.serialise()
 
         if out_directory:
-            report, precision, f1, accuracy = get_evaluation_metrics(
+            report, precision, f1, action_accuracy = get_evaluation_metrics(
                 targets, predictions, output_dict=True
             )
 
+            # Add conversation level accuracy to story report.
+            num_failed = len(story_evaluation.failed_stories)
+            num_correct = len(story_evaluation.successful_stories)
+            num_convs = num_failed + num_correct
+            if num_convs:
+                conv_accuracy = num_correct / num_convs
+                report["conversation_accuracy"] = {
+                    "accuracy": conv_accuracy,
+                    "correct": num_correct,
+                    "total": num_convs,
+                }
             report_filename = os.path.join(out_directory, REPORT_STORIES_FILE)
             rasa.shared.utils.io.dump_obj_as_json_to_file(report_filename, report)
             logger.info(f"Stories report saved to {report_filename}.")
         else:
-            report, precision, f1, accuracy = get_evaluation_metrics(
+            report, precision, f1, action_accuracy = get_evaluation_metrics(
                 targets, predictions, output_dict=True
             )
 
@@ -812,12 +816,10 @@ async def test(
     _log_evaluation_table(
         evaluation_store.action_targets,
         "ACTION",
-        report,
-        precision,
-        f1,
-        accuracy,
-        story_evaluation.in_training_data_fraction,
-        include_report=False,
+        action_accuracy,
+        precision=precision,
+        f1=f1,
+        in_training_data_fraction=story_evaluation.in_training_data_fraction,
     )
 
     if not disable_plotting and out_directory:
@@ -842,7 +844,7 @@ async def test(
         "report": report,
         "precision": precision,
         "f1": f1,
-        "accuracy": accuracy,
+        "accuracy": action_accuracy,
         "actions": story_evaluation.action_list,
         "in_training_data_fraction": story_evaluation.in_training_data_fraction,
         "is_end_to_end_evaluation": e2e,
@@ -852,22 +854,25 @@ async def test(
 def _log_evaluation_table(
     golds: List[Any],
     name: Text,
-    report: Dict[Text, Any],
-    precision: float,
-    f1: float,
     accuracy: float,
-    in_training_data_fraction: float,
+    report: Optional[Dict[Text, Any]] = None,
+    precision: Optional[float] = None,
+    f1: Optional[float] = None,
+    in_training_data_fraction: Optional[float] = None,
     include_report: bool = True,
 ) -> None:  # pragma: no cover
     """Log the sklearn evaluation metrics."""
     logger.info(f"Evaluation Results on {name} level:")
     logger.info(f"\tCorrect:          {int(len(golds) * accuracy)} / {len(golds)}")
-    logger.info(f"\tF1-Score:         {f1:.3f}")
-    logger.info(f"\tPrecision:        {precision:.3f}")
+    if f1 is not None:
+        logger.info(f"\tF1-Score:         {f1:.3f}")
+    if precision is not None:
+        logger.info(f"\tPrecision:        {precision:.3f}")
     logger.info(f"\tAccuracy:         {accuracy:.3f}")
-    logger.info(f"\tIn-data fraction: {in_training_data_fraction:.3g}")
+    if in_training_data_fraction is not None:
+        logger.info(f"\tIn-data fraction: {in_training_data_fraction:.3g}")
 
-    if include_report:
+    if include_report and report is not None:
         logger.info(f"\tClassification report: \n{report}")
 
 

diff --git a/tests/core/test_evaluation.py b/tests/core/test_evaluation.py
@@ -1,5 +1,7 @@
 import os
 from pathlib import Path
+import json
+import logging
 from typing import Any, Text, Dict
 
 import pytest
@@ -299,3 +301,170 @@ async def test_retrieval_intent_wrong_prediction(
 
     # check if the predicted entry contains full retrieval intent
     assert "# predicted: chitchat/ask_name" in failed_stories
+
+
+@pytest.mark.trains_model
+@pytest.mark.parametrize(
+    "stories_yaml,expected_results",
+    [
+        [
+            """
+stories:
+  - story: story1
+    steps:
+    - intent: greet
+    - action: utter_greet
+  - story: story2
+    steps:
+    - intent: goodbye
+    - action: utter_goodbye
+  - story: story3
+    steps:
+    - intent: greet
+    - action: utter_greet
+    - intent: goodbye
+    - action: utter_default
+            """,
+            {
+                "utter_goodbye": {
+                    "precision": 1.0,
+                    "recall": 1.0,
+                    "f1-score": 1.0,
+                    "support": 1,
+                },
+                "action_listen": {
+                    "precision": 1.0,
+                    "recall": 0.75,
+                    "f1-score": 0.8571428571428571,
+                    "support": 4,
+                },
+                "utter_greet": {
+                    "precision": 1.0,
+                    "recall": 1.0,
+                    "f1-score": 1.0,
+                    "support": 2,
+                },
+                "utter_default": {
+                    "precision": 0.0,
+                    "recall": 0.0,
+                    "f1-score": 0.0,
+                    "support": 1,
+                },
+                "micro avg": {
+                    "precision": 1.0,
+                    "recall": 0.75,
+                    "f1-score": 0.8571428571428571,
+                    "support": 8,
+                },
+                "macro avg": {
+                    "precision": 0.75,
+                    "recall": 0.6875,
+                    "f1-score": 0.7142857142857143,
+                    "support": 8,
+                },
+                "weighted avg": {
+                    "precision": 0.875,
+                    "recall": 0.75,
+                    "f1-score": 0.8035714285714286,
+                    "support": 8,
+                },
+                "conversation_accuracy": {
+                    "accuracy": 2.0 / 3.0,
+                    "total": 3,
+                    "correct": 2,
+                },
+            },
+        ],
+    ],
+)
+async def test_story_report(
+    tmpdir: Path,
+    core_agent: Agent,
+    stories_yaml: Text,
+    expected_results: Dict[Text, Dict[Text, Any]],
+) -> None:
+    """Check story_report.json file contains correct result keys/values."""
+
+    stories_path = tmpdir / "stories.yml"
+    stories_path.write_text(stories_yaml, "utf8")
+    out_directory = tmpdir / "results"
+    out_directory.mkdir()
+
+    await evaluate_stories(stories_path, core_agent, out_directory=out_directory)
+    story_report_path = out_directory / "story_report.json"
+    assert story_report_path.exists()
+
+    actual_results = json.loads(story_report_path.read_text("utf8"))
+    assert actual_results == expected_results
+
+
+@pytest.mark.trains_model
+async def test_story_report_with_empty_stories(
+    tmpdir: Path, core_agent: Agent,
+) -> None:
+    """Check that story_report.json contains empty dictionary when stories.yml is empty."""
+
+    stories_path = tmpdir / "stories.yml"
+    stories_path.write_text("", "utf8")
+    out_directory = tmpdir / "results"
+    out_directory.mkdir()
+
+    await evaluate_stories(stories_path, core_agent, out_directory=out_directory)
+    story_report_path = out_directory / "story_report.json"
+    assert story_report_path.exists()
+
+    actual_results = json.loads(story_report_path.read_text("utf8"))
+    assert actual_results == {}
+
+
+@pytest.mark.parametrize(
+    "skip_field,skip_value",
+    [
+        [None, None,],
+        ["precision", None,],
+        ["f1", None,],
+        ["in_training_data_fraction", None,],
+        ["report", None,],
+        ["include_report", False,],
+    ],
+)
+def test_log_evaluation_table(caplog, skip_field, skip_value):
+    """Check that _log_evaluation_table correctly omits/includes optional args."""
+    arr = [1, 1, 1, 0]
+    acc = 0.75
+    kwargs = {
+        "precision": 0.5,
+        "f1": 0.6,
+        "in_training_data_fraction": 0.1,
+        "report": {"macro f1": 0.7},
+    }
+    if skip_field:
+        kwargs[skip_field] = skip_value
+    caplog.set_level(logging.INFO)
+    rasa.core.test._log_evaluation_table(arr, "CONVERSATION", acc, **kwargs)
+
+    assert f"Correct:          {int(len(arr) * acc)} / {len(arr)}" in caplog.text
+    assert f"Accuracy:         {acc:.3f}" in caplog.text
+
+    if skip_field != "f1":
+        assert f"F1-Score:         {kwargs['f1']:5.3f}" in caplog.text
+    else:
+        assert "F1-Score:" not in caplog.text
+
+    if skip_field != "precision":
+        assert f"Precision:        {kwargs['precision']:5.3f}" in caplog.text
+    else:
+        assert "Precision:" not in caplog.text
+
+    if skip_field != "in_training_data_fraction":
+        assert (
+            f"In-data fraction: {kwargs['in_training_data_fraction']:.3g}"
+            in caplog.text
+        )
+    else:
+        assert "In-data fraction:" not in caplog.text
+
+    if skip_field != "report" and skip_field != "include_report":
+        assert f"Classification report: \n{kwargs['report']}" in caplog.text
+    else:
+        assert "Classification report:" not in caplog.text
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		Remove console logging of conversation level F1-score and precision since these calculations were not meaningful.

		Add conversation level accuracy to core policy results logged to file in `story_report.json` after running `rasa test core` or `rasa test`.