From 38593940450a93aaac7a880782bf6bb3f5c89cdb Mon Sep 17 00:00:00 2001
From: kedz <kedzie@cs.columbia.edu>
Date: Tue, 23 Feb 2021 15:48:43 -0500
Subject: [PATCH 01/13] Fix logging of conversation level core metrics to only
 show conversation level accuracy. Add conversation level accuracy to
 story_report.json

---
 rasa/core/test.py | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/rasa/core/test.py b/rasa/core/test.py
index 503cb211f4c7..71605667fd43 100644
--- a/rasa/core/test.py
+++ b/rasa/core/test.py
@@ -702,26 +702,18 @@ async def _collect_story_predictions(
             success.append(predicted_tracker)
 
     logger.info("Finished collecting predictions.")
-    with warnings.catch_warnings():
-        from sklearn.exceptions import UndefinedMetricWarning
-
-        warnings.simplefilter("ignore", UndefinedMetricWarning)
-        report, precision, f1, accuracy = get_evaluation_metrics(
-            [1] * len(completed_trackers), correct_dialogues
-        )
 
     in_training_data_fraction = _in_training_data_fraction(action_list)
 
-    _log_evaluation_table(
-        [1] * len(completed_trackers),
-        "END-TO-END" if use_e2e else "CONVERSATION",
-        report,
-        precision,
-        f1,
-        accuracy,
-        in_training_data_fraction,
-        include_report=False,
+    num_convs = len(correct_dialogues)
+    num_correct = sum(correct_dialogues)
+    accuracy = num_correct / num_convs if num_convs else 0.0
+
+    logger.info(
+        f"Evaluation Results on {'END-TO-END' if use_e2e else 'CONVERSATION'} level:"
     )
+    logger.info(f"\tCorrect:          {num_correct} / {num_convs}")
+    logger.info(f"\tAccuracy:         {accuracy:.3f}")
 
     return (
         StoryEvaluation(
@@ -799,6 +791,16 @@ async def test(
                 targets, predictions, output_dict=True
             )
 
+            # Add conversation level accuracy to story report.
+            num_failed = len(story_evaluation.failed_stories)
+            num_correct = len(story_evaluation.successful_stories)
+            num_convs = num_failed + num_correct
+            conv_acc = num_correct / num_correct if num_correct else 0.0
+            report["conversation_accuracy"] = {
+                "accuracy": conv_acc,
+                "correct": num_correct,
+                "total": num_convs,
+            }
             report_filename = os.path.join(out_directory, REPORT_STORIES_FILE)
             rasa.shared.utils.io.dump_obj_as_json_to_file(report_filename, report)
             logger.info(f"Stories report saved to {report_filename}.")

From 5526cfc0c30dd45f4fedf80f4c9738ee1760deae Mon Sep 17 00:00:00 2001
From: kedz <kedzie@cs.columbia.edu>
Date: Wed, 24 Feb 2021 12:45:35 -0500
Subject: [PATCH 02/13] Fix typo on accuracy calculation. Modify
 _log_evaluation_table to take optional arguments and reuse that function. Use
 sklearn.metrics.accuracy_score instead of computing manually. Remove
 unneccesary get_evaluation_metrics import.

---
 rasa/core/test.py | 65 +++++++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 31 deletions(-)

diff --git a/rasa/core/test.py b/rasa/core/test.py
index 71605667fd43..3be5049a2482 100644
--- a/rasa/core/test.py
+++ b/rasa/core/test.py
@@ -667,7 +667,7 @@ async def _collect_story_predictions(
     use_e2e: bool = False,
 ) -> Tuple[StoryEvaluation, int]:
     """Test the stories from a file, running them through the stored model."""
-    from rasa.test import get_evaluation_metrics
+    from sklearn.metrics import accuracy_score
     from tqdm import tqdm
 
     story_eval_store = EvaluationStore()
@@ -705,15 +705,16 @@ async def _collect_story_predictions(
 
     in_training_data_fraction = _in_training_data_fraction(action_list)
 
-    num_convs = len(correct_dialogues)
-    num_correct = sum(correct_dialogues)
-    accuracy = num_correct / num_convs if num_convs else 0.0
+    if len(correct_dialogues):
+        accuracy = accuracy_score([1] * len(correct_dialogues), correct_dialogues)
+    else:
+        accuracy = 0
 
-    logger.info(
-        f"Evaluation Results on {'END-TO-END' if use_e2e else 'CONVERSATION'} level:"
+    _log_evaluation_table(
+        [1] * len(correct_dialogues),
+        "END-TO-END" if use_e2e else "CONVERSATION",
+        accuracy,
     )
-    logger.info(f"\tCorrect:          {num_correct} / {num_convs}")
-    logger.info(f"\tAccuracy:         {accuracy:.3f}")
 
     return (
         StoryEvaluation(
@@ -787,7 +788,7 @@ async def test(
         targets, predictions = evaluation_store.serialise()
 
         if out_directory:
-            report, precision, f1, accuracy = get_evaluation_metrics(
+            report, precision, f1, action_accuracy = get_evaluation_metrics(
                 targets, predictions, output_dict=True
             )
 
@@ -795,17 +796,18 @@ async def test(
             num_failed = len(story_evaluation.failed_stories)
             num_correct = len(story_evaluation.successful_stories)
             num_convs = num_failed + num_correct
-            conv_acc = num_correct / num_correct if num_correct else 0.0
-            report["conversation_accuracy"] = {
-                "accuracy": conv_acc,
-                "correct": num_correct,
-                "total": num_convs,
-            }
+            if num_convs:
+                conv_accuracy = num_correct / num_convs
+                report["conversation_accuracy"] = {
+                    "accuracy": conv_accuracy,
+                    "correct": num_correct,
+                    "total": num_convs,
+                }
             report_filename = os.path.join(out_directory, REPORT_STORIES_FILE)
             rasa.shared.utils.io.dump_obj_as_json_to_file(report_filename, report)
             logger.info(f"Stories report saved to {report_filename}.")
         else:
-            report, precision, f1, accuracy = get_evaluation_metrics(
+            report, precision, f1, action_accuracy = get_evaluation_metrics(
                 targets, predictions, output_dict=True
             )
 
@@ -814,12 +816,10 @@ async def test(
     _log_evaluation_table(
         evaluation_store.action_targets,
         "ACTION",
-        report,
-        precision,
-        f1,
-        accuracy,
-        story_evaluation.in_training_data_fraction,
-        include_report=False,
+        action_accuracy,
+        precision=precision,
+        f1=f1,
+        in_training_data_fraction=story_evaluation.in_training_data_fraction,
     )
 
     if not disable_plotting and out_directory:
@@ -844,7 +844,7 @@ async def test(
         "report": report,
         "precision": precision,
         "f1": f1,
-        "accuracy": accuracy,
+        "accuracy": action_accuracy,
         "actions": story_evaluation.action_list,
         "in_training_data_fraction": story_evaluation.in_training_data_fraction,
         "is_end_to_end_evaluation": e2e,
@@ -854,22 +854,25 @@ async def test(
 def _log_evaluation_table(
     golds: List[Any],
     name: Text,
-    report: Dict[Text, Any],
-    precision: float,
-    f1: float,
     accuracy: float,
-    in_training_data_fraction: float,
+    report: Optional[Dict[Text, Any]] = None,
+    precision: Optional[float] = None,
+    f1: Optional[float] = None,
+    in_training_data_fraction: Optional[float] = None,
     include_report: bool = True,
 ) -> None:  # pragma: no cover
     """Log the sklearn evaluation metrics."""
     logger.info(f"Evaluation Results on {name} level:")
     logger.info(f"\tCorrect:          {int(len(golds) * accuracy)} / {len(golds)}")
-    logger.info(f"\tF1-Score:         {f1:.3f}")
-    logger.info(f"\tPrecision:        {precision:.3f}")
+    if f1 is not None:
+        logger.info(f"\tF1-Score:         {f1:.3f}")
+    if precision is not None:
+        logger.info(f"\tPrecision:        {precision:.3f}")
     logger.info(f"\tAccuracy:         {accuracy:.3f}")
-    logger.info(f"\tIn-data fraction: {in_training_data_fraction:.3g}")
+    if in_training_data_fraction is not None:
+        logger.info(f"\tIn-data fraction: {in_training_data_fraction:.3g}")
 
-    if include_report:
+    if include_report and report is not None:
         logger.info(f"\tClassification report: \n{report}")
 
 

From 666f54acb669f65e6ef7a6cc6a6a4c6a6e3979a4 Mon Sep 17 00:00:00 2001
From: kedz <kedzie@cs.columbia.edu>
Date: Thu, 25 Feb 2021 14:15:11 -0500
Subject: [PATCH 03/13] Revert to original argument ([1] *
 len(completed_trackers)) in call to _log_evaluation_table to preserve
 continuity with previous code version.

---
 rasa/core/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/core/test.py b/rasa/core/test.py
index 3be5049a2482..2541f15cf599 100644
--- a/rasa/core/test.py
+++ b/rasa/core/test.py
@@ -711,7 +711,7 @@ async def _collect_story_predictions(
         accuracy = 0
 
     _log_evaluation_table(
-        [1] * len(correct_dialogues),
+        [1] * len(completed_trackers),
         "END-TO-END" if use_e2e else "CONVERSATION",
         accuracy,
     )

From b8e9b15ef076e33ee710e1697a8ba5bcd18fcedd Mon Sep 17 00:00:00 2001
From: kedz <kedzie@cs.columbia.edu>
Date: Mon, 1 Mar 2021 15:43:20 -0500
Subject: [PATCH 04/13] Added a test to check rasa.core.test.test writes policy
 results to file.

---
 tests/core/test_test.py | 156 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)
 create mode 100644 tests/core/test_test.py

diff --git a/tests/core/test_test.py b/tests/core/test_test.py
new file mode 100644
index 000000000000..fc2f06e1007b
--- /dev/null
+++ b/tests/core/test_test.py
@@ -0,0 +1,156 @@
+import asyncio
+import json
+import pathlib
+import pytest
+from typing import Text, Any, Dict
+
+from rasa.core.agent import Agent
+from rasa.core.policies.ensemble import SimplePolicyEnsemble
+from rasa.core.policies.rule_policy import RulePolicy
+import rasa.core.test
+from rasa.shared.core.constants import ACTION_LISTEN_NAME
+from rasa.shared.core.domain import Domain
+from rasa.shared.core.events import ActionExecuted, UserUttered
+from rasa.shared.core.generator import TrackerWithCachedStates
+from rasa.shared.nlu.interpreter import RegexInterpreter
+
+
+@pytest.fixture(scope="session")
+def out_directory(tmpdir_factory):
+    """Output directory for logging info."""
+    fn = tmpdir_factory.mktemp("results")
+    return fn
+
+
+@pytest.mark.parametrize(
+    "stories_yaml,expected_results",
+    [
+        [
+            """
+stories:
+  - story: story1
+    steps:
+    - intent: intentA
+    - action: actionA
+  - story: story2
+    steps:
+    - intent: intentB
+    - action: actionB
+  - story: story3
+    steps:
+    - intent: intentA
+    - action: actionA
+    - intent: intentB
+    - action: actionC
+            """,
+            {
+                "actionB": {
+                    "precision": 1.0,
+                    "recall": 1.0,
+                    "f1-score": 1.0,
+                    "support": 1,
+                },
+                "action_listen": {
+                    "precision": 1.0,
+                    "recall": 0.75,
+                    "f1-score": 0.8571428571428571,
+                    "support": 4,
+                },
+                "actionA": {
+                    "precision": 1.0,
+                    "recall": 1.0,
+                    "f1-score": 1.0,
+                    "support": 2,
+                },
+                "actionC": {
+                    "precision": 0.0,
+                    "recall": 0.0,
+                    "f1-score": 0.0,
+                    "support": 1,
+                },
+                "micro avg": {
+                    "precision": 1.0,
+                    "recall": 0.75,
+                    "f1-score": 0.8571428571428571,
+                    "support": 8,
+                },
+                "macro avg": {
+                    "precision": 0.75,
+                    "recall": 0.6875,
+                    "f1-score": 0.7142857142857143,
+                    "support": 8,
+                },
+                "weighted avg": {
+                    "precision": 0.875,
+                    "recall": 0.75,
+                    "f1-score": 0.8035714285714286,
+                    "support": 8,
+                },
+                "conversation_accuracy": {
+                    "accuracy": 2.0 / 3.0,
+                    "total": 3,
+                    "correct": 2,
+                },
+            },
+        ],
+        ["", {}],
+    ],
+)
+async def test_test(
+    tmpdir_factory: pathlib.Path,
+    out_directory: pathlib.Path,
+    stories_yaml: Text,
+    expected_results: Dict[Text, Dict[Text, Any]],
+) -> None:
+
+    stories_path = tmpdir_factory.mktemp("test_rasa_core_test").join("eval_stories.yml")
+    stories_path.write_text(stories_yaml, "utf8")
+
+    domain = Domain.from_yaml(
+        """
+intents:
+- intentA
+- intentB
+actions:
+- actionA
+- actionB
+- actionC
+"""
+    )
+
+    policy = RulePolicy()
+    rt1 = TrackerWithCachedStates.from_events(
+        "ruleAtoA",
+        domain=domain,
+        slots=domain.slots,
+        evts=[
+            ActionExecuted(ACTION_LISTEN_NAME),
+            UserUttered(intent={"name": "intentA"}),
+            ActionExecuted("actionA"),
+            ActionExecuted(ACTION_LISTEN_NAME),
+        ],
+        is_rule_tracker=True,
+    )
+    rt2 = TrackerWithCachedStates.from_events(
+        "ruleBtoB",
+        domain=domain,
+        slots=domain.slots,
+        evts=[
+            ActionExecuted(ACTION_LISTEN_NAME),
+            UserUttered(intent={"name": "intentB"}),
+            ActionExecuted("actionB"),
+            ActionExecuted(ACTION_LISTEN_NAME),
+        ],
+        is_rule_tracker=True,
+    )
+
+    policy.train([rt1, rt2], domain, RegexInterpreter())
+
+    agent = Agent(domain=domain, policies=SimplePolicyEnsemble([policy]),)
+
+    await rasa.core.test.test(stories_path, agent, out_directory=out_directory)
+    story_report_path = out_directory / "story_report.json"
+    assert story_report_path.exists()
+
+    actual_results = json.loads(story_report_path.read_text("utf8"))
+    assert actual_results == expected_results

From 20e21a8420415200bbcf3fdaec009de3ed586451 Mon Sep 17 00:00:00 2001
From: kedz <kedzie@cs.columbia.edu>
Date: Mon, 1 Mar 2021 16:08:09 -0500
Subject: [PATCH 05/13] Add changelog entry.

---
 changelog/8000.improvment.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 changelog/8000.improvment.md

diff --git a/changelog/8000.improvment.md b/changelog/8000.improvment.md
new file mode 100644
index 000000000000..db02c931ab68
--- /dev/null
+++ b/changelog/8000.improvment.md
@@ -0,0 +1,5 @@
+Remove console logging of conversation level F1-score and precision since these calculations were not meaningful.
+
+Add conversation level accuracy to core policy results logged to file in `story_report.json` after running `rasa test core` or `rasa test`.
+
+Add test to check that `story_report.json` contents are correct.

From 45bc56aa813b0fa3e4a21d7aca080d9eafac41d8 Mon Sep 17 00:00:00 2001
From: kedz <kedzie@cs.columbia.edu>
Date: Mon, 1 Mar 2021 16:47:07 -0500
Subject: [PATCH 06/13] Added test for _log_evaluation_table method.

---
 tests/core/test_test.py | 53 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/tests/core/test_test.py b/tests/core/test_test.py
index fc2f06e1007b..09982f4a57e3 100644
--- a/tests/core/test_test.py
+++ b/tests/core/test_test.py
@@ -2,6 +2,7 @@
 import json
 import pathlib
 import pytest
+import logging
 from typing import Text, Any, Dict
 
 from rasa.core.agent import Agent
@@ -154,3 +155,55 @@ async def test_test(
 
     actual_results = json.loads(story_report_path.read_text("utf8"))
     assert actual_results == expected_results
+
+
+@pytest.mark.parametrize(
+    "skip_field,skip_value",
+    [
+        [None, None,],
+        ["precision", None,],
+        ["f1", None,],
+        ["in_training_data_fraction", None,],
+        ["report", None,],
+        ["include_report", False,],
+    ],
+)
+def test_log_evaluation_table(caplog, skip_field, skip_value):
+    arr = [1, 1, 1, 0]
+    acc = 0.75
+    kwargs = {
+        "precision": 0.5,
+        "f1": 0.6,
+        "in_training_data_fraction": 0.1,
+        "report": {"macro f1": 0.7},
+    }
+    if skip_field:
+        kwargs[skip_field] = skip_value
+    caplog.set_level(logging.INFO)
+    rasa.core.test._log_evaluation_table(arr, "CONVERSATION", acc, **kwargs)
+
+    assert f"Correct:          {int(len(arr) * acc)} / {len(arr)}" in caplog.text
+    assert f"Accuracy:         {acc:.3f}" in caplog.text
+
+    if skip_field != "f1":
+        assert f"F1-Score:         {kwargs['f1']:5.3f}" in caplog.text
+    else:
+        assert f"F1-Score:" not in caplog.text
+
+    if skip_field != "precision":
+        assert f"Precision:        {kwargs['precision']:5.3f}" in caplog.text
+    else:
+        assert f"Precision:" not in caplog.text
+
+    if skip_field != "in_training_data_fraction":
+        assert (
+            f"In-data fraction: {kwargs['in_training_data_fraction']:.3g}"
+            in caplog.text
+        )
+    else:
+        assert f"In-data fraction:" not in caplog.text
+
+    if skip_field != "report" and skip_field != "include_report":
+        assert f"Classification report: \n{kwargs['report']}" in caplog.text
+    else:
+        assert f"Classification report:" not in caplog.text

From 003bf56129190415a7e7d0a9b7f856886ad1c7cf Mon Sep 17 00:00:00 2001
From: kedz <kedzie@cs.columbia.edu>
Date: Mon, 1 Mar 2021 16:53:26 -0500
Subject: [PATCH 07/13] Removed f in strings without formatting.

---
 tests/core/test_test.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/core/test_test.py b/tests/core/test_test.py
index 09982f4a57e3..363828e83c35 100644
--- a/tests/core/test_test.py
+++ b/tests/core/test_test.py
@@ -188,12 +188,12 @@ def test_log_evaluation_table(caplog, skip_field, skip_value):
     if skip_field != "f1":
         assert f"F1-Score:         {kwargs['f1']:5.3f}" in caplog.text
     else:
-        assert f"F1-Score:" not in caplog.text
+        assert "F1-Score:" not in caplog.text
 
     if skip_field != "precision":
         assert f"Precision:        {kwargs['precision']:5.3f}" in caplog.text
     else:
-        assert f"Precision:" not in caplog.text
+        assert "Precision:" not in caplog.text
 
     if skip_field != "in_training_data_fraction":
         assert (
@@ -201,9 +201,9 @@ def test_log_evaluation_table(caplog, skip_field, skip_value):
             in caplog.text
         )
     else:
-        assert f"In-data fraction:" not in caplog.text
+        assert "In-data fraction:" not in caplog.text
 
     if skip_field != "report" and skip_field != "include_report":
         assert f"Classification report: \n{kwargs['report']}" in caplog.text
     else:
-        assert f"Classification report:" not in caplog.text
+        assert "Classification report:" not in caplog.text

From 553821db20b266367154026f88274addfb61d88d Mon Sep 17 00:00:00 2001
From: kedz <kedzie@cs.columbia.edu>
Date: Tue, 2 Mar 2021 13:00:34 -0500
Subject: [PATCH 08/13] Fix typo in changelog filename. Make changlog text
 shorter.

---
 changelog/{8000.improvment.md => 8000.improvement.md} | 2 --
 1 file changed, 2 deletions(-)
 rename changelog/{8000.improvment.md => 8000.improvement.md} (79%)

diff --git a/changelog/8000.improvment.md b/changelog/8000.improvement.md
similarity index 79%
rename from changelog/8000.improvment.md
rename to changelog/8000.improvement.md
index db02c931ab68..beedf1fd1f41 100644
--- a/changelog/8000.improvment.md
+++ b/changelog/8000.improvement.md
@@ -1,5 +1,3 @@
 Remove console logging of conversation level F1-score and precision since these calculations were not meaningful.
 
 Add conversation level accuracy to core policy results logged to file in `story_report.json` after running `rasa test core` or `rasa test`.
-
-Add test to check that `story_report.json` contents are correct.

From a484bfb1405269484f0c78658362323905e0edd9 Mon Sep 17 00:00:00 2001
From: kedz <kedzie@cs.columbia.edu>
Date: Tue, 2 Mar 2021 13:51:17 -0500
Subject: [PATCH 09/13] Cleaned up temp path fixtures.

---
 tests/core/test_test.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tests/core/test_test.py b/tests/core/test_test.py
index 363828e83c35..c0ec195a2555 100644
--- a/tests/core/test_test.py
+++ b/tests/core/test_test.py
@@ -16,11 +16,16 @@
 from rasa.shared.nlu.interpreter import RegexInterpreter
 
 
-@pytest.fixture(scope="session")
-def out_directory(tmpdir_factory):
-    """Output directory for logging info."""
-    fn = tmpdir_factory.mktemp("results")
-    return fn
+@pytest.fixture(scope="function")
+def out_directory(tmpdir: pathlib.Path):
+    """Output directory for logging policy results."""
+    return tmpdir
+
+
+@pytest.fixture(scope="function")
+def stories_path(tmpdir: pathlib.Path):
+    """Path for writing test stories."""
+    return tmpdir / "stories.yml"
 
 
 @pytest.mark.parametrize(
@@ -98,13 +103,12 @@ def out_directory(tmpdir_factory):
     ],
 )
 async def test_test(
-    tmpdir_factory: pathlib.Path,
+    stories_path: pathlib.Path,
     out_directory: pathlib.Path,
     stories_yaml: Text,
     expected_results: Dict[Text, Dict[Text, Any]],
 ) -> None:
 
-    stories_path = tmpdir_factory.mktemp("test_rasa_core_test").join("eval_stories.yml")
     stories_path.write_text(stories_yaml, "utf8")
 
     domain = Domain.from_yaml(

From ff9cf070cd0c59d8a78711e669887436142b3e51 Mon Sep 17 00:00:00 2001
From: kedz <kedzie@cs.columbia.edu>
Date: Thu, 4 Mar 2021 14:14:59 -0500
Subject: [PATCH 10/13] Move tests to tests/core/test_evaluation. Use agent
 test fixture.

---
 tests/core/test_evaluation.py | 167 ++++++++++++++++++++++++++
 tests/core/test_test.py       | 213 ----------------------------------
 2 files changed, 167 insertions(+), 213 deletions(-)
 delete mode 100644 tests/core/test_test.py

diff --git a/tests/core/test_evaluation.py b/tests/core/test_evaluation.py
index 967afbe46c8a..57d16eff9e6b 100644
--- a/tests/core/test_evaluation.py
+++ b/tests/core/test_evaluation.py
@@ -1,5 +1,7 @@
 import os
 from pathlib import Path
+import json
+import logging
 from typing import Any, Text, Dict
 
 import pytest
@@ -299,3 +301,168 @@ async def test_retrieval_intent_wrong_prediction(
 
     # check if the predicted entry contains full retrieval intent
     assert "# predicted: chitchat/ask_name" in failed_stories
+
+
+@pytest.mark.parametrize(
+    "stories_yaml,expected_results",
+    [
+        [
+            """
+stories:
+  - story: story1
+    steps:
+    - intent: greet
+    - action: utter_greet
+  - story: story2
+    steps:
+    - intent: goodbye
+    - action: utter_goodbye
+  - story: story3
+    steps:
+    - intent: greet
+    - action: utter_greet
+    - intent: goodbye
+    - action: utter_default
+            """,
+            {
+                "utter_goodbye": {
+                    "precision": 1.0,
+                    "recall": 1.0,
+                    "f1-score": 1.0,
+                    "support": 1,
+                },
+                "action_listen": {
+                    "precision": 1.0,
+                    "recall": 0.75,
+                    "f1-score": 0.8571428571428571,
+                    "support": 4,
+                },
+                "utter_greet": {
+                    "precision": 1.0,
+                    "recall": 1.0,
+                    "f1-score": 1.0,
+                    "support": 2,
+                },
+                "utter_default": {
+                    "precision": 0.0,
+                    "recall": 0.0,
+                    "f1-score": 0.0,
+                    "support": 1,
+                },
+                "micro avg": {
+                    "precision": 1.0,
+                    "recall": 0.75,
+                    "f1-score": 0.8571428571428571,
+                    "support": 8,
+                },
+                "macro avg": {
+                    "precision": 0.75,
+                    "recall": 0.6875,
+                    "f1-score": 0.7142857142857143,
+                    "support": 8,
+                },
+                "weighted avg": {
+                    "precision": 0.875,
+                    "recall": 0.75,
+                    "f1-score": 0.8035714285714286,
+                    "support": 8,
+                },
+                "conversation_accuracy": {
+                    "accuracy": 2.0 / 3.0,
+                    "total": 3,
+                    "correct": 2,
+                },
+            },
+        ],
+    ],
+)
+async def test_story_results(
+    tmpdir: Path,
+    core_agent: Agent,
+    stories_yaml: Text,
+    expected_results: Dict[Text, Dict[Text, Any]],
+) -> None:
+    """Check story_results.json file contains correct result keys/values."""
+
+    stories_path = tmpdir / "stories.yml"
+    stories_path.write_text(stories_yaml, "utf8")
+    out_directory = tmpdir / "results"
+    out_directory.mkdir()
+
+    await evaluate_stories(stories_path, core_agent, out_directory=out_directory)
+    story_report_path = out_directory / "story_report.json"
+    assert story_report_path.exists()
+
+    actual_results = json.loads(story_report_path.read_text("utf8"))
+    assert actual_results == expected_results
+
+
+async def test_story_results_with_empty_stories(
+    tmpdir: Path, core_agent: Agent,
+) -> None:
+    """Check that story_results.json contains empty dictionary when stories.yml is empty."""
+
+    stories_path = tmpdir / "stories.yml"
+    stories_path.write_text("", "utf8")
+    out_directory = tmpdir / "results"
+    out_directory.mkdir()
+
+    await evaluate_stories(stories_path, core_agent, out_directory=out_directory)
+    story_report_path = out_directory / "story_report.json"
+    assert story_report_path.exists()
+
+    actual_results = json.loads(story_report_path.read_text("utf8"))
+    assert actual_results == {}
+
+
+@pytest.mark.parametrize(
+    "skip_field,skip_value",
+    [
+        [None, None,],
+        ["precision", None,],
+        ["f1", None,],
+        ["in_training_data_fraction", None,],
+        ["report", None,],
+        ["include_report", False,],
+    ],
+)
+def test_log_evaluation_table(caplog, skip_field, skip_value):
+    """Check that _log_evaluation_table correctly omits/includes optional args."""
+    arr = [1, 1, 1, 0]
+    acc = 0.75
+    kwargs = {
+        "precision": 0.5,
+        "f1": 0.6,
+        "in_training_data_fraction": 0.1,
+        "report": {"macro f1": 0.7},
+    }
+    if skip_field:
+        kwargs[skip_field] = skip_value
+    caplog.set_level(logging.INFO)
+    rasa.core.test._log_evaluation_table(arr, "CONVERSATION", acc, **kwargs)
+
+    assert f"Correct:          {int(len(arr) * acc)} / {len(arr)}" in caplog.text
+    assert f"Accuracy:         {acc:.3f}" in caplog.text
+
+    if skip_field != "f1":
+        assert f"F1-Score:         {kwargs['f1']:5.3f}" in caplog.text
+    else:
+        assert "F1-Score:" not in caplog.text
+
+    if skip_field != "precision":
+        assert f"Precision:        {kwargs['precision']:5.3f}" in caplog.text
+    else:
+        assert "Precision:" not in caplog.text
+
+    if skip_field != "in_training_data_fraction":
+        assert (
+            f"In-data fraction: {kwargs['in_training_data_fraction']:.3g}"
+            in caplog.text
+        )
+    else:
+        assert "In-data fraction:" not in caplog.text
+
+    if skip_field != "report" and skip_field != "include_report":
+        assert f"Classification report: \n{kwargs['report']}" in caplog.text
+    else:
+        assert "Classification report:" not in caplog.text
diff --git a/tests/core/test_test.py b/tests/core/test_test.py
deleted file mode 100644
index c0ec195a2555..000000000000
--- a/tests/core/test_test.py
+++ /dev/null
@@ -1,213 +0,0 @@
-import asyncio
-import json
-import pathlib
-import pytest
-import logging
-from typing import Text, Any, Dict
-
-from rasa.core.agent import Agent
-from rasa.core.policies.ensemble import SimplePolicyEnsemble
-from rasa.core.policies.rule_policy import RulePolicy
-import rasa.core.test
-from rasa.shared.core.constants import ACTION_LISTEN_NAME
-from rasa.shared.core.domain import Domain
-from rasa.shared.core.events import ActionExecuted, UserUttered
-from rasa.shared.core.generator import TrackerWithCachedStates
-from rasa.shared.nlu.interpreter import RegexInterpreter
-
-
-@pytest.fixture(scope="function")
-def out_directory(tmpdir: pathlib.Path):
-    """Output directory for logging policy results."""
-    return tmpdir
-
-
-@pytest.fixture(scope="function")
-def stories_path(tmpdir: pathlib.Path):
-    """Path for writing test stories."""
-    return tmpdir / "stories.yml"
-
-
-@pytest.mark.parametrize(
-    "stories_yaml,expected_results",
-    [
-        [
-            """
-stories:
-  - story: story1
-    steps:
-    - intent: intentA
-    - action: actionA
-  - story: story2
-    steps:
-    - intent: intentB
-    - action: actionB
-  - story: story3
-    steps:
-    - intent: intentA
-    - action: actionA
-    - intent: intentB
-    - action: actionC
-            """,
-            {
-                "actionB": {
-                    "precision": 1.0,
-                    "recall": 1.0,
-                    "f1-score": 1.0,
-                    "support": 1,
-                },
-                "action_listen": {
-                    "precision": 1.0,
-                    "recall": 0.75,
-                    "f1-score": 0.8571428571428571,
-                    "support": 4,
-                },
-                "actionA": {
-                    "precision": 1.0,
-                    "recall": 1.0,
-                    "f1-score": 1.0,
-                    "support": 2,
-                },
-                "actionC": {
-                    "precision": 0.0,
-                    "recall": 0.0,
-                    "f1-score": 0.0,
-                    "support": 1,
-                },
-                "micro avg": {
-                    "precision": 1.0,
-                    "recall": 0.75,
-                    "f1-score": 0.8571428571428571,
-                    "support": 8,
-                },
-                "macro avg": {
-                    "precision": 0.75,
-                    "recall": 0.6875,
-                    "f1-score": 0.7142857142857143,
-                    "support": 8,
-                },
-                "weighted avg": {
-                    "precision": 0.875,
-                    "recall": 0.75,
-                    "f1-score": 0.8035714285714286,
-                    "support": 8,
-                },
-                "conversation_accuracy": {
-                    "accuracy": 2.0 / 3.0,
-                    "total": 3,
-                    "correct": 2,
-                },
-            },
-        ],
-        ["", {}],
-    ],
-)
-async def test_test(
-    stories_path: pathlib.Path,
-    out_directory: pathlib.Path,
-    stories_yaml: Text,
-    expected_results: Dict[Text, Dict[Text, Any]],
-) -> None:
-
-    stories_path.write_text(stories_yaml, "utf8")
-
-    domain = Domain.from_yaml(
-        """
-intents:
-- intentA
-- intentB
-actions:
-- actionA
-- actionB
-- actionC
-"""
-    )
-
-    policy = RulePolicy()
-    rt1 = TrackerWithCachedStates.from_events(
-        "ruleAtoA",
-        domain=domain,
-        slots=domain.slots,
-        evts=[
-            ActionExecuted(ACTION_LISTEN_NAME),
-            UserUttered(intent={"name": "intentA"}),
-            ActionExecuted("actionA"),
-            ActionExecuted(ACTION_LISTEN_NAME),
-        ],
-        is_rule_tracker=True,
-    )
-    rt2 = TrackerWithCachedStates.from_events(
-        "ruleBtoB",
-        domain=domain,
-        slots=domain.slots,
-        evts=[
-            ActionExecuted(ACTION_LISTEN_NAME),
-            UserUttered(intent={"name": "intentB"}),
-            ActionExecuted("actionB"),
-            ActionExecuted(ACTION_LISTEN_NAME),
-        ],
-        is_rule_tracker=True,
-    )
-
-    policy.train([rt1, rt2], domain, RegexInterpreter())
-
-    agent = Agent(domain=domain, policies=SimplePolicyEnsemble([policy]),)
-
-    await rasa.core.test.test(stories_path, agent, out_directory=out_directory)
-    story_report_path = out_directory / "story_report.json"
-    assert story_report_path.exists()
-
-    actual_results = json.loads(story_report_path.read_text("utf8"))
-    assert actual_results == expected_results
-
-
-@pytest.mark.parametrize(
-    "skip_field,skip_value",
-    [
-        [None, None,],
-        ["precision", None,],
-        ["f1", None,],
-        ["in_training_data_fraction", None,],
-        ["report", None,],
-        ["include_report", False,],
-    ],
-)
-def test_log_evaluation_table(caplog, skip_field, skip_value):
-    arr = [1, 1, 1, 0]
-    acc = 0.75
-    kwargs = {
-        "precision": 0.5,
-        "f1": 0.6,
-        "in_training_data_fraction": 0.1,
-        "report": {"macro f1": 0.7},
-    }
-    if skip_field:
-        kwargs[skip_field] = skip_value
-    caplog.set_level(logging.INFO)
-    rasa.core.test._log_evaluation_table(arr, "CONVERSATION", acc, **kwargs)
-
-    assert f"Correct:          {int(len(arr) * acc)} / {len(arr)}" in caplog.text
-    assert f"Accuracy:         {acc:.3f}" in caplog.text
-
-    if skip_field != "f1":
-        assert f"F1-Score:         {kwargs['f1']:5.3f}" in caplog.text
-    else:
-        assert "F1-Score:" not in caplog.text
-
-    if skip_field != "precision":
-        assert f"Precision:        {kwargs['precision']:5.3f}" in caplog.text
-    else:
-        assert "Precision:" not in caplog.text
-
-    if skip_field != "in_training_data_fraction":
-        assert (
-            f"In-data fraction: {kwargs['in_training_data_fraction']:.3g}"
-            in caplog.text
-        )
-    else:
-        assert "In-data fraction:" not in caplog.text
-
-    if skip_field != "report" and skip_field != "include_report":
-        assert f"Classification report: \n{kwargs['report']}" in caplog.text
-    else:
-        assert "Classification report:" not in caplog.text

From 7afc7f7b74dc005e3cf1624f052491c80f17a09d Mon Sep 17 00:00:00 2001
From: kedz <kedzie@cs.columbia.edu>
Date: Thu, 4 Mar 2021 14:25:32 -0500
Subject: [PATCH 11/13] Rename tests.

---
 tests/core/test_evaluation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/core/test_evaluation.py b/tests/core/test_evaluation.py
index 57d16eff9e6b..73fa85c36d66 100644
--- a/tests/core/test_evaluation.py
+++ b/tests/core/test_evaluation.py
@@ -376,7 +376,7 @@ async def test_retrieval_intent_wrong_prediction(
         ],
     ],
 )
-async def test_story_results(
+async def test_story_report(
     tmpdir: Path,
     core_agent: Agent,
     stories_yaml: Text,
@@ -397,7 +397,7 @@ async def test_story_results(
     assert actual_results == expected_results
 
 
-async def test_story_results_with_empty_stories(
+async def test_story_report_with_empty_stories(
     tmpdir: Path, core_agent: Agent,
 ) -> None:
     """Check that story_results.json contains empty dictionary when stories.yml is empty."""

From 140ad14611e6637e4b4ec4bec247cba8076e9610 Mon Sep 17 00:00:00 2001
From: kedz <kedzie@cs.columbia.edu>
Date: Fri, 5 Mar 2021 09:01:15 -0500
Subject: [PATCH 12/13] Fixed wording in doc string.

---
 tests/core/test_evaluation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/core/test_evaluation.py b/tests/core/test_evaluation.py
index 73fa85c36d66..ce86be545277 100644
--- a/tests/core/test_evaluation.py
+++ b/tests/core/test_evaluation.py
@@ -382,7 +382,7 @@ async def test_story_report(
     stories_yaml: Text,
     expected_results: Dict[Text, Dict[Text, Any]],
 ) -> None:
-    """Check story_results.json file contains correct result keys/values."""
+    """Check story_report.json file contains correct result keys/values."""
 
     stories_path = tmpdir / "stories.yml"
     stories_path.write_text(stories_yaml, "utf8")
@@ -400,7 +400,7 @@ async def test_story_report(
 async def test_story_report_with_empty_stories(
     tmpdir: Path, core_agent: Agent,
 ) -> None:
-    """Check that story_results.json contains empty dictionary when stories.yml is empty."""
+    """Check that story_report.json contains empty dictionary when stories.yml is empty."""
 
     stories_path = tmpdir / "stories.yml"
     stories_path.write_text("", "utf8")

From 0377d0f5b588a537e2230aaa12b919dfe2b9f7e6 Mon Sep 17 00:00:00 2001
From: kedz <kedzie@cs.columbia.edu>
Date: Fri, 5 Mar 2021 09:04:13 -0500
Subject: [PATCH 13/13] Added @pytest.mark.trains_model to core policy
 evaluation tests.

---
 tests/core/test_evaluation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/core/test_evaluation.py b/tests/core/test_evaluation.py
index ce86be545277..6ebeb4573c99 100644
--- a/tests/core/test_evaluation.py
+++ b/tests/core/test_evaluation.py
@@ -303,6 +303,7 @@ async def test_retrieval_intent_wrong_prediction(
     assert "# predicted: chitchat/ask_name" in failed_stories
 
 
+@pytest.mark.trains_model
 @pytest.mark.parametrize(
     "stories_yaml,expected_results",
     [
@@ -397,6 +398,7 @@ async def test_story_report(
     assert actual_results == expected_results
 
 
+@pytest.mark.trains_model
 async def test_story_report_with_empty_stories(
     tmpdir: Path, core_agent: Agent,
 ) -> None: