From 38593940450a93aaac7a880782bf6bb3f5c89cdb Mon Sep 17 00:00:00 2001 From: kedz Date: Tue, 23 Feb 2021 15:48:43 -0500 Subject: [PATCH 01/13] Fix logging of conversation level core metrics to only show conversation level accuracy. Add conversation level accuracy to story_report.json --- rasa/core/test.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/rasa/core/test.py b/rasa/core/test.py index 503cb211f4c7..71605667fd43 100644 --- a/rasa/core/test.py +++ b/rasa/core/test.py @@ -702,26 +702,18 @@ async def _collect_story_predictions( success.append(predicted_tracker) logger.info("Finished collecting predictions.") - with warnings.catch_warnings(): - from sklearn.exceptions import UndefinedMetricWarning - - warnings.simplefilter("ignore", UndefinedMetricWarning) - report, precision, f1, accuracy = get_evaluation_metrics( - [1] * len(completed_trackers), correct_dialogues - ) in_training_data_fraction = _in_training_data_fraction(action_list) - _log_evaluation_table( - [1] * len(completed_trackers), - "END-TO-END" if use_e2e else "CONVERSATION", - report, - precision, - f1, - accuracy, - in_training_data_fraction, - include_report=False, + num_convs = len(correct_dialogues) + num_correct = sum(correct_dialogues) + accuracy = num_correct / num_convs if num_convs else 0.0 + + logger.info( + f"Evaluation Results on {'END-TO-END' if use_e2e else 'CONVERSATION'} level:" ) + logger.info(f"\tCorrect: {num_correct} / {num_convs}") + logger.info(f"\tAccuracy: {accuracy:.3f}") return ( StoryEvaluation( @@ -799,6 +791,16 @@ async def test( targets, predictions, output_dict=True ) + # Add conversation level accuracy to story report. + num_failed = len(story_evaluation.failed_stories) + num_correct = len(story_evaluation.successful_stories) + num_convs = num_failed + num_correct + conv_acc = num_correct / num_correct if num_correct else 0.0 + report["conversation_accuracy"] = { + "accuracy": conv_acc, + "correct": num_correct, + "total": num_convs, + } report_filename = os.path.join(out_directory, REPORT_STORIES_FILE) rasa.shared.utils.io.dump_obj_as_json_to_file(report_filename, report) logger.info(f"Stories report saved to {report_filename}.") From 5526cfc0c30dd45f4fedf80f4c9738ee1760deae Mon Sep 17 00:00:00 2001 From: kedz Date: Wed, 24 Feb 2021 12:45:35 -0500 Subject: [PATCH 02/13] Fix typo on accuracy calculation. Modify _log_evaluation_table to take optional arguments and reuse that function. Use sklearn.metrics.accuracy_score instead of computing manually. Remove unneccesary get_evaluation_metrics import. --- rasa/core/test.py | 65 +++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/rasa/core/test.py b/rasa/core/test.py index 71605667fd43..3be5049a2482 100644 --- a/rasa/core/test.py +++ b/rasa/core/test.py @@ -667,7 +667,7 @@ async def _collect_story_predictions( use_e2e: bool = False, ) -> Tuple[StoryEvaluation, int]: """Test the stories from a file, running them through the stored model.""" - from rasa.test import get_evaluation_metrics + from sklearn.metrics import accuracy_score from tqdm import tqdm story_eval_store = EvaluationStore() @@ -705,15 +705,16 @@ async def _collect_story_predictions( in_training_data_fraction = _in_training_data_fraction(action_list) - num_convs = len(correct_dialogues) - num_correct = sum(correct_dialogues) - accuracy = num_correct / num_convs if num_convs else 0.0 + if len(correct_dialogues): + accuracy = accuracy_score([1] * len(correct_dialogues), correct_dialogues) + else: + accuracy = 0 - logger.info( - f"Evaluation Results on {'END-TO-END' if use_e2e else 'CONVERSATION'} level:" + _log_evaluation_table( + [1] * len(correct_dialogues), + "END-TO-END" if use_e2e else "CONVERSATION", + accuracy, ) - logger.info(f"\tCorrect: {num_correct} / {num_convs}") - logger.info(f"\tAccuracy: {accuracy:.3f}") return ( StoryEvaluation( @@ -787,7 +788,7 @@ async def test( targets, predictions = evaluation_store.serialise() if out_directory: - report, precision, f1, accuracy = get_evaluation_metrics( + report, precision, f1, action_accuracy = get_evaluation_metrics( targets, predictions, output_dict=True ) @@ -795,17 +796,18 @@ async def test( num_failed = len(story_evaluation.failed_stories) num_correct = len(story_evaluation.successful_stories) num_convs = num_failed + num_correct - conv_acc = num_correct / num_correct if num_correct else 0.0 - report["conversation_accuracy"] = { - "accuracy": conv_acc, - "correct": num_correct, - "total": num_convs, - } + if num_convs: + conv_accuracy = num_correct / num_convs + report["conversation_accuracy"] = { + "accuracy": conv_accuracy, + "correct": num_correct, + "total": num_convs, + } report_filename = os.path.join(out_directory, REPORT_STORIES_FILE) rasa.shared.utils.io.dump_obj_as_json_to_file(report_filename, report) logger.info(f"Stories report saved to {report_filename}.") else: - report, precision, f1, accuracy = get_evaluation_metrics( + report, precision, f1, action_accuracy = get_evaluation_metrics( targets, predictions, output_dict=True ) @@ -814,12 +816,10 @@ async def test( _log_evaluation_table( evaluation_store.action_targets, "ACTION", - report, - precision, - f1, - accuracy, - story_evaluation.in_training_data_fraction, - include_report=False, + action_accuracy, + precision=precision, + f1=f1, + in_training_data_fraction=story_evaluation.in_training_data_fraction, ) if not disable_plotting and out_directory: @@ -844,7 +844,7 @@ async def test( "report": report, "precision": precision, "f1": f1, - "accuracy": accuracy, + "accuracy": action_accuracy, "actions": story_evaluation.action_list, "in_training_data_fraction": story_evaluation.in_training_data_fraction, "is_end_to_end_evaluation": e2e, @@ -854,22 +854,25 @@ async def test( def _log_evaluation_table( golds: List[Any], name: Text, - report: Dict[Text, Any], - precision: float, - f1: float, accuracy: float, - in_training_data_fraction: float, + report: Optional[Dict[Text, Any]] = None, + precision: Optional[float] = None, + f1: Optional[float] = None, + in_training_data_fraction: Optional[float] = None, include_report: bool = True, ) -> None: # pragma: no cover """Log the sklearn evaluation metrics.""" logger.info(f"Evaluation Results on {name} level:") logger.info(f"\tCorrect: {int(len(golds) * accuracy)} / {len(golds)}") - logger.info(f"\tF1-Score: {f1:.3f}") - logger.info(f"\tPrecision: {precision:.3f}") + if f1 is not None: + logger.info(f"\tF1-Score: {f1:.3f}") + if precision is not None: + logger.info(f"\tPrecision: {precision:.3f}") logger.info(f"\tAccuracy: {accuracy:.3f}") - logger.info(f"\tIn-data fraction: {in_training_data_fraction:.3g}") + if in_training_data_fraction is not None: + logger.info(f"\tIn-data fraction: {in_training_data_fraction:.3g}") - if include_report: + if include_report and report is not None: logger.info(f"\tClassification report: \n{report}") From 666f54acb669f65e6ef7a6cc6a6a4c6a6e3979a4 Mon Sep 17 00:00:00 2001 From: kedz Date: Thu, 25 Feb 2021 14:15:11 -0500 Subject: [PATCH 03/13] Revert to original argument ([1] * len(completed_trackers)) in call to _log_evaluation_table to preserve continuity with previous code version. --- rasa/core/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rasa/core/test.py b/rasa/core/test.py index 3be5049a2482..2541f15cf599 100644 --- a/rasa/core/test.py +++ b/rasa/core/test.py @@ -711,7 +711,7 @@ async def _collect_story_predictions( accuracy = 0 _log_evaluation_table( - [1] * len(correct_dialogues), + [1] * len(completed_trackers), "END-TO-END" if use_e2e else "CONVERSATION", accuracy, ) From b8e9b15ef076e33ee710e1697a8ba5bcd18fcedd Mon Sep 17 00:00:00 2001 From: kedz Date: Mon, 1 Mar 2021 15:43:20 -0500 Subject: [PATCH 04/13] Added a test to check rasa.core.test.test writes policy results to file. --- tests/core/test_test.py | 156 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 tests/core/test_test.py diff --git a/tests/core/test_test.py b/tests/core/test_test.py new file mode 100644 index 000000000000..fc2f06e1007b --- /dev/null +++ b/tests/core/test_test.py @@ -0,0 +1,156 @@ +import asyncio +import json +import pathlib +import pytest +from typing import Text, Any, Dict + +from rasa.core.agent import Agent +from rasa.core.policies.ensemble import SimplePolicyEnsemble +from rasa.core.policies.rule_policy import RulePolicy +import rasa.core.test +from rasa.shared.core.constants import ACTION_LISTEN_NAME +from rasa.shared.core.domain import Domain +from rasa.shared.core.events import ActionExecuted, UserUttered +from rasa.shared.core.generator import TrackerWithCachedStates +from rasa.shared.nlu.interpreter import RegexInterpreter + + +@pytest.fixture(scope="session") +def out_directory(tmpdir_factory): + """Output directory for logging info.""" + fn = tmpdir_factory.mktemp("results") + return fn + + +@pytest.mark.parametrize( + "stories_yaml,expected_results", + [ + [ + """ +stories: + - story: story1 + steps: + - intent: intentA + - action: actionA + - story: story2 + steps: + - intent: intentB + - action: actionB + - story: story3 + steps: + - intent: intentA + - action: actionA + - intent: intentB + - action: actionC + """, + { + "actionB": { + "precision": 1.0, + "recall": 1.0, + "f1-score": 1.0, + "support": 1, + }, + "action_listen": { + "precision": 1.0, + "recall": 0.75, + "f1-score": 0.8571428571428571, + "support": 4, + }, + "actionA": { + "precision": 1.0, + "recall": 1.0, + "f1-score": 1.0, + "support": 2, + }, + "actionC": { + "precision": 0.0, + "recall": 0.0, + "f1-score": 0.0, + "support": 1, + }, + "micro avg": { + "precision": 1.0, + "recall": 0.75, + "f1-score": 0.8571428571428571, + "support": 8, + }, + "macro avg": { + "precision": 0.75, + "recall": 0.6875, + "f1-score": 0.7142857142857143, + "support": 8, + }, + "weighted avg": { + "precision": 0.875, + "recall": 0.75, + "f1-score": 0.8035714285714286, + "support": 8, + }, + "conversation_accuracy": { + "accuracy": 2.0 / 3.0, + "total": 3, + "correct": 2, + }, + }, + ], + ["", {}], + ], +) +async def test_test( + tmpdir_factory: pathlib.Path, + out_directory: pathlib.Path, + stories_yaml: Text, + expected_results: Dict[Text, Dict[Text, Any]], +) -> None: + + stories_path = tmpdir_factory.mktemp("test_rasa_core_test").join("eval_stories.yml") + stories_path.write_text(stories_yaml, "utf8") + + domain = Domain.from_yaml( + """ +intents: +- intentA +- intentB +actions: +- actionA +- actionB +- actionC +""" + ) + + policy = RulePolicy() + rt1 = TrackerWithCachedStates.from_events( + "ruleAtoA", + domain=domain, + slots=domain.slots, + evts=[ + ActionExecuted(ACTION_LISTEN_NAME), + UserUttered(intent={"name": "intentA"}), + ActionExecuted("actionA"), + ActionExecuted(ACTION_LISTEN_NAME), + ], + is_rule_tracker=True, + ) + rt2 = TrackerWithCachedStates.from_events( + "ruleBtoB", + domain=domain, + slots=domain.slots, + evts=[ + ActionExecuted(ACTION_LISTEN_NAME), + UserUttered(intent={"name": "intentB"}), + ActionExecuted("actionB"), + ActionExecuted(ACTION_LISTEN_NAME), + ], + is_rule_tracker=True, + ) + + policy.train([rt1, rt2], domain, RegexInterpreter()) + + agent = Agent(domain=domain, policies=SimplePolicyEnsemble([policy]),) + + await rasa.core.test.test(stories_path, agent, out_directory=out_directory) + story_report_path = out_directory / "story_report.json" + assert story_report_path.exists() + + actual_results = json.loads(story_report_path.read_text("utf8")) + assert actual_results == expected_results From 20e21a8420415200bbcf3fdaec009de3ed586451 Mon Sep 17 00:00:00 2001 From: kedz Date: Mon, 1 Mar 2021 16:08:09 -0500 Subject: [PATCH 05/13] Add changelog entry. --- changelog/8000.improvment.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 changelog/8000.improvment.md diff --git a/changelog/8000.improvment.md b/changelog/8000.improvment.md new file mode 100644 index 000000000000..db02c931ab68 --- /dev/null +++ b/changelog/8000.improvment.md @@ -0,0 +1,5 @@ +Remove console logging of conversation level F1-score and precision since these calculations were not meaningful. + +Add conversation level accuracy to core policy results logged to file in `story_report.json` after running `rasa test core` or `rasa test`. + +Add test to check that `story_report.json` contents are correct. From 45bc56aa813b0fa3e4a21d7aca080d9eafac41d8 Mon Sep 17 00:00:00 2001 From: kedz Date: Mon, 1 Mar 2021 16:47:07 -0500 Subject: [PATCH 06/13] Added test for _log_evaluation_table method. --- tests/core/test_test.py | 53 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tests/core/test_test.py b/tests/core/test_test.py index fc2f06e1007b..09982f4a57e3 100644 --- a/tests/core/test_test.py +++ b/tests/core/test_test.py @@ -2,6 +2,7 @@ import json import pathlib import pytest +import logging from typing import Text, Any, Dict from rasa.core.agent import Agent @@ -154,3 +155,55 @@ async def test_test( actual_results = json.loads(story_report_path.read_text("utf8")) assert actual_results == expected_results + + +@pytest.mark.parametrize( + "skip_field,skip_value", + [ + [None, None,], + ["precision", None,], + ["f1", None,], + ["in_training_data_fraction", None,], + ["report", None,], + ["include_report", False,], + ], +) +def test_log_evaluation_table(caplog, skip_field, skip_value): + arr = [1, 1, 1, 0] + acc = 0.75 + kwargs = { + "precision": 0.5, + "f1": 0.6, + "in_training_data_fraction": 0.1, + "report": {"macro f1": 0.7}, + } + if skip_field: + kwargs[skip_field] = skip_value + caplog.set_level(logging.INFO) + rasa.core.test._log_evaluation_table(arr, "CONVERSATION", acc, **kwargs) + + assert f"Correct: {int(len(arr) * acc)} / {len(arr)}" in caplog.text + assert f"Accuracy: {acc:.3f}" in caplog.text + + if skip_field != "f1": + assert f"F1-Score: {kwargs['f1']:5.3f}" in caplog.text + else: + assert f"F1-Score:" not in caplog.text + + if skip_field != "precision": + assert f"Precision: {kwargs['precision']:5.3f}" in caplog.text + else: + assert f"Precision:" not in caplog.text + + if skip_field != "in_training_data_fraction": + assert ( + f"In-data fraction: {kwargs['in_training_data_fraction']:.3g}" + in caplog.text + ) + else: + assert f"In-data fraction:" not in caplog.text + + if skip_field != "report" and skip_field != "include_report": + assert f"Classification report: \n{kwargs['report']}" in caplog.text + else: + assert f"Classification report:" not in caplog.text From 003bf56129190415a7e7d0a9b7f856886ad1c7cf Mon Sep 17 00:00:00 2001 From: kedz Date: Mon, 1 Mar 2021 16:53:26 -0500 Subject: [PATCH 07/13] Removed f in strings without formatting. --- tests/core/test_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/core/test_test.py b/tests/core/test_test.py index 09982f4a57e3..363828e83c35 100644 --- a/tests/core/test_test.py +++ b/tests/core/test_test.py @@ -188,12 +188,12 @@ def test_log_evaluation_table(caplog, skip_field, skip_value): if skip_field != "f1": assert f"F1-Score: {kwargs['f1']:5.3f}" in caplog.text else: - assert f"F1-Score:" not in caplog.text + assert "F1-Score:" not in caplog.text if skip_field != "precision": assert f"Precision: {kwargs['precision']:5.3f}" in caplog.text else: - assert f"Precision:" not in caplog.text + assert "Precision:" not in caplog.text if skip_field != "in_training_data_fraction": assert ( @@ -201,9 +201,9 @@ def test_log_evaluation_table(caplog, skip_field, skip_value): in caplog.text ) else: - assert f"In-data fraction:" not in caplog.text + assert "In-data fraction:" not in caplog.text if skip_field != "report" and skip_field != "include_report": assert f"Classification report: \n{kwargs['report']}" in caplog.text else: - assert f"Classification report:" not in caplog.text + assert "Classification report:" not in caplog.text From 553821db20b266367154026f88274addfb61d88d Mon Sep 17 00:00:00 2001 From: kedz Date: Tue, 2 Mar 2021 13:00:34 -0500 Subject: [PATCH 08/13] Fix typo in changelog filename. Make changlog text shorter. --- changelog/{8000.improvment.md => 8000.improvement.md} | 2 -- 1 file changed, 2 deletions(-) rename changelog/{8000.improvment.md => 8000.improvement.md} (79%) diff --git a/changelog/8000.improvment.md b/changelog/8000.improvement.md similarity index 79% rename from changelog/8000.improvment.md rename to changelog/8000.improvement.md index db02c931ab68..beedf1fd1f41 100644 --- a/changelog/8000.improvment.md +++ b/changelog/8000.improvement.md @@ -1,5 +1,3 @@ Remove console logging of conversation level F1-score and precision since these calculations were not meaningful. Add conversation level accuracy to core policy results logged to file in `story_report.json` after running `rasa test core` or `rasa test`. - -Add test to check that `story_report.json` contents are correct. From a484bfb1405269484f0c78658362323905e0edd9 Mon Sep 17 00:00:00 2001 From: kedz Date: Tue, 2 Mar 2021 13:51:17 -0500 Subject: [PATCH 09/13] Cleaned up temp path fixtures. --- tests/core/test_test.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/core/test_test.py b/tests/core/test_test.py index 363828e83c35..c0ec195a2555 100644 --- a/tests/core/test_test.py +++ b/tests/core/test_test.py @@ -16,11 +16,16 @@ from rasa.shared.nlu.interpreter import RegexInterpreter -@pytest.fixture(scope="session") -def out_directory(tmpdir_factory): - """Output directory for logging info.""" - fn = tmpdir_factory.mktemp("results") - return fn +@pytest.fixture(scope="function") +def out_directory(tmpdir: pathlib.Path): + """Output directory for logging policy results.""" + return tmpdir + + +@pytest.fixture(scope="function") +def stories_path(tmpdir: pathlib.Path): + """Path for writing test stories.""" + return tmpdir / "stories.yml" @pytest.mark.parametrize( @@ -98,13 +103,12 @@ def out_directory(tmpdir_factory): ], ) async def test_test( - tmpdir_factory: pathlib.Path, + stories_path: pathlib.Path, out_directory: pathlib.Path, stories_yaml: Text, expected_results: Dict[Text, Dict[Text, Any]], ) -> None: - stories_path = tmpdir_factory.mktemp("test_rasa_core_test").join("eval_stories.yml") stories_path.write_text(stories_yaml, "utf8") domain = Domain.from_yaml( From ff9cf070cd0c59d8a78711e669887436142b3e51 Mon Sep 17 00:00:00 2001 From: kedz Date: Thu, 4 Mar 2021 14:14:59 -0500 Subject: [PATCH 10/13] Move tests to tests/core/test_evaluation. Use agent test fixture. --- tests/core/test_evaluation.py | 167 ++++++++++++++++++++++++++ tests/core/test_test.py | 213 ---------------------------------- 2 files changed, 167 insertions(+), 213 deletions(-) delete mode 100644 tests/core/test_test.py diff --git a/tests/core/test_evaluation.py b/tests/core/test_evaluation.py index 967afbe46c8a..57d16eff9e6b 100644 --- a/tests/core/test_evaluation.py +++ b/tests/core/test_evaluation.py @@ -1,5 +1,7 @@ import os from pathlib import Path +import json +import logging from typing import Any, Text, Dict import pytest @@ -299,3 +301,168 @@ async def test_retrieval_intent_wrong_prediction( # check if the predicted entry contains full retrieval intent assert "# predicted: chitchat/ask_name" in failed_stories + + +@pytest.mark.parametrize( + "stories_yaml,expected_results", + [ + [ + """ +stories: + - story: story1 + steps: + - intent: greet + - action: utter_greet + - story: story2 + steps: + - intent: goodbye + - action: utter_goodbye + - story: story3 + steps: + - intent: greet + - action: utter_greet + - intent: goodbye + - action: utter_default + """, + { + "utter_goodbye": { + "precision": 1.0, + "recall": 1.0, + "f1-score": 1.0, + "support": 1, + }, + "action_listen": { + "precision": 1.0, + "recall": 0.75, + "f1-score": 0.8571428571428571, + "support": 4, + }, + "utter_greet": { + "precision": 1.0, + "recall": 1.0, + "f1-score": 1.0, + "support": 2, + }, + "utter_default": { + "precision": 0.0, + "recall": 0.0, + "f1-score": 0.0, + "support": 1, + }, + "micro avg": { + "precision": 1.0, + "recall": 0.75, + "f1-score": 0.8571428571428571, + "support": 8, + }, + "macro avg": { + "precision": 0.75, + "recall": 0.6875, + "f1-score": 0.7142857142857143, + "support": 8, + }, + "weighted avg": { + "precision": 0.875, + "recall": 0.75, + "f1-score": 0.8035714285714286, + "support": 8, + }, + "conversation_accuracy": { + "accuracy": 2.0 / 3.0, + "total": 3, + "correct": 2, + }, + }, + ], + ], +) +async def test_story_results( + tmpdir: Path, + core_agent: Agent, + stories_yaml: Text, + expected_results: Dict[Text, Dict[Text, Any]], +) -> None: + """Check story_results.json file contains correct result keys/values.""" + + stories_path = tmpdir / "stories.yml" + stories_path.write_text(stories_yaml, "utf8") + out_directory = tmpdir / "results" + out_directory.mkdir() + + await evaluate_stories(stories_path, core_agent, out_directory=out_directory) + story_report_path = out_directory / "story_report.json" + assert story_report_path.exists() + + actual_results = json.loads(story_report_path.read_text("utf8")) + assert actual_results == expected_results + + +async def test_story_results_with_empty_stories( + tmpdir: Path, core_agent: Agent, +) -> None: + """Check that story_results.json contains empty dictionary when stories.yml is empty.""" + + stories_path = tmpdir / "stories.yml" + stories_path.write_text("", "utf8") + out_directory = tmpdir / "results" + out_directory.mkdir() + + await evaluate_stories(stories_path, core_agent, out_directory=out_directory) + story_report_path = out_directory / "story_report.json" + assert story_report_path.exists() + + actual_results = json.loads(story_report_path.read_text("utf8")) + assert actual_results == {} + + +@pytest.mark.parametrize( + "skip_field,skip_value", + [ + [None, None,], + ["precision", None,], + ["f1", None,], + ["in_training_data_fraction", None,], + ["report", None,], + ["include_report", False,], + ], +) +def test_log_evaluation_table(caplog, skip_field, skip_value): + """Check that _log_evaluation_table correctly omits/includes optional args.""" + arr = [1, 1, 1, 0] + acc = 0.75 + kwargs = { + "precision": 0.5, + "f1": 0.6, + "in_training_data_fraction": 0.1, + "report": {"macro f1": 0.7}, + } + if skip_field: + kwargs[skip_field] = skip_value + caplog.set_level(logging.INFO) + rasa.core.test._log_evaluation_table(arr, "CONVERSATION", acc, **kwargs) + + assert f"Correct: {int(len(arr) * acc)} / {len(arr)}" in caplog.text + assert f"Accuracy: {acc:.3f}" in caplog.text + + if skip_field != "f1": + assert f"F1-Score: {kwargs['f1']:5.3f}" in caplog.text + else: + assert "F1-Score:" not in caplog.text + + if skip_field != "precision": + assert f"Precision: {kwargs['precision']:5.3f}" in caplog.text + else: + assert "Precision:" not in caplog.text + + if skip_field != "in_training_data_fraction": + assert ( + f"In-data fraction: {kwargs['in_training_data_fraction']:.3g}" + in caplog.text + ) + else: + assert "In-data fraction:" not in caplog.text + + if skip_field != "report" and skip_field != "include_report": + assert f"Classification report: \n{kwargs['report']}" in caplog.text + else: + assert "Classification report:" not in caplog.text diff --git a/tests/core/test_test.py b/tests/core/test_test.py deleted file mode 100644 index c0ec195a2555..000000000000 --- a/tests/core/test_test.py +++ /dev/null @@ -1,213 +0,0 @@ -import asyncio -import json -import pathlib -import pytest -import logging -from typing import Text, Any, Dict - -from rasa.core.agent import Agent -from rasa.core.policies.ensemble import SimplePolicyEnsemble -from rasa.core.policies.rule_policy import RulePolicy -import rasa.core.test -from rasa.shared.core.constants import ACTION_LISTEN_NAME -from rasa.shared.core.domain import Domain -from rasa.shared.core.events import ActionExecuted, UserUttered -from rasa.shared.core.generator import TrackerWithCachedStates -from rasa.shared.nlu.interpreter import RegexInterpreter - - -@pytest.fixture(scope="function") -def out_directory(tmpdir: pathlib.Path): - """Output directory for logging policy results.""" - return tmpdir - - -@pytest.fixture(scope="function") -def stories_path(tmpdir: pathlib.Path): - """Path for writing test stories.""" - return tmpdir / "stories.yml" - - -@pytest.mark.parametrize( - "stories_yaml,expected_results", - [ - [ - """ -stories: - - story: story1 - steps: - - intent: intentA - - action: actionA - - story: story2 - steps: - - intent: intentB - - action: actionB - - story: story3 - steps: - - intent: intentA - - action: actionA - - intent: intentB - - action: actionC - """, - { - "actionB": { - "precision": 1.0, - "recall": 1.0, - "f1-score": 1.0, - "support": 1, - }, - "action_listen": { - "precision": 1.0, - "recall": 0.75, - "f1-score": 0.8571428571428571, - "support": 4, - }, - "actionA": { - "precision": 1.0, - "recall": 1.0, - "f1-score": 1.0, - "support": 2, - }, - "actionC": { - "precision": 0.0, - "recall": 0.0, - "f1-score": 0.0, - "support": 1, - }, - "micro avg": { - "precision": 1.0, - "recall": 0.75, - "f1-score": 0.8571428571428571, - "support": 8, - }, - "macro avg": { - "precision": 0.75, - "recall": 0.6875, - "f1-score": 0.7142857142857143, - "support": 8, - }, - "weighted avg": { - "precision": 0.875, - "recall": 0.75, - "f1-score": 0.8035714285714286, - "support": 8, - }, - "conversation_accuracy": { - "accuracy": 2.0 / 3.0, - "total": 3, - "correct": 2, - }, - }, - ], - ["", {}], - ], -) -async def test_test( - stories_path: pathlib.Path, - out_directory: pathlib.Path, - stories_yaml: Text, - expected_results: Dict[Text, Dict[Text, Any]], -) -> None: - - stories_path.write_text(stories_yaml, "utf8") - - domain = Domain.from_yaml( - """ -intents: -- intentA -- intentB -actions: -- actionA -- actionB -- actionC -""" - ) - - policy = RulePolicy() - rt1 = TrackerWithCachedStates.from_events( - "ruleAtoA", - domain=domain, - slots=domain.slots, - evts=[ - ActionExecuted(ACTION_LISTEN_NAME), - UserUttered(intent={"name": "intentA"}), - ActionExecuted("actionA"), - ActionExecuted(ACTION_LISTEN_NAME), - ], - is_rule_tracker=True, - ) - rt2 = TrackerWithCachedStates.from_events( - "ruleBtoB", - domain=domain, - slots=domain.slots, - evts=[ - ActionExecuted(ACTION_LISTEN_NAME), - UserUttered(intent={"name": "intentB"}), - ActionExecuted("actionB"), - ActionExecuted(ACTION_LISTEN_NAME), - ], - is_rule_tracker=True, - ) - - policy.train([rt1, rt2], domain, RegexInterpreter()) - - agent = Agent(domain=domain, policies=SimplePolicyEnsemble([policy]),) - - await rasa.core.test.test(stories_path, agent, out_directory=out_directory) - story_report_path = out_directory / "story_report.json" - assert story_report_path.exists() - - actual_results = json.loads(story_report_path.read_text("utf8")) - assert actual_results == expected_results - - -@pytest.mark.parametrize( - "skip_field,skip_value", - [ - [None, None,], - ["precision", None,], - ["f1", None,], - ["in_training_data_fraction", None,], - ["report", None,], - ["include_report", False,], - ], -) -def test_log_evaluation_table(caplog, skip_field, skip_value): - arr = [1, 1, 1, 0] - acc = 0.75 - kwargs = { - "precision": 0.5, - "f1": 0.6, - "in_training_data_fraction": 0.1, - "report": {"macro f1": 0.7}, - } - if skip_field: - kwargs[skip_field] = skip_value - caplog.set_level(logging.INFO) - rasa.core.test._log_evaluation_table(arr, "CONVERSATION", acc, **kwargs) - - assert f"Correct: {int(len(arr) * acc)} / {len(arr)}" in caplog.text - assert f"Accuracy: {acc:.3f}" in caplog.text - - if skip_field != "f1": - assert f"F1-Score: {kwargs['f1']:5.3f}" in caplog.text - else: - assert "F1-Score:" not in caplog.text - - if skip_field != "precision": - assert f"Precision: {kwargs['precision']:5.3f}" in caplog.text - else: - assert "Precision:" not in caplog.text - - if skip_field != "in_training_data_fraction": - assert ( - f"In-data fraction: {kwargs['in_training_data_fraction']:.3g}" - in caplog.text - ) - else: - assert "In-data fraction:" not in caplog.text - - if skip_field != "report" and skip_field != "include_report": - assert f"Classification report: \n{kwargs['report']}" in caplog.text - else: - assert "Classification report:" not in caplog.text From 7afc7f7b74dc005e3cf1624f052491c80f17a09d Mon Sep 17 00:00:00 2001 From: kedz Date: Thu, 4 Mar 2021 14:25:32 -0500 Subject: [PATCH 11/13] Rename tests. --- tests/core/test_evaluation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/core/test_evaluation.py b/tests/core/test_evaluation.py index 57d16eff9e6b..73fa85c36d66 100644 --- a/tests/core/test_evaluation.py +++ b/tests/core/test_evaluation.py @@ -376,7 +376,7 @@ async def test_retrieval_intent_wrong_prediction( ], ], ) -async def test_story_results( +async def test_story_report( tmpdir: Path, core_agent: Agent, stories_yaml: Text, @@ -397,7 +397,7 @@ async def test_story_results( assert actual_results == expected_results -async def test_story_results_with_empty_stories( +async def test_story_report_with_empty_stories( tmpdir: Path, core_agent: Agent, ) -> None: """Check that story_results.json contains empty dictionary when stories.yml is empty.""" From 140ad14611e6637e4b4ec4bec247cba8076e9610 Mon Sep 17 00:00:00 2001 From: kedz Date: Fri, 5 Mar 2021 09:01:15 -0500 Subject: [PATCH 12/13] Fixed wording in doc string. --- tests/core/test_evaluation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/core/test_evaluation.py b/tests/core/test_evaluation.py index 73fa85c36d66..ce86be545277 100644 --- a/tests/core/test_evaluation.py +++ b/tests/core/test_evaluation.py @@ -382,7 +382,7 @@ async def test_story_report( stories_yaml: Text, expected_results: Dict[Text, Dict[Text, Any]], ) -> None: - """Check story_results.json file contains correct result keys/values.""" + """Check story_report.json file contains correct result keys/values.""" stories_path = tmpdir / "stories.yml" stories_path.write_text(stories_yaml, "utf8") @@ -400,7 +400,7 @@ async def test_story_report( async def test_story_report_with_empty_stories( tmpdir: Path, core_agent: Agent, ) -> None: - """Check that story_results.json contains empty dictionary when stories.yml is empty.""" + """Check that story_report.json contains empty dictionary when stories.yml is empty.""" stories_path = tmpdir / "stories.yml" stories_path.write_text("", "utf8") From 0377d0f5b588a537e2230aaa12b919dfe2b9f7e6 Mon Sep 17 00:00:00 2001 From: kedz Date: Fri, 5 Mar 2021 09:04:13 -0500 Subject: [PATCH 13/13] Added @pytest.mark.trains_model to core policy evaluation tests. --- tests/core/test_evaluation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/core/test_evaluation.py b/tests/core/test_evaluation.py index ce86be545277..6ebeb4573c99 100644 --- a/tests/core/test_evaluation.py +++ b/tests/core/test_evaluation.py @@ -303,6 +303,7 @@ async def test_retrieval_intent_wrong_prediction( assert "# predicted: chitchat/ask_name" in failed_stories +@pytest.mark.trains_model @pytest.mark.parametrize( "stories_yaml,expected_results", [ @@ -397,6 +398,7 @@ async def test_story_report( assert actual_results == expected_results +@pytest.mark.trains_model async def test_story_report_with_empty_stories( tmpdir: Path, core_agent: Agent, ) -> None: