diff --git a/changelog/8000.improvement.md b/changelog/8000.improvement.md new file mode 100644 index 000000000000..beedf1fd1f41 --- /dev/null +++ b/changelog/8000.improvement.md @@ -0,0 +1,3 @@ +Remove console logging of conversation level F1-score and precision since these calculations were not meaningful. + +Add conversation level accuracy to core policy results logged to file in `story_report.json` after running `rasa test core` or `rasa test`. diff --git a/rasa/core/test.py b/rasa/core/test.py index 503cb211f4c7..2541f15cf599 100644 --- a/rasa/core/test.py +++ b/rasa/core/test.py @@ -667,7 +667,7 @@ async def _collect_story_predictions( use_e2e: bool = False, ) -> Tuple[StoryEvaluation, int]: """Test the stories from a file, running them through the stored model.""" - from rasa.test import get_evaluation_metrics + from sklearn.metrics import accuracy_score from tqdm import tqdm story_eval_store = EvaluationStore() @@ -702,25 +702,18 @@ async def _collect_story_predictions( success.append(predicted_tracker) logger.info("Finished collecting predictions.") - with warnings.catch_warnings(): - from sklearn.exceptions import UndefinedMetricWarning - - warnings.simplefilter("ignore", UndefinedMetricWarning) - report, precision, f1, accuracy = get_evaluation_metrics( - [1] * len(completed_trackers), correct_dialogues - ) in_training_data_fraction = _in_training_data_fraction(action_list) + if len(correct_dialogues): + accuracy = accuracy_score([1] * len(correct_dialogues), correct_dialogues) + else: + accuracy = 0 + _log_evaluation_table( [1] * len(completed_trackers), "END-TO-END" if use_e2e else "CONVERSATION", - report, - precision, - f1, accuracy, - in_training_data_fraction, - include_report=False, ) return ( @@ -795,15 +788,26 @@ async def test( targets, predictions = evaluation_store.serialise() if out_directory: - report, precision, f1, accuracy = get_evaluation_metrics( + report, precision, f1, action_accuracy = get_evaluation_metrics( targets, predictions, output_dict=True ) + # Add conversation level accuracy to story report. + num_failed = len(story_evaluation.failed_stories) + num_correct = len(story_evaluation.successful_stories) + num_convs = num_failed + num_correct + if num_convs: + conv_accuracy = num_correct / num_convs + report["conversation_accuracy"] = { + "accuracy": conv_accuracy, + "correct": num_correct, + "total": num_convs, + } report_filename = os.path.join(out_directory, REPORT_STORIES_FILE) rasa.shared.utils.io.dump_obj_as_json_to_file(report_filename, report) logger.info(f"Stories report saved to {report_filename}.") else: - report, precision, f1, accuracy = get_evaluation_metrics( + report, precision, f1, action_accuracy = get_evaluation_metrics( targets, predictions, output_dict=True ) @@ -812,12 +816,10 @@ async def test( _log_evaluation_table( evaluation_store.action_targets, "ACTION", - report, - precision, - f1, - accuracy, - story_evaluation.in_training_data_fraction, - include_report=False, + action_accuracy, + precision=precision, + f1=f1, + in_training_data_fraction=story_evaluation.in_training_data_fraction, ) if not disable_plotting and out_directory: @@ -842,7 +844,7 @@ async def test( "report": report, "precision": precision, "f1": f1, - "accuracy": accuracy, + "accuracy": action_accuracy, "actions": story_evaluation.action_list, "in_training_data_fraction": story_evaluation.in_training_data_fraction, "is_end_to_end_evaluation": e2e, @@ -852,22 +854,25 @@ async def test( def _log_evaluation_table( golds: List[Any], name: Text, - report: Dict[Text, Any], - precision: float, - f1: float, accuracy: float, - in_training_data_fraction: float, + report: Optional[Dict[Text, Any]] = None, + precision: Optional[float] = None, + f1: Optional[float] = None, + in_training_data_fraction: Optional[float] = None, include_report: bool = True, ) -> None: # pragma: no cover """Log the sklearn evaluation metrics.""" logger.info(f"Evaluation Results on {name} level:") logger.info(f"\tCorrect: {int(len(golds) * accuracy)} / {len(golds)}") - logger.info(f"\tF1-Score: {f1:.3f}") - logger.info(f"\tPrecision: {precision:.3f}") + if f1 is not None: + logger.info(f"\tF1-Score: {f1:.3f}") + if precision is not None: + logger.info(f"\tPrecision: {precision:.3f}") logger.info(f"\tAccuracy: {accuracy:.3f}") - logger.info(f"\tIn-data fraction: {in_training_data_fraction:.3g}") + if in_training_data_fraction is not None: + logger.info(f"\tIn-data fraction: {in_training_data_fraction:.3g}") - if include_report: + if include_report and report is not None: logger.info(f"\tClassification report: \n{report}") diff --git a/tests/core/test_evaluation.py b/tests/core/test_evaluation.py index 967afbe46c8a..6ebeb4573c99 100644 --- a/tests/core/test_evaluation.py +++ b/tests/core/test_evaluation.py @@ -1,5 +1,7 @@ import os from pathlib import Path +import json +import logging from typing import Any, Text, Dict import pytest @@ -299,3 +301,170 @@ async def test_retrieval_intent_wrong_prediction( # check if the predicted entry contains full retrieval intent assert "# predicted: chitchat/ask_name" in failed_stories + + +@pytest.mark.trains_model +@pytest.mark.parametrize( + "stories_yaml,expected_results", + [ + [ + """ +stories: + - story: story1 + steps: + - intent: greet + - action: utter_greet + - story: story2 + steps: + - intent: goodbye + - action: utter_goodbye + - story: story3 + steps: + - intent: greet + - action: utter_greet + - intent: goodbye + - action: utter_default + """, + { + "utter_goodbye": { + "precision": 1.0, + "recall": 1.0, + "f1-score": 1.0, + "support": 1, + }, + "action_listen": { + "precision": 1.0, + "recall": 0.75, + "f1-score": 0.8571428571428571, + "support": 4, + }, + "utter_greet": { + "precision": 1.0, + "recall": 1.0, + "f1-score": 1.0, + "support": 2, + }, + "utter_default": { + "precision": 0.0, + "recall": 0.0, + "f1-score": 0.0, + "support": 1, + }, + "micro avg": { + "precision": 1.0, + "recall": 0.75, + "f1-score": 0.8571428571428571, + "support": 8, + }, + "macro avg": { + "precision": 0.75, + "recall": 0.6875, + "f1-score": 0.7142857142857143, + "support": 8, + }, + "weighted avg": { + "precision": 0.875, + "recall": 0.75, + "f1-score": 0.8035714285714286, + "support": 8, + }, + "conversation_accuracy": { + "accuracy": 2.0 / 3.0, + "total": 3, + "correct": 2, + }, + }, + ], + ], +) +async def test_story_report( + tmpdir: Path, + core_agent: Agent, + stories_yaml: Text, + expected_results: Dict[Text, Dict[Text, Any]], +) -> None: + """Check story_report.json file contains correct result keys/values.""" + + stories_path = tmpdir / "stories.yml" + stories_path.write_text(stories_yaml, "utf8") + out_directory = tmpdir / "results" + out_directory.mkdir() + + await evaluate_stories(stories_path, core_agent, out_directory=out_directory) + story_report_path = out_directory / "story_report.json" + assert story_report_path.exists() + + actual_results = json.loads(story_report_path.read_text("utf8")) + assert actual_results == expected_results + + +@pytest.mark.trains_model +async def test_story_report_with_empty_stories( + tmpdir: Path, core_agent: Agent, +) -> None: + """Check that story_report.json contains empty dictionary when stories.yml is empty.""" + + stories_path = tmpdir / "stories.yml" + stories_path.write_text("", "utf8") + out_directory = tmpdir / "results" + out_directory.mkdir() + + await evaluate_stories(stories_path, core_agent, out_directory=out_directory) + story_report_path = out_directory / "story_report.json" + assert story_report_path.exists() + + actual_results = json.loads(story_report_path.read_text("utf8")) + assert actual_results == {} + + +@pytest.mark.parametrize( + "skip_field,skip_value", + [ + [None, None,], + ["precision", None,], + ["f1", None,], + ["in_training_data_fraction", None,], + ["report", None,], + ["include_report", False,], + ], +) +def test_log_evaluation_table(caplog, skip_field, skip_value): + """Check that _log_evaluation_table correctly omits/includes optional args.""" + arr = [1, 1, 1, 0] + acc = 0.75 + kwargs = { + "precision": 0.5, + "f1": 0.6, + "in_training_data_fraction": 0.1, + "report": {"macro f1": 0.7}, + } + if skip_field: + kwargs[skip_field] = skip_value + caplog.set_level(logging.INFO) + rasa.core.test._log_evaluation_table(arr, "CONVERSATION", acc, **kwargs) + + assert f"Correct: {int(len(arr) * acc)} / {len(arr)}" in caplog.text + assert f"Accuracy: {acc:.3f}" in caplog.text + + if skip_field != "f1": + assert f"F1-Score: {kwargs['f1']:5.3f}" in caplog.text + else: + assert "F1-Score:" not in caplog.text + + if skip_field != "precision": + assert f"Precision: {kwargs['precision']:5.3f}" in caplog.text + else: + assert "Precision:" not in caplog.text + + if skip_field != "in_training_data_fraction": + assert ( + f"In-data fraction: {kwargs['in_training_data_fraction']:.3g}" + in caplog.text + ) + else: + assert "In-data fraction:" not in caplog.text + + if skip_field != "report" and skip_field != "include_report": + assert f"Classification report: \n{kwargs['report']}" in caplog.text + else: + assert "Classification report:" not in caplog.text