Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix logging of conversation level core metrics. #8030

Merged
merged 18 commits into from
Mar 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelog/8000.improvement.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Remove console logging of conversation level F1-score and precision since these calculations were not meaningful.

Add conversation level accuracy to core policy results logged to file in `story_report.json` after running `rasa test core` or `rasa test`.
65 changes: 35 additions & 30 deletions rasa/core/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,7 @@ async def _collect_story_predictions(
use_e2e: bool = False,
) -> Tuple[StoryEvaluation, int]:
"""Test the stories from a file, running them through the stored model."""
from rasa.test import get_evaluation_metrics
from sklearn.metrics import accuracy_score
from tqdm import tqdm

story_eval_store = EvaluationStore()
Expand Down Expand Up @@ -702,25 +702,18 @@ async def _collect_story_predictions(
success.append(predicted_tracker)

logger.info("Finished collecting predictions.")
with warnings.catch_warnings():
from sklearn.exceptions import UndefinedMetricWarning

warnings.simplefilter("ignore", UndefinedMetricWarning)
report, precision, f1, accuracy = get_evaluation_metrics(
[1] * len(completed_trackers), correct_dialogues
)

in_training_data_fraction = _in_training_data_fraction(action_list)

if len(correct_dialogues):
accuracy = accuracy_score([1] * len(correct_dialogues), correct_dialogues)
else:
accuracy = 0

_log_evaluation_table(
[1] * len(completed_trackers),
"END-TO-END" if use_e2e else "CONVERSATION",
report,
precision,
f1,
accuracy,
in_training_data_fraction,
include_report=False,
)

return (
Expand Down Expand Up @@ -795,15 +788,26 @@ async def test(
targets, predictions = evaluation_store.serialise()

if out_directory:
report, precision, f1, accuracy = get_evaluation_metrics(
report, precision, f1, action_accuracy = get_evaluation_metrics(
targets, predictions, output_dict=True
)

# Add conversation level accuracy to story report.
num_failed = len(story_evaluation.failed_stories)
num_correct = len(story_evaluation.successful_stories)
num_convs = num_failed + num_correct
if num_convs:
conv_accuracy = num_correct / num_convs
report["conversation_accuracy"] = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should test this new data is there and correct in a unit test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Totally! I'll create a unit test in tests/core/test_test.py

"accuracy": conv_accuracy,
"correct": num_correct,
"total": num_convs,
}
report_filename = os.path.join(out_directory, REPORT_STORIES_FILE)
rasa.shared.utils.io.dump_obj_as_json_to_file(report_filename, report)
logger.info(f"Stories report saved to {report_filename}.")
else:
report, precision, f1, accuracy = get_evaluation_metrics(
report, precision, f1, action_accuracy = get_evaluation_metrics(
targets, predictions, output_dict=True
)

Expand All @@ -812,12 +816,10 @@ async def test(
_log_evaluation_table(
evaluation_store.action_targets,
"ACTION",
report,
precision,
f1,
accuracy,
story_evaluation.in_training_data_fraction,
include_report=False,
action_accuracy,
precision=precision,
f1=f1,
in_training_data_fraction=story_evaluation.in_training_data_fraction,
)

if not disable_plotting and out_directory:
Expand All @@ -842,7 +844,7 @@ async def test(
"report": report,
"precision": precision,
"f1": f1,
"accuracy": accuracy,
"accuracy": action_accuracy,
"actions": story_evaluation.action_list,
"in_training_data_fraction": story_evaluation.in_training_data_fraction,
"is_end_to_end_evaluation": e2e,
Expand All @@ -852,22 +854,25 @@ async def test(
def _log_evaluation_table(
golds: List[Any],
name: Text,
report: Dict[Text, Any],
precision: float,
f1: float,
accuracy: float,
in_training_data_fraction: float,
report: Optional[Dict[Text, Any]] = None,
precision: Optional[float] = None,
f1: Optional[float] = None,
in_training_data_fraction: Optional[float] = None,
include_report: bool = True,
) -> None: # pragma: no cover
"""Log the sklearn evaluation metrics."""
logger.info(f"Evaluation Results on {name} level:")
logger.info(f"\tCorrect: {int(len(golds) * accuracy)} / {len(golds)}")
logger.info(f"\tF1-Score: {f1:.3f}")
logger.info(f"\tPrecision: {precision:.3f}")
if f1 is not None:
logger.info(f"\tF1-Score: {f1:.3f}")
if precision is not None:
logger.info(f"\tPrecision: {precision:.3f}")
logger.info(f"\tAccuracy: {accuracy:.3f}")
logger.info(f"\tIn-data fraction: {in_training_data_fraction:.3g}")
if in_training_data_fraction is not None:
logger.info(f"\tIn-data fraction: {in_training_data_fraction:.3g}")

if include_report:
if include_report and report is not None:
logger.info(f"\tClassification report: \n{report}")


Expand Down
169 changes: 169 additions & 0 deletions tests/core/test_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
from pathlib import Path
import json
import logging
from typing import Any, Text, Dict

import pytest
Expand Down Expand Up @@ -299,3 +301,170 @@ async def test_retrieval_intent_wrong_prediction(

# check if the predicted entry contains full retrieval intent
assert "# predicted: chitchat/ask_name" in failed_stories


@pytest.mark.trains_model
@pytest.mark.parametrize(
"stories_yaml,expected_results",
[
[
"""
stories:
- story: story1
steps:
- intent: greet
- action: utter_greet
- story: story2
steps:
- intent: goodbye
- action: utter_goodbye
- story: story3
steps:
- intent: greet
- action: utter_greet
- intent: goodbye
- action: utter_default
""",
{
"utter_goodbye": {
"precision": 1.0,
"recall": 1.0,
"f1-score": 1.0,
"support": 1,
},
"action_listen": {
"precision": 1.0,
"recall": 0.75,
"f1-score": 0.8571428571428571,
"support": 4,
},
"utter_greet": {
"precision": 1.0,
"recall": 1.0,
"f1-score": 1.0,
"support": 2,
},
"utter_default": {
"precision": 0.0,
"recall": 0.0,
"f1-score": 0.0,
"support": 1,
},
"micro avg": {
"precision": 1.0,
"recall": 0.75,
"f1-score": 0.8571428571428571,
"support": 8,
},
"macro avg": {
"precision": 0.75,
"recall": 0.6875,
"f1-score": 0.7142857142857143,
"support": 8,
},
"weighted avg": {
"precision": 0.875,
"recall": 0.75,
"f1-score": 0.8035714285714286,
"support": 8,
},
"conversation_accuracy": {
"accuracy": 2.0 / 3.0,
"total": 3,
"correct": 2,
},
},
],
],
)
async def test_story_report(
tmpdir: Path,
core_agent: Agent,
stories_yaml: Text,
expected_results: Dict[Text, Dict[Text, Any]],
) -> None:
"""Check story_report.json file contains correct result keys/values."""

stories_path = tmpdir / "stories.yml"
stories_path.write_text(stories_yaml, "utf8")
out_directory = tmpdir / "results"
out_directory.mkdir()

await evaluate_stories(stories_path, core_agent, out_directory=out_directory)
story_report_path = out_directory / "story_report.json"
assert story_report_path.exists()

actual_results = json.loads(story_report_path.read_text("utf8"))
assert actual_results == expected_results


@pytest.mark.trains_model
async def test_story_report_with_empty_stories(
tmpdir: Path, core_agent: Agent,
) -> None:
"""Check that story_report.json contains empty dictionary when stories.yml is empty."""

stories_path = tmpdir / "stories.yml"
stories_path.write_text("", "utf8")
out_directory = tmpdir / "results"
out_directory.mkdir()

await evaluate_stories(stories_path, core_agent, out_directory=out_directory)
story_report_path = out_directory / "story_report.json"
assert story_report_path.exists()

actual_results = json.loads(story_report_path.read_text("utf8"))
assert actual_results == {}


@pytest.mark.parametrize(
"skip_field,skip_value",
[
[None, None,],
["precision", None,],
["f1", None,],
["in_training_data_fraction", None,],
["report", None,],
["include_report", False,],
],
)
def test_log_evaluation_table(caplog, skip_field, skip_value):
"""Check that _log_evaluation_table correctly omits/includes optional args."""
arr = [1, 1, 1, 0]
acc = 0.75
kwargs = {
"precision": 0.5,
"f1": 0.6,
"in_training_data_fraction": 0.1,
"report": {"macro f1": 0.7},
}
if skip_field:
kwargs[skip_field] = skip_value
caplog.set_level(logging.INFO)
rasa.core.test._log_evaluation_table(arr, "CONVERSATION", acc, **kwargs)

assert f"Correct: {int(len(arr) * acc)} / {len(arr)}" in caplog.text
assert f"Accuracy: {acc:.3f}" in caplog.text

if skip_field != "f1":
assert f"F1-Score: {kwargs['f1']:5.3f}" in caplog.text
else:
assert "F1-Score:" not in caplog.text

if skip_field != "precision":
assert f"Precision: {kwargs['precision']:5.3f}" in caplog.text
else:
assert "Precision:" not in caplog.text

if skip_field != "in_training_data_fraction":
assert (
f"In-data fraction: {kwargs['in_training_data_fraction']:.3g}"
in caplog.text
)
else:
assert "In-data fraction:" not in caplog.text

if skip_field != "report" and skip_field != "include_report":
assert f"Classification report: \n{kwargs['report']}" in caplog.text
else:
assert "Classification report:" not in caplog.text