diff --git a/src/evaluators/json_evaluator.py b/src/evaluators/json_evaluator.py index 98a800b..71949ef 100644 --- a/src/evaluators/json_evaluator.py +++ b/src/evaluators/json_evaluator.py @@ -29,7 +29,7 @@ def __init__( self.result = {} def __call__(self, ground_truth, actual, eval_schema={}): - self.compare_dicts(ground_truth, actual, eval_schema) + self.compare_values(ground_truth, actual, eval_schema, None) for wrapper in self.eval_wrappers: self.result[f"{wrapper.name}.ratio"] = ( wrapper.calculate_ratio() @@ -38,42 +38,54 @@ def __call__(self, ground_truth, actual, eval_schema={}): return self.result def compare_values(self, ground_truth, actual, eval_schema, curr_key): - if isinstance(ground_truth, dict) and isinstance(actual, dict): + if isinstance(ground_truth, dict): return self.compare_dicts(ground_truth, actual, eval_schema, curr_key) - elif isinstance(ground_truth, list) and isinstance(actual, list): + elif isinstance(ground_truth, list): return self.compare_lists(ground_truth, actual, eval_schema, curr_key) else: for wrapper in self.eval_wrappers: - score = wrapper.instance( - ground_truth, - actual, - eval_schema.get(wrapper.name, None), - ) + if actual is None: + score = 0 + else: + score = wrapper.instance( + ground_truth, + actual, + eval_schema.get(wrapper.name, None), + ) wrapper.total_strings_compared += 1 self.result[f"{wrapper.name}.{curr_key}"] = score wrapper.total_score += score def compare_dicts(self, ground_truth_dict, actual_dict, eval_schema, curr_key=None): for key in ground_truth_dict: - if key not in actual_dict: - for string_evaluator in self.eval_wrappers: - string_evaluator.total_strings_compared += 1 - else: - next_key = f"{curr_key}.{key}" if curr_key is not None else key - self.compare_values( - ground_truth_dict[key], - actual_dict[key], - eval_schema.get(key, {}), - next_key, - ) + # handle defaults if is None + next_key = f"{curr_key}.{key}" if curr_key is not None else key + actual = actual_dict.get(key, None) if actual_dict is not None else None + curr_eval_schema = eval_schema.get(key, {}) if eval_schema is not None else {} + + self.compare_values( + ground_truth_dict[key], + actual, + curr_eval_schema, + next_key, + ) def compare_lists(self, ground_truth_list, actual_list, eval_schema, curr_key): - if not eval_schema: - eval_schema = [{}] * len(ground_truth_list) + for i in range(len(ground_truth_list)): + # handle defaults if is None + next_key = f"{curr_key}[{i}]" if curr_key is not None else f"[{i}]" + try: + actual = actual_list[i] + except Exception: + actual = None + try: + curr_eval_schema = eval_schema[i] + except Exception: + curr_eval_schema = {} - for i, (ground_truth_item, actual_item, schema) in enumerate( - zip(ground_truth_list, actual_list, eval_schema) - ): self.compare_values( - ground_truth_item, actual_item, schema, f"{curr_key}[{i}]" + ground_truth_list[i], + actual, + curr_eval_schema, + next_key, ) diff --git a/src/evaluators/tests/test_json_evaluator.py b/src/evaluators/tests/test_json_evaluator.py index 45d5604..67cd3cf 100644 --- a/src/evaluators/tests/test_json_evaluator.py +++ b/src/evaluators/tests/test_json_evaluator.py @@ -39,8 +39,8 @@ def test_json_evaluator_no_eval_schema(self): }, # key5 is missing }, + # key3 is missing "key4": "value10", # correct 6 - # key2 is missing } # Total correct = 6 # ratio = 6/10 = 0.6 @@ -167,3 +167,84 @@ def test_json_evaluator_no_eval_schema_with_default_config(self): result = json_evaluator(ground_truth_data, actual_data) assert result["CustomStringEvaluator.ratio"] == 0.5 assert result['FuzzStringEvaluator.ratio'] == 0.764 + + def test_json_evaluator_different_array_length_in_actual(self): + ground_truth_data = { + "key1": "value1", # value 1 + "key2": ["test1", "test2", "test3"], # Values 2, 3, 4 + } + # Total values = 4 + + actual_data = { + "key1": "value1", # correct 1 + "key2": ["test1"], # correct 2, wrong 1, wrong 2 (missing index 1, 2) + } + + evaluators = [CustomStringEvaluator()] + + # Total correct = 2 + # ratio = 2/4 = 0.5 + + json_evaluator = JsonEvaluator(evaluators) + result = json_evaluator(ground_truth_data, actual_data) + assert result["CustomStringEvaluator.ratio"] == 0.5 + assert result['CustomStringEvaluator.key1'] == 1 + assert result['CustomStringEvaluator.key2[0]'] == 1 + assert result['CustomStringEvaluator.key2[1]'] == 0 + assert result['CustomStringEvaluator.key2[2]'] == 0 + + def test_json_evaluator_handles_array_first_value(self): + ground_truth_data = [ + {"key1": "value1"}, # value 1 + {"key2": ["1", "2", "3"]}, + "array_value_3" + ] + # Total values = 5 + + actual_data = [ + {"key1": "value1"}, # correct 1 + {"key2": ["1", "wrong", "3"]}, # correct 2, wrong 1, correct 3 + "array_value_3" # correct 4 + ] + + # Total correct = 4 + # ratio = 4/5 = 0.8 + + evaluators = [CustomStringEvaluator()] + + json_evaluator = JsonEvaluator(evaluators) + result = json_evaluator(ground_truth_data, actual_data) + assert result["CustomStringEvaluator.ratio"] == 0.8 + assert result['CustomStringEvaluator.[0].key1'] == 1 + assert result['CustomStringEvaluator.[1].key2[0]'] == 1 + assert result['CustomStringEvaluator.[1].key2[1]'] == 0 + assert result['CustomStringEvaluator.[1].key2[2]'] == 1 + assert result['CustomStringEvaluator.[2]'] == 1 + + def test_json_evaluator_handles_array_dict_mismatch(self): + ground_truth_data = [ + {"key1": "value1"}, # value 1 + {"key2": ["1", "2", "3"]}, + "array_value_3" + ] + # Total values = 5 + + # all values should be wrong, as this is a dict and not an array + actual_data = { + "key1": "value1", + "key2": ["1", "wrong", "3"], + } + + # Total correct = 0 + # ratio = 0/5 = 0 + + evaluators = [CustomStringEvaluator()] + + json_evaluator = JsonEvaluator(evaluators) + result = json_evaluator(ground_truth_data, actual_data) + assert result["CustomStringEvaluator.ratio"] == 0 + assert result['CustomStringEvaluator.[0].key1'] == 0 + assert result['CustomStringEvaluator.[1].key2[0]'] == 0 + assert result['CustomStringEvaluator.[1].key2[1]'] == 0 + assert result['CustomStringEvaluator.[1].key2[2]'] == 0 + assert result['CustomStringEvaluator.[2]'] == 0 \ No newline at end of file