Azure-Samples · albertaga27 · Aug 14, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/src/evaluators/json_evaluator.py b/src/evaluators/json_evaluator.py
@@ -29,7 +29,7 @@ def __init__(
         self.result = {}
 
     def __call__(self, ground_truth, actual, eval_schema={}):
-        self.compare_dicts(ground_truth, actual, eval_schema)
+        self.compare_values(ground_truth, actual, eval_schema, None)
         for wrapper in self.eval_wrappers:
             self.result[f"{wrapper.name}.ratio"] = (
                 wrapper.calculate_ratio()
@@ -38,42 +38,54 @@ def __call__(self, ground_truth, actual, eval_schema={}):
         return self.result
 
     def compare_values(self, ground_truth, actual, eval_schema, curr_key):
-        if isinstance(ground_truth, dict) and isinstance(actual, dict):
+        if isinstance(ground_truth, dict):
             return self.compare_dicts(ground_truth, actual, eval_schema, curr_key)
-        elif isinstance(ground_truth, list) and isinstance(actual, list):
+        elif isinstance(ground_truth, list):
             return self.compare_lists(ground_truth, actual, eval_schema, curr_key)
         else:
             for wrapper in self.eval_wrappers:
-                score = wrapper.instance(
-                    ground_truth,
-                    actual,
-                    eval_schema.get(wrapper.name, None),
-                )
+                if actual is None:
+                    score = 0
+                else:
+                    score = wrapper.instance(
+                        ground_truth,
+                        actual,
+                        eval_schema.get(wrapper.name, None),
+                    )
                 wrapper.total_strings_compared += 1
                 self.result[f"{wrapper.name}.{curr_key}"] = score
                 wrapper.total_score += score
 
     def compare_dicts(self, ground_truth_dict, actual_dict, eval_schema, curr_key=None):
         for key in ground_truth_dict:
-            if key not in actual_dict:
-                for string_evaluator in self.eval_wrappers:
-                    string_evaluator.total_strings_compared += 1
-            else:
-                next_key = f"{curr_key}.{key}" if curr_key is not None else key
-                self.compare_values(
-                    ground_truth_dict[key],
-                    actual_dict[key],
-                    eval_schema.get(key, {}),
-                    next_key,
-                )
+            # handle defaults if is None
+            next_key = f"{curr_key}.{key}" if curr_key is not None else key
+            actual = actual_dict.get(key, None) if actual_dict is not None else None
+            curr_eval_schema = eval_schema.get(key, {}) if eval_schema is not None else {}
+
+            self.compare_values(
+                ground_truth_dict[key],
+                actual,
+                curr_eval_schema,
+                next_key,
+            )
 
     def compare_lists(self, ground_truth_list, actual_list, eval_schema, curr_key):
-        if not eval_schema:
-            eval_schema = [{}] * len(ground_truth_list)
+        for i in range(len(ground_truth_list)):
+            # handle defaults if is None
+            next_key = f"{curr_key}[{i}]" if curr_key is not None else f"[{i}]"
+            try:
+                actual = actual_list[i]
+            except Exception:
+                actual = None
+            try:
+                curr_eval_schema = eval_schema[i]
+            except Exception:
+                curr_eval_schema = {}
 
-        for i, (ground_truth_item, actual_item, schema) in enumerate(
-            zip(ground_truth_list, actual_list, eval_schema)
-        ):
             self.compare_values(
-                ground_truth_item, actual_item, schema, f"{curr_key}[{i}]"
+                ground_truth_list[i],
+                actual,
+                curr_eval_schema,
+                next_key,
             )
diff --git a/src/evaluators/tests/test_json_evaluator.py b/src/evaluators/tests/test_json_evaluator.py
@@ -39,8 +39,8 @@ def test_json_evaluator_no_eval_schema(self):
                 },
                 # key5 is missing
             },
+            # key3 is missing
             "key4": "value10",  # correct 6
-            # key2 is missing
         }
         # Total correct = 6
         # ratio = 6/10 = 0.6
@@ -167,3 +167,84 @@ def test_json_evaluator_no_eval_schema_with_default_config(self):
         result = json_evaluator(ground_truth_data, actual_data)
         assert result["CustomStringEvaluator.ratio"] == 0.5
         assert result['FuzzStringEvaluator.ratio'] == 0.764
+
+    def test_json_evaluator_different_array_length_in_actual(self):
+        ground_truth_data = {
+            "key1": "value1",  # value 1
+            "key2": ["test1", "test2", "test3"],  # Values 2, 3, 4
+        }
+        # Total values = 4
+
+        actual_data = {
+            "key1": "value1",   # correct 1
+            "key2": ["test1"],  # correct 2, wrong 1, wrong 2 (missing index 1, 2)
+        }
+
+        evaluators = [CustomStringEvaluator()]
+
+        # Total correct = 2
+        # ratio = 2/4 = 0.5
+
+        json_evaluator = JsonEvaluator(evaluators)
+        result = json_evaluator(ground_truth_data, actual_data)
+        assert result["CustomStringEvaluator.ratio"] == 0.5
+        assert result['CustomStringEvaluator.key1'] == 1
+        assert result['CustomStringEvaluator.key2[0]'] == 1
+        assert result['CustomStringEvaluator.key2[1]'] == 0
+        assert result['CustomStringEvaluator.key2[2]'] == 0
+
+    def test_json_evaluator_handles_array_first_value(self):
+        ground_truth_data = [
+            {"key1": "value1"},  # value 1
+            {"key2": ["1", "2", "3"]},
+            "array_value_3"
+        ]
+        # Total values = 5
+
+        actual_data = [
+            {"key1": "value1"},  # correct 1
+            {"key2": ["1", "wrong", "3"]}, # correct 2, wrong 1, correct 3
+            "array_value_3" # correct 4
+        ]
+
+        # Total correct = 4
+        # ratio = 4/5 = 0.8
+
+        evaluators = [CustomStringEvaluator()]
+
+        json_evaluator = JsonEvaluator(evaluators)
+        result = json_evaluator(ground_truth_data, actual_data)
+        assert result["CustomStringEvaluator.ratio"] == 0.8
+        assert result['CustomStringEvaluator.[0].key1'] == 1
+        assert result['CustomStringEvaluator.[1].key2[0]'] == 1
+        assert result['CustomStringEvaluator.[1].key2[1]'] == 0
+        assert result['CustomStringEvaluator.[1].key2[2]'] == 1
+        assert result['CustomStringEvaluator.[2]'] == 1
+
+    def test_json_evaluator_handles_array_dict_mismatch(self):
+        ground_truth_data = [
+            {"key1": "value1"},  # value 1
+            {"key2": ["1", "2", "3"]},
+            "array_value_3"
+        ]
+        # Total values = 5
+
+        # all values should be wrong, as this is a dict and not an array
+        actual_data = {
+            "key1": "value1",
+            "key2": ["1", "wrong", "3"],  
+        }
+
+        # Total correct = 0
+        # ratio = 0/5 = 0
+
+        evaluators = [CustomStringEvaluator()]
+
+        json_evaluator = JsonEvaluator(evaluators)
+        result = json_evaluator(ground_truth_data, actual_data)
+        assert result["CustomStringEvaluator.ratio"] == 0
+        assert result['CustomStringEvaluator.[0].key1'] == 0
+        assert result['CustomStringEvaluator.[1].key2[0]'] == 0
+        assert result['CustomStringEvaluator.[1].key2[1]'] == 0
+        assert result['CustomStringEvaluator.[1].key2[2]'] == 0
+        assert result['CustomStringEvaluator.[2]'] == 0