From 297697511e3fb42a9b03b832f8bd055893fefdc0 Mon Sep 17 00:00:00 2001
From: IanCa <ianrcallanan@gmail.com>
Date: Thu, 10 Aug 2023 17:46:51 -0500
Subject: [PATCH 1/2] Add summary to top of schema comparisons

---
 hed/schema/schema_compare.py        | 66 ++++++++++++++++++++++++++---
 tests/schema/test_schema_compare.py | 53 +++++++++++++++++------
 2 files changed, 100 insertions(+), 19 deletions(-)

diff --git a/hed/schema/schema_compare.py b/hed/schema/schema_compare.py
index 5a64f6b7..a229ee1a 100644
--- a/hed/schema/schema_compare.py
+++ b/hed/schema/schema_compare.py
@@ -12,11 +12,21 @@
     HedSectionKey.UnitModifiers: "Unit Modifier",
     HedSectionKey.Properties: "Property",
     HedSectionKey.Attributes: "Attribute",
+}
 
+SectionEntryNamesPlural = {
+    HedSectionKey.Tags: "Tags",
+    HedSectionKey.Units: "Units",
+    HedSectionKey.UnitClasses: "Unit Classes",
+    HedSectionKey.ValueClasses: "Value Classes",
+    HedSectionKey.UnitModifiers: "Unit Modifiers",
+    HedSectionKey.Properties: "Properties",
+    HedSectionKey.Attributes: "Attributes",
 }
 
 
-def find_matching_tags(schema1, schema2, output='raw', sections=(HedSectionKey.Tags,)):
+def find_matching_tags(schema1, schema2, output='raw', sections=(HedSectionKey.Tags,),
+                       include_summary=True):
     """
     Compare the tags in two library schemas.  This finds tags with the same term.
 
@@ -28,7 +38,8 @@ def find_matching_tags(schema1, schema2, output='raw', sections=(HedSectionKey.T
                       'dict' returns a json style dictionary
         sections(list): the list of sections to compare.  By default, just the tags section.
                         If None, checks all sections including header, prologue, and epilogue.
-
+        include_summary(bool): If True, adds the 'summary' dict to the dict return option, and prints it with the
+                               string option.  Lists the names of all the nodes that are missing or different.
     Returns:
         dict, json style dict, or str: A dictionary containing matching entries in the Tags section of both schemas.
     """
@@ -37,8 +48,12 @@ def find_matching_tags(schema1, schema2, output='raw', sections=(HedSectionKey.T
     for section_key, section_dict in matches.items():
         section_dict.update(unequal_entries[section_key])
 
+    header_summary = _get_tag_name_summary((matches, unequal_entries))
+
     if output == 'string':
         final_string = ""
+        if include_summary:
+            final_string += _pretty_print_header(header_summary)
         if sections is None:
             sections = HedSectionKey
         for section_key in sections:
@@ -51,6 +66,9 @@ def find_matching_tags(schema1, schema2, output='raw', sections=(HedSectionKey.T
         return final_string
     elif output == 'dict':
         output_dict = {}
+        if include_summary:
+            output_dict["summary"] = {str(key): value for key, value in header_summary.items()}
+
         for section_name, section_entries in matches.items():
             output_dict[str(section_name)] = {}
             for key, (entry1, entry2) in section_entries.items():
@@ -59,7 +77,8 @@ def find_matching_tags(schema1, schema2, output='raw', sections=(HedSectionKey.T
     return matches
 
 
-def compare_differences(schema1, schema2, output='raw', attribute_filter=None, sections=(HedSectionKey.Tags,)):
+def compare_differences(schema1, schema2, output='raw', attribute_filter=None, sections=(HedSectionKey.Tags,),
+                        include_summary=True):
     """
     Compare the tags in two schemas, this finds any differences
 
@@ -75,6 +94,8 @@ def compare_differences(schema1, schema2, output='raw', attribute_filter=None, s
                                           If it evaluates to False, no filtering is performed.
         sections(list or None): the list of sections to compare.  By default, just the tags section.
                 If None, checks all sections including header, prologue, and epilogue.
+        include_summary(bool): If True, adds the 'summary' dict to the dict return option, and prints it with the
+                               string option.  Lists the names of all the nodes that are missing or different.
 
     Returns:
         tuple, str or dict: 
@@ -94,14 +115,15 @@ def compare_differences(schema1, schema2, output='raw', attribute_filter=None, s
     if sections is None:
         sections = HedSectionKey
 
+    header_summary = _get_tag_name_summary((not_in_1, not_in_2, unequal_entries))
     if output == 'string':
         final_string = ""
+        if include_summary:
+            final_string += _pretty_print_header(header_summary)
         for section_key in sections:
             val1, val2, val3 = unequal_entries[section_key], not_in_1[section_key], not_in_2[section_key]
             type_name = SectionEntryNames[section_key]
             if val1 or val2 or val3:
-                if final_string:
-                    final_string += "\n\n"
                 final_string += f"{type_name} differences:\n"
                 if val1:
                     final_string += _pretty_print_diff_all(val1, type_name=type_name) + "\n"
@@ -109,11 +131,15 @@ def compare_differences(schema1, schema2, output='raw', attribute_filter=None, s
                     final_string += _pretty_print_missing_all(val2, "Schema1", type_name) + "\n"
                 if val3:
                     final_string += _pretty_print_missing_all(val3, "Schema2", type_name) + "\n"
+                final_string += "\n\n"
         return final_string
     elif output == 'dict':
         # todo: clean this part up
         output_dict = {}
         current_section = {}
+        if include_summary:
+            output_dict["summary"] = {str(key): value for key, value in header_summary.items()}
+
         output_dict["unequal"] = current_section
         for section_name, section_entries in unequal_entries.items():
             current_section[str(section_name)] = {}
@@ -212,6 +238,36 @@ def compare_schemas(schema1, schema2, attribute_filter=HedKey.InLibrary, section
     return matches, not_in_schema1, not_in_schema2, unequal_entries
 
 
+def _get_tag_name_summary(tag_dicts):
+    out_dict = {section_key: [] for section_key in HedSectionKey}
+    for tag_dict in tag_dicts:
+        for section_key, section in tag_dict.items():
+            if section_key == MiscSection:
+                continue
+            out_dict[section_key].extend(section.keys())
+
+    return out_dict
+
+
+def _pretty_print_header(summary_dict):
+    output_string = ""
+    first_entry = True
+    for section_key, tag_names in summary_dict.items():
+        if not tag_names:
+            continue
+        type_name = SectionEntryNamesPlural[section_key]
+        if not first_entry:
+            output_string += "\n"
+        output_string += f"{type_name}: "
+
+        output_string += ", ".join(sorted(tag_names))
+
+        output_string += "\n"
+        first_entry = False
+    output_string += "\n"
+    return output_string
+
+
 def _pretty_print_entry(entry):
     """ Returns the contents of a HedSchemaEntry object as a list of strings.
 
diff --git a/tests/schema/test_schema_compare.py b/tests/schema/test_schema_compare.py
index 6543fa27..c10085fe 100644
--- a/tests/schema/test_schema_compare.py
+++ b/tests/schema/test_schema_compare.py
@@ -1,9 +1,6 @@
 import unittest
-import os
-import io
 import json
 
-
 from hed.schema import HedKey, HedSectionKey, from_string
 from hed.schema.schema_compare import compare_schemas, find_matching_tags, \
     _pretty_print_diff_all, _pretty_print_missing_all, compare_differences
@@ -11,7 +8,6 @@
 
 
 class TestSchemaComparison(unittest.TestCase):
-
     library_schema_start = """HED library="testcomparison" version="1.1.0" withStandard="8.2.0" unmerged="true"
 
 '''Prologue'''
@@ -37,7 +33,7 @@ def load_schema1(self):
                       " *TestNode2",
                       " *TestNode3",
                       " *TestNode4"
-                     ]
+                      ]
         return self._get_test_schema(test_nodes)
 
     def load_schema2(self):
@@ -45,7 +41,7 @@ def load_schema2(self):
                       " *TestNode2",
                       " **TestNode3",
                       " *TestNode5"
-                     ]
+                      ]
 
         return self._get_test_schema(test_nodes)
 
@@ -63,16 +59,28 @@ def test_find_matching_tags(self):
         self.assertNotIn("TestNode4", result[HedSectionKey.Tags])
         self.assertNotIn("TestNode5", result[HedSectionKey.Tags])
 
-        match_string = find_matching_tags(schema1, schema2, output='string')
+        # Test with include_summary=True
+        match_string = find_matching_tags(schema1, schema2, output='string', include_summary=True)
         self.assertIsInstance(match_string, str)
-        # print(match_string)
+        self.assertIn("Tags:", match_string)
+        print(match_string)
 
-        json_style_dict = find_matching_tags(schema1, schema2, output='dict')
+        json_style_dict = find_matching_tags(schema1, schema2, output='dict', include_summary=True)
         self.assertIsInstance(json_style_dict, dict)
+        self.assertIn("summary", json_style_dict)
 
         result_string = json.dumps(json_style_dict, indent=4)
         self.assertIsInstance(result_string, str)
 
+        # Optionally, you can also test the case without include_summary
+        match_string_no_summary = find_matching_tags(schema1, schema2, output='string', include_summary=False)
+        self.assertIsInstance(match_string_no_summary, str)
+        self.assertNotIn("Tags:", match_string_no_summary)
+
+        json_style_dict_no_summary = find_matching_tags(schema1, schema2, output='dict', include_summary=False)
+        self.assertIsInstance(json_style_dict_no_summary, dict)
+        self.assertNotIn("summary", json_style_dict_no_summary)
+
     def test_compare_schemas(self):
         schema1 = self.load_schema1()
         schema2 = self.load_schema2()
@@ -109,9 +117,26 @@ def test_compare_differences(self):
         self.assertEqual(len(unequal_entries[HedSectionKey.Tags]), 1)  # No unequal entries should be found
         self.assertIn("TestNode3", unequal_entries[HedSectionKey.Tags])
 
-        diff_string = compare_differences(schema1, schema2, output='string')
-        self.assertIsInstance(diff_string, str)
-        # print(diff_string)
+        # Test with include_summary=True, string output
+        diff_string_with_summary = compare_differences(schema1, schema2, output='string', include_summary=True)
+        self.assertIsInstance(diff_string_with_summary, str)
+        self.assertIn("Tags:", diff_string_with_summary)
+        # print(diff_string_with_summary)
+
+        # Test with include_summary=True, dict output
+        diff_dict_with_summary = compare_differences(schema1, schema2, output='dict', include_summary=True)
+        self.assertIsInstance(diff_dict_with_summary, dict)
+        self.assertIn("summary", diff_dict_with_summary)
+
+        # Optionally, test without include_summary, string output
+        diff_string_no_summary = compare_differences(schema1, schema2, output='string', include_summary=False)
+        self.assertIsInstance(diff_string_no_summary, str)
+        self.assertNotIn("Tags:", diff_string_no_summary)
+
+        # Optionally, test without include_summary, dict output
+        diff_dict_no_summary = compare_differences(schema1, schema2, output='dict', include_summary=False)
+        self.assertIsInstance(diff_dict_no_summary, dict)
+        self.assertNotIn("summary", diff_dict_no_summary)
 
     def test_compare_score_lib_versions(self):
         schema1 = load_schema_version("score_1.0.0")
@@ -128,8 +153,8 @@ def test_compare_score_lib_versions(self):
         # print(diff_string)
 
         json_style_dict = compare_differences(schema1, schema2, attribute_filter=HedKey.InLibrary, output='dict',
-                                          sections=None)
+                                              sections=None)
         self.assertIsInstance(json_style_dict, dict)
 
         result_string = json.dumps(json_style_dict, indent=4)
-        self.assertIsInstance(result_string, str)
\ No newline at end of file
+        self.assertIsInstance(result_string, str)

From b0c5521a6813fe08eff7d164a9568d92affbbc91 Mon Sep 17 00:00:00 2001
From: IanCa <ianrcallanan@gmail.com>
Date: Thu, 10 Aug 2023 18:13:00 -0500
Subject: [PATCH 2/2] Simplify find_def_tags.  Remove print from test

---
 hed/models/hed_group.py             | 31 +++++++++++++++++------------
 tests/schema/test_schema_compare.py |  2 +-
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/hed/models/hed_group.py b/hed/models/hed_group.py
index 1f1c02d4..aa90d227 100644
--- a/hed/models/hed_group.py
+++ b/hed/models/hed_group.py
@@ -466,27 +466,32 @@ def find_def_tags(self, recursive=False, include_groups=3):
         Returns:
             list: A list of tuples. The contents depend on the values of the include_group.
         """
-        from hed.models.definition_dict import DefTagNames
         if recursive:
             groups = self.get_all_groups()
+            def_tags = []
+            for group in groups:
+                def_tags += self._get_def_tags_from_group(group)
         else:
-            groups = (self,)
-
-        def_tags = []
-        for group in groups:
-            for child in group.children:
-                if isinstance(child, HedTag):
-                    if child.short_base_tag == DefTagNames.DEF_ORG_KEY:
-                        def_tags.append((child, child, group))
-                else:
-                    for tag in child.tags():
-                        if tag.short_base_tag == DefTagNames.DEF_EXPAND_ORG_KEY:
-                            def_tags.append((tag, child, group))
+            def_tags =  self._get_def_tags_from_group(self)
 
         if include_groups == 0 or include_groups == 1 or include_groups == 2:
             return [tag[include_groups] for tag in def_tags]
         return def_tags
 
+    @staticmethod
+    def _get_def_tags_from_group(group):
+        from hed.models.definition_dict import DefTagNames
+        def_tags = []
+        for child in group.children:
+            if isinstance(child, HedTag):
+                if child.short_base_tag == DefTagNames.DEF_ORG_KEY:
+                    def_tags.append((child, child, group))
+            else:
+                for tag in child.tags():
+                    if tag.short_base_tag == DefTagNames.DEF_EXPAND_ORG_KEY:
+                        def_tags.append((tag, child, group))
+        return def_tags
+
     def find_tags_with_term(self, term, recursive=False, include_groups=2):
         """  Find any tags that contain the given term.
 
diff --git a/tests/schema/test_schema_compare.py b/tests/schema/test_schema_compare.py
index c10085fe..ccc08b79 100644
--- a/tests/schema/test_schema_compare.py
+++ b/tests/schema/test_schema_compare.py
@@ -63,7 +63,7 @@ def test_find_matching_tags(self):
         match_string = find_matching_tags(schema1, schema2, output='string', include_summary=True)
         self.assertIsInstance(match_string, str)
         self.assertIn("Tags:", match_string)
-        print(match_string)
+        # print(match_string)
 
         json_style_dict = find_matching_tags(schema1, schema2, output='dict', include_summary=True)
         self.assertIsInstance(json_style_dict, dict)