Merge pull request #719 from hed-standard/master

Remodeling JSON summaries now have common output format.
hed-standard · Jul 4, 2023 · 57fac3a · 57fac3a
2 parents 9018eca + 96a4d8a
commit 57fac3a
Show file tree

Hide file tree

Showing 60 changed files with 1,289 additions and 696 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -25,7 +25,7 @@ jobs:
       - uses: actions/cache@v3
         with:
           path: ${{ env.pythonLocation }}
-          key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('dev-requirements.txt') }}
+          key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('docs/requirements.txt') }}
 
       - name: Install dependencies
         run: |
@@ -85,7 +85,7 @@ jobs:
       - uses: actions/cache@v3
         with:
           path: ${{ env.pythonLocation }}
-          key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('dev-requirements.txt') }}
+          key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('docs/requirements.txt') }}
 
       - name: Install dependencies
         run: |

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+Release 0.3.1 July 3, 2023
+- Pinned the version of the pydantic and inflect libraries due to inflict.
+- Reorganized JSON output of remodeling summaries so that all of consistent form.
+- Fixed summarize_hed_tags_op so that tags were correctly categorized for output.
+- Minor refactoring to reduce code complexity.
+- BaseInput and Sidecar now raise HedFileError if input could not be read.
+
+
 Release 0.3.0 June 20, 2023
 - Introduction of partnered schema.
 - Improved error handling for schema validation.

diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,4 @@
-MIT License
+The MIT License (MIT)
 
 Copyright (c) 2020+ HED Standard Working Group
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,10 +1,10 @@
 defusedxml>=0.7.1
-inflect>=6.0.2
-myst-parser>=0.18.1
+inflect>=6.0.5
 numpy>=1.21.6
 openpyxl>=3.1.0
 pandas>=1.3.5
 portalocker>=2.7.0
 semantic_version>=2.10.0
 Sphinx>=5.2.2
 sphinx_rtd_theme>=1.0.0
+wordcloud==1.9.2
diff --git a/hed/errors/exceptions.py b/hed/errors/exceptions.py
@@ -2,6 +2,7 @@
 
 
 class HedExceptions:
+    GENERIC_ERROR = 'GENERIC_ERROR'
     # A list of all exceptions that can be generated by the hedtools.
     FILE_NOT_FOUND = 'fileNotFound'
     BAD_PARAMETERS = 'badParameters'
@@ -10,7 +11,7 @@ class HedExceptions:
     INVALID_EXTENSION = 'invalidExtension'
 
     INVALID_DATAFRAME = 'INVALID_DATAFRAME'
-
+    INVALID_FILE_FORMAT = 'INVALID_FILE_FORMAT'
     # These are actual schema issues, not that the file cannot be found or parsed
     SCHEMA_HEADER_MISSING = 'HED_SCHEMA_HEADER_INVALID'
     HED_SCHEMA_HEADER_INVALID = 'HED_SCHEMA_HEADER_INVALID'

diff --git a/hed/models/base_input.py b/hed/models/base_input.py
@@ -43,12 +43,10 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
             - An invalid dataframe was passed with size 0
             - An invalid extension was provided
             - A duplicate or empty column name appears
-
-        :raises OSError:
             - Cannot open the indicated file
-
-        :raises KeyError:
             - The specified worksheet name does not exist
+            - If the sidecar file or tabular file had invalid format and could not be read.
+
          """
         if mapper is None:
             mapper = ColumnMapper()
@@ -77,14 +75,20 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
         elif not file:
             raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file passed to BaseInput.", file)
         elif input_type in self.TEXT_EXTENSION:
-            self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header,
-                                              dtype=str, keep_default_na=True, na_values=None)
+            try:
+                self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header,
+                                                  dtype=str, keep_default_na=True, na_values=None)
+            except Exception as e:
+                raise HedFileError(HedExceptions.INVALID_FILE_FORMAT, str(e), self.name) from e
             # Convert nan values to a known value
             self._dataframe = self._dataframe.fillna("n/a")
         elif input_type in self.EXCEL_EXTENSION:
-            self._loaded_workbook = openpyxl.load_workbook(file)
-            loaded_worksheet = self.get_worksheet(self._worksheet_name)
-            self._dataframe = self._get_dataframe_from_worksheet(loaded_worksheet, has_column_names)
+            try:
+                self._loaded_workbook = openpyxl.load_workbook(file)
+                loaded_worksheet = self.get_worksheet(self._worksheet_name)
+                self._dataframe = self._get_dataframe_from_worksheet(loaded_worksheet, has_column_names)
+            except Exception as e:
+                raise HedFileError(HedExceptions.GENERIC_ERROR, str(e), self.name) from e
         else:
             raise HedFileError(HedExceptions.INVALID_EXTENSION, "", file)
 
@@ -94,7 +98,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
         # todo: Can we get rid of this behavior now that we're using pandas?
         column_issues = ColumnMapper.check_for_blank_names(self.columns, allow_blank_names=allow_blank_names)
         if column_issues:
-            raise HedFileError(HedExceptions.BAD_COLUMN_NAMES, "Duplicate or blank columns found.  See issues.",
+            raise HedFileError(HedExceptions.BAD_COLUMN_NAMES, "Duplicate or blank columns found. See issues.",
                                self.name, issues=column_issues)
 
         self.reset_mapper(mapper)
@@ -285,7 +289,7 @@ def set_cell(self, row_number, column_number, new_string_obj, tag_form="short_ta
 
         Notes:
              Any attribute of a HedTag that returns a string is a valid value of tag_form.
-             
+
         :raises ValueError:
             - There is not a loaded dataframe
 

diff --git a/hed/models/hed_tag.py b/hed/models/hed_tag.py
@@ -602,10 +602,14 @@ def _get_tag_units_portion(self, tag_unit_classes):
     @staticmethod
     def _find_modifier_unit_entry(units, all_valid_unit_permutations):
         possible_match = all_valid_unit_permutations.get(units)
-        if not possible_match or not possible_match.has_attribute(HedKey.UnitSymbol):
-            possible_match = all_valid_unit_permutations.get(units.lower())
-            if possible_match and possible_match.has_attribute(HedKey.UnitSymbol):
-                possible_match = None
+        # If we have a match that's a unit symbol, we're done, return it.
+        if possible_match and possible_match.has_attribute(HedKey.UnitSymbol):
+            return possible_match
+
+        possible_match = all_valid_unit_permutations.get(units.lower())
+        # Unit symbols must match including case, a match of a unit symbol now is something like M becoming m.
+        if possible_match and possible_match.has_attribute(HedKey.UnitSymbol):
+            possible_match = None
 
         return possible_match
 

diff --git a/hed/models/sidecar.py b/hed/models/sidecar.py
@@ -127,15 +127,13 @@ def load_sidecar_file(self, file):
         if not file:
             return {}
         elif isinstance(file, str):
+            if not self.name:
+                self.name = file
             try:
                 with open(file, "r") as fp:
-                    if not self.name:
-                        self.name = file
                     return self._load_json_file(fp)
-            except FileNotFoundError as e:
-                raise HedFileError(HedExceptions.FILE_NOT_FOUND, e.strerror, file)
-            except TypeError as e:
-                raise HedFileError(HedExceptions.FILE_NOT_FOUND, str(e), file)
+            except OSError as e:
+                raise HedFileError(HedExceptions.FILE_NOT_FOUND, e.strerror, file) from e
         else:
             return self._load_json_file(file)
 
@@ -189,12 +187,11 @@ def _load_json_file(self, fp):
 
         :raises HedFileError:
             - If the file cannot be parsed.
-            
         """
         try:
             return json.load(fp)
-        except json.decoder.JSONDecodeError as e:
-            raise HedFileError(HedExceptions.CANNOT_PARSE_JSON, str(e), self.name)
+        except (json.decoder.JSONDecodeError, AttributeError) as e:
+            raise HedFileError(HedExceptions.CANNOT_PARSE_JSON, str(e), self.name) from e
 
     def extract_definitions(self, hed_schema=None, error_handler=None):
         """ Gather and validate definitions in metadata.

diff --git a/hed/models/tabular_input.py b/hed/models/tabular_input.py
@@ -13,8 +13,8 @@ def __init__(self, file=None, sidecar=None, name=None):
         """ Constructor for the TabularInput class.
 
         Parameters:
-            file (str or file like): A tsv file to open.
-            sidecar (str or Sidecar): A Sidecar filename or Sidecar
+            file (str or FileLike): A tsv file to open.
+            sidecar (str or Sidecar or FileLike): A Sidecar or source file/filename.
             name (str): The name to display for this file for error purposes.
 
         :raises HedFileError:

diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py
@@ -509,7 +509,6 @@ def _find_tag_entry(self, tag, schema_namespace=""):
         clean_tag = str(tag)
         namespace = schema_namespace
         clean_tag = clean_tag[len(namespace):]
-        prefix_tag_adj = len(namespace)
         working_tag = clean_tag.lower()
 
         # Most tags are in the schema directly, so test that first
@@ -523,9 +522,26 @@ def _find_tag_entry(self, tag, schema_namespace=""):
 
             return found_entry, remainder, []
 
+        prefix_tag_adj = len(namespace)
+
+        try:
+            found_entry, current_slash_index = self._find_tag_subfunction(tag, working_tag, prefix_tag_adj)
+        except self._TagIdentifyError as e:
+            issue = e.issue
+            return None, None, issue
+
+        remainder = None
+        if current_slash_index != -1:
+            remainder = clean_tag[current_slash_index:]
+        if remainder and found_entry.takes_value_child_entry:
+            found_entry = found_entry.takes_value_child_entry
+
+        return found_entry, remainder, []
+
+    def _find_tag_subfunction(self, tag, working_tag, prefix_tag_adj):
+        """Finds the base tag and remainder from the left, raising exception on issues"""
         current_slash_index = -1
         current_entry = None
-
         # Loop left to right, checking each word.  Once we find an invalid word, we stop.
         while True:
             next_index = working_tag.find("/", current_slash_index + 1)
@@ -541,36 +557,37 @@ def _find_tag_entry(self, tag, schema_namespace=""):
                                                       tag,
                                                       index_in_tag=prefix_tag_adj,
                                                       index_in_tag_end=prefix_tag_adj + next_index)
-                    return None, None, error
+                    raise self._TagIdentifyError(error)
                 # If this is not a takes value node, validate each term in the remainder.
                 if not current_entry.takes_value_child_entry:
-                    child_names = working_tag[current_slash_index + 1:].split("/")
-                    word_start_index = current_slash_index + 1 + prefix_tag_adj
-                    for name in child_names:
-                        if self._get_tag_entry(name):
-                            error = ErrorHandler.format_error(ValidationErrors.INVALID_PARENT_NODE,
-                                                              tag,
-                                                              index_in_tag=word_start_index,
-                                                              index_in_tag_end=word_start_index + len(name),
-                                                              expected_parent_tag=self.all_tags[name].name)
-                            return None, None, error
-                        word_start_index += len(name) + 1
+                    # This will raise _TagIdentifyError on any issues
+                    self._validate_remaining_terms(tag, working_tag, prefix_tag_adj, current_slash_index)
                 break
 
             current_entry = parent_entry
             current_slash_index = next_index
             if next_index == len(working_tag):
                 break
-            continue
-
-        remainder = None
-        if current_slash_index != -1:
-            remainder = clean_tag[current_slash_index:]
-        if remainder and current_entry.takes_value_child_entry:
-            current_entry = current_entry.takes_value_child_entry
-        found_entry = current_entry
 
-        return found_entry, remainder, []
+        return current_entry, current_slash_index
+
+    def _validate_remaining_terms(self, tag, working_tag, prefix_tag_adj, current_slash_index):
+        """ Validates the terms past current_slash_index.
+        
+        :raises _TagIdentifyError:
+            - One of the extension terms already exists as a schema term.
+        """
+        child_names = working_tag[current_slash_index + 1:].split("/")
+        word_start_index = current_slash_index + 1 + prefix_tag_adj
+        for name in child_names:
+            if self._get_tag_entry(name):
+                error = ErrorHandler.format_error(ValidationErrors.INVALID_PARENT_NODE,
+                                                  tag,
+                                                  index_in_tag=word_start_index,
+                                                  index_in_tag_end=word_start_index + len(name),
+                                                  expected_parent_tag=self.all_tags[name].name)
+                raise self._TagIdentifyError(error)
+            word_start_index += len(name) + 1
 
     # ===============================================
     # Semi-private creation finalizing functions
@@ -801,3 +818,8 @@ def _add_tag_to_dict(self, long_tag_name, new_entry, key_class):
     def _create_tag_entry(self, long_tag_name, key_class):
         section = self._sections[key_class]
         return section._create_tag_entry(long_tag_name)
+
+    class _TagIdentifyError(Exception):
+        """Used internally to note when a tag cannot be identified."""
+        def __init__(self, issue):
+            self.issue = issue
diff --git a/hed/schema/schema_attribute_validators.py b/hed/schema/schema_attribute_validators.py
@@ -0,0 +1,81 @@
+"""The built-in functions to validate known attributes.
+
+Template for the functions:
+attribute_checker_template(hed_schema, tag_entry, attribute_name, possible_values):
+    hed_schema (HedSchema): The schema to use for validation
+    tag_entry (HedSchemaEntry): The schema entry for this tag.
+    attribute_name (str): The name of this attribute
+Returns:
+    bool
+"""
+
+from hed.errors.error_types import SchemaWarnings, ValidationErrors
+from hed.errors.error_reporter import ErrorHandler
+from hed.schema.hed_schema import HedSchema
+
+
+def tag_is_placeholder_check(hed_schema, tag_entry, attribute_name):
+    """ Check if comma separated list has valid HedTags.
+
+    Parameters:
+        hed_schema (HedSchema): The schema to use for validation
+        tag_entry (HedSchemaEntry): The schema entry for this tag.
+        attribute_name (str): The name of this attribute
+
+    Returns:
+        list: A list of issues. Each issue is a dictionary.
+
+    """
+    issues = []
+    if not tag_entry.name.endswith("/#"):
+        issues += ErrorHandler.format_error(SchemaWarnings.NON_PLACEHOLDER_HAS_CLASS, tag_entry.name,
+                                            attribute_name)
+
+    return issues
+
+
+def tag_exists_check(hed_schema, tag_entry, attribute_name):
+    """ Check if the list of possible tags exists in the schema.
+
+    Parameters:
+        hed_schema (HedSchema): The schema to use for validation
+        tag_entry (HedSchemaEntry): The schema entry for this tag.
+        attribute_name (str): The name of this attribute
+
+    Returns:
+        list: A list of issues. Each issue is a dictionary.
+
+    """
+    issues = []
+    possible_tags = tag_entry.attributes.get(attribute_name, "")
+    split_tags = possible_tags.split(",")
+    for org_tag in split_tags:
+        if org_tag and org_tag not in hed_schema.all_tags:
+            issues += ErrorHandler.format_error(ValidationErrors.NO_VALID_TAG_FOUND,
+                                                org_tag,
+                                                index_in_tag=0,
+                                                index_in_tag_end=len(org_tag))
+
+    return issues
+
+
+def tag_exists_base_schema_check(hed_schema, tag_entry, attribute_name):
+    """ Check if the single tag is a partnered schema tag
+
+    Parameters:
+        hed_schema (HedSchema): The schema to use for validation
+        tag_entry (HedSchemaEntry): The schema entry for this tag.
+        attribute_name (str): The name of this attribute
+
+    Returns:
+        list: A list of issues. Each issue is a dictionary.
+    """
+    issues = []
+    rooted_tag = tag_entry.attributes.get(attribute_name, "")
+    if rooted_tag and rooted_tag not in hed_schema.all_tags:
+        issues += ErrorHandler.format_error(ValidationErrors.NO_VALID_TAG_FOUND,
+                                            rooted_tag,
+                                            index_in_tag=0,
+                                            index_in_tag_end=len(rooted_tag))
+
+    return issues