Skip to content

Commit

Permalink
Merge pull request #719 from hed-standard/master
Browse files Browse the repository at this point in the history
Remodeling JSON summaries now have common output format.
  • Loading branch information
VisLab authored Jul 4, 2023
2 parents 9018eca + 96a4d8a commit 57fac3a
Show file tree
Hide file tree
Showing 60 changed files with 1,289 additions and 696 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
- uses: actions/cache@v3
with:
path: ${{ env.pythonLocation }}
key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('dev-requirements.txt') }}
key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('docs/requirements.txt') }}

- name: Install dependencies
run: |
Expand Down Expand Up @@ -85,7 +85,7 @@ jobs:
- uses: actions/cache@v3
with:
path: ${{ env.pythonLocation }}
key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('dev-requirements.txt') }}
key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('docs/requirements.txt') }}

- name: Install dependencies
run: |
Expand Down
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
Release 0.3.1 July 3, 2023
- Pinned the version of the pydantic and inflect libraries due to inflict.
- Reorganized JSON output of remodeling summaries so that all of consistent form.
- Fixed summarize_hed_tags_op so that tags were correctly categorized for output.
- Minor refactoring to reduce code complexity.
- BaseInput and Sidecar now raise HedFileError if input could not be read.


Release 0.3.0 June 20, 2023
- Introduction of partnered schema.
- Improved error handling for schema validation.
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
MIT License
The MIT License (MIT)

Copyright (c) 2020+ HED Standard Working Group

Expand Down
4 changes: 2 additions & 2 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
defusedxml>=0.7.1
inflect>=6.0.2
myst-parser>=0.18.1
inflect>=6.0.5
numpy>=1.21.6
openpyxl>=3.1.0
pandas>=1.3.5
portalocker>=2.7.0
semantic_version>=2.10.0
Sphinx>=5.2.2
sphinx_rtd_theme>=1.0.0
wordcloud==1.9.2
3 changes: 2 additions & 1 deletion hed/errors/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@


class HedExceptions:
GENERIC_ERROR = 'GENERIC_ERROR'
# A list of all exceptions that can be generated by the hedtools.
FILE_NOT_FOUND = 'fileNotFound'
BAD_PARAMETERS = 'badParameters'
Expand All @@ -10,7 +11,7 @@ class HedExceptions:
INVALID_EXTENSION = 'invalidExtension'

INVALID_DATAFRAME = 'INVALID_DATAFRAME'

INVALID_FILE_FORMAT = 'INVALID_FILE_FORMAT'
# These are actual schema issues, not that the file cannot be found or parsed
SCHEMA_HEADER_MISSING = 'HED_SCHEMA_HEADER_INVALID'
HED_SCHEMA_HEADER_INVALID = 'HED_SCHEMA_HEADER_INVALID'
Expand Down
26 changes: 15 additions & 11 deletions hed/models/base_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,10 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
- An invalid dataframe was passed with size 0
- An invalid extension was provided
- A duplicate or empty column name appears
:raises OSError:
- Cannot open the indicated file
:raises KeyError:
- The specified worksheet name does not exist
- If the sidecar file or tabular file had invalid format and could not be read.
"""
if mapper is None:
mapper = ColumnMapper()
Expand Down Expand Up @@ -77,14 +75,20 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
elif not file:
raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file passed to BaseInput.", file)
elif input_type in self.TEXT_EXTENSION:
self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header,
dtype=str, keep_default_na=True, na_values=None)
try:
self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header,
dtype=str, keep_default_na=True, na_values=None)
except Exception as e:
raise HedFileError(HedExceptions.INVALID_FILE_FORMAT, str(e), self.name) from e
# Convert nan values to a known value
self._dataframe = self._dataframe.fillna("n/a")
elif input_type in self.EXCEL_EXTENSION:
self._loaded_workbook = openpyxl.load_workbook(file)
loaded_worksheet = self.get_worksheet(self._worksheet_name)
self._dataframe = self._get_dataframe_from_worksheet(loaded_worksheet, has_column_names)
try:
self._loaded_workbook = openpyxl.load_workbook(file)
loaded_worksheet = self.get_worksheet(self._worksheet_name)
self._dataframe = self._get_dataframe_from_worksheet(loaded_worksheet, has_column_names)
except Exception as e:
raise HedFileError(HedExceptions.GENERIC_ERROR, str(e), self.name) from e
else:
raise HedFileError(HedExceptions.INVALID_EXTENSION, "", file)

Expand All @@ -94,7 +98,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
# todo: Can we get rid of this behavior now that we're using pandas?
column_issues = ColumnMapper.check_for_blank_names(self.columns, allow_blank_names=allow_blank_names)
if column_issues:
raise HedFileError(HedExceptions.BAD_COLUMN_NAMES, "Duplicate or blank columns found. See issues.",
raise HedFileError(HedExceptions.BAD_COLUMN_NAMES, "Duplicate or blank columns found. See issues.",
self.name, issues=column_issues)

self.reset_mapper(mapper)
Expand Down Expand Up @@ -285,7 +289,7 @@ def set_cell(self, row_number, column_number, new_string_obj, tag_form="short_ta
Notes:
Any attribute of a HedTag that returns a string is a valid value of tag_form.
:raises ValueError:
- There is not a loaded dataframe
Expand Down
12 changes: 8 additions & 4 deletions hed/models/hed_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,10 +602,14 @@ def _get_tag_units_portion(self, tag_unit_classes):
@staticmethod
def _find_modifier_unit_entry(units, all_valid_unit_permutations):
possible_match = all_valid_unit_permutations.get(units)
if not possible_match or not possible_match.has_attribute(HedKey.UnitSymbol):
possible_match = all_valid_unit_permutations.get(units.lower())
if possible_match and possible_match.has_attribute(HedKey.UnitSymbol):
possible_match = None
# If we have a match that's a unit symbol, we're done, return it.
if possible_match and possible_match.has_attribute(HedKey.UnitSymbol):
return possible_match

possible_match = all_valid_unit_permutations.get(units.lower())
# Unit symbols must match including case, a match of a unit symbol now is something like M becoming m.
if possible_match and possible_match.has_attribute(HedKey.UnitSymbol):
possible_match = None

return possible_match

Expand Down
15 changes: 6 additions & 9 deletions hed/models/sidecar.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,13 @@ def load_sidecar_file(self, file):
if not file:
return {}
elif isinstance(file, str):
if not self.name:
self.name = file
try:
with open(file, "r") as fp:
if not self.name:
self.name = file
return self._load_json_file(fp)
except FileNotFoundError as e:
raise HedFileError(HedExceptions.FILE_NOT_FOUND, e.strerror, file)
except TypeError as e:
raise HedFileError(HedExceptions.FILE_NOT_FOUND, str(e), file)
except OSError as e:
raise HedFileError(HedExceptions.FILE_NOT_FOUND, e.strerror, file) from e
else:
return self._load_json_file(file)

Expand Down Expand Up @@ -189,12 +187,11 @@ def _load_json_file(self, fp):
:raises HedFileError:
- If the file cannot be parsed.
"""
try:
return json.load(fp)
except json.decoder.JSONDecodeError as e:
raise HedFileError(HedExceptions.CANNOT_PARSE_JSON, str(e), self.name)
except (json.decoder.JSONDecodeError, AttributeError) as e:
raise HedFileError(HedExceptions.CANNOT_PARSE_JSON, str(e), self.name) from e

def extract_definitions(self, hed_schema=None, error_handler=None):
""" Gather and validate definitions in metadata.
Expand Down
4 changes: 2 additions & 2 deletions hed/models/tabular_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ def __init__(self, file=None, sidecar=None, name=None):
""" Constructor for the TabularInput class.
Parameters:
file (str or file like): A tsv file to open.
sidecar (str or Sidecar): A Sidecar filename or Sidecar
file (str or FileLike): A tsv file to open.
sidecar (str or Sidecar or FileLike): A Sidecar or source file/filename.
name (str): The name to display for this file for error purposes.
:raises HedFileError:
Expand Down
68 changes: 45 additions & 23 deletions hed/schema/hed_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,6 @@ def _find_tag_entry(self, tag, schema_namespace=""):
clean_tag = str(tag)
namespace = schema_namespace
clean_tag = clean_tag[len(namespace):]
prefix_tag_adj = len(namespace)
working_tag = clean_tag.lower()

# Most tags are in the schema directly, so test that first
Expand All @@ -523,9 +522,26 @@ def _find_tag_entry(self, tag, schema_namespace=""):

return found_entry, remainder, []

prefix_tag_adj = len(namespace)

try:
found_entry, current_slash_index = self._find_tag_subfunction(tag, working_tag, prefix_tag_adj)
except self._TagIdentifyError as e:
issue = e.issue
return None, None, issue

remainder = None
if current_slash_index != -1:
remainder = clean_tag[current_slash_index:]
if remainder and found_entry.takes_value_child_entry:
found_entry = found_entry.takes_value_child_entry

return found_entry, remainder, []

def _find_tag_subfunction(self, tag, working_tag, prefix_tag_adj):
"""Finds the base tag and remainder from the left, raising exception on issues"""
current_slash_index = -1
current_entry = None

# Loop left to right, checking each word. Once we find an invalid word, we stop.
while True:
next_index = working_tag.find("/", current_slash_index + 1)
Expand All @@ -541,36 +557,37 @@ def _find_tag_entry(self, tag, schema_namespace=""):
tag,
index_in_tag=prefix_tag_adj,
index_in_tag_end=prefix_tag_adj + next_index)
return None, None, error
raise self._TagIdentifyError(error)
# If this is not a takes value node, validate each term in the remainder.
if not current_entry.takes_value_child_entry:
child_names = working_tag[current_slash_index + 1:].split("/")
word_start_index = current_slash_index + 1 + prefix_tag_adj
for name in child_names:
if self._get_tag_entry(name):
error = ErrorHandler.format_error(ValidationErrors.INVALID_PARENT_NODE,
tag,
index_in_tag=word_start_index,
index_in_tag_end=word_start_index + len(name),
expected_parent_tag=self.all_tags[name].name)
return None, None, error
word_start_index += len(name) + 1
# This will raise _TagIdentifyError on any issues
self._validate_remaining_terms(tag, working_tag, prefix_tag_adj, current_slash_index)
break

current_entry = parent_entry
current_slash_index = next_index
if next_index == len(working_tag):
break
continue

remainder = None
if current_slash_index != -1:
remainder = clean_tag[current_slash_index:]
if remainder and current_entry.takes_value_child_entry:
current_entry = current_entry.takes_value_child_entry
found_entry = current_entry

return found_entry, remainder, []
return current_entry, current_slash_index

def _validate_remaining_terms(self, tag, working_tag, prefix_tag_adj, current_slash_index):
""" Validates the terms past current_slash_index.
:raises _TagIdentifyError:
- One of the extension terms already exists as a schema term.
"""
child_names = working_tag[current_slash_index + 1:].split("/")
word_start_index = current_slash_index + 1 + prefix_tag_adj
for name in child_names:
if self._get_tag_entry(name):
error = ErrorHandler.format_error(ValidationErrors.INVALID_PARENT_NODE,
tag,
index_in_tag=word_start_index,
index_in_tag_end=word_start_index + len(name),
expected_parent_tag=self.all_tags[name].name)
raise self._TagIdentifyError(error)
word_start_index += len(name) + 1

# ===============================================
# Semi-private creation finalizing functions
Expand Down Expand Up @@ -801,3 +818,8 @@ def _add_tag_to_dict(self, long_tag_name, new_entry, key_class):
def _create_tag_entry(self, long_tag_name, key_class):
section = self._sections[key_class]
return section._create_tag_entry(long_tag_name)

class _TagIdentifyError(Exception):
"""Used internally to note when a tag cannot be identified."""
def __init__(self, issue):
self.issue = issue
81 changes: 81 additions & 0 deletions hed/schema/schema_attribute_validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""The built-in functions to validate known attributes.
Template for the functions:
attribute_checker_template(hed_schema, tag_entry, attribute_name, possible_values):
hed_schema (HedSchema): The schema to use for validation
tag_entry (HedSchemaEntry): The schema entry for this tag.
attribute_name (str): The name of this attribute
Returns:
bool
"""

from hed.errors.error_types import SchemaWarnings, ValidationErrors
from hed.errors.error_reporter import ErrorHandler
from hed.schema.hed_schema import HedSchema


def tag_is_placeholder_check(hed_schema, tag_entry, attribute_name):
""" Check if comma separated list has valid HedTags.
Parameters:
hed_schema (HedSchema): The schema to use for validation
tag_entry (HedSchemaEntry): The schema entry for this tag.
attribute_name (str): The name of this attribute
Returns:
list: A list of issues. Each issue is a dictionary.
"""
issues = []
if not tag_entry.name.endswith("/#"):
issues += ErrorHandler.format_error(SchemaWarnings.NON_PLACEHOLDER_HAS_CLASS, tag_entry.name,
attribute_name)

return issues


def tag_exists_check(hed_schema, tag_entry, attribute_name):
""" Check if the list of possible tags exists in the schema.
Parameters:
hed_schema (HedSchema): The schema to use for validation
tag_entry (HedSchemaEntry): The schema entry for this tag.
attribute_name (str): The name of this attribute
Returns:
list: A list of issues. Each issue is a dictionary.
"""
issues = []
possible_tags = tag_entry.attributes.get(attribute_name, "")
split_tags = possible_tags.split(",")
for org_tag in split_tags:
if org_tag and org_tag not in hed_schema.all_tags:
issues += ErrorHandler.format_error(ValidationErrors.NO_VALID_TAG_FOUND,
org_tag,
index_in_tag=0,
index_in_tag_end=len(org_tag))

return issues


def tag_exists_base_schema_check(hed_schema, tag_entry, attribute_name):
""" Check if the single tag is a partnered schema tag
Parameters:
hed_schema (HedSchema): The schema to use for validation
tag_entry (HedSchemaEntry): The schema entry for this tag.
attribute_name (str): The name of this attribute
Returns:
list: A list of issues. Each issue is a dictionary.
"""
issues = []
rooted_tag = tag_entry.attributes.get(attribute_name, "")
if rooted_tag and rooted_tag not in hed_schema.all_tags:
issues += ErrorHandler.format_error(ValidationErrors.NO_VALID_TAG_FOUND,
rooted_tag,
index_in_tag=0,
index_in_tag_end=len(rooted_tag))

return issues
Loading

0 comments on commit 57fac3a

Please sign in to comment.