From 1bf129a51fbdd51de0893e45c936860e43dab8b6 Mon Sep 17 00:00:00 2001 From: IanCa Date: Tue, 7 May 2024 14:24:33 -0500 Subject: [PATCH 1/3] Second pass dataframe/omn schema handling --- hed/models/tabular_input.py | 1 + hed/schema/hed_schema.py | 146 ++++------- hed/schema/hed_schema_constants.py | 26 +- hed/schema/hed_schema_df_constants.py | 75 +++++- hed/schema/hed_schema_entry.py | 6 +- hed/schema/hed_schema_io.py | 265 ++++++++++--------- hed/schema/hed_schema_section.py | 23 +- hed/schema/schema_compliance.py | 14 +- hed/schema/schema_header_util.py | 2 + hed/schema/schema_io/base2schema.py | 93 +------ hed/schema/schema_io/df2schema.py | 315 +++++++---------------- hed/schema/schema_io/ontology_util.py | 351 ++++++++++++++++++++++++++ hed/schema/schema_io/owl2schema.py | 285 --------------------- hed/schema/schema_io/owl_constants.py | 50 ---- hed/schema/schema_io/schema2base.py | 49 ++-- hed/schema/schema_io/schema2df.py | 336 ++++++++++++++++++++---- hed/schema/schema_io/schema2owl.py | 313 ----------------------- hed/schema/schema_io/schema2wiki.py | 8 +- hed/schema/schema_io/schema2xml.py | 10 +- hed/schema/schema_io/schema_util.py | 15 +- hed/schema/schema_io/text_util.py | 71 ++++++ hed/schema/schema_io/wiki2schema.py | 13 +- tests/schema/test_hed_schema_io_df.py | 21 +- tests/schema/test_ontology_util.py | 157 ++++++++++++ 24 files changed, 1356 insertions(+), 1289 deletions(-) create mode 100644 hed/schema/schema_io/ontology_util.py delete mode 100644 hed/schema/schema_io/owl2schema.py delete mode 100644 hed/schema/schema_io/owl_constants.py delete mode 100644 hed/schema/schema_io/schema2owl.py create mode 100644 hed/schema/schema_io/text_util.py create mode 100644 tests/schema/test_ontology_util.py diff --git a/hed/models/tabular_input.py b/hed/models/tabular_input.py index 6ff9ce8a..f9724a01 100644 --- a/hed/models/tabular_input.py +++ b/hed/models/tabular_input.py @@ -52,6 +52,7 @@ def reset_column_mapper(self, sidecar=None): """ new_mapper = ColumnMapper(sidecar=sidecar, optional_tag_columns=[self.HED_COLUMN_NAME]) + self._sidecar = sidecar self.reset_mapper(new_mapper) diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py index fa22dc69..b82b87bc 100644 --- a/hed/schema/hed_schema.py +++ b/hed/schema/hed_schema.py @@ -1,15 +1,14 @@ import json -import os -from hed.schema.hed_schema_constants import HedKey, HedSectionKey, HedKey83 +from hed.schema.hed_schema_constants import HedKey, HedSectionKey, HedKeyOld from hed.schema import hed_schema_constants as constants from hed.schema.schema_io import schema_util from hed.schema.schema_io.schema2xml import Schema2XML from hed.schema.schema_io.schema2wiki import Schema2Wiki from hed.schema.schema_io.schema2df import Schema2DF +from hed.schema.schema_io import ontology_util + -# from hed.schema.schema_io.schema2owl import Schema2Owl -# from hed.schema.schema_io.owl_constants import ext_to_format from hed.schema.hed_schema_section import (HedSchemaSection, HedSchemaTagSection, HedSchemaUnitClassSection, HedSchemaUnitSection) from hed.errors import ErrorHandler @@ -245,29 +244,9 @@ def get_as_mediawiki_string(self, save_merged=False): str: The schema as a string in mediawiki format. """ - output_strings = Schema2Wiki.process_schema(self, save_merged) + output_strings = Schema2Wiki().process_schema(self, save_merged) return '\n'.join(output_strings) - # def get_as_owl_string(self, save_merged=False, file_format="owl"): - # """ Return the schema to a mediawiki string. - # - # Parameters: - # save_merged (bool): If True, this will save the schema as a merged schema if it is a "withStandard" schema. - # If it is not a "withStandard" schema, this setting has no effect. - # file_format(str or None): Override format from filename extension. - # Accepts any value rdflib accepts(We fully support "turtle", "xml"("owl" also accepted) and "json-ld"). - # Other values should work, but aren't as fully supported. - # Returns: - # str: The schema as a string in mediawiki format. - # - # :raises rdflib.plugin.PluginException: - # - Invalid format of file_format. Make sure you use a supported RDF format. - # """ - # if file_format == "owl": - # file_format = "xml" - # rdf_data = Schema2Owl.process_schema(self, save_merged) - # return rdf_data.serialize(format=file_format) - def get_as_xml_string(self, save_merged=True): """ Return the schema to an XML string. @@ -279,32 +258,27 @@ def get_as_xml_string(self, save_merged=True): str: Return the schema as an XML string. """ - xml_tree = Schema2XML.process_schema(self, save_merged) + xml_tree = Schema2XML().process_schema(self, save_merged) return schema_util.xml_element_2_str(xml_tree) - def save_as_mediawiki(self, filename, save_merged=False): - """ Save as mediawiki to a file. + def get_as_dataframes(self, save_merged=False): + """ Get a dict of dataframes representing this file - filename: str - save location save_merged: bool If True, this will save the schema as a merged schema if it is a "withStandard" schema. If it is not a "withStandard" schema, this setting has no effect. - :raises OSError: - - File cannot be saved for some reason. + Returns: + dataframes(dict): a dict of dataframes you can load as a schema """ - output_strings = Schema2Wiki.process_schema(self, save_merged) - with open(filename, mode='w', encoding='utf-8') as opened_file: - for string in output_strings: - opened_file.write(string) - opened_file.write('\n') + output_dfs = Schema2DF().process_schema(self, save_merged) + return output_dfs - def save_as_dataframes(self, base_filename, save_merged=False): + def save_as_mediawiki(self, filename, save_merged=False): """ Save as mediawiki to a file. - base_filename: str - save filename. A suffix will be added to most, e.g. _Tag + filename: str + save location save_merged: bool If True, this will save the schema as a merged schema if it is a "withStandard" schema. If it is not a "withStandard" schema, this setting has no effect. @@ -312,39 +286,11 @@ def save_as_dataframes(self, base_filename, save_merged=False): :raises OSError: - File cannot be saved for some reason. """ - output_dfs = Schema2DF.process_schema(self, save_merged) - base, base_ext = os.path.splitext(base_filename) - for suffix, dataframe in output_dfs.items(): - filename = f"{base}_{suffix}.tsv" - with open(filename, mode='w', encoding='utf-8') as opened_file: - dataframe.to_csv(opened_file, sep='\t', index=False, header=True) - - # def save_as_owl(self, filename, save_merged=False, file_format=None): - # """ Save as json to a file. - # - # filename: str - # Save the file here - # save_merged: bool - # If True, this will save the schema as a merged schema if it is a "withStandard" schema. - # If it is not a "withStandard" schema, this setting has no effect. - # file_format(str or None): Required for owl formatted files other than the following: - # .ttl: turtle - # .owl: xml - # .json-ld: json-ld - # - # :raises OSError: - # - File cannot be saved for some reason - # - # :raises rdflib.plugin.PluginException: - # - Invalid format of file_format. Make sure you use a supported RDF format. - # """ - # ext = os.path.splitext(filename.lower())[1] - # if ext in ext_to_format and file_format is None: - # file_format = ext_to_format[ext] - # if file_format == "owl": - # file_format = "xml" - # rdf_data = Schema2Owl.process_schema(self, save_merged) - # rdf_data.serialize(filename, format=file_format) + output_strings = Schema2Wiki().process_schema(self, save_merged) + with open(filename, mode='w', encoding='utf-8') as opened_file: + for string in output_strings: + opened_file.write(string) + opened_file.write('\n') def save_as_xml(self, filename, save_merged=True): """ Save as XML to a file. @@ -358,11 +304,26 @@ def save_as_xml(self, filename, save_merged=True): :raises OSError: - File cannot be saved for some reason """ - xml_tree = Schema2XML.process_schema(self, save_merged) + xml_tree = Schema2XML().process_schema(self, save_merged) with open(filename, mode='w', encoding='utf-8') as opened_file: xml_string = schema_util.xml_element_2_str(xml_tree) opened_file.write(xml_string) + def save_as_dataframes(self, base_filename, save_merged=False): + """ Save as mediawiki to a file. + + base_filename: str + save filename. A suffix will be added to most, e.g. _Tag + save_merged: bool + If True, this will save the schema as a merged schema if it is a "withStandard" schema. + If it is not a "withStandard" schema, this setting has no effect. + + :raises OSError: + - File cannot be saved for some reason. + """ + output_dfs = Schema2DF().process_schema(self, save_merged) + ontology_util.save_dataframes(base_filename, output_dfs) + def set_schema_prefix(self, schema_namespace): """ Set library namespace associated for this schema. @@ -664,10 +625,10 @@ def get_tag_attribute_names_old(self): """ return {tag_entry.name: tag_entry for tag_entry in self._sections[HedSectionKey.Attributes].values() - if not tag_entry.has_attribute(HedKey.UnitClassProperty) - and not tag_entry.has_attribute(HedKey.UnitProperty) - and not tag_entry.has_attribute(HedKey.UnitModifierProperty) - and not tag_entry.has_attribute(HedKey.ValueClassProperty)} + if not tag_entry.has_attribute(HedKeyOld.UnitClassProperty) + and not tag_entry.has_attribute(HedKeyOld.UnitProperty) + and not tag_entry.has_attribute(HedKeyOld.UnitModifierProperty) + and not tag_entry.has_attribute(HedKeyOld.ValueClassProperty)} # =============================================== # Private utility functions @@ -693,19 +654,14 @@ def _get_modifiers_for_unit(self, unit): unit (str): A known unit. Returns: - list: List of HedSchemaEntry. + derived_unit_list(list of HedSchemaEntry): The derived units for this unit Notes: This is a lower level one that doesn't rely on the Unit entries being fully setup. - """ - # todo: could refactor this so this unit.casefold() part is in HedSchemaUnitSection.get unit_entry = self.get_tag_entry(unit, HedSectionKey.Units) if unit_entry is None: - unit_entry = self.get_tag_entry(unit.casefold(), HedSectionKey.Units) - # Unit symbols must match exactly - if unit_entry is None or unit_entry.has_attribute(HedKey.UnitSymbol): - return [] + return [] is_si_unit = unit_entry.has_attribute(HedKey.SIUnit) is_unit_symbol = unit_entry.has_attribute(HedKey.UnitSymbol) if not is_si_unit: @@ -732,7 +688,7 @@ def _get_attributes_for_section(self, key_class): Returns: dict: A dict of all the attributes for this section. """ - element_prop_key = HedKey83.ElementDomain if self.schema_83_props else HedKey.ElementProperty + element_prop_key = HedKey.ElementDomain if self.schema_83_props else HedKeyOld.ElementProperty # Common logic for Attributes and Properties if key_class in [HedSectionKey.Attributes, HedSectionKey.Properties]: @@ -744,18 +700,18 @@ def _get_attributes_for_section(self, key_class): if self.schema_83_props: attrib_classes = { - HedSectionKey.UnitClasses: HedKey83.UnitClassDomain, - HedSectionKey.Units: HedKey83.UnitDomain, - HedSectionKey.UnitModifiers: HedKey83.UnitModifierDomain, - HedSectionKey.ValueClasses: HedKey83.ValueClassDomain, - HedSectionKey.Tags: HedKey83.TagDomain + HedSectionKey.UnitClasses: HedKey.UnitClassDomain, + HedSectionKey.Units: HedKey.UnitDomain, + HedSectionKey.UnitModifiers: HedKey.UnitModifierDomain, + HedSectionKey.ValueClasses: HedKey.ValueClassDomain, + HedSectionKey.Tags: HedKey.TagDomain } else: attrib_classes = { - HedSectionKey.UnitClasses: HedKey.UnitClassProperty, - HedSectionKey.Units: HedKey.UnitProperty, - HedSectionKey.UnitModifiers: HedKey.UnitModifierProperty, - HedSectionKey.ValueClasses: HedKey.ValueClassProperty + HedSectionKey.UnitClasses: HedKeyOld.UnitClassProperty, + HedSectionKey.Units: HedKeyOld.UnitProperty, + HedSectionKey.UnitModifiers: HedKeyOld.UnitModifierProperty, + HedSectionKey.ValueClasses: HedKeyOld.ValueClassProperty } if key_class == HedSectionKey.Tags: return self.get_tag_attribute_names_old() diff --git a/hed/schema/hed_schema_constants.py b/hed/schema/hed_schema_constants.py index 2153740f..d3711fff 100644 --- a/hed/schema/hed_schema_constants.py +++ b/hed/schema/hed_schema_constants.py @@ -61,18 +61,6 @@ class HedKey: InLibrary = "inLibrary" HedID = 'hedId' - # All known properties - BoolProperty = 'boolProperty' - UnitClassProperty = 'unitClassProperty' - UnitProperty = 'unitProperty' - UnitModifierProperty = 'unitModifierProperty' - ValueClassProperty = 'valueClassProperty' - ElementProperty = 'elementProperty' - NodeProperty = 'nodeProperty' - IsInheritedProperty = 'isInheritedProperty' - - -class HedKey83: UnitClassDomain = "unitClassDomain" UnitDomain = "unitDomain" UnitModifierDomain = "unitModifierDomain" @@ -82,8 +70,6 @@ class HedKey83: AnnotationProperty = "annotationProperty" BoolRange = "boolRange" - - # Fully new below this TagRange = "tagRange" NumericRange = "numericRange" StringRange = "stringRange" @@ -92,6 +78,18 @@ class HedKey83: ValueClassRange = "valueClassRange" +class HedKeyOld: + # Fully Deprecated properties + BoolProperty = 'boolProperty' + UnitClassProperty = 'unitClassProperty' + UnitProperty = 'unitProperty' + UnitModifierProperty = 'unitModifierProperty' + ValueClassProperty = 'valueClassProperty' + ElementProperty = 'elementProperty' + NodeProperty = 'nodeProperty' + IsInheritedProperty = 'isInheritedProperty' + + VERSION_ATTRIBUTE = 'version' LIBRARY_ATTRIBUTE = 'library' WITH_STANDARD_ATTRIBUTE = "withStandard" diff --git a/hed/schema/hed_schema_df_constants.py b/hed/schema/hed_schema_df_constants.py index 6114160d..8642d914 100644 --- a/hed/schema/hed_schema_df_constants.py +++ b/hed/schema/hed_schema_df_constants.py @@ -1,7 +1,78 @@ +from hed.schema.hed_schema_constants import HedSectionKey + # Known tsv format suffixes STRUCT_KEY = "Structure" TAG_KEY = "Tag" +UNIT_KEY = "Unit" +UNIT_CLASS_KEY = "UnitClass" +UNIT_MODIFIER_KEY = "UnitModifier" +VALUE_CLASS_KEY = "ValueClass" + +ANNOTATION_KEY = "AnnotationProperty" +DATA_KEY = "DataProperty" +OBJECT_KEY = "ObjectProperty" + +ATTRIBUTE_PROPERTY_KEY = "AttributeProperty" + +PROPERTY_KEYS = [ANNOTATION_KEY, DATA_KEY, OBJECT_KEY] +DF_SUFFIXES = {TAG_KEY, STRUCT_KEY, VALUE_CLASS_KEY, + UNIT_CLASS_KEY, UNIT_KEY, UNIT_MODIFIER_KEY, + *PROPERTY_KEYS, ATTRIBUTE_PROPERTY_KEY} + +section_mapping = { + STRUCT_KEY: None, + TAG_KEY: HedSectionKey.Tags, + VALUE_CLASS_KEY: HedSectionKey.ValueClasses, + UNIT_CLASS_KEY: HedSectionKey.UnitClasses, + UNIT_KEY: HedSectionKey.Units, + UNIT_MODIFIER_KEY: HedSectionKey.UnitModifiers, + ANNOTATION_KEY: HedSectionKey.Attributes, + DATA_KEY: HedSectionKey.Attributes, + OBJECT_KEY: HedSectionKey.Attributes, + ATTRIBUTE_PROPERTY_KEY: HedSectionKey.Properties, +} + +# Spreadsheet column ids +hed_id = "hedId" +level = "Level" +name = "rdfs:label" +subclass_of = "omn:SubClassOf" +attributes = "Attributes" +description = "dc:description" +equivalent_to = "owm:EquivalentTo" +has_unit_class = "hasUnitClass" + +struct_columns = [hed_id, name, attributes, subclass_of, description] +tag_columns = [hed_id, level, name, subclass_of, attributes, description, equivalent_to] +unit_columns = [hed_id, name, subclass_of, has_unit_class, attributes, description, equivalent_to] + +# The columns for unit class, value class, and unit modifier +other_columns = [hed_id, name, subclass_of, attributes, description, equivalent_to] + +# for schema attributes +property_type = "Type" +property_domain = "omn:Domain" +property_range = "omn:Range" +properties = "Properties" +property_columns = [hed_id, name, property_type, property_domain, property_range, properties, description] + +# For the schema properties +property_columns_reduced = [hed_id, name, property_type, description] + +# HED_00X__YY where X is the library starting index, and Y is the entity number below. +struct_base_ids = { + "HedEntity": 1, + "HedStructure": 2, + "HedElement": 3, + "HedSchema": 4, + "HedTag": 5, + "HedUnitClass": 6, + "HedUnit": 7, + "HedUnitModifier": 8, + "HedValueClass": 9, + "HedHeader": 10, + "HedPrologue": 11, + "HedEpilogue": 12 +} -# todo: move more constants up here -hed_id_column = "hedId" diff --git a/hed/schema/hed_schema_entry.py b/hed/schema/hed_schema_entry.py index 467a06ea..8e0f3e7a 100644 --- a/hed/schema/hed_schema_entry.py +++ b/hed/schema/hed_schema_entry.py @@ -168,8 +168,9 @@ def finalize_entry(self, schema): schema (HedSchema): The object with the schema rules. """ - self.units = {unit_entry.name: unit_entry for unit_entry in self._units} + for unit_entry in self.units.values(): + unit_entry.unit_class_entry = self derivative_units = {} for unit_entry in self.units.values(): derivative_units.update({key: unit_entry for key in unit_entry.derivative_units.keys()}) @@ -209,9 +210,9 @@ class UnitEntry(HedSchemaEntry): """ A single unit entry with modifiers in the HedSchema. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.unit_class_name = None self.unit_modifiers = [] self.derivative_units = {} + self.unit_class_entry = None def finalize_entry(self, schema): """ Called once after loading to set internal state. @@ -221,7 +222,6 @@ def finalize_entry(self, schema): """ self.unit_modifiers = schema._get_modifiers_for_unit(self.name) - derivative_units = {} if self.has_attribute(HedKey.UnitSymbol): base_plural_units = {self.name} diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py index a0e09df8..04661a80 100644 --- a/hed/schema/hed_schema_io.py +++ b/hed/schema/hed_schema_io.py @@ -6,7 +6,6 @@ from hed.schema.schema_io.xml2schema import SchemaLoaderXML from hed.schema.schema_io.wiki2schema import SchemaLoaderWiki from hed.schema.schema_io.df2schema import SchemaLoaderDF -# from hed.schema.schema_io.owl2schema import SchemaLoaderOWL from hed.schema import hed_cache from hed.errors.exceptions import HedFileError, HedExceptions @@ -14,71 +13,58 @@ from hed.schema.hed_schema_group import HedSchemaGroup from hed.schema.schema_header_util import validate_version_string from collections import defaultdict -# from hed.schema.schema_io.owl_constants import ext_to_format from urllib.error import URLError MAX_MEMORY_CACHE = 40 -def from_string(schema_string, schema_format=".xml", schema_namespace=None, schema=None, name=None): - """ Create a schema from the given string. +def load_schema_version(xml_version=None, xml_folder=None): + """ Return a HedSchema or HedSchemaGroup extracted from xml_version Parameters: - schema_string (str or dict): An XML, mediawiki or OWL, file as a single long string - If tsv, Must be a dict of spreadsheets as strings. - schema_format (str): The schema format of the source schema string. - Allowed normal values: .mediawiki, .xml, .tsv - Note: tsv is in progress and has limited features - schema_namespace (str, None): The name_prefix all tags in this schema will accept. - schema(HedSchema or None): A hed schema to merge this new file into - It must be a with-standard schema with the same value. - name(str or None): User supplied identifier for this schema + xml_version (str or list): List or str specifying which official HED schemas to use. + A json str format is also supported, + based on the output of HedSchema.get_formatted_version + Basic format: `[schema_namespace:][library_name_]X.Y.Z`. + xml_folder (str): Path to a folder containing schema. Returns: - (HedSchema): The loaded schema. + HedSchema or HedSchemaGroup: The schema or schema group extracted. :raises HedFileError: - - If empty string or invalid extension is passed. - - Other fatal formatting issues with file - - Notes: - - The loading is determined by file type. - + - The xml_version is not valid. + - The specified version cannot be found or loaded + - Other fatal errors loading the schema (These are unlikely if you are not editing them locally) + - The prefix is invalid """ - if not schema_string: - raise HedFileError(HedExceptions.BAD_PARAMETERS, "Empty string passed to HedSchema.from_string", - filename=name) - - if isinstance(schema_string, str): - # Replace carriage returns with new lines since this might not be done by the caller - schema_string = schema_string.replace("\r\n", "\n") + # Check if we start and end with a square bracket, or double quote. This might be valid json + if xml_version and isinstance(xml_version, str) and \ + ((xml_version[0], xml_version[-1]) in [('[', ']'), ('"', '"')]): + try: + xml_version = json.loads(xml_version) + except json.decoder.JSONDecodeError as e: + raise HedFileError(HedExceptions.CANNOT_PARSE_JSON, str(e), xml_version) from e + if xml_version and isinstance(xml_version, list): + xml_versions = parse_version_list(xml_version) + schemas = [_load_schema_version(xml_version=version, xml_folder=xml_folder) for version in + xml_versions.values()] + if len(schemas) == 1: + return schemas[0] - if schema_format.endswith(".xml"): - hed_schema = SchemaLoaderXML.load(schema_as_string=schema_string, schema=schema, name=name) - elif schema_format.endswith(".mediawiki"): - hed_schema = SchemaLoaderWiki.load(schema_as_string=schema_string, schema=schema, name=name) - elif schema_format.endswith(".tsv"): - if schema is not None: - raise HedFileError(HedExceptions.INVALID_HED_FORMAT, "Cannot pass a schema to merge into spreadsheet loading currently.", filename=name) - hed_schema = SchemaLoaderDF.load_spreadsheet(schema_as_strings=schema_string, name=name) - # elif schema_format: - # hed_schema = SchemaLoaderOWL.load(schema_as_string=schema_string, schema=schema, file_format=schema_format, - # name=name) + name = ",".join([schema.version for schema in schemas]) + return HedSchemaGroup(schemas, name=name) else: - raise HedFileError(HedExceptions.INVALID_EXTENSION, f"Unknown schema extension {schema_format}", filename=name) - - if schema_namespace: - hed_schema.set_schema_prefix(schema_namespace=schema_namespace) - return hed_schema + return _load_schema_version(xml_version=xml_version, xml_folder=xml_folder) def load_schema(hed_path, schema_namespace=None, schema=None, name=None): """ Load a schema from the given file or URL path. Parameters: - hed_path (str or dict): A filepath or url to open a schema from. - If loading a TSV file, this can be a single filename template, or a dict of filenames. - Template: basename.tsv, where files are named basename_Struct.tsv and basename_Tag.tsv + + hed_path (str): A filepath or url to open a schema from. + If loading a TSV file, this should be a single filename where: + Template: basename.tsv, where files are named basename_Struct.tsv, basename_Tag.tsv, etc. schema_namespace (str or None): The name_prefix all tags in this schema will accept. schema(HedSchema or None): A hed schema to merge this new file into It must be a with-standard schema with the same value. @@ -104,10 +90,6 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None): except URLError as e: raise HedFileError(HedExceptions.URL_ERROR, str(e), hed_path) from e hed_schema = from_string(file_as_string, schema_format=os.path.splitext(hed_path.lower())[1], name=name) - # elif ext in ext_to_format: - # hed_schema = SchemaLoaderOWL.load(hed_path, schema=schema, file_format=ext_to_format[ext], name=name) - # elif file_format: - # hed_schema = SchemaLoaderOWL.load(hed_path, schema=schema, file_format=file_format, name=name) elif hed_path.lower().endswith(".xml"): hed_schema = SchemaLoaderXML.load(hed_path, schema=schema, name=name) elif hed_path.lower().endswith(".mediawiki"): @@ -126,6 +108,80 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None): return hed_schema +def from_string(schema_string, schema_format=".xml", schema_namespace=None, schema=None, name=None): + """ Create a schema from the given string. + + Parameters: + schema_string (str): An XML or mediawiki file as a single long string + schema_format (str): The schema format of the source schema string. + Allowed normal values: .mediawiki, .xml + schema_namespace (str, None): The name_prefix all tags in this schema will accept. + schema(HedSchema or None): A hed schema to merge this new file into + It must be a with-standard schema with the same value. + name(str or None): User supplied identifier for this schema + + Returns: + (HedSchema): The loaded schema. + + :raises HedFileError: + - If empty string or invalid extension is passed. + - Other fatal formatting issues with file + + Notes: + - The loading is determined by file type. + + """ + if not schema_string: + raise HedFileError(HedExceptions.BAD_PARAMETERS, "Empty string passed to HedSchema.from_string", + filename=name) + + if isinstance(schema_string, str): + # Replace carriage returns with new lines since this might not be done by the caller + schema_string = schema_string.replace("\r\n", "\n") + + if schema_format.endswith(".xml"): + hed_schema = SchemaLoaderXML.load(schema_as_string=schema_string, schema=schema, name=name) + elif schema_format.endswith(".mediawiki"): + hed_schema = SchemaLoaderWiki.load(schema_as_string=schema_string, schema=schema, name=name) + else: + raise HedFileError(HedExceptions.INVALID_EXTENSION, f"Unknown schema extension {schema_format}", filename=name) + + if schema_namespace: + hed_schema.set_schema_prefix(schema_namespace=schema_namespace) + return hed_schema + + +def from_dataframes(schema_data, schema_namespace=None, name=None): + """ Create a schema from the given string. + + Parameters: + schema_string (dict): A dict of DF_SUFFIXES:file_as_string_or_df + Should have an entry for all values of DF_SUFFIXES. + schema_namespace (str, None): The name_prefix all tags in this schema will accept. + name(str or None): User supplied identifier for this schema + + Returns: + (HedSchema): The loaded schema. + + :raises HedFileError: + - Empty/invalid parameters + + Notes: + - The loading is determined by file type. + + """ + if not schema_data or not isinstance(schema_data, dict): + raise HedFileError(HedExceptions.BAD_PARAMETERS, "Empty or non dict value passed to HedSchema.from_dataframes", + filename=name) + + hed_schema = SchemaLoaderDF.load_spreadsheet(schema_as_strings_or_df=schema_data, name=name) + + if schema_namespace: + hed_schema.set_schema_prefix(schema_namespace=schema_namespace) + + return hed_schema + + # If this is actually used, we could easily add other versions/update this one def get_hed_xml_version(xml_file_path): """ Get the version number from a HED XML file. @@ -143,6 +199,40 @@ def get_hed_xml_version(xml_file_path): return parser.schema.version +def parse_version_list(xml_version_list): + """Takes a list of xml versions and returns a dictionary split by prefix + + e.g. ["score", "testlib"] will return {"": "score, testlib"} + e.g. ["score", "testlib", "ol:otherlib"] will return {"": "score, testlib", "ol:": "otherlib"} + + Parameters: + xml_version_list (list): List of str specifying which hed schemas to use + + Returns: + HedSchema or HedSchemaGroup: The schema or schema group extracted. + """ + out_versions = defaultdict(list) + for version in xml_version_list: + schema_namespace = "" + if version and ":" in version: + schema_namespace, _, version = version.partition(":") + + if not isinstance(version, str): + raise HedFileError(HedExceptions.SCHEMA_VERSION_INVALID, + f"Must specify a schema version by number, found no version on {xml_version_list} schema.", + filename=None) + if version in out_versions[schema_namespace]: + raise HedFileError(HedExceptions.SCHEMA_DUPLICATE_LIBRARY, + f"Attempting to load the same library '{version}' twice: {out_versions[schema_namespace]}", + filename=None) + out_versions[schema_namespace].append(version) + + out_versions = {key: ",".join(value) if not key else f"{key}:" + ",".join(value) for key, value in + out_versions.items()} + + return out_versions + + @functools.lru_cache(maxsize=MAX_MEMORY_CACHE) def _load_schema_version(xml_version=None, xml_folder=None): """ Return specified version @@ -219,13 +309,13 @@ def _load_schema_version_sub(xml_version, schema_namespace="", xml_folder=None, - The prefix is invalid """ library_name = None - if not xml_version: out_name = schema_namespace if schema_namespace else "standard" raise HedFileError(HedExceptions.SCHEMA_VERSION_INVALID, f"Must specify a schema version by number, found no version on {out_name} schema.", filename=None) + save_version = xml_version if "_" in xml_version: library_name, _, xml_version = xml_version.rpartition("_") @@ -252,7 +342,7 @@ def _load_schema_version_sub(xml_version, schema_namespace="", xml_folder=None, final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder, check_prerelease=True) if not final_hed_xml_file: raise HedFileError(HedExceptions.FILE_NOT_FOUND, - f"HED version '{xml_version}' not found in cache: {hed_cache.get_cache_directory()}", + f"HED version '{save_version}' not found in cache: {hed_cache.get_cache_directory()}", filename=xml_folder) hed_schema = load_schema(final_hed_xml_file, schema=schema, name=name) else: @@ -264,74 +354,3 @@ def _load_schema_version_sub(xml_version, schema_namespace="", xml_folder=None, return hed_schema -def load_schema_version(xml_version=None, xml_folder=None): - """ Return a HedSchema or HedSchemaGroup extracted from xml_version - - Parameters: - xml_version (str or list): List or str specifying which official HED schemas to use. - A json str format is also supported, - based on the output of HedSchema.get_formatted_version - Basic format: `[schema_namespace:][library_name_]X.Y.Z`. - xml_folder (str): Path to a folder containing schema. - - Returns: - HedSchema or HedSchemaGroup: The schema or schema group extracted. - - :raises HedFileError: - - The xml_version is not valid. - - The specified version cannot be found or loaded - - Other fatal errors loading the schema (These are unlikely if you are not editing them locally) - - The prefix is invalid - """ - # Check if we start and end with a square bracket, or double quote. This might be valid json - if xml_version and isinstance(xml_version, str) and \ - ((xml_version[0], xml_version[-1]) in [('[', ']'), ('"', '"')]): - try: - xml_version = json.loads(xml_version) - except json.decoder.JSONDecodeError as e: - raise HedFileError(HedExceptions.CANNOT_PARSE_JSON, str(e), xml_version) from e - if xml_version and isinstance(xml_version, list): - xml_versions = parse_version_list(xml_version) - schemas = [_load_schema_version(xml_version=version, xml_folder=xml_folder) for version in - xml_versions.values()] - if len(schemas) == 1: - return schemas[0] - - name = ",".join([schema.version for schema in schemas]) - return HedSchemaGroup(schemas, name=name) - else: - return _load_schema_version(xml_version=xml_version, xml_folder=xml_folder) - - -def parse_version_list(xml_version_list): - """Takes a list of xml versions and returns a dictionary split by prefix - - e.g. ["score", "testlib"] will return {"": "score, testlib"} - e.g. ["score", "testlib", "ol:otherlib"] will return {"": "score, testlib", "ol:": "otherlib"} - - Parameters: - xml_version_list (list): List of str specifying which hed schemas to use - - Returns: - HedSchema or HedSchemaGroup: The schema or schema group extracted. - """ - out_versions = defaultdict(list) - for version in xml_version_list: - schema_namespace = "" - if version and ":" in version: - schema_namespace, _, version = version.partition(":") - - if not isinstance(version, str): - raise HedFileError(HedExceptions.SCHEMA_VERSION_INVALID, - f"Must specify a schema version by number, found no version on {xml_version_list} schema.", - filename=None) - if version in out_versions[schema_namespace]: - raise HedFileError(HedExceptions.SCHEMA_DUPLICATE_LIBRARY, - f"Attempting to load the same library '{version}' twice: {out_versions[schema_namespace]}", - filename=None) - out_versions[schema_namespace].append(version) - - out_versions = {key: ",".join(value) if not key else f"{key}:" + ",".join(value) for key, value in - out_versions.items()} - - return out_versions diff --git a/hed/schema/hed_schema_section.py b/hed/schema/hed_schema_section.py index f27fe6c1..4923c4ac 100644 --- a/hed/schema/hed_schema_section.py +++ b/hed/schema/hed_schema_section.py @@ -1,5 +1,5 @@ from hed.schema.hed_schema_entry import HedSchemaEntry, UnitClassEntry, UnitEntry, HedTagEntry -from hed.schema.hed_schema_constants import HedSectionKey, HedKey, HedKey83 +from hed.schema.hed_schema_constants import HedSectionKey, HedKey, HedKeyOld entries_by_section = { HedSectionKey.Properties: HedSchemaEntry, @@ -125,9 +125,10 @@ def get(self, key): key (str): The name of the key. """ - if not self.case_sensitive: - key = key.casefold() - return self.all_names.get(key) + try: + return self.__getitem__(key) + except KeyError: + return None def __eq__(self, other): if self.all_names != other.all_names: @@ -156,6 +157,16 @@ def _check_if_duplicate(self, name_key, new_entry): name_key = name_key.casefold() return super()._check_if_duplicate(name_key, new_entry) + def __getitem__(self, key): + """Check the case of the key appropriately for symbols/not symbols, and return the matching entry.""" + unit_entry = self.all_names.get(key) + if unit_entry is None: + unit_entry = self.all_names.get(key.casefold()) + # Unit symbols must match exactly + if unit_entry is None or unit_entry.has_attribute(HedKey.UnitSymbol): + return None + return unit_entry + class HedSchemaUnitClassSection(HedSchemaSection): """The schema section containing unit classes.""" @@ -256,10 +267,10 @@ def _finalize_section(self, hed_schema): attribute_section = hed_schema.attributes if hed_schema.schema_83_props: self.inheritable_attributes = [name for name, value in attribute_section.items() - if not value.has_attribute(HedKey83.AnnotationProperty)] + if not value.has_attribute(HedKey.AnnotationProperty)] else: self.inheritable_attributes = [name for name, value in attribute_section.items() - if value.has_attribute(HedKey.IsInheritedProperty)] + if value.has_attribute(HedKeyOld.IsInheritedProperty)] # Hardcode in extension allowed as it is critical for validation in older schemas if not self.inheritable_attributes: diff --git a/hed/schema/schema_compliance.py b/hed/schema/schema_compliance.py index 33b32409..f16f2002 100644 --- a/hed/schema/schema_compliance.py +++ b/hed/schema/schema_compliance.py @@ -2,7 +2,7 @@ from hed.errors.error_types import ErrorContext, SchemaErrors, ErrorSeverity, SchemaAttributeErrors, SchemaWarnings from hed.errors.error_reporter import ErrorHandler, sort_issues -from hed.schema.hed_schema import HedSchema, HedKey, HedSectionKey, HedKey83 +from hed.schema.hed_schema import HedSchema, HedKey, HedSectionKey from hed.schema import schema_attribute_validators from hed.schema.schema_validation_util import validate_schema_tag_new, validate_schema_term_new, \ get_allowed_characters_by_name, get_problem_indexes, validate_schema_description_new @@ -144,12 +144,12 @@ def check_attributes(self): attribute_entry = self.hed_schema.get_tag_entry(attribute_name, HedSectionKey.Attributes) if attribute_entry: range_validators = { - HedKey83.TagRange: [partial(schema_attribute_validators.item_exists_check, section_key=HedSectionKey.Tags)], - HedKey83.NumericRange: [schema_attribute_validators.is_numeric_value], - HedKey83.StringRange: [], # Unclear what validation should be done here. - HedKey83.UnitClassRange: [partial(schema_attribute_validators.item_exists_check, section_key=HedSectionKey.UnitClasses)], - HedKey83.UnitRange: [schema_attribute_validators.unit_exists], - HedKey83.ValueClassRange: [partial(schema_attribute_validators.item_exists_check, section_key=HedSectionKey.ValueClasses)] + HedKey.TagRange: [partial(schema_attribute_validators.item_exists_check, section_key=HedSectionKey.Tags)], + HedKey.NumericRange: [schema_attribute_validators.is_numeric_value], + HedKey.StringRange: [], # Unclear what validation should be done here. + HedKey.UnitClassRange: [partial(schema_attribute_validators.item_exists_check, section_key=HedSectionKey.UnitClasses)], + HedKey.UnitRange: [schema_attribute_validators.unit_exists], + HedKey.ValueClassRange: [partial(schema_attribute_validators.item_exists_check, section_key=HedSectionKey.ValueClasses)] } for range_attribute in attribute_entry.attributes: validators += range_validators.get(range_attribute, []) diff --git a/hed/schema/schema_header_util.py b/hed/schema/schema_header_util.py index 8902faa2..791d83db 100644 --- a/hed/schema/schema_header_util.py +++ b/hed/schema/schema_header_util.py @@ -95,3 +95,5 @@ def validate_attributes(attrib_dict, name): if constants.VERSION_ATTRIBUTE not in attrib_dict: raise HedFileError(HedExceptions.SCHEMA_VERSION_INVALID, "No version attribute found in header", filename=name) + + diff --git a/hed/schema/schema_io/base2schema.py b/hed/schema/schema_io/base2schema.py index d84d0ac3..bd3da88e 100644 --- a/hed/schema/schema_io/base2schema.py +++ b/hed/schema/schema_io/base2schema.py @@ -1,18 +1,15 @@ import copy -import re +from hed.schema.schema_io import schema_util from hed.errors.exceptions import HedFileError, HedExceptions -from hed.errors.error_types import ErrorContext -from hed.schema import HedSchema, hed_schema_constants as constants + +from hed.schema.hed_schema import HedSchema +from hed.schema import hed_schema_constants as constants from hed.schema.hed_schema_constants import HedKey from abc import abstractmethod, ABC from hed.schema import schema_header_util from hed.schema import hed_schema_constants -# Might need separate version again for wiki -header_attr_expression = "([^ ,]+?)=\"(.*?)\"" -attr_re = re.compile(header_attr_expression) - class SchemaLoader(ABC): """ Baseclass for schema loading, to handle basic errors and partnered schemas @@ -53,7 +50,7 @@ def __init__(self, filename, schema_as_string=None, schema=None, file_format=Non hed_attributes = self._get_header_attributes(self.input_data) schema_header_util.validate_attributes(hed_attributes, name=self.name) - withStandard = hed_attributes.get(hed_schema_constants.WITH_STANDARD_ATTRIBUTE, "") + with_standard = hed_attributes.get(hed_schema_constants.WITH_STANDARD_ATTRIBUTE, "") self.library = hed_attributes.get(hed_schema_constants.LIBRARY_ATTRIBUTE, "") version_number = hed_attributes.get(hed_schema_constants.VERSION_ATTRIBUTE, "") if not schema: @@ -66,9 +63,10 @@ def __init__(self, filename, schema_as_string=None, schema=None, file_format=Non "Trying to load multiple normal schemas as a merged one with the same namespace. " "Ensure schemas have the withStandard header attribute set", self.name) - elif withStandard != self._schema.with_standard: - raise HedFileError(HedExceptions.BAD_WITH_STANDARD_VERSION, - "When merging two schemas without a schema namespace, you they must have the same withStandard value.", self.name) + elif with_standard != self._schema.with_standard: + raise HedFileError(HedExceptions.BAD_WITH_STANDARD_MULTIPLE_VALUES, + "When merging two schemas without a schema namespace, you they must have the same withStandard value.", + self.name) hed_attributes[hed_schema_constants.VERSION_ATTRIBUTE] = self._schema.version_number + f",{version_number}" hed_attributes[hed_schema_constants.LIBRARY_ATTRIBUTE] = self._schema.library + f",{self.library}" if name: @@ -116,7 +114,7 @@ def _load(self): try: base_version = load_schema_version(self._schema.with_standard) except HedFileError as e: - raise HedFileError(HedExceptions.BAD_WITH_STANDARD_VERSION, + raise HedFileError(HedExceptions.BAD_WITH_STANDARD, message=f"Cannot load withStandard schema '{self._schema.with_standard}'", filename=e.filename) # Copy the non-alterable cached schema @@ -151,7 +149,8 @@ def _add_to_dict_base(self, entry, key_class): if not entry.has_attribute(HedKey.InLibrary) and self.appending_to_schema and self._schema.merged: return None - if self.library and (not self._schema.with_standard or (not self._schema.merged and self._schema.with_standard)): + if self.library and ( + not self._schema.with_standard or (not self._schema.merged and self._schema.with_standard)): # only add it if not already present - This is a rare case if not entry.has_attribute(HedKey.InLibrary): entry._set_attribute_value(HedKey.InLibrary, self.library) @@ -214,70 +213,4 @@ def find_rooted_entry(tag_entry, schema, loading_merged): def _add_fatal_error(self, line_number, line, warning_message="Schema term is empty or the line is malformed", error_code=HedExceptions.WIKI_DELIMITERS_INVALID): - self.fatal_errors += self._format_error(line_number, line, warning_message, error_code) - - - @staticmethod - def _format_error(row_number, row, warning_message="Schema term is empty or the line is malformed", - error_code=HedExceptions.GENERIC_ERROR): - error = {'code': error_code, - ErrorContext.ROW: row_number, - ErrorContext.LINE: str(row), - "message": f"{warning_message}" - } - - return [error] - - # Below here are generic string loading functions, used by wiki and spreadsheet formats. - @staticmethod - def _validate_attribute_string(attribute_string): - pattern = r'^[A-Za-z]+(=.+)?$' - match = re.fullmatch(pattern, attribute_string) - if match: - return match.group() - - def _parse_attribute_string(self, row_number, attr_string): - if attr_string: - attributes_split = [x.strip() for x in attr_string.split(',')] - - final_attributes = {} - for attribute in attributes_split: - if self._validate_attribute_string(attribute) is None: - self._add_fatal_error(row_number, attr_string, - f"Malformed attribute found {attribute}. " - f"Valid formatting is: attribute, or attribute=\"value\".") - continue - split_attribute = attribute.split("=") - if len(split_attribute) == 1: - final_attributes[split_attribute[0]] = True - else: - if split_attribute[0] in final_attributes: - final_attributes[split_attribute[0]] += "," + split_attribute[1] - else: - final_attributes[split_attribute[0]] = split_attribute[1] - return final_attributes - else: - return {} - - @staticmethod - def _parse_attributes_line(version_line): - matches = {} - unmatched = [] - last_end = 0 - - for match in attr_re.finditer(version_line): - start, end = match.span() - - # If there's unmatched content between the last match and the current one. - if start > last_end: - unmatched.append(version_line[last_end:start]) - - matches[match.group(1)] = match.group(2) - last_end = end - - # If there's unmatched content after the last match - if last_end < len(version_line): - unmatched.append(version_line[last_end:]) - - unmatched = [m.strip() for m in unmatched if m.strip()] - return matches, unmatched + self.fatal_errors += schema_util.format_error(line_number, line, warning_message, error_code) diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py index 46f8e66d..27eab345 100644 --- a/hed/schema/schema_io/df2schema.py +++ b/hed/schema/schema_io/df2schema.py @@ -4,14 +4,14 @@ import io import os +import hed.schema.schema_io.ontology_util from hed.schema.hed_schema_constants import HedSectionKey, HedKey from hed.errors.exceptions import HedFileError, HedExceptions from .base2schema import SchemaLoader import pandas as pd -from hed.schema.schema_io.schema2df import Schema2DF -from hed.schema.hed_schema_df_constants import * -import copy +import hed.schema.hed_schema_df_constants as constants from hed.errors import error_reporter +from hed.schema.schema_io import text_util class SchemaLoaderDF(SchemaLoader): @@ -22,48 +22,29 @@ class SchemaLoaderDF(SchemaLoader): Note: due to supporting multiple files, this one differs from the other schema loaders """ - def __init__(self, filenames, schema_as_strings, name=""): - from hed.schema.hed_schema_io import load_schema_version - + def __init__(self, filenames, schema_as_strings_or_df, name=""): self.filenames = self.convert_filenames_to_dict(filenames) - self.schema_as_strings = schema_as_strings + self.schema_as_strings_or_df = schema_as_strings_or_df if self.filenames: - reported_filename = self.filenames.get(STRUCT_KEY) + reported_filename = self.filenames.get(constants.STRUCT_KEY) else: reported_filename = "from_strings" super().__init__(reported_filename, None, None, None, name) - # Grab the header attributes we already loaded - save_header = self._schema.header_attributes - # BFK - just load 8.3.0 for the non tag sections - version = save_header.get("withStandard", "8.3.0") - schema = copy.deepcopy(load_schema_version(version)) - - self._schema = schema - self._schema.header_attributes = save_header - - # Blow away tags section if needed. This will eventually be removed once we load all from spreadsheets. - if self._schema.merged or not self._schema.with_standard: - # todo: reset this once we load more from the spreadsheets - clear_sections(schema, [HedSectionKey.Tags]) - # clear_sections(schema, [HedSectionKey.Tags, HedSectionKey.UnitClasses, HedSectionKey.Units, - # HedSectionKey.ValueClasses, HedSectionKey.UnitModifiers, HedSectionKey.Properties, - # HedSectionKey.Attributes]) - self._schema.source_format = "spreadsheet" @classmethod - def load_spreadsheet(cls, filenames=None, schema_as_strings=None, name=""): + def load_spreadsheet(cls, filenames=None, schema_as_strings_or_df=None, name=""): """ Loads and returns the schema, including partnered schema if applicable. Parameters: filenames(str or None or dict of str): A valid set of schema spreadsheet filenames If a single filename string, assumes the standard filename suffixes. - schema_as_strings(None or dict of str): A valid set of schema spreadsheet files(tsv as strings) + schema_as_strings_or_df(None or dict of str): A valid set of schema spreadsheet files(tsv as strings) name (str): what to identify this schema as Returns: schema(HedSchema): The new schema """ - loader = cls(filenames, schema_as_strings=schema_as_strings, name=name) + loader = cls(filenames, schema_as_strings_or_df=schema_as_strings_or_df, name=name) return loader._load() @staticmethod @@ -71,22 +52,21 @@ def convert_filenames_to_dict(filenames): """Infers filename meaning based on suffix, e.g. _Tag for the tags sheet Parameters: - filenames(None or list or dict): The list to convert to a dict + filenames(str or None or list or dict): The list to convert to a dict Returns: filename_dict(str: str): The required suffix to filename mapping""" - needed_suffixes = {TAG_KEY, STRUCT_KEY} result_filenames = {} if isinstance(filenames, str): base, base_ext = os.path.splitext(filenames) - for suffix in needed_suffixes: + for suffix in constants.DF_SUFFIXES: filename = f"{base}_{suffix}.tsv" result_filenames[suffix] = filename filenames = result_filenames elif isinstance(filenames, list): for filename in filenames: remainder, suffix = filename.replace("_", "-").rsplit("-") - for needed_suffix in needed_suffixes: + for needed_suffix in constants.DF_SUFFIXES: if needed_suffix in suffix: result_filenames[needed_suffix] = filename filenames = result_filenames @@ -97,23 +77,32 @@ def _open_file(self): if self.filenames: dataframes = load_dataframes(self.filenames) else: - dataframes = load_dataframes_from_strings(self.schema_as_strings) + dataframes = load_dataframes_from_strings(self.schema_as_strings_or_df) return dataframes def _get_header_attributes(self, file_data): header_attributes = {} - for row_number, row in file_data[STRUCT_KEY].iterrows(): - cls = row["omn:SubClassOf"] - attributes = row["Attributes"] + for row_number, row in file_data[constants.STRUCT_KEY].iterrows(): + cls = row[constants.subclass_of] + attributes = row[constants.attributes] if cls == "HedHeader" and attributes: - header_attributes, _ = self._parse_attributes_line(attributes) + header_attributes, _ = text_util._parse_header_attributes_line(attributes) continue return header_attributes def _parse_data(self): self._schema.prologue, self._schema.epilogue = self._get_prologue_epilogue(self.input_data) + self._schema._initialize_attributes(HedSectionKey.Properties) + self._read_attribute_section(self.input_data[constants.ATTRIBUTE_PROPERTY_KEY], + section_key=HedSectionKey.Properties) + self._read_attributes() + self._read_section(self.input_data[constants.UNIT_MODIFIER_KEY], HedSectionKey.UnitModifiers) + self._read_section(self.input_data[constants.VALUE_CLASS_KEY], HedSectionKey.ValueClasses) + self._read_section(self.input_data[constants.UNIT_CLASS_KEY], HedSectionKey.UnitClasses) + self._read_units(self.input_data[constants.UNIT_KEY]) + # This one is a special case self._read_schema(self.input_data) if self.fatal_errors: self.fatal_errors = error_reporter.sort_issues(self.fatal_errors) @@ -124,9 +113,9 @@ def _parse_data(self): def _get_prologue_epilogue(self, file_data): prologue, epilogue = "", "" - for row_number, row in file_data[STRUCT_KEY].iterrows(): - cls = row["omn:SubClassOf"] - description = row["dc:description"] + for row_number, row in file_data[constants.STRUCT_KEY].iterrows(): + cls = row[constants.subclass_of] + description = row[constants.description] if cls == "HedPrologue" and description: prologue = description.replace("\\n", "\n") continue @@ -147,11 +136,11 @@ def _read_schema(self, dataframe): parent_tags = [] level_adj = 0 self._schema._initialize_attributes(HedSectionKey.Tags) - for row_number, row in dataframe[TAG_KEY].iterrows(): + for row_number, row in dataframe[constants.TAG_KEY].iterrows(): # skip blank rows, though there shouldn't be any if not any(row): continue - parent_tag = row["omn:SubClassOf"] + parent_tag = row[constants.subclass_of] # Return -1 by default for top level rooted tag support(they might not be in the dict) raw_level = known_tag_levels.get(parent_tag, -1) + 1 if raw_level == 0: @@ -175,6 +164,7 @@ def _read_schema(self, dataframe): known_tag_levels[tag_entry.short_tag_name] = raw_level + # todo: this is part 100% duplicated in wiki2schema try: rooted_entry = self.find_rooted_entry(tag_entry, self._schema, self._loading_merged) if rooted_entry: @@ -190,6 +180,37 @@ def _read_schema(self, dataframe): parent_tags.append(tag_entry.short_tag_name) + def _read_section(self, df, section_key): + self._schema._initialize_attributes(section_key) + + for row_number, row in df.iterrows(): + new_entry = self._create_entry(row_number, row, section_key) + self._add_to_dict(row_number, row, new_entry, section_key) + + def _read_units(self, df): + self._schema._initialize_attributes(HedSectionKey.Units) + + for row_number, row in df.iterrows(): + new_entry = self._create_entry(row_number, row, HedSectionKey.Units) + unit_class_name = row[constants.has_unit_class] + unit_class_entry = self._schema.get_tag_entry(unit_class_name, HedSectionKey.UnitClasses) + unit_class_entry.add_unit(new_entry) + self._add_to_dict(row_number, row, new_entry, HedSectionKey.Units) + + def _read_attributes(self): + self._schema._initialize_attributes(HedSectionKey.Attributes) + self._read_attribute_section(self.input_data[constants.ANNOTATION_KEY], True) + self._read_attribute_section(self.input_data[constants.OBJECT_KEY]) + self._read_attribute_section(self.input_data[constants.DATA_KEY]) + + def _read_attribute_section(self, df, annotation_property=False, section_key=HedSectionKey.Attributes): + # todo: this needs to ALSO check range/domain(and verify they match) + for row_number, row in df.iterrows(): + new_entry = self._create_entry(row_number, row, section_key) + if annotation_property: + new_entry._set_attribute_value(HedKey.AnnotationProperty, True) + self._add_to_dict(row_number, row, new_entry, section_key) + def _add_tag_line(self, parent_tags, line_number, row): """ Add a tag to the dictionaries. @@ -203,7 +224,7 @@ def _add_tag_line(self, parent_tags, line_number, row): Notes: Includes attributes and description. """ - tag_name = self._get_tag_name_from_row(row) + tag_name = self._get_name_from_row(row) if tag_name: if parent_tags: long_tag_name = "/".join(parent_tags) + "/" + tag_name @@ -215,34 +236,25 @@ def _add_tag_line(self, parent_tags, line_number, row): self._add_fatal_error(line_number, row, f"No tag name found in row.", error_code=HedExceptions.GENERIC_ERROR) - def _get_tag_name_from_row(self, row): - try: - base_tag_name = row["rdfs:label"] - if base_tag_name.endswith("-#"): - return "#" - return base_tag_name - except KeyError: - return None - - def _get_hedid_from_row(self, row): - try: - return row[hed_id_column] - except KeyError: - return None + @staticmethod + def _get_name_from_row(row): + base_tag_name = row[constants.name] + if base_tag_name.endswith("-#"): + return "#" + return base_tag_name def _create_entry(self, line_number, row, key_class, full_tag_name=None): - element_name = self._get_tag_name_from_row(row) + element_name = self._get_name_from_row(row) if full_tag_name: element_name = full_tag_name - hedID = self._get_hedid_from_row(row) - node_attributes = self._get_tag_attributes(line_number, row) - if hedID: - node_attributes[HedKey.HedID] = hedID + hed_id = row[constants.hed_id] + if hed_id: + node_attributes[HedKey.HedID] = hed_id - description = row["dc:description"] + description = row[constants.description] tag_entry = self._schema._create_tag_entry(element_name, key_class) if description: @@ -262,8 +274,10 @@ def _get_tag_attributes(self, row_number, row): Returns: dict: Dictionary of attributes. """ - attr_string = row["Attributes"] - return self._parse_attribute_string(row_number, attr_string) + try: + return hed.schema.schema_io.ontology_util.get_attributes_from_row(row) + except ValueError as e: + self._add_fatal_error(row_number, str(row), str(e)) def _add_to_dict(self, line_number, line, entry, key_class): if entry.has_attribute(HedKey.InLibrary) and not self._loading_merged and not self.appending_to_schema: @@ -274,170 +288,25 @@ def _add_to_dict(self, line_number, line, entry, key_class): return self._add_to_dict_base(entry, key_class) - - def load_dataframes(filenames): dict_filenames = SchemaLoaderDF.convert_filenames_to_dict(filenames) - return {key: pd.read_csv(filename, sep="\t", dtype=str, na_filter=False) for (key, filename) in dict_filenames.items()} - - -def load_dataframes_from_strings(data_contents): - # Assume data_contents is a list of tuples (key, tsv_string) - return {key: pd.read_csv(io.StringIO(tsv_string), sep="\t", dtype=str, na_filter=False) - for key, tsv_string in data_contents.items()} - - -def get_all_ids(df): - if hed_id_column in df.columns: - modified_df = df[hed_id_column].str.replace("HED_", "") - modified_df = pd.to_numeric(modified_df, errors="coerce").dropna().astype(int) - return set(modified_df.unique()) - return None - - -tag_index_ranges = { - "": (10000, 40000), - "score": (40000, 60000), - "lang": (60000, 80000) -} - -def _get_hedid_range(schema_name, section_key): - if section_key != HedSectionKey.Tags: - raise NotImplementedError("Cannot assign hedID's to non tag sections yet") - - starting_id, ending_id = tag_index_ranges[schema_name] - - tag_section_adj = 2000 - initial_tag_adj = 1 - starting_id += tag_section_adj + initial_tag_adj - return set(range(starting_id, ending_id)) - - -def update_dataframes_from_schema(dataframes, schema, schema_name=""): - # We're going to potentially alter the schema, so make a copy - schema = copy.deepcopy(schema) - - section_mapping = { - STRUCT_KEY: None, - TAG_KEY: HedSectionKey.Tags - } - - # todo: this needs to handle other sections eventually - for key, df in dataframes.items(): - section_key = section_mapping.get(key) - if not section_key: - continue - section = schema[section_key] - - hedid_errors = _verify_hedid_matches(section, df) - if hedid_errors: - raise HedFileError(hedid_errors[0]['code'], - f"{len(hedid_errors)} issues found with hedId mismatches. See the .issues " - f"parameter on this exception for more details.", schema.name, - issues=hedid_errors) - unused_tag_ids = _get_hedid_range(schema_name, section_key) - - # If no errors, assign new hed ID's - assign_hed_ids_section(section, unused_tag_ids, df) - - output_dfs = Schema2DF.process_schema(schema, save_merged=False) - - merge_dfs(output_dfs[TAG_KEY], dataframes[TAG_KEY]) - # Struct is special, just directly merge for now. - output_dfs[STRUCT_KEY] = pd.concat([dataframes[STRUCT_KEY], output_dfs[STRUCT_KEY]]).drop_duplicates('rdfs:label', keep='last').reset_index(drop=True) - - return output_dfs + dataframes = {} + for key, filename in dict_filenames.items(): + try: + dataframes[key] = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False) + except OSError: + dataframes[key] = None + return dataframes -def _verify_hedid_matches(section, df): - """ Verify ID's in both have the same label, and verify all entries in the dataframe are already in the schema +def load_dataframes_from_strings(schema_data): + """ Load the given strings/dataframes as dataframes. Parameters: - section(HedSchemaSection): The loaded schema section to compare ID's with - df(pd.DataFrame): The loaded spreadsheet dataframe to compare with + schema_data(dict): The dict of files(strings or pd.DataFrames) key being constants like TAG_KEY Returns: - error_list(list of str): A list of errors found matching id's + schema_data(dict): A dict with the same keys as schema_data, but values are dataframes if not before """ - hedid_errors = [] - for row_number, row in df.iterrows(): - if not any(row): - continue - label = row["rdfs:label"] - if label.endswith("-#"): - label = label.replace("-#", "/#") - df_id = row[hed_id_column] - entry = section.get(label) - if not entry: - hedid_errors += SchemaLoaderDF._format_error(row_number, row, - f"'{label}' does not exist in the schema file provided, only the spreadsheet.") - continue - entry_id = entry.attributes.get(HedKey.HedID) - if entry_id and entry_id != df_id: - hedid_errors += SchemaLoaderDF._format_error(row_number, row, - f"'{label}' has hedID '{df_id}' in dataframe, but '{entry_id}' in schema.") - continue - - return hedid_errors - - -def assign_hed_ids_schema(schema): - """Note: only assigns values to TAGS section for now.""" - for section_key in HedSectionKey: - section = schema[section_key] - # Still need to add hed ranges for non tag sections - if section_key != HedSectionKey.Tags: - continue - unused_tag_ids = _get_hedid_range(schema.library, section_key) - assign_hed_ids_section(section, unused_tag_ids, None) - - -def assign_hed_ids_section(section, unused_tag_ids, df=None): - spreadsheet_label_to_hedid = {} - if df is not None: - # Remove hedIds already used in the dataframe - unused_tag_ids -= get_all_ids(df) - spreadsheet_label_to_hedid = df.set_index('rdfs:label')['hedId'].to_dict() - - # Remove hedId's already used in the schema - section_used_ids = set( - int(entry.attributes.get(HedKey.HedID, "0").replace("HED_", "")) for entry in section.all_entries) - unused_tag_ids -= section_used_ids - - sorted_unused_ids = sorted(unused_tag_ids, reverse=True) - - # Next assign hed ID to this if needed - for entry in section.all_entries: - if section.section_key == HedSectionKey.Tags: - name = entry.short_tag_name - else: - name = entry.name - current_tag_id = spreadsheet_label_to_hedid.get(name) - if not current_tag_id: - current_tag_id = f"HED_{sorted_unused_ids.pop():07d}" - entry._set_attribute_value(HedKey.HedID, current_tag_id) - - -def merge_dfs(df1, df2): - """Merges df2 into df1, adding the extra columns from the ontology to the schema df.""" - # todo: vectorize this at some point - save_df1_columns = df1.columns.copy() - for index, row in df2.iterrows(): - # Find matching index in df1 based on 'rdfs:label' - match_index = df1[df1['rdfs:label'] == row['rdfs:label']].index - if not match_index.empty: - for col in df2.columns: - if col not in save_df1_columns: - df1.at[match_index[0], col] = row[col] - - return df1 - - -def clear_sections(schema, sections_to_clear): - # Temporary function until these spreadsheet writers are finished - # Also clear prologue and epilogue - schema.prologue = "" - schema.epilogue = "" - empty_sections = schema._create_empty_sections() - for section_key in sections_to_clear: - schema._sections[section_key] = empty_sections[section_key] + return {key: value if isinstance(value, pd.DataFrame) else pd.read_csv(io.StringIO(value), sep="\t", dtype=str, na_filter=False) + for key, value in schema_data.items()} diff --git a/hed/schema/schema_io/ontology_util.py b/hed/schema/schema_io/ontology_util.py new file mode 100644 index 00000000..6280d186 --- /dev/null +++ b/hed/schema/schema_io/ontology_util.py @@ -0,0 +1,351 @@ +"""Utility functions for saving as an ontology or dataframe.""" +import os + +import pandas as pd + +from hed.schema.schema_io import schema_util +from hed.errors import HedFileError +from hed.schema import hed_schema_df_constants as constants +from hed.schema.hed_schema_constants import HedKey +from hed.schema.schema_io.text_util import parse_attribute_string + +library_index_ranges = { + "": (10000, 40000), + "score": (40000, 60000), + "lang": (60000, 80000) +} +UNKNOWN_LIBRARY_VALUE = 9910000 + +object_type_id_offset = { + constants.OBJECT_KEY: (100, 300), + constants.DATA_KEY: (300, 500), + constants.ANNOTATION_KEY: (500, 700), + constants.ATTRIBUTE_PROPERTY_KEY: (700, 900), + constants.VALUE_CLASS_KEY: (1300, 1400), + constants.UNIT_MODIFIER_KEY: (1400, 1500), + constants.UNIT_CLASS_KEY: (1500, 1600), + constants.UNIT_KEY: (1600, 1700), + constants.TAG_KEY: (2000, -1), # -1 = go to end of range +} + + +def get_library_name_and_id(schema): + """ Get the library("Standard" for the standard schema) and first id for a schema range + + Parameters: + schema(HedSchema): The schema to check + + Returns: + library_name(str): The capitalized library name + first_id(int): the first id for a given library + """ + name = schema.library + + starting_id, _ = library_index_ranges.get(name, (UNKNOWN_LIBRARY_VALUE, 0)) + + if not name: + name = "standard" + return name.capitalize(), starting_id + + +def _get_hedid_range(schema_name, df_key): + """ Get the set of HedId's for this object type/schema name. + + Parameters: + schema_name(str): The known schema name with an assigned id range + df_key(str): The dataframe range type we're interested in. a key from constants.DF_SUFFIXES + + Returns: + number_set(set): A set of all id's in the requested range + """ + if df_key == constants.STRUCT_KEY: + raise NotImplementedError("Cannot assign hed_ids struct section") + + starting_id, ending_id = library_index_ranges[schema_name] + + start_object_range, end_object_range = object_type_id_offset[df_key] + initial_tag_adj = 1 # We always skip 1 + final_start = starting_id + start_object_range + initial_tag_adj + final_end = starting_id + end_object_range + if end_object_range == -1: + final_end = ending_id + return set(range(final_start, final_end)) + + +def get_all_ids(df): + """Returns a set of all unique hedIds in the dataframe + + Parameters: + df(pd.DataFrame): The dataframe + + Returns: + numbers(Set or None): None if this has no hed column, otherwise all unique numbers as a set. + """ + if constants.hed_id in df.columns: + modified_df = df[constants.hed_id].str.removeprefix("HED_") + modified_df = pd.to_numeric(modified_df, errors="coerce").dropna().astype(int) + return set(modified_df.unique()) + return None + + +def update_dataframes_from_schema(dataframes, schema, schema_name="", get_as_ids=False): + """ Write out schema as a dataframe, then merge in extra columns from dataframes. + + Parameters: + dataframes(dict of str:pd.DataFrames): A full set of schema spreadsheet formatted dataframes + schema(HedSchema): The schema to write into the dataframes: + schema_name(str): The name to use to find the schema id range. + get_as_ids(bool): If True, replace all known references with HedIds + + Returns: + dataframes(dict of str:pd.DataFrames): The updated dataframes + These dataframes acn (potentially including extra columns) + """ + # 1. Verify existing hed ids don't conflict between schema/dataframes + for key, df in dataframes.items(): + section_key = constants.section_mapping.get(key) + if not section_key: + continue + section = schema[section_key] + + hedid_errors = _verify_hedid_matches(section, df) + if hedid_errors: + raise HedFileError(hedid_errors[0]['code'], + f"{len(hedid_errors)} issues found with hedId mismatches. See the .issues " + f"parameter on this exception for more details.", schema.name, + issues=hedid_errors) + + # 2. Get the new schema as DFs + from hed.schema.schema_io.schema2df import Schema2DF # Late import as this is recursive + output_dfs = Schema2DF(get_as_ids=get_as_ids).process_schema(schema, save_merged=False) + + # 3: Add any hed ID's as needed to these generated dfs + for df_key, df in output_dfs.items(): + if df_key == constants.STRUCT_KEY: + continue + unused_tag_ids = _get_hedid_range(schema_name, df_key) + + # If no errors, assign new hed ID's + assign_hed_ids_section(df, unused_tag_ids) + + # 4: Merge the dataframes + for df_key in output_dfs.keys(): + out_df = output_dfs[df_key] + df = dataframes[df_key] + merge_dfs(out_df, df) + + return output_dfs + + +def _verify_hedid_matches(section, df): + """ Verify ID's in both have the same label, and verify all entries in the dataframe are already in the schema + + Parameters: + section(HedSchemaSection): The loaded schema section to compare ID's with + df(pd.DataFrame): The loaded spreadsheet dataframe to compare with + + Returns: + error_list(list of str): A list of errors found matching id's + """ + hedid_errors = [] + for row_number, row in df.iterrows(): + if not any(row): + continue + label = row[constants.name] + if label.endswith("-#"): + label = label.replace("-#", "/#") + df_id = row[constants.hed_id] + entry = section.get(label) + if not entry: + hedid_errors += schema_util.format_error(row_number, row, + f"'{label}' does not exist in the schema file provided, only the spreadsheet.") + continue + entry_id = entry.attributes.get(HedKey.HedID) + if entry_id and entry_id != df_id: + hedid_errors += schema_util.format_error(row_number, row, + f"'{label}' has hedID '{df_id}' in dataframe, but '{entry_id}' in schema.") + continue + + return hedid_errors + + +def assign_hed_ids_section(df, unused_tag_ids): + """ Adds missing HedIds to dataframe. + + Parameters: + df(pd.DataFrame): The dataframe to add id's to. + unused_tag_ids(set of int): The possible hed id's to assign from + """ + # Remove already used ids + unused_tag_ids -= get_all_ids(df) + sorted_unused_ids = sorted(unused_tag_ids, reverse=True) + + for row_number, row in df.iterrows(): + hed_id = row[constants.hed_id] + # we already verified existing ones + if hed_id: + continue + hed_id = f"HED_{sorted_unused_ids.pop():07d}" + row[constants.hed_id] = hed_id + + +def merge_dfs(dest_df, source_df): + """ Merges extra columns from source_df into dest_df, adding the extra columns from the ontology to the schema df. + + Args: + dest_df: The dataframe to add extra columns to + source_df: The dataframe to get extra columns from + """ + # todo: vectorize this at some point + save_df1_columns = dest_df.columns.copy() + for index, row in source_df.iterrows(): + # Find matching index in df1 based on 'rdfs:label' + match_index = dest_df[dest_df['rdfs:label'] == row['rdfs:label']].index + if not match_index.empty: + for col in source_df.columns: + if col not in save_df1_columns: + dest_df.at[match_index[0], col] = row[col] + + +def _get_annotation_prop_ids(dataframes): + annotation_props = {key: value for key, value in zip(dataframes[constants.ANNOTATION_KEY][constants.name], + dataframes[constants.ANNOTATION_KEY][constants.hed_id])} + # Also add schema properties + annotation_props.update( + {key: value for key, value in zip(dataframes[constants.ATTRIBUTE_PROPERTY_KEY][constants.name], + dataframes[constants.ATTRIBUTE_PROPERTY_KEY][constants.hed_id])}) + + return annotation_props + + +def convert_df_to_omn(dataframes): + """ Convert the dataframe format schema to omn format. + + Parameters: + dataframes(dict): A set of dataframes representing a schema, potentially including extra columns + + Returns: + omn_file(str): A combined string representing (most of) a schema omn file. + """ + from hed.schema.hed_schema_io import from_dataframes + # Load the schema, so we can save it out with ID's + schema = from_dataframes(dataframes) + # Convert dataframes to hedId format, and add any missing hedId's(generally, they should be replaced before here) + dataframes = update_dataframes_from_schema(dataframes, schema, get_as_ids=True) + + # Write out the new dataframes in omn format + annotation_props = _get_annotation_prop_ids(dataframes) + full_text = "" + for suffix, dataframe in dataframes.items(): + if suffix == constants.STRUCT_KEY: # not handled here yet + continue + output_text = _convert_df_to_omn(dataframes[suffix], annotation_properties=annotation_props) + full_text += output_text + "\n" + + return full_text + + +def _convert_df_to_omn(df, annotation_properties=("",)): + """Takes a single df format schema and converts it to omn. + + This is one section, e.g. tags, units, etc. + + Note: This mostly assumes a fully valid df. A df missing a required column will raise an error. + + Parameters: + df(pd.DataFrame): the dataframe to turn into omn + annotation_properties(dict of str:str): Known annotation properties, with the values being their hedId. + Returns: + omn_text(str): the omn formatted text for this section + """ + output_text = "" + for index, row in df.iterrows(): + prop_type = "Class" + if constants.property_type in row.index: + prop_type = row[constants.property_type] + hed_id = row[constants.hed_id] + output_text += f"{prop_type}: hed:{hed_id}\n" + annotation_lines = [] + description = row[constants.description] + if description: + annotation_lines.append(f"\t\t{constants.description} \"{description}\"") + name = row[constants.name] + if name: + annotation_lines.append(f"\t\t{constants.name} \"{name}\"") + + # Add annotation properties(other than HedId) + attributes = get_attributes_from_row(row) + for attribute in attributes: + if attribute in annotation_properties and attribute != HedKey.HedID: + annotation_id = f"hed:{annotation_properties[attribute]}" + value = attributes[attribute] + if value is True: + value = "true" + else: + value = f'"{value}"' + annotation_lines.append(f"\t\t{annotation_id} {value}") + + if annotation_lines: + output_text += "\tAnnotations:\n" + output_text += ",\n".join(annotation_lines) + output_text += "\n" + + if prop_type != "AnnotationProperty": + if constants.property_domain in row.index: + prop_domain = row[constants.property_domain] + output_text += "\tDomain:\n" + output_text += f"\t\t{prop_domain}\n" + if constants.property_range in row.index: + prop_range = row[constants.property_range] + output_text += "\tRange:\n" + output_text += f"\t\t{prop_range}\n" + output_text += "\n" + + if constants.equivalent_to in row.index: + equivalent_to = row[constants.equivalent_to] + equivalent_to = equivalent_to.replace(" and ", "\n\t\tand ") + subclass_of = row[constants.subclass_of] + if equivalent_to and equivalent_to != subclass_of: + output_text += "\tEquivalentTo:\n" + output_text += f"\t\t{equivalent_to}" + else: + output_text += "\tSubClassOf:\n" + output_text += f"\t\t{subclass_of}" + output_text += "\n" + + output_text += "\n" + return output_text + + +def save_dataframes(base_filename, dataframe_dict): + """ Writes out the dataframes using the provided suffixes. + + Does not validate contents or suffixes. + + Parameters: + base_filename(str): The base filename to use. Output is {base_filename}_{suffix}.tsv + See DF_SUFFIXES for all expected names. + dataframe_dict(dict of str: df.DataFrame): The list of files to save out. No validation is done. + """ + base, base_ext = os.path.splitext(base_filename) + for suffix, dataframe in dataframe_dict.items(): + filename = f"{base}_{suffix}.tsv" + with open(filename, mode='w', encoding='utf-8') as opened_file: + dataframe.to_csv(opened_file, sep='\t', index=False, header=True) + + +def get_attributes_from_row(row): + """ Get the tag attributes from a line. + + Parameters: + row (pd.Series): A tag line. + Returns: + dict: Dictionary of attributes. + """ + if constants.properties in row.index: + attr_string = row[constants.properties] + elif constants.attributes in row.index: + attr_string = row[constants.attributes] + else: + attr_string = "" + return parse_attribute_string(attr_string) diff --git a/hed/schema/schema_io/owl2schema.py b/hed/schema/schema_io/owl2schema.py deleted file mode 100644 index da8970ce..00000000 --- a/hed/schema/schema_io/owl2schema.py +++ /dev/null @@ -1,285 +0,0 @@ -# """ -# This module is used to create a HedSchema object from an OWL file or graph. -# """ -# -# -# from hed.errors.exceptions import HedFileError, HedExceptions -# from hed.schema.hed_schema_constants import HedSectionKey, HedKey -# from hed.schema import schema_validation_util -# from .base2schema import SchemaLoader -# import rdflib -# from rdflib.exceptions import ParserError -# from rdflib import RDF, RDFS, URIRef, OWL -# from collections import defaultdict -# -# from hed.schema.schema_io.owl_constants import HED, HEDT, HEDU, HEDUM -# -# -# class SchemaLoaderOWL(SchemaLoader): -# """ Loads XML schemas from filenames or strings. -# -# Expected usage is SchemaLoaderXML.load(filename) -# -# SchemaLoaderXML(filename) will load just the header_attributes -# """ -# def __init__(self, filename, schema_as_string=None, schema=None, file_format=None, name=""): -# if schema_as_string and not file_format: -# raise HedFileError(HedExceptions.BAD_PARAMETERS, -# "Must pass a file_format if loading owl schema as a string.", -# name) -# super().__init__(filename, schema_as_string, schema, file_format, name) -# -# self._schema.source_format = ".owl" -# self.graph = None -# # When loading, this stores rooted tag name -> full root path pairs -# self._rooted_cache = {} -# -# def _open_file(self): -# """Parses a Turtle/owl/etc file and returns the RDF graph.""" -# -# graph = rdflib.Graph() -# try: -# if self.filename: -# graph.parse(self.filename, format=self.file_format) -# else: -# graph.parse(data=self.schema_as_string, format=self.file_format) -# except FileNotFoundError as fnf_error: -# raise HedFileError(HedExceptions.FILE_NOT_FOUND, str(fnf_error), self.name) -# except ParserError as parse_error: -# raise HedFileError(HedExceptions.CANNOT_PARSE_RDF, str(parse_error), self.name) -# -# return graph -# -# def _read_prologue(self): -# """Reads the Prologue section from the ontology.""" -# prologue = self.graph.value(subject=HED.Prologue, predicate=HED.elementValue, any=False) -# return str(prologue) if prologue else "" -# -# def _read_epilogue(self): -# """Reads the Epilogue section from the ontology.""" -# epilogue = self.graph.value(subject=HED.Epilogue, predicate=HED.elementValue, any=False) -# return str(epilogue) if epilogue else "" -# -# def _get_header_attributes(self, graph): -# """Parses header attributes from an RDF graph into a dictionary.""" -# header_attributes = {} -# for s, _, _ in graph.triples((None, RDF.type, HED.HeaderMember)): -# label = graph.value(s, RDFS.label) -# if label: -# header_attribute = graph.value(s, HED.HeaderAttribute) -# header_attributes[str(label)] = str(header_attribute) if header_attribute else None -# return header_attributes -# -# def _parse_data(self): -# self.graph = self.input_data -# self.graph.bind("hed", HED) -# self.graph.bind("hedt", HEDT) -# self.graph.bind("hedu", HEDU) -# self.graph.bind("hedum", HEDUM) -# -# self._schema.epilogue = self._read_epilogue() -# self._schema.prologue = self._read_prologue() -# self._get_header_attributes(self.graph) -# self._read_properties() -# self._read_attributes() -# self._read_units() -# self._read_section(HedSectionKey.ValueClasses, HED.HedValueClass) -# self._read_section(HedSectionKey.UnitModifiers, HED.HedUnitModifier) -# self._read_tags() -# -# def get_local_names_from_uris(parent_chain, tag_uri): -# """ -# Extracts local names from URIs using RDFlib's n3() method. -# """ -# full_names = [] -# for uri in parent_chain + [tag_uri]: -# # Serialize the URI into N3 format and extract the local name -# name = uri.n3(namespace_manager=HED.namespace_manager).split(':')[-1] -# full_names.append(name) -# -# return full_names -# -# def sort_classes_by_hierarchy(self, classes): -# """ -# Sorts all tags based on assembled full name -# -# Returns: -# list of tuples. -# Left Tag URI, right side is parent labels(not including self) -# """ -# parent_chains = [] -# full_tag_names = [] -# for tag_uri in classes: -# parent_chain = self._get_parent_chain(tag_uri) -# parent_chain = [uri.n3(namespace_manager=self.graph.namespace_manager).split(':')[-1] for uri in parent_chain + [tag_uri]] -# # parent_chain = [self.graph.value(p, RDFS.label) or p for p in parent_chain + [tag_uri]] -# full_tag_names.append("/".join(parent_chain)) -# parent_chains.append((tag_uri, parent_chain[:-1])) -# -# # Sort parent_chains by full_tag_names. -# _, parent_chains = zip(*sorted(zip(full_tag_names, parent_chains))) -# -# return parent_chains -# -# def _get_parent_chain(self, cls): -# """ Recursively builds the parent chain for a given class. """ -# parent = self.graph.value(subject=cls, predicate=HED.hasHedParent) -# if parent is None: -# return [] -# return self._get_parent_chain(parent) + [parent] -# -# def _parse_uri(self, uri, key_class, name=None): -# if name: -# label = name -# else: -# label = self.graph.value(subject=uri, predicate=RDFS.label) -# if not label: -# raise ValueError(f"Empty label value found in owl file in uri {uri}") -# label = str(label) -# -# tag_entry = self._schema._create_tag_entry(label, key_class) -# -# description = self.graph.value(subject=uri, predicate=RDFS.comment) -# if description: -# tag_entry.description = str(description) -# -# section = self._schema._sections[key_class] -# valid_attributes = section.valid_attributes -# -# new_values = defaultdict(list) -# for predicate, obj in self.graph.predicate_objects(subject=uri): -# # Convert predicate URI to a readable string, assuming it's in a known namespace -# attr_name = predicate.n3(self.graph.namespace_manager).split(':')[1] -# -# if attr_name in valid_attributes: -# if isinstance(obj, URIRef): -# attr_value = obj.n3(self.graph.namespace_manager).split(':')[1] -# else: -# attr_value = str(obj) -# -# new_values[attr_name].append(attr_value) -# -# for name, value in new_values.items(): -# value = ",".join(value) -# if value == "true": -# value = True -# tag_entry._set_attribute_value(name, value) -# -# return tag_entry -# -# def _get_classes_with_subproperty(self, subproperty_uri, base_type): -# """Iterates over all classes that have a specified rdfs:subPropertyOf.""" -# classes = set() -# for s in self.graph.subjects(RDF.type, base_type): -# if (s, RDFS.subPropertyOf, subproperty_uri) in self.graph: -# classes.add(s) -# return classes -# -# def _get_all_subclasses(self, base_type): -# """ -# Recursively finds all subclasses of the given base_type. -# """ -# subclasses = set() -# for subclass in self.graph.subjects(RDFS.subClassOf, base_type): -# subclasses.add(subclass) -# subclasses.update(self._get_all_subclasses(subclass)) -# return subclasses -# -# def _get_classes(self, base_type): -# """ -# Retrieves all instances of the given base_type, including instances of its subclasses. -# """ -# classes = set() -# # Add instances of the base type -# for s in self.graph.subjects(RDF.type, base_type): -# classes.add(s) -# # Add instances of all subclasses -# for subclass in self._get_all_subclasses(base_type): -# for s in self.graph.subjects(RDF.type, subclass): -# classes.add(s) -# return classes -# -# def _read_properties(self): -# key_class = HedSectionKey.Properties -# self._schema._initialize_attributes(key_class) -# prop_uris = self._get_classes_with_subproperty(HED.schemaProperty, OWL.AnnotationProperty) -# for uri in prop_uris: -# new_entry = self._parse_uri(uri, key_class) -# self._add_to_dict(new_entry, key_class) -# -# def _read_attributes(self): -# key_class = HedSectionKey.Attributes -# self._schema._initialize_attributes(key_class) -# prop_uris = self._get_classes_with_subproperty(HED.schemaAttributeDatatypeProperty, OWL.DatatypeProperty) -# prop_uris.update(self._get_classes_with_subproperty(HED.schemaAttributeObjectProperty, OWL.ObjectProperty)) -# -# for uri in prop_uris: -# new_entry = self._parse_uri(uri, key_class) -# self._add_to_dict(new_entry, key_class) -# -# def _read_section(self, key_class, node_uri): -# self._schema._initialize_attributes(key_class) -# classes = self._get_classes(node_uri) -# for uri in classes: -# new_entry = self._parse_uri(uri, key_class) -# self._add_to_dict(new_entry, key_class) -# -# def _read_units(self): -# self._schema._initialize_attributes(HedSectionKey.UnitClasses) -# self._schema._initialize_attributes(HedSectionKey.Units) -# key_class = HedSectionKey.UnitClasses -# classes = self._get_classes(HED.HedUnitClass) -# unit_classes = {} -# for uri in classes: -# new_entry = self._parse_uri(uri, key_class) -# self._add_to_dict(new_entry, key_class) -# unit_classes[uri] = new_entry -# -# key_class = HedSectionKey.Units -# units = self._get_classes(HED.HedUnit) -# for uri in units: -# new_entry = self._parse_uri(uri, key_class) -# self._add_to_dict(new_entry, key_class) -# unit_class_uri = self.graph.value(subject=uri, predicate=HED.unitClass) -# class_entry = unit_classes.get(unit_class_uri) -# class_entry.add_unit(new_entry) -# -# def _add_tag_internal(self, uri, parent_tags): -# tag_name = self.graph.value(uri, RDFS.label) -# if not tag_name: -# raise ValueError(f"No label for uri {uri}") -# tag_name = str(tag_name) -# parents_and_child = parent_tags + [tag_name] -# if parent_tags and parents_and_child[0] in self._rooted_cache: -# full_tag = "/".join([self._rooted_cache[parents_and_child[0]]] + parents_and_child[1:]) -# else: -# full_tag = "/".join(parents_and_child) -# -# tag_entry = self._parse_uri(uri, HedSectionKey.Tags, full_tag) -# -# rooted_entry = schema_validation_util.find_rooted_entry(tag_entry, self._schema, self._loading_merged) -# if rooted_entry: -# loading_from_chain = rooted_entry.name + "/" + tag_entry.short_tag_name -# loading_from_chain_short = tag_entry.short_tag_name -# self._rooted_cache[tag_entry.short_tag_name] = loading_from_chain -# full_tag = full_tag.replace(loading_from_chain_short, loading_from_chain) -# tag_entry = self._parse_uri(uri, HedSectionKey.Tags, full_tag) -# -# self._add_to_dict(tag_entry, HedSectionKey.Tags) -# -# def _read_tags(self): -# """Populates a dictionary of dictionaries associated with tags and their attributes.""" -# classes = self._get_classes(HED.HedTag) -# classes.update(self._get_classes(HED.HedPlaceholder)) -# sorted_classes = self.sort_classes_by_hierarchy(classes) -# self._schema._initialize_attributes(HedSectionKey.Tags) -# for uri, parents in sorted_classes: -# self._add_tag_internal(uri, parents) -# -# def _add_to_dict(self, entry, key_class): -# if entry.has_attribute(HedKey.InLibrary) and not self._loading_merged and not self.appending_to_schema: -# raise HedFileError(HedExceptions.IN_LIBRARY_IN_UNMERGED, -# "Library tag in unmerged schema has InLibrary attribute", -# self.name) -# -# return self._add_to_dict_base(entry, key_class) diff --git a/hed/schema/schema_io/owl_constants.py b/hed/schema/schema_io/owl_constants.py deleted file mode 100644 index e63b95bd..00000000 --- a/hed/schema/schema_io/owl_constants.py +++ /dev/null @@ -1,50 +0,0 @@ -# from rdflib import Namespace -# -# from hed.schema.hed_schema_constants import HedSectionKey -# -# -# # Default file associations(notably owl maps to XML format, as we already use XML) -# ext_to_format = { -# ".ttl": "turtle", -# ".owl": "xml", -# ".json-ld": "json-ld" -# } -# -# # Core schema structural types in owl -# HED = Namespace("https://purl.org/hed#") -# # Tags -# HEDT = Namespace("https://purl.org/hed/tag#") -# # Unit classes, value classes, and units -# HEDU = Namespace("https://purl.org/hed/aux#") -# # Unit Modifiers -# HEDUM = Namespace("https://purl.org/hed/aux/unit_modifier#") -# -# # Some of this stuff may be commented back in later if needed -# -# # SECTION_ELEMENT_NAME = { -# # HedSectionKey.Tags: "StartSchemaSection", -# # HedSectionKey.UnitClasses: "UnitClassSection", -# # HedSectionKey.Units: "UnitSection", -# # HedSectionKey.UnitModifiers: "UnitModifiersSection", -# # HedSectionKey.ValueClasses: "ValueClassesSection", -# # HedSectionKey.Attributes: "AttributesSection", -# # HedSectionKey.Properties: "PropertiesSection", -# # } -# # -# # SECTION_ELEMENT_TYPE = { -# # HedSectionKey.Tags: "HedStartSchemaSection", -# # HedSectionKey.UnitClasses: "HedUnitClassSection", -# # HedSectionKey.Units: "HedUnitSection", -# # HedSectionKey.UnitModifiers: "HedUnitModifiersSection", -# # HedSectionKey.ValueClasses: "HedValueClassesSection", -# # HedSectionKey.Attributes: "HedAttributesSection", -# # HedSectionKey.Properties: "HedPropertiesSection", -# # } -# -# ELEMENT_NAMES = { -# HedSectionKey.Tags: "HedTag", -# HedSectionKey.Units: "HedUnit", -# HedSectionKey.UnitClasses: "HedUnitClass", -# HedSectionKey.UnitModifiers: "HedUnitModifier", -# HedSectionKey.ValueClasses: "HedValueClass", -# } diff --git a/hed/schema/schema_io/schema2base.py b/hed/schema/schema_io/schema2base.py index 5c8e1234..a46a3b67 100644 --- a/hed/schema/schema_io/schema2base.py +++ b/hed/schema/schema_io/schema2base.py @@ -11,11 +11,11 @@ def __init__(self): self._save_base = False self._save_merged = False self._strip_out_in_library = False + self._schema = None - @classmethod - def process_schema(cls, hed_schema, save_merged=False): + def process_schema(self, hed_schema, save_merged=False): """ - Takes a HedSchema object and returns a list of strings representing its .mediawiki version. + Takes a HedSchema object and returns it in the inherited form(mediawiki, xml, etc) Parameters ---------- @@ -34,33 +34,38 @@ def process_schema(cls, hed_schema, save_merged=False): raise HedFileError(HedExceptions.SCHEMA_LIBRARY_INVALID, "Cannot save a schema merged from multiple library schemas", hed_schema.filename) - saver = cls() - saver._save_lib = False - saver._save_base = False - saver._strip_out_in_library = True + + self._initialize_output() + self._save_lib = False + self._save_base = False + self._strip_out_in_library = True + self._schema = hed_schema # This is needed to save attributes in dataframes for now if hed_schema.with_standard: - saver._save_lib = True + self._save_lib = True if save_merged: - saver._save_base = True - saver._strip_out_in_library = False + self._save_base = True + self._strip_out_in_library = False else: # Saving a standard schema or a library schema without a standard schema save_merged = True - saver._save_lib = True - saver._save_base = True + self._save_lib = True + self._save_base = True - saver._save_merged = save_merged + self._save_merged = save_merged - saver._output_header(hed_schema.get_save_header_attributes(saver._save_merged), hed_schema.prologue) - saver._output_tags(hed_schema.tags) - saver._output_units(hed_schema.unit_classes) - saver._output_section(hed_schema, HedSectionKey.UnitModifiers) - saver._output_section(hed_schema, HedSectionKey.ValueClasses) - saver._output_section(hed_schema, HedSectionKey.Attributes) - saver._output_section(hed_schema, HedSectionKey.Properties) - saver._output_footer(hed_schema.epilogue) + self._output_header(hed_schema.get_save_header_attributes(self._save_merged), hed_schema.prologue) + self._output_tags(hed_schema.tags) + self._output_units(hed_schema.unit_classes) + self._output_section(hed_schema, HedSectionKey.UnitModifiers) + self._output_section(hed_schema, HedSectionKey.ValueClasses) + self._output_section(hed_schema, HedSectionKey.Attributes) + self._output_section(hed_schema, HedSectionKey.Properties) + self._output_footer(hed_schema.epilogue) - return saver.output + return self.output + + def _initialize_output(self): + raise NotImplementedError("This needs to be defined in the subclass") def _output_header(self, attributes, prologue): raise NotImplementedError("This needs to be defined in the subclass") diff --git a/hed/schema/schema_io/schema2df.py b/hed/schema/schema_io/schema2df.py index a7389b69..7917eb2f 100644 --- a/hed/schema/schema_io/schema2df.py +++ b/hed/schema/schema_io/schema2df.py @@ -1,58 +1,95 @@ """Allows output of HedSchema objects as .mediawiki format""" from hed.schema.hed_schema_constants import HedSectionKey, HedKey +from hed.schema.schema_io.ontology_util import get_library_name_and_id from hed.schema.schema_io.schema2base import Schema2Base import pandas as pd -from hed.schema.hed_schema_df_constants import * +import hed.schema.hed_schema_df_constants as constants +from hed.schema.hed_schema_entry import HedTagEntry + +section_key_to_df = { + HedSectionKey.Tags: constants.TAG_KEY, + HedSectionKey.Units: constants.UNIT_KEY, + HedSectionKey.UnitClasses: constants.UNIT_CLASS_KEY, + HedSectionKey.UnitModifiers: constants.UNIT_MODIFIER_KEY, + HedSectionKey.ValueClasses: constants.VALUE_CLASS_KEY, + HedSectionKey.Attributes: HedSectionKey.Attributes, + HedSectionKey.Properties: HedSectionKey.Properties +} class Schema2DF(Schema2Base): - # todo: add omn:EquivalentTo" - struct_columns = ["hedId", "rdfs:label", "Attributes", "omn:SubClassOf", "dc:description"] - tag_columns = ["hedId", "Level", "rdfs:label", "omn:SubClassOf", "Attributes", "dc:description"] - def __init__(self): + def __init__(self, get_as_ids=False): + """ Constructor for schema to dataframe converter + + Parameters: + get_as_ids(bool): If true, return the hedId rather than name in most places + This is mostly relevant for creating an ontology. + """ super().__init__() - self.current_tag_string = "" - self.current_tag_extra = "" - self.output = { - STRUCT_KEY: pd.DataFrame(columns=self.struct_columns, dtype=str), - TAG_KEY: pd.DataFrame(columns=self.tag_columns, dtype=str)} + self._get_as_ids = get_as_ids + + def _get_object_name_and_id(self, object_name, include_prefix=False): + """ Get the adjusted name and ID for the given object type. + + Parameters: + object_name(str): The name of the base hed object, e.g. HedHeader, HedUnit + include_prefix(bool): If True, include the "hed:" + Returns: + object_name(str): The inherited object name, e.g. StandardHeader + hed_id(str): The full formatted hed_id + """ + prefix, obj_id = get_library_name_and_id(self._schema) + name = f"{prefix}{object_name.removeprefix('Hed')}" + full_hed_id = self._get_object_id(object_name, obj_id, include_prefix) + return name, full_hed_id + + def _get_object_id(self, object_name, base_id=0, include_prefix=False): + prefix="" + if include_prefix: + prefix = "hed:" + return f"{prefix}HED_{base_id + constants.struct_base_ids[object_name]:07d}" + # ========================================= # Required baseclass function # ========================================= - def _output_header(self, attributes, prologue): - attributes_string = self._get_attribs_string_from_schema(attributes, sep=", ") - new_row = { - "hedId": f"HED_0010010", - "rdfs:label": "StandardHeader", - "Attributes": attributes_string, - "omn:SubClassOf": "HedHeader", - "dc:description": "", - # "omn:EquivalentTo": "", + def _initialize_output(self): + self.output = { + constants.STRUCT_KEY: pd.DataFrame(columns=constants.struct_columns, dtype=str), + constants.TAG_KEY: pd.DataFrame(columns=constants.tag_columns, dtype=str), + constants.UNIT_KEY: pd.DataFrame(columns=constants.unit_columns, dtype=str), + constants.UNIT_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str), + constants.UNIT_MODIFIER_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str), + constants.VALUE_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str), + constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str), + constants.DATA_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str), + constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str), + constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str), } - self.output[STRUCT_KEY].loc[len(self.output[STRUCT_KEY])] = new_row + def _create_and_add_object_row(self, base_object, attributes="", description=""): + name, full_hed_id = self._get_object_name_and_id(base_object) new_row = { - "hedId": f"HED_0010011", - "rdfs:label": "StandardPrologue", - "Attributes": "", - "omn:SubClassOf": "HedPrologue", - "dc:description": prologue.replace("\n", "\\n"), - # "omn:EquivalentTo": "", + constants.hed_id: full_hed_id, + constants.name: name, + constants.attributes: attributes, + constants.subclass_of: base_object, + constants.description: description.replace("\n", "\\n"), } - self.output[STRUCT_KEY].loc[len(self.output[STRUCT_KEY])] = new_row + self.output[constants.STRUCT_KEY].loc[len(self.output[constants.STRUCT_KEY])] = new_row + + def _output_header(self, attributes, prologue): + base_object = "HedHeader" + attributes_string = self._get_attribs_string_from_schema(attributes, sep=", ") + self._create_and_add_object_row(base_object, attributes_string) + + base_object = "HedPrologue" + self._create_and_add_object_row(base_object, description=prologue) def _output_footer(self, epilogue): - new_row = { - "hedId": f"HED_0010012", - "rdfs:label": "StandardEpilogue", - "Attributes": "", - "omn:SubClassOf": "HedEpilogue", - "dc:description": epilogue.replace("\n", "\\n"), - # "omn:EquivalentTo": "", - } - self.output[STRUCT_KEY].loc[len(self.output[STRUCT_KEY])] = new_row + base_object = "HedEpilogue" + self._create_and_add_object_row(base_object, description=epilogue) def _start_section(self, key_class): pass @@ -61,25 +98,230 @@ def _end_tag_section(self): pass def _write_tag_entry(self, tag_entry, parent_node=None, level=0): - # ["hedID", "Level", "rdfs:label", "Parent", "Attributes", "dc:description", "omn:EquivalentTo"] tag_id = tag_entry.attributes.get(HedKey.HedID, "") new_row = { - "hedId": tag_id, - "Level": f"{level}", - "rdfs:label": tag_entry.short_tag_name if not tag_entry.has_attribute(HedKey.TakesValue) else tag_entry.short_tag_name + "-#", - "omn:SubClassOf": tag_entry.parent.short_tag_name if tag_entry.parent else "HedTag", - "Attributes": self._format_tag_attributes(tag_entry.attributes), - "dc:description": tag_entry.description, - # "omn:EquivalentTo": "", + constants.hed_id: f"{tag_id}", + constants.level: f"{level}", + constants.name: tag_entry.short_tag_name if not tag_entry.has_attribute(HedKey.TakesValue) else tag_entry.short_tag_name + "-#", + constants.subclass_of: self._get_subclass_of(tag_entry), + constants.attributes: self._format_tag_attributes(tag_entry.attributes), + constants.description: tag_entry.description, + constants.equivalent_to: self._get_tag_equivalent_to(tag_entry), } - self.output[TAG_KEY].loc[len(self.output[TAG_KEY])] = new_row + self.output[constants.TAG_KEY].loc[len(self.output[constants.TAG_KEY])] = new_row def _write_entry(self, entry, parent_node, include_props=True): - # only tags page implemented so far + df_key = section_key_to_df.get(entry.section_key) + if not df_key: + return + + # Special case + if df_key == HedSectionKey.Properties: + return self._write_property_entry(entry) + elif df_key == HedSectionKey.Attributes: + return self._write_attribute_entry(entry, include_props=include_props) + df = self.output[df_key] + tag_id = entry.attributes.get(HedKey.HedID, "") + new_row = { + constants.hed_id: f"{tag_id}", + constants.name: entry.name, + constants.subclass_of: self._get_subclass_of(entry), + constants.attributes: self._format_tag_attributes(entry.attributes), + constants.description: entry.description, + constants.equivalent_to: self._get_tag_equivalent_to(entry), + } + # Handle the special case of units, which have the extra unit class + if hasattr(entry, "unit_class_entry"): + class_entry_name = entry.unit_class_entry.name + if self._get_as_ids: + class_entry_name = f"{entry.unit_class_entry.attributes.get(constants.hed_id)}" + new_row[constants.has_unit_class] = class_entry_name + df.loc[len(df)] = new_row pass + def _write_attribute_entry(self, entry, include_props): + df_key = constants.OBJECT_KEY + property_type = "ObjectProperty" + if HedKey.AnnotationProperty in entry.attributes: + df_key = constants.ANNOTATION_KEY + property_type = "AnnotationProperty" + elif (HedKey.NumericRange in entry.attributes + or HedKey.StringRange in entry.attributes + or HedKey.BoolRange in entry.attributes): + df_key = constants.DATA_KEY + property_type = "DataProperty" + + hed_id_mapping = { + "HedTag": self._get_object_id("HedTag", include_prefix=True), + "HedUnit": self._get_object_id("HedUnit", include_prefix=True), + "HedUnitClass": self._get_object_id("HedUnitClass", include_prefix=True), + "HedUnitModifier": self._get_object_id("HedUnitModifier", include_prefix=True), + "HedValueClass": self._get_object_id("HedValueClass", include_prefix=True), + "HedElement": self._get_object_id("HedElement", include_prefix=True), + "string": "xsd:string", + "boolean": "xsd:boolean", + "float": "xsd:float" + } + + domain_attributes = { + HedKey.TagDomain: "HedTag", + HedKey.UnitDomain: "HedUnit", + HedKey.UnitClassDomain: "HedUnitClass", + HedKey.UnitModifierDomain: "HedUnitModifier", + HedKey.ValueClassDomain: "HedValueClass", + HedKey.ElementDomain: "HedElement" + } + range_attributes = { + HedKey.StringRange: "string", + HedKey.TagRange: "HedTag", + HedKey.NumericRange: "float", + HedKey.BoolRange: "boolean", + HedKey.UnitRange: "HedUnit", + HedKey.UnitClassRange: "HedUnitClass", + HedKey.ValueClassRange: "HedValueClass" + } + + domain_keys = [key for key in entry.attributes if key in domain_attributes] + range_keys = [key for key in entry.attributes if key in range_attributes] + + if self._get_as_ids: + domain_string = " or ".join(hed_id_mapping[domain_attributes[key]] for key in domain_keys) + range_string = " or ".join(hed_id_mapping[range_attributes[key]] for key in range_keys) + else: + domain_string = " or ".join(domain_attributes[key] for key in domain_keys) + range_string = " or ".join(range_attributes[key] for key in range_keys) + + df = self.output[df_key] + tag_id = entry.attributes.get(HedKey.HedID, "") + new_row = { + constants.hed_id: f"{tag_id}", + constants.name: entry.name, + constants.property_type: property_type, + constants.property_domain: domain_string, + constants.property_range: range_string, + constants.properties: self._format_tag_attributes(entry.attributes) if include_props else "", + constants.description: entry.description, + } + df.loc[len(df)] = new_row + + def _write_property_entry(self, entry): + df_key = constants.ATTRIBUTE_PROPERTY_KEY + property_type = "AnnotationProperty" + df = self.output[df_key] + tag_id = entry.attributes.get(HedKey.HedID, "") + new_row = { + constants.hed_id: f"{tag_id}", + constants.name: entry.name, + constants.property_type: property_type, + constants.description: entry.description, + } + df.loc[len(df)] = new_row + def _attribute_disallowed(self, attribute): if super()._attribute_disallowed(attribute): return True # strip out hedID in dataframe format - return attribute == HedKey.HedID + return attribute in [HedKey.HedID, HedKey.AnnotationProperty] + + def _get_tag_equivalent_to(self, tag_entry): + subclass = self._get_subclass_of(tag_entry) + + attribute_types = { + "object": "some", + "data": "value" + } + range_types = { + HedKey.TagRange: HedSectionKey.Tags, + HedKey.UnitRange: HedSectionKey.Units, + HedKey.UnitClassRange: HedSectionKey.UnitClasses, + HedKey.ValueClassRange: HedSectionKey.ValueClasses, + HedKey.NumericRange: HedKey.NumericRange + } + attribute_strings = [] + for attribute, value in tag_entry.attributes.items(): + attribute_entry = self._schema.attributes.get(attribute) + attribute_type = self._calculate_attribute_type(attribute_entry) + if self._attribute_disallowed(attribute) or attribute_type == "annotation": + continue + if isinstance(value, str): + values = value.split(",") + values = [v.strip() for v in values] + found_range = None + for range_type in range_types: + if range_type in attribute_entry.attributes: + found_range = range_types[range_type] + break + if self._get_as_ids and found_range and found_range != HedKey.NumericRange: + section = self._schema[found_range] + if any(section.get(v) is None for v in values): + raise ValueError(f"Cannot find schema entry for {v}") + for v in values: + test_id = section.get(v).attributes.get(HedKey.HedID) + if not test_id: + raise ValueError(f"Schema entry {v} has no hedId.") + + values = [f"hed:{section.get(v).attributes[HedKey.HedID]}" for v in values] + # If not a known type, add quotes. + if not found_range: + values = [f'"{v}"' for v in values] + else: + if value is True: + value = 'true' + values = [value] + for v in values: + if self._get_as_ids: + attribute = f"hed:{attribute_entry.attributes[HedKey.HedID]}" + attribute_strings.append(f"({attribute} {attribute_types[attribute_type]} {v})") + if hasattr(tag_entry, "unit_class_entry"): + class_entry_name = tag_entry.unit_class_entry.name + if self._get_as_ids: + class_entry_name = f"hed:{tag_entry.unit_class_entry.attributes.get(constants.hed_id)}" + + if self._get_as_ids: + attribute_strings.append(f"(hed:HED_0000103 some {class_entry_name})") + else: + attribute_strings.append(f"({constants.has_unit_class} some {class_entry_name})") + if hasattr(tag_entry, "parent") and not tag_entry.parent: + schema_name, schema_id = self._get_object_name_and_id("HedSchema", include_prefix=True) + if self._get_as_ids: + attribute_strings.append(f"(hed:HED_0000102 some {schema_id})") + else: + attribute_strings.append(f"(inHedSchema some {schema_name}") + + return " and ".join([subclass] + attribute_strings) + + def _get_subclass_of(self, tag_entry): + # Special case for HedTag + if isinstance(tag_entry, HedTagEntry): + if self._get_as_ids: + parent_entry = tag_entry.parent + if parent_entry: + return f"hed:{parent_entry.attributes[HedKey.HedID]}" + + # HedTag always returns as base object + return "hed:HED_0000005" + else: + return tag_entry.parent.short_tag_name if tag_entry.parent else "HedTag" + + base_objects = { + HedSectionKey.Units: f"HedUnit", + HedSectionKey.UnitClasses: f"HedUnitClass", + HedSectionKey.UnitModifiers: f"HedUnitModifier", + HedSectionKey.ValueClasses: f"HedValueClass" + } + name, obj_id = self._get_object_name_and_id(base_objects[tag_entry.section_key], include_prefix=True) + + if self._get_as_ids: + return obj_id + return name + + @staticmethod + def _calculate_attribute_type(attribute_entry): + attributes = attribute_entry.attributes + object_ranges = {HedKey.TagRange, HedKey.UnitRange, HedKey.UnitClassRange, HedKey.ValueClassRange} + if HedKey.AnnotationProperty in attributes: + return "annotation" + elif any(attribute in object_ranges for attribute in attributes): + return "object" + return "data" + diff --git a/hed/schema/schema_io/schema2owl.py b/hed/schema/schema_io/schema2owl.py deleted file mode 100644 index 3b8563a2..00000000 --- a/hed/schema/schema_io/schema2owl.py +++ /dev/null @@ -1,313 +0,0 @@ -# """Allows output of HedSchema objects as .xml format""" -# -# from hed.schema.hed_schema_constants import HedSectionKey, HedKey -# from hed.schema.schema_io import owl_constants -# from hed.schema.schema_io.schema2base import Schema2Base -# from rdflib import Graph, RDF, RDFS, Literal, URIRef, OWL, XSD -# -# from hed.schema.schema_io.owl_constants import HED, HEDT, HEDU, HEDUM -# import re -# -# -# HED_URIS = { -# None: HED, -# HedSectionKey.Tags: HEDT, -# HedSectionKey.UnitClasses: HEDU, -# HedSectionKey.Units: HEDU, -# HedSectionKey.UnitModifiers: HEDUM, -# HedSectionKey.ValueClasses: HEDU, -# HedSectionKey.Attributes: HED, -# HedSectionKey.Properties: HED, -# } -# -# HED_ATTR = { -# "unitClass": HEDU, -# "valueClass": HEDU, -# "unit": HEDU, -# "unitModifier": HEDUM, -# "property": HED, -# "suggestedTag": HEDT, -# "relatedTag": HEDT, -# "rooted": HEDT, -# } -# -# float_attributes = {"conversionFactor"} -# -# hed_keys_with_types = { -# HedKey.ExtensionAllowed: XSD["boolean"], -# HedKey.Recommended: XSD["boolean"], -# HedKey.Required: XSD["boolean"], -# HedKey.RequireChild: XSD["boolean"], -# HedKey.TagGroup: XSD["boolean"], -# HedKey.TakesValue: XSD["boolean"], -# HedKey.TopLevelTagGroup: XSD["boolean"], -# HedKey.Unique: XSD["boolean"], -# HedKey.UnitClass: HED["HedUnitClass"], -# HedKey.ValueClass: HED["HedValueClass"], -# HedKey.RelatedTag: HED["HedTag"], -# HedKey.SuggestedTag: HED["HedTag"], -# HedKey.Rooted: HED["HedTag"], -# HedKey.DeprecatedFrom: XSD["string"], -# HedKey.ConversionFactor: XSD["string"], -# HedKey.Reserved: XSD["boolean"], -# HedKey.SIUnit: XSD["boolean"], -# HedKey.UnitSymbol: XSD["boolean"], -# HedKey.DefaultUnits: HED["HedUnit"], -# HedKey.UnitPrefix: XSD["boolean"], -# HedKey.SIUnitModifier: XSD["boolean"], -# HedKey.SIUnitSymbolModifier: XSD["boolean"], -# HedKey.AllowedCharacter: XSD["string"], -# HedKey.InLibrary: XSD["string"] -# } -# -# object_properties = {key for key, value in hed_keys_with_types.items() if value.startswith(HED)} -# -# -# class Schema2Owl(Schema2Base): -# def __init__(self): -# super().__init__() -# self.owl_graph = Graph() -# self.output = self.owl_graph -# self.owl_graph.bind("hed", HED) -# self.owl_graph.bind("hedt", HEDT) -# self.owl_graph.bind("hedu", HEDU) -# self.owl_graph.bind("hedum", HEDUM) -# -# # ========================================= -# # Required baseclass function -# # ========================================= -# def _output_header(self, attributes, prologue): -# # Create a dictionary mapping label names to property URIs -# property_uris = { -# "library": HED.Library, -# "unmerged": HED.Unmerged, -# "version": HED.Version, -# "withStandard": HED.WithStandard, -# "xmlns:xsi": HED.XSI, -# "xsi:noNamespaceSchemaLocation": HED.XSINoNamespaceSchemaLocation -# } -# -# for attrib_label, attrib_value in attributes.items(): -# prop_uri = property_uris.get(attrib_label) -# if prop_uri: -# self.owl_graph.add((prop_uri, RDF.type, HED.HeaderMember)) -# self.owl_graph.add((prop_uri, RDFS.label, Literal(attrib_label))) -# self.owl_graph.add((prop_uri, HED.HeaderAttribute, Literal(attrib_value))) -# -# self.owl_graph.add((HED.Prologue, RDF.type, HED.HedElement)) -# self.owl_graph.add((HED.Prologue, RDFS.label, Literal("epilogue"))) -# if prologue: -# self.owl_graph.add((HED.Prologue, HED["elementValue"], Literal(prologue))) -# -# def _output_footer(self, epilogue): -# self.owl_graph.add((HED.Epilogue, RDF.type, HED.HedElement)) -# self.owl_graph.add((HED.Epilogue, RDFS.label, Literal("epilogue"))) -# if epilogue: -# self.owl_graph.add((HED.Epilogue, HED["elementValue"], Literal(epilogue))) -# -# def _start_section(self, key_class): -# return None -# -# def _end_tag_section(self): -# pass -# -# def _write_attributes(self, entry_uri, entry): -# for attribute, value in entry.attributes.items(): -# is_bool = entry.attribute_has_property(attribute, "boolProperty") \ -# or entry.section_key == HedSectionKey.Attributes -# -# if self._attribute_disallowed(attribute): -# continue -# -# if is_bool: -# self.owl_graph.add((entry_uri, HED[attribute], Literal(True))) -# -# elif attribute in float_attributes: -# # Treat as a string for now -# self.owl_graph.add((entry_uri, HED[attribute], Literal(value))) -# else: -# # Todo: further develop this if needed or merge into base tools -# values = value.split(",") -# for val2 in values: -# clean_value = val2 -# if attribute in HED_ATTR: -# attribute_uri = HED_ATTR[attribute][clean_value] -# else: -# attribute_uri = Literal(clean_value) -# -# self.owl_graph.add((entry_uri, HED[attribute], attribute_uri)) -# -# def _add_entry(self, base_uri, tag_name, label, comment, parent=None, entry=None, -# tag_type=HED.HedTag, unit_class_uri=None): -# is_takes_value = entry.has_attribute("takesValue") -# if is_takes_value: -# tag_type = HED.HedPlaceholder -# tag_name = entry.short_tag_name + "-Placeholder" -# label = "#" -# -# tag_name = sanitize_for_turtle(tag_name) -# uri = f"{base_uri}{tag_name}" -# hed_tag_uri = URIRef(uri) -# -# self.owl_graph.add((hed_tag_uri, RDF.type, tag_type)) -# self.owl_graph.add((hed_tag_uri, RDFS.label, Literal(label))) -# if comment: -# self.owl_graph.add((hed_tag_uri, RDFS.comment, Literal(comment))) -# # Don't store the parent in unmerged rooted nodes -# if parent is not None and (HedKey.Rooted not in entry.attributes or self._save_merged): -# parent_uri = HEDT[parent] -# self.owl_graph.add((hed_tag_uri, HED.hasHedParent, parent_uri)) -# if unit_class_uri is not None: -# self.owl_graph.add((hed_tag_uri, HED.unitClass, unit_class_uri)) -# self._write_attributes(hed_tag_uri, entry) -# return hed_tag_uri -# -# def _add_property(self, base_uri, name, label, comment, entry, -# data_type, sub_type): -# name = sanitize_for_turtle(name) -# uri = f"{base_uri}{name}" -# hed_tag_uri = URIRef(uri) -# -# self.owl_graph.add((hed_tag_uri, RDF.type, data_type)) -# self.owl_graph.add((hed_tag_uri, RDFS.subPropertyOf, sub_type)) -# self.owl_graph.add((hed_tag_uri, RDFS.range, XSD.boolean)) -# self.owl_graph.add((hed_tag_uri, RDFS.label, Literal(label))) -# self.owl_graph.add((hed_tag_uri, RDFS.comment, Literal(comment))) -# self._write_attributes(hed_tag_uri, entry) -# -# return hed_tag_uri -# -# def _get_element_domains(self, entry): -# domain_table = {HedKey.ValueClassProperty: "HedValueClass", -# HedKey.UnitModifierProperty: "HedUnitModifier", -# HedKey.UnitProperty: "HedUnit", -# HedKey.ElementProperty: "HedElement", -# HedKey.UnitClassProperty: "HedUnitClass", -# HedKey.NodeProperty: "HedTag" -# } -# domains = [] -# for attribute in entry.attributes: -# if attribute in domain_table: -# domains.append(domain_table[attribute]) -# -# if not domains: -# domains.append(domain_table[HedKey.NodeProperty]) -# -# return domains -# -# def _add_attribute(self, base_uri, name, label, comment, entry): -# domains = self._get_element_domains(entry) -# name = sanitize_for_turtle(name) -# uri = f"{base_uri}{name}" -# hed_tag_uri = URIRef(uri) -# data_type = OWL.ObjectProperty -# sub_type = HED.schemaAttributeObjectProperty -# if name not in object_properties: -# data_type = OWL.DatatypeProperty -# sub_type = HED.schemaAttributeDatatypeProperty -# self.owl_graph.add((hed_tag_uri, RDF.type, data_type)) -# for domain in domains: -# self.owl_graph.add((hed_tag_uri, RDFS.domain, HED[domain])) -# self.owl_graph.add((hed_tag_uri, RDFS.subPropertyOf, sub_type)) -# self.owl_graph.add((hed_tag_uri, RDFS.range, hed_keys_with_types[name])) -# self.owl_graph.add((hed_tag_uri, RDFS.label, Literal(label))) -# self.owl_graph.add((hed_tag_uri, RDFS.comment, Literal(comment))) -# self._write_attributes(hed_tag_uri, entry) -# -# return hed_tag_uri -# -# def _write_tag_entry(self, tag_entry, parent_node=None, level=0): -# """ -# Creates a tag node and adds it to the parent. -# -# Parameters -# ---------- -# tag_entry: HedTagEntry -# The entry for that tag we want to write out -# parent_node: Any -# Unused -# level: Any -# Unused -# -# Returns -# ------- -# SubElement -# The added node -# """ -# tag_name = tag_entry.short_tag_name -# parent = tag_entry.parent -# if parent: -# parent = parent.short_tag_name -# comment = tag_entry.description -# return self._add_entry( -# HEDT, -# tag_name=tag_name, -# label=tag_name, -# comment=comment, -# parent=parent, -# entry=tag_entry -# ) -# -# def _write_entry(self, entry, parent_node=None, include_props=True): -# """ -# Creates an entry node and adds it to the parent. -# -# Parameters: -# entry(HedSchemaEntry): The entry for that tag we want to write out -# parent_node(str): URI for unit class owner, if this is a unit -# include_props(bool): Add the description and attributes to new node. -# Returns: -# str: The added URI -# """ -# key_class = entry.section_key -# prefix = HED_URIS[key_class] -# name = entry.name -# comment = entry.description -# if key_class == HedSectionKey.Attributes: -# uri = self._add_attribute( -# prefix, -# name=name, -# label=name, -# comment=comment, -# entry=entry -# ) -# elif key_class == HedSectionKey.Properties: -# uri = self._add_property( -# prefix, -# name=name, -# label=name, -# comment=comment, -# entry=entry, -# data_type=OWL.AnnotationProperty, -# sub_type=HED.schemaProperty -# ) -# else: -# unit_class_uri = None -# if key_class == HedSectionKey.Units: -# unit_class_uri = parent_node -# uri = self._add_entry( -# prefix, -# tag_name=name, -# label=name, -# comment=comment, -# entry=entry, -# tag_type=HED[owl_constants.ELEMENT_NAMES[key_class]], -# unit_class_uri=unit_class_uri -# ) -# return uri -# -# -# def sanitize_for_turtle(name): -# """ Sanitizes a string to be a valid IRIREF in Turtle, based on the SPARQL grammar. -# -# Excludes: `control characters, space, <, >, double quote, {, }, |, ^, backtick, and backslash.` -# Replacing them with underscores -# -# Parameters: -# name (str): The string to sanitize. -# -# Returns: -# str: A sanitized string suitable for use as an IRIREF in Turtle. -# """ -# invalid_chars_pattern = r'[\x00-\x20<>"{}\|^`\\]' -# return re.sub(invalid_chars_pattern, '_', name) diff --git a/hed/schema/schema_io/schema2wiki.py b/hed/schema/schema_io/schema2wiki.py index e0d216ab..ea1e3e48 100644 --- a/hed/schema/schema_io/schema2wiki.py +++ b/hed/schema/schema_io/schema2wiki.py @@ -10,11 +10,15 @@ def __init__(self): super().__init__() self.current_tag_string = "" self.current_tag_extra = "" - self.output = [] # ========================================= # Required baseclass function # ========================================= + def _initialize_output(self): + self.current_tag_string = "" + self.current_tag_extra = "" + self.output = [] + def _output_header(self, attributes, prologue): hed_attrib_string = self._get_attribs_string_from_schema(attributes) self.current_tag_string = f"{wiki_constants.HEADER_LINE_STRING} {hed_attrib_string}" @@ -55,7 +59,7 @@ def _write_tag_entry(self, tag_entry, parent_node=None, level=0): self.current_tag_string += f"'''{tag}'''" else: short_tag = tag.split("/")[-1] - tab_char = '' # Github mangles these, so remove spacing for now. + tab_char = '' # GitHub mangles these, so remove spacing for now. # takes value tags should appear after the nowiki tag. if short_tag.endswith("#"): self.current_tag_string += f"{tab_char * level}{'*' * level} " diff --git a/hed/schema/schema_io/schema2xml.py b/hed/schema/schema_io/schema2xml.py index d1845645..f453bfed 100644 --- a/hed/schema/schema_io/schema2xml.py +++ b/hed/schema/schema_io/schema2xml.py @@ -9,13 +9,17 @@ class Schema2XML(Schema2Base): def __init__(self): super().__init__() - self.hed_node = Element('HED') - # alias this to output to match baseclass expectation. - self.output = self.hed_node + self.hed_node = None + self.output = None # ========================================= # Required baseclass function # ========================================= + def _initialize_output(self): + self.hed_node = Element('HED') + # alias this to output to match baseclass expectation. + self.output = self.hed_node + def _output_header(self, attributes, prologue): for attrib_name, attrib_value in attributes.items(): self.hed_node.set(attrib_name, attrib_value) diff --git a/hed/schema/schema_io/schema_util.py b/hed/schema/schema_io/schema_util.py index df653ae9..02d00c52 100644 --- a/hed/schema/schema_io/schema_util.py +++ b/hed/schema/schema_io/schema_util.py @@ -7,6 +7,8 @@ from xml.etree import ElementTree from semantic_version import Version +from hed.errors import HedExceptions, ErrorContext + # you can fill this in locally if you don't want to add it to environ. github_api_access_token = "" @@ -111,4 +113,15 @@ def schema_version_greater_equal(hed_schema, target_version): if Version(version) >= target_version: return True - return False \ No newline at end of file + return False + + +def format_error(row_number, row, warning_message="Schema term is empty or the line is malformed", + error_code=HedExceptions.GENERIC_ERROR): + error = {'code': error_code, + ErrorContext.ROW: row_number, + ErrorContext.LINE: str(row), + "message": f"{warning_message}" + } + + return [error] diff --git a/hed/schema/schema_io/text_util.py b/hed/schema/schema_io/text_util.py new file mode 100644 index 00000000..84820f3a --- /dev/null +++ b/hed/schema/schema_io/text_util.py @@ -0,0 +1,71 @@ +"""Functions for parsing text from dataframes/text formats""" + +import re + +# Might need separate version again for wiki +header_attr_expression = "([^ ,]+?)=\"(.*?)\"" +attr_re = re.compile(header_attr_expression) + + +def _parse_header_attributes_line(version_line): + matches = {} + unmatched = [] + last_end = 0 + + for match in attr_re.finditer(version_line): + start, end = match.span() + + # If there's unmatched content between the last match and the current one. + if start > last_end: + unmatched.append(version_line[last_end:start]) + + matches[match.group(1)] = match.group(2) + last_end = end + + # If there's unmatched content after the last match + if last_end < len(version_line): + unmatched.append(version_line[last_end:]) + + unmatched = [m.strip() for m in unmatched if m.strip()] + return matches, unmatched + + +def _validate_attribute_string(attribute_string): + """Raises ValueError on bad input""" + pattern = r'^[A-Za-z]+(=.+)?$' + match = re.fullmatch(pattern, attribute_string) + if match: + return match.group() + raise ValueError(f'Malformed attribute {attribute_string} found. Valid formatting is: attribute, or attribute="value"') + + +def parse_attribute_string(attr_string): + """ Parse attributes for a single element into a dict. + + Parameters: + attr_string(str): Formatted attributes (a=b, c=d, etc.) + + Returns: + attributes(dict): The located attributes. Can be empty. + + :raises ValueError: + - Very malformed input + """ + if attr_string: + attributes_split = [x.strip() for x in attr_string.split(',')] + + final_attributes = {} + for attribute in attributes_split: + # Raises error on very invalid + _validate_attribute_string(attribute) + split_attribute = attribute.split("=") + if len(split_attribute) == 1: + final_attributes[split_attribute[0]] = True + else: + if split_attribute[0] in final_attributes: + final_attributes[split_attribute[0]] += "," + split_attribute[1] + else: + final_attributes[split_attribute[0]] = split_attribute[1] + return final_attributes + elif attr_string == "": + return {} diff --git a/hed/schema/schema_io/wiki2schema.py b/hed/schema/schema_io/wiki2schema.py index 84078bbe..0208aeb3 100644 --- a/hed/schema/schema_io/wiki2schema.py +++ b/hed/schema/schema_io/wiki2schema.py @@ -5,10 +5,11 @@ from hed.schema.hed_schema_constants import HedSectionKey, HedKey from hed.errors.exceptions import HedFileError, HedExceptions -from hed.errors import ErrorContext, error_reporter +from hed.errors import error_reporter from hed.schema.schema_io import wiki_constants from .base2schema import SchemaLoader from .wiki_constants import HedWikiSection, SectionStarts, SectionNames +from hed.schema.schema_io import text_util extend_here_line = 'extend here' @@ -248,7 +249,7 @@ def _get_header_attributes_internal(self, version_line): if "=" not in version_line: return self._get_header_attributes_internal_old(version_line) - attributes, malformed = self._parse_attributes_line(version_line) + attributes, malformed = text_util._parse_header_attributes_line(version_line) for m in malformed: # todo: May shift this at some point to report all errors @@ -356,9 +357,11 @@ def _get_tag_attributes(self, line_number, tag_line, starting_index): """ attr_string, starting_index = SchemaLoaderWiki._get_line_section(tag_line, starting_index, '{', '}') - if attr_string is None: - return None, starting_index - return self._parse_attribute_string(line_number, attr_string), starting_index + try: + return text_util.parse_attribute_string(attr_string), starting_index + except ValueError as e: + self._add_fatal_error(line_number, attr_string, str(e)) + return {}, starting_index @staticmethod def _get_line_section(tag_line, starting_index, start_delim='[', end_delim=']'): diff --git a/tests/schema/test_hed_schema_io_df.py b/tests/schema/test_hed_schema_io_df.py index a22e1548..a82ad2e7 100644 --- a/tests/schema/test_hed_schema_io_df.py +++ b/tests/schema/test_hed_schema_io_df.py @@ -1,10 +1,10 @@ import unittest import shutil -from hed.schema import load_schema, load_schema_version, from_string -from hed.schema.hed_schema_df_constants import * +from hed.schema.hed_schema_io import load_schema, load_schema_version, from_dataframes import os +from hed.schema.schema_io.df2schema import SchemaLoaderDF class TestHedSchemaDF(unittest.TestCase): @@ -42,18 +42,23 @@ def test_saving_default_schemas(self): reloaded_schema = load_schema(self.output_folder + "test_testlib2.tsv") self.assertEqual(schema, reloaded_schema) - def test_saving_default(self): + def test_from_dataframes(self): schema = load_schema_version("8.3.0") + filename = self.output_folder + "test_8_string.tsv" schema.save_as_dataframes(self.output_folder + "test_8_string.tsv") - filenames = {STRUCT_KEY: self.output_folder + "test_8_string_Structure.tsv", - TAG_KEY: self.output_folder + "test_8_string_Tag.tsv"} - + filenames = SchemaLoaderDF.convert_filenames_to_dict(filename) new_file_strings = {} for key, value in filenames.items(): with open(value, "r") as f: all_lines = f.readlines() new_file_strings[key] = "".join(all_lines) - reloaded_schema = from_string(new_file_strings, ".tsv") - self.assertEqual(schema, reloaded_schema) \ No newline at end of file + reloaded_schema = from_dataframes(new_file_strings) + self.assertEqual(schema, reloaded_schema) + + schema = load_schema_version("8.3.0") + dfs = schema.get_as_dataframes() + reloaded_schema = from_dataframes(dfs) + self.assertEqual(schema, reloaded_schema) + diff --git a/tests/schema/test_ontology_util.py b/tests/schema/test_ontology_util.py new file mode 100644 index 00000000..6a1c51b2 --- /dev/null +++ b/tests/schema/test_ontology_util.py @@ -0,0 +1,157 @@ +import unittest +import pandas as pd +from hed import HedFileError +from hed.schema import hed_schema_df_constants as constants +from hed.schema.schema_io import ontology_util +from hed.schema.schema_io.ontology_util import get_library_name_and_id, _verify_hedid_matches, assign_hed_ids_section, \ + get_all_ids, convert_df_to_omn, update_dataframes_from_schema +from hed import load_schema_version + + +class TestLibraryFunctions(unittest.TestCase): + def setUp(self): + pass + + def test_get_library_name_and_id_default(self): + # Test default case where no library name is provided + schema = load_schema_version("8.2.0") + name, first_id = get_library_name_and_id(schema) + self.assertEqual(name, "Standard") + self.assertEqual(first_id, 10000) + + def test_get_library_name_and_id_non_default(self): + # Test non-default case + schema = load_schema_version("score_1.1.0") + name, first_id = get_library_name_and_id(schema) + self.assertEqual(name, "Score") + self.assertEqual(first_id, 40000) + + def test_get_library_name_and_id_unknown(self): + # Test for an unknown library + schema = load_schema_version("testlib_2.0.0") + name, first_id = get_library_name_and_id(schema) + self.assertEqual(name, "Testlib") + self.assertEqual(first_id, ontology_util.UNKNOWN_LIBRARY_VALUE) + + def test_get_hedid_range_normal_case(self): + id_set = ontology_util._get_hedid_range("score", constants.DATA_KEY) + self.assertTrue(40401 in id_set) + self.assertEqual(len(id_set), 200 - 1) # Check the range size + + def test_get_hedid_range_boundary(self): + # Test boundary condition where end range is -1 + id_set = ontology_util._get_hedid_range("score", constants.TAG_KEY) + self.assertTrue(42001 in id_set) + self.assertEqual(len(id_set), 18000 - 1) # From 42001 to 60000 + + def test_get_hedid_range_error(self): + with self.assertRaises(NotImplementedError): + ontology_util._get_hedid_range("lang", constants.STRUCT_KEY) + + +class TestVerifyHedIdMatches(unittest.TestCase): + def setUp(self): + self.schema_82 = load_schema_version("8.2.0") + self.schema_id = load_schema_version("8.3.0") + + def test_no_hedid(self): + df = pd.DataFrame([{'rdfs:label': 'Event', 'hedId': '001'}, {'rdfs:label': 'Age-#', 'hedId': '002'}]) + errors = _verify_hedid_matches(self.schema_82.tags, df) + self.assertEqual(len(errors), 0) + + def test_id_matches(self): + df = pd.DataFrame( + [{'rdfs:label': 'Event', 'hedId': 'HED_0012001'}, {'rdfs:label': 'Age-#', 'hedId': 'HED_0012475'}]) + errors = _verify_hedid_matches(self.schema_id.tags, df) + self.assertEqual(len(errors), 0) + + def test_label_mismatch_id(self): + df = pd.DataFrame( + [{'rdfs:label': 'Event', 'hedId': 'invalid_id'}, {'rdfs:label': 'Age-#', 'hedId': 'invalid_id'}]) + + errors = _verify_hedid_matches(self.schema_id.tags, df) + self.assertEqual(len(errors), 2) + + def test_label_no_entry(self): + df = pd.DataFrame([{'rdfs:label': 'NotARealEvent', 'hedId': 'does_not_matter'}]) + + errors = _verify_hedid_matches(self.schema_id.tags, df) + self.assertEqual(len(errors), 1) + + def test_get_all_ids_exists(self): + # Test when hedId column exists and has proper prefixed IDs + df = pd.DataFrame({ + 'hedId': ['HED_0000001', 'HED_0000002', 'HED_0000003'] + }) + result = get_all_ids(df) + self.assertEqual(result, {1, 2, 3}) + + def test_get_all_ids_not_exists(self): + # Test when hedId column does not exist + df = pd.DataFrame({ + 'otherId': [1, 2, 3] + }) + result = get_all_ids(df) + self.assertIsNone(result) + + def test_get_all_ids_mixed_invalid(self): + # Test when hedId column exists but contains invalid and non-numeric entries + df = pd.DataFrame({ + 'hedId': ['HED_0000001', 'HED_ABC', 'HED_0000003', 'HED_'] + }) + result = get_all_ids(df) + self.assertEqual(result, {1, 3}) # Should ignore non-numeric and malformed IDs + + def test_assign_hed_ids_section(self): + df = pd.DataFrame({ + 'hedId': ['HED_0000001', 'HED_0000003', None, None], + 'label': ['Label1', 'Label2', 'Label3', 'Label4'] # Adding arbitrary labels + }) + expected_result = df.copy() + expected_result.loc[2, 'hedId'] = "HED_0000002" + expected_result.loc[3, 'hedId'] = "HED_0000004" + unused_tag_ids = {2, 4, 5} # Simulate unused hedIds + assign_hed_ids_section(df, unused_tag_ids) + + self.assertTrue(df.equals(expected_result)) + + +class TestUpdateDataframes(unittest.TestCase): + def test_update_dataframes_from_schema(self): + # valid direction first + schema_dataframes = load_schema_version("8.3.0").get_as_dataframes() + schema_83 = load_schema_version("8.3.0") + # Add a test column and ensure it stays around + fixed_value = "test_column_value" + for key, df in schema_dataframes.items(): + df['test_column'] = fixed_value + + updated_dataframes = update_dataframes_from_schema(schema_dataframes, schema_83) + + for key, df in updated_dataframes.items(): + self.assertTrue((df['test_column'] == fixed_value).all()) + # this is expected to bomb horribly, since schema lacks many of the spreadsheet entries. + schema = load_schema_version("8.2.0") + schema_dataframes_new = load_schema_version("8.3.0").get_as_dataframes() + try: + updated_dataframes = update_dataframes_from_schema(schema_dataframes_new, schema) + except HedFileError as e: + self.assertEqual(len(e.issues), 86) + breakHere = 3 + + +class TestConvertOmn(unittest.TestCase): + def test_convert_df_to_omn(self): + dataframes = load_schema_version("8.3.0").get_as_dataframes() + omn_version = convert_df_to_omn(dataframes) + + # todo ian: add another check here for hed ID's being located(it's okay if it's halffassed) + # make these more robust, for now just verify it's somewhere in the result + for df_name, df in dataframes.items(): + if df_name == constants.STRUCT_KEY: + continue # Not implemented yet + for label in df['rdfs:label']: + # Verify that the label is somewhere in the OMN text + error = f"Label '{label}' from dataframe '{df_name}' was not found in the OMN output." + label_key = f'rdfs:label "{label}"' + self.assertIn(label_key, omn_version, error) From 0eaa7232159c16439adfae9f57b0f3fccb659375 Mon Sep 17 00:00:00 2001 From: IanCa Date: Tue, 7 May 2024 14:29:05 -0500 Subject: [PATCH 2/3] Fix typos --- hed/schema/schema_io/ontology_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hed/schema/schema_io/ontology_util.py b/hed/schema/schema_io/ontology_util.py index 6280d186..da68197c 100644 --- a/hed/schema/schema_io/ontology_util.py +++ b/hed/schema/schema_io/ontology_util.py @@ -99,7 +99,7 @@ def update_dataframes_from_schema(dataframes, schema, schema_name="", get_as_ids Returns: dataframes(dict of str:pd.DataFrames): The updated dataframes - These dataframes acn (potentially including extra columns) + These dataframes can potentially have extra columns """ # 1. Verify existing hed ids don't conflict between schema/dataframes for key, df in dataframes.items(): From d3191aca7d5a1989478e60cac934929bacead267 Mon Sep 17 00:00:00 2001 From: IanCa Date: Tue, 7 May 2024 14:34:22 -0500 Subject: [PATCH 3/3] slightly improve ontology test --- tests/schema/test_ontology_util.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/schema/test_ontology_util.py b/tests/schema/test_ontology_util.py index 6a1c51b2..224902dd 100644 --- a/tests/schema/test_ontology_util.py +++ b/tests/schema/test_ontology_util.py @@ -145,7 +145,6 @@ def test_convert_df_to_omn(self): dataframes = load_schema_version("8.3.0").get_as_dataframes() omn_version = convert_df_to_omn(dataframes) - # todo ian: add another check here for hed ID's being located(it's okay if it's halffassed) # make these more robust, for now just verify it's somewhere in the result for df_name, df in dataframes.items(): if df_name == constants.STRUCT_KEY: @@ -155,3 +154,10 @@ def test_convert_df_to_omn(self): error = f"Label '{label}' from dataframe '{df_name}' was not found in the OMN output." label_key = f'rdfs:label "{label}"' self.assertIn(label_key, omn_version, error) + + for hed_id in df[constants.hed_id]: + if df_name == constants.STRUCT_KEY: + continue # Not implemented yet + base_id = f": hed:{hed_id}" + error = f"HedId '{base_id}' from dataframe '{df_name}' was not found in the OMN output." + self.assertIn(base_id, omn_version, error)