From 764442952a2739e275188f636f45d61ef11ca419 Mon Sep 17 00:00:00 2001 From: IanCa Date: Sun, 12 Jun 2022 16:18:22 -0500 Subject: [PATCH] Add requested_columns to column mapper. Only check specific columns for definitions --- hed/models/base_input.py | 80 +++++++++++------------ hed/models/column_mapper.py | 40 ++++++++++-- hed/models/tabular_input.py | 3 +- hed/schema/hed_cache.py | 45 ++++++++----- hed/schema/hed_schema.py | 20 +++--- hed/schema/hed_schema_io.py | 3 +- hed/schema/{ => schema_io}/schema_util.py | 47 ------------- tests/schema/test_schema_util.py | 2 +- 8 files changed, 115 insertions(+), 125 deletions(-) rename hed/schema/{ => schema_io}/schema_util.py (67%) diff --git a/hed/models/base_input.py b/hed/models/base_input.py index 187d27dfb..dacd2ad24 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -28,7 +28,7 @@ class BaseInput: COMMA_DELIMITER = ',' def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=True, mapper=None, def_mapper=None, - name=None): + definition_columns=None, name=None): """ Constructor for the BaseInput class. Args: @@ -39,6 +39,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T (Not applicable to tsv files.) has_column_names (bool): True if file has column names. mapper (ColumnMapper or None): Indicates which columns have HED tags. + definition_columns(list or None): A list of columns to check for definitions. Explicit 'None' means all. name (str or None): Optional field for how this file will report errors. Notes: @@ -54,6 +55,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T # This is the loaded workbook if we loaded originally from an excel file. self._loaded_workbook = None self._worksheet_name = worksheet_name + self._def_columns = definition_columns self.file_def_dict = None pandas_header = 0 if not self._has_column_names: @@ -234,43 +236,34 @@ def to_csv(self, file=None, output_processed_file=False): header=output_file._has_column_names) return csv_string_if_filename_none - def __iter__(self): - """ Iterate over the underlying dataframe. """ - return self.iter_dataframe() - - def iter_raw(self, hed_ops=None, error_handler=None, **kwargs): - """ Iterate all columns without substitutions - - Args: - hed_ops (list, func, HedOps, or None): A func, a HedOps or a list of these to apply to the - hed strings before returning. - error_handler (ErrorHandler or None): Handler to use for context or a default one if None. - kwargs: - - Yields: - - dict: A dict with column_number keys and values corresponding to the cell at that position. + @property + def columns(self): + """ Returns a list of the column names. - Notes: - - See models.hed_ops.translate_ops or the specific hed_ops for additional options. - - Primarily for altering or re-saving the original file (e.g., convert short tags to long). - - Used for initial processing when trying to find definitions. + Empty if no column names. + Returns: + columns(list): The column names. """ - if error_handler is None: - error_handler = ErrorHandler() + columns = [] + if self._dataframe is not None and self._has_column_names: + columns = list(self._dataframe.columns) + return columns - default_mapper = ColumnMapper() - return self.iter_dataframe(hed_ops=hed_ops, mapper=default_mapper, run_string_ops_on_columns=True, - error_handler=error_handler, **kwargs) + def __iter__(self): + """ Iterate over the underlying dataframe. """ + return self.iter_dataframe() - def iter_dataframe(self, hed_ops=None, mapper=None, return_string_only=True, run_string_ops_on_columns=False, - error_handler=None, expand_defs=False, remove_definitions=True, **kwargs): + def iter_dataframe(self, hed_ops=None, mapper=None, requested_columns=None, return_string_only=True, + run_string_ops_on_columns=False, error_handler=None, expand_defs=False, remove_definitions=True, + **kwargs): """ Iterate rows based on the given column mapper. Args: hed_ops (list, func, HedOps, or None): A func, a HedOps or a list of these to apply to the hed strings before returning. mapper (ColumnMapper or None): The column name to column number mapper (or internal mapper if None). + requested_columns(list or None): If this is not None, return ONLY these columns. Names or numbers allowed. return_string_only (bool): If True, do not return issues list, individual columns, attribute columns, etc. run_string_ops_on_columns (bool): If true, run all tag and string ops on columns, rather than columns then rows. @@ -289,6 +282,11 @@ def iter_dataframe(self, hed_ops=None, mapper=None, return_string_only=True, run if mapper is None: mapper = self._mapper + if requested_columns: + # Make a copy to ensure we don't alter the actual mapper + mapper = copy.deepcopy(mapper) + mapper.set_requested_columns(requested_columns) + tag_funcs, string_funcs = self._translate_ops(hed_ops, run_string_ops_on_columns=run_string_ops_on_columns, expand_defs=expand_defs, remove_definitions=remove_definitions, error_handler=error_handler, **kwargs) @@ -438,20 +436,13 @@ def _get_dataframe_from_worksheet(worksheet, has_headers): else: return pandas.DataFrame(worksheet.values, dtype=str) - def _run_validators(self, hed_ops, error_handler, run_on_raw=False, expand_defs=False, **kwargs): + def _run_validators(self, hed_ops, error_handler, expand_defs=False, **kwargs): validation_issues = [] - if run_on_raw: - for row_dict in self.iter_raw(hed_ops=hed_ops, - return_string_only=False, - error_handler=error_handler, expand_defs=expand_defs, - **kwargs): - validation_issues += row_dict[model_constants.ROW_ISSUES] - else: - for row_dict in self.iter_dataframe(hed_ops=hed_ops, - return_string_only=False, - error_handler=error_handler, expand_defs=expand_defs, - **kwargs): - validation_issues += row_dict[model_constants.ROW_ISSUES] + for row_dict in self.iter_dataframe(hed_ops=hed_ops, + return_string_only=False, + error_handler=error_handler, expand_defs=expand_defs, + **kwargs): + validation_issues += row_dict[model_constants.ROW_ISSUES] return validation_issues @@ -529,7 +520,14 @@ def extract_definitions(self, error_handler=None): error_handler = ErrorHandler() new_def_dict = DefinitionDict() hed_ops = [new_def_dict] - _ = self._run_validators(hed_ops, run_on_raw=True, error_handler=error_handler) + for _ in self.iter_dataframe(hed_ops=hed_ops, + return_string_only=False, + requested_columns=self._def_columns, + run_string_ops_on_columns=True, + remove_definitions=False, + error_handler=error_handler): + pass + return new_def_dict def update_definition_mapper(self, def_dict): diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py index f8b3198a2..9c11568ff 100644 --- a/hed/models/column_mapper.py +++ b/hed/models/column_mapper.py @@ -18,7 +18,7 @@ class ColumnMapper: - Functions and variables column and row indexing starts at 0. """ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None, - attribute_columns=None, optional_tag_columns=None): + attribute_columns=None, optional_tag_columns=None, requested_columns=None): """ Constructor for ColumnMapper. Args: @@ -45,8 +45,10 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None self.column_data = {} # Maps column number to column_entry. This is what's actually used by most code. self._final_column_map = {} + self._no_mapping_info = True self._column_map = None + self._requested_columns = [] self._tag_columns = [] self._optional_tag_columns = [] self._column_prefix_dictionary = {} @@ -57,6 +59,7 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None self._set_sidecar(sidecar) self.add_columns(attribute_columns) + self.set_requested_columns(requested_columns, False) self.set_tag_columns(tag_columns, optional_tag_columns, False) self.set_column_prefix_dict(column_prefix_dictionary, False) @@ -125,6 +128,21 @@ def set_tag_columns(self, tag_columns=None, optional_tag_columns=None, finalize_ return issues return [] + def set_requested_columns(self, requested_columns, finalize_mapping=True): + """ Set to return only the columns listed in requested_columns + + Args: + requested_columns(list or None): If this is not None, return ONLY these columns. Names or numbers allowed. + finalize_mapping(bool): Finalize the mapping right now if True + + Returns: + issues(list): An empty list of mapping issues + """ + self._requested_columns = requested_columns + if finalize_mapping: + return self._finalize_mapping() + return [] + def set_column_map(self, new_column_map=None): """ Set the column number to name mapping. @@ -174,7 +192,7 @@ def _expand_column(self, column_number, input_text): """ # Default 1-1 mapping if we don't have specific behavior. - if not self._final_column_map: + if self._no_mapping_info: return HedString(input_text), False # If no entry, ignore this column. @@ -300,14 +318,17 @@ def _finalize_mapping(self): self._final_column_map = {} found_named_tag_columns = {} all_tag_columns = self._tag_columns + self._optional_tag_columns + if self._requested_columns: + all_tag_columns += self._requested_columns self._finalize_mapping_issues = [] if self._column_map is not None: for column_number, column_name in self._column_map.items(): - if column_name in self.column_data: + name_requested = self._column_name_requested(column_name) + if name_requested and column_name in self.column_data: column_entry = self.column_data[column_name] column_entry.column_name = column_name self._final_column_map[column_number] = column_entry - elif column_name in all_tag_columns: + elif name_requested and column_name in all_tag_columns: found_named_tag_columns[column_name] = column_number elif column_name.startswith(PANDAS_COLUMN_PREFIX_TO_IGNORE): continue @@ -318,13 +339,16 @@ def _finalize_mapping(self): # Add any numbered columns for column_name, column_entry in self.column_data.items(): - if isinstance(column_name, int): + if isinstance(column_name, int) and self._column_name_requested(column_name): # Convert to internal numbering format column_number = column_name self._final_column_map[column_number] = column_entry # Add any tag columns for column_number in all_tag_columns: + name_requested = self._column_name_requested(column_number) + if not name_requested: + continue if isinstance(column_number, int): if column_number not in self._final_column_map: self._final_column_map[column_number] = ColumnMetadata(ColumnType.HEDTags, column_number) @@ -340,8 +364,14 @@ def _finalize_mapping(self): for column_number, prefix in self._column_prefix_dictionary.items(): self._set_column_prefix(column_number, prefix) + self._no_mapping_info = self._requested_columns is None and not self._final_column_map return self._finalize_mapping_issues + def _column_name_requested(self, column_name): + if self._requested_columns is None: + return True + return column_name in self._requested_columns + def get_def_dicts(self): """ Return def dicts from every column description. diff --git a/hed/models/tabular_input.py b/hed/models/tabular_input.py index 5efc28614..42076ccbc 100644 --- a/hed/models/tabular_input.py +++ b/hed/models/tabular_input.py @@ -34,13 +34,14 @@ def __init__(self, file=None, sidecar=None, attribute_columns=None, extra_def_di new_mapper = ColumnMapper(sidecar=sidecar, optional_tag_columns=[self.HED_COLUMN_NAME], attribute_columns=attribute_columns) + definition_columns = [self.HED_COLUMN_NAME] self._sidecar = sidecar self._also_gather_defs = also_gather_defs self._extra_def_dicts = extra_def_dicts def_mapper = self.create_def_mapper(new_mapper, extra_def_dicts) super().__init__(file, file_type=".tsv", worksheet_name=None, has_column_names=True, mapper=new_mapper, - def_mapper=def_mapper, name=name) + def_mapper=def_mapper, name=name, definition_columns=definition_columns) if not self._has_column_names: raise ValueError("You are attempting to open a bids_old style file with no column headers provided.\n" diff --git a/hed/schema/hed_cache.py b/hed/schema/hed_cache.py index 57f34900c..5a7cf1525 100644 --- a/hed/schema/hed_cache.py +++ b/hed/schema/hed_cache.py @@ -10,7 +10,7 @@ from semantic_version import Version import portalocker import time -from hed.schema.schema_util import url_to_file +from hed.schema.schema_io.schema_util import url_to_file """Infrastructure for caching HED schema from remote repositories.""" @@ -128,13 +128,18 @@ def cache_specific_url(hed_xml_url, xml_version=None, library_name=None, cache_f filename = hed_xml_url.split('/')[-1] cache_filename = os.path.join(cache_folder, filename) + return _cache_specific_url(hed_xml_url, cache_filename) + + +def _cache_specific_url(hed_xml_url, cache_filename): + cache_folder = cache_filename.rpartition("/")[0] os.makedirs(cache_folder, exist_ok=True) temp_hed_xml_file = url_to_file(hed_xml_url) if temp_hed_xml_file: - cache_filename = _safe_copy_tmp_to_folder(temp_hed_xml_file, cache_filename) + cache_filename = _safe_move_tmp_to_folder(temp_hed_xml_file, cache_filename) + os.remove(temp_hed_xml_file) return cache_filename - else: - return None + return None def get_hed_version_path(xml_version=None, library_name=None, local_hed_directory=None): @@ -184,7 +189,7 @@ def get_path_from_hed_version(hed_version, library_name=None, local_hed_director def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP_FOLDERS, cache_folder=None): - """ Cache a file from a URL. + """ Cache all schemas at the given URLs. Args: hed_base_urls (str or list): Path or list of paths. @@ -298,6 +303,22 @@ def _sort_version_list(hed_versions): def _get_hed_xml_versions_from_url(hed_base_url, library_name=None, skip_folders=DEFAULT_SKIP_FOLDERS, get_libraries=False): + """ Get all available schemas and their hash values + + Args: + hed_base_url (str): A single GitHub API url to cache + library_name(str or None): If str, cache only the named library schemas + skip_folders (list): A list of subfolders to skip over when downloading. + get_libraries (bool): If true, return a dictionary of version numbers, with an entry for each library name. + + Returns: + list or dict: List of version numbers or dictionary {library_name: [versions]}. + + - The Default skip_folders is 'deprecated'. + - The HED cache folder defaults to HED_CACHE_DIRECTORY. + - The directories on Github are of the form: + https://api.github.com/repos/hed-standard/hed-specification/contents/hedxml + """ url_request = urllib.request.urlopen(hed_base_url) url_data = str(url_request.read(), 'utf-8') loaded_json = json.loads(url_data) @@ -375,7 +396,7 @@ def _calculate_sha1(filename): return None -def _safe_copy_tmp_to_folder(temp_hed_xml_file, dest_filename): +def _safe_move_tmp_to_folder(temp_hed_xml_file, dest_filename): """ Copy to destination folder and rename. Args: @@ -385,16 +406,12 @@ def _safe_copy_tmp_to_folder(temp_hed_xml_file, dest_filename): Returns: dest_filename (str): The new filename on success or None on failure. - Notes: - The file will be deleted on a successful copy. - """ _, temp_xml_file = os.path.split(temp_hed_xml_file) dest_folder, _ = os.path.split(dest_filename) temp_filename_in_cache = os.path.join(dest_folder, temp_xml_file) copyfile(temp_hed_xml_file, temp_filename_in_cache) - os.remove(temp_hed_xml_file) try: os.replace(temp_filename_in_cache, dest_filename) except OSError: @@ -413,13 +430,7 @@ def _cache_hed_version(version, library_name, version_info, cache_folder): if sha_hash == local_sha_hash: return possible_cache_filename - os.makedirs(cache_folder, exist_ok=True) - temp_hed_xml_file = url_to_file(download_url) - if temp_hed_xml_file: - cache_filename = _safe_copy_tmp_to_folder(temp_hed_xml_file, possible_cache_filename) - return cache_filename - else: - return None + return _cache_specific_url(download_url, possible_cache_filename) def _get_latest_semantic_version_in_list(semantic_version_list): diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py index 772ccc307..063a58b4d 100644 --- a/hed/schema/hed_schema.py +++ b/hed/schema/hed_schema.py @@ -1,9 +1,10 @@ from hed.schema.hed_schema_constants import HedKey, HedSectionKey +from hed.schema.schema_io import schema_util from hed.schema.schema_io.schema2xml import HedSchema2XML from hed.schema.schema_io.schema2wiki import HedSchema2Wiki -from hed.schema import schema_validation_util, schema_util +from hed.schema import schema_validation_util from hed.schema.hed_schema_section import HedSchemaSection, HedSchemaTagSection from hed.errors import ErrorHandler from hed.errors.error_types import ValidationErrors @@ -15,10 +16,7 @@ class HedSchema: def __init__(self): """ Constructor for the HedSchema class. - - Returns: - HedSchema: The constructed HED schema. - + A HedSchema can be used for validation, checking tag attributes, parsing tags, etc. """ self._has_duplicate_tags = False self.header_attributes = {} @@ -31,7 +29,6 @@ def __init__(self): self._library_prefix = "" self._sections = self._create_empty_sections() - self.short_tag_mapping = {} # =============================================== # Basic schema properties @@ -42,7 +39,6 @@ def filename(self): Returns: str: The filename of this schema. - """ return self._filename @@ -51,7 +47,7 @@ def filename(self, value): """ Set the filename, if one has not already been set. Args: - value (str): The source filename for this file + value (str): The source filename for this file """ if self._filename is None: self._filename = value @@ -103,7 +99,7 @@ def valid_prefixes(self): Notes: - The return value is always length 1 if using a HedSchema. """ - return list(self._library_prefix) + return [self._library_prefix] # =============================================== # Creation and saving functions @@ -381,7 +377,7 @@ def find_tag_entry(self, tag, library_prefix=""): list: A list of errors while converting. Notes: - Works right to left (which is mostly relevant for errors). + Works left to right (which is mostly relevant for errors). """ clean_tag = str(tag) @@ -472,7 +468,7 @@ def _initialize_attributes(self, key_class): key_class (str): The section key for the section to update. """ - self._sections[key_class].valid_attributes = self._get_attributes_for_class(key_class) + self._sections[key_class].valid_attributes = self._get_attributes_for_section(key_class) # =============================================== # Getters used to write out schema primarily. @@ -624,7 +620,7 @@ def get_modifiers_for_unit(self, unit): valid_modifiers = self.unit_modifiers.get_entries_with_attribute(modifier_attribute_name) return valid_modifiers - def _get_attributes_for_class(self, key_class): + def _get_attributes_for_section(self, key_class): """ Return the valid attributes for this section. Args: diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py index 7c2f67d99..1f65b9247 100644 --- a/hed/schema/hed_schema_io.py +++ b/hed/schema/hed_schema_io.py @@ -4,9 +4,10 @@ from hed.schema.schema_io.xml2schema import HedSchemaXMLParser from hed.schema.schema_io.wiki2schema import HedSchemaWikiParser -from hed.schema import hed_schema_constants, hed_cache, schema_util +from hed.schema import hed_schema_constants, hed_cache from hed.errors.exceptions import HedFileError, HedExceptions +from hed.schema.schema_io import schema_util def from_string(schema_string, file_type=".xml", library_prefix=None): diff --git a/hed/schema/schema_util.py b/hed/schema/schema_io/schema_util.py similarity index 67% rename from hed/schema/schema_util.py rename to hed/schema/schema_io/schema_util.py index 8dfc8ea87..5b9b08c13 100644 --- a/hed/schema/schema_util.py +++ b/hed/schema/schema_io/schema_util.py @@ -6,33 +6,6 @@ from xml.dom import minidom from xml.etree import ElementTree -NO_VERSION_INFO_STRING = "No version info found" - - -def get_version_from_xml(hed_xml_tree): - """Get version from root node of an XML tree. - - Args: - hed_xml_tree (Element): The root node of an XML tree. - - Returns: - str: The version of the HED schema (e.g. "8.0.0"). - - Raises: - KeyError or AttributeError: If invalid. - - TODO: This should be moved to the schema module - - """ - - if hed_xml_tree is None: - return NO_VERSION_INFO_STRING - - try: - return hed_xml_tree.attrib['version'] - except KeyError or AttributeError: - return NO_VERSION_INFO_STRING - def url_to_file(resource_url): """ Write data from a URL resource into a file. Data is decoded as unicode. @@ -65,22 +38,6 @@ def url_to_string(resource_url): return url_data -def write_errors_to_file(issues, extension=".txt"): - """ Write an array of issue dictionaries to a temporary file. - - Args: - issues (list): List of 2-element dictionaries containing code and message keys. - extension (str): Desired file extension. - - Returns: - str: The name of the temporary file. - """ - with tempfile.NamedTemporaryFile(suffix=extension, mode='w', delete=False, encoding='utf-8') as error_file: - for line in issues: - error_file.write(f"{line['code']}: {line['message']}\n") - return error_file.name - - def write_strings_to_file(output_strings, extension=None): """ Write output strings to a temporary file. @@ -111,8 +68,6 @@ def write_xml_tree_2_xml_file(xml_tree, extension=".xml"): Returns: str: Name of the temporary file. - TODO: Should this be in the schema module? - """ with tempfile.NamedTemporaryFile(suffix=extension, mode='w', delete=False, encoding='utf-8') as hed_xml_file: xml_string = _xml_element_2_str(xml_tree) @@ -129,8 +84,6 @@ def _xml_element_2_str(elem): Returns: str: An XML string representing the XML element. - TODO: Shouldn't this be with the schema? - """ rough_string = ElementTree.tostring(elem, method='xml') reparsed = minidom.parseString(rough_string) diff --git a/tests/schema/test_schema_util.py b/tests/schema/test_schema_util.py index 2331fce4a..fc75d4918 100644 --- a/tests/schema/test_schema_util.py +++ b/tests/schema/test_schema_util.py @@ -1,7 +1,7 @@ import unittest import os -from hed.schema import schema_util +from hed.schema.schema_io import schema_util class Test(unittest.TestCase):