From 764442952a2739e275188f636f45d61ef11ca419 Mon Sep 17 00:00:00 2001
From: IanCa <ianrcallanan@gmail.com>
Date: Sun, 12 Jun 2022 16:18:22 -0500
Subject: [PATCH] Add requested_columns to column mapper.  Only check specific
 columns for definitions

---
 hed/models/base_input.py                  | 80 +++++++++++------------
 hed/models/column_mapper.py               | 40 ++++++++++--
 hed/models/tabular_input.py               |  3 +-
 hed/schema/hed_cache.py                   | 45 ++++++++-----
 hed/schema/hed_schema.py                  | 20 +++---
 hed/schema/hed_schema_io.py               |  3 +-
 hed/schema/{ => schema_io}/schema_util.py | 47 -------------
 tests/schema/test_schema_util.py          |  2 +-
 8 files changed, 115 insertions(+), 125 deletions(-)
 rename hed/schema/{ => schema_io}/schema_util.py (67%)

diff --git a/hed/models/base_input.py b/hed/models/base_input.py
index 187d27dfb..dacd2ad24 100644
--- a/hed/models/base_input.py
+++ b/hed/models/base_input.py
@@ -28,7 +28,7 @@ class BaseInput:
     COMMA_DELIMITER = ','
 
     def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=True, mapper=None, def_mapper=None,
-                 name=None):
+                 definition_columns=None, name=None):
         """ Constructor for the BaseInput class.
 
         Args:
@@ -39,6 +39,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
                 (Not applicable to tsv files.)
             has_column_names (bool): True if file has column names.
             mapper (ColumnMapper or None):  Indicates which columns have HED tags.
+            definition_columns(list or None): A list of columns to check for definitions.  Explicit 'None' means all.
             name (str or None): Optional field for how this file will report errors.
 
         Notes:
@@ -54,6 +55,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
         # This is the loaded workbook if we loaded originally from an excel file.
         self._loaded_workbook = None
         self._worksheet_name = worksheet_name
+        self._def_columns = definition_columns
         self.file_def_dict = None
         pandas_header = 0
         if not self._has_column_names:
@@ -234,43 +236,34 @@ def to_csv(self, file=None, output_processed_file=False):
                                                                     header=output_file._has_column_names)
         return csv_string_if_filename_none
 
-    def __iter__(self):
-        """ Iterate over the underlying dataframe. """
-        return self.iter_dataframe()
-
-    def iter_raw(self, hed_ops=None, error_handler=None, **kwargs):
-        """ Iterate all columns without substitutions
-
-        Args:
-            hed_ops (list, func, HedOps, or None): A func, a HedOps or a list of these to apply to the
-                hed strings before returning.
-            error_handler (ErrorHandler or None): Handler to use for context or a default one if None.
-            kwargs:
-
-        Yields:
-            - dict: A dict with column_number keys and values corresponding to the cell at that position.
+    @property
+    def columns(self):
+        """ Returns a list of the column names.
 
-        Notes:
-            - See models.hed_ops.translate_ops or the specific hed_ops for additional options.
-            - Primarily for altering or re-saving the original file (e.g., convert short tags to long).
-            - Used for initial processing when trying to find definitions.
+            Empty if no column names.
 
+        Returns:
+            columns(list): The column names.
         """
-        if error_handler is None:
-            error_handler = ErrorHandler()
+        columns = []
+        if self._dataframe is not None and self._has_column_names:
+            columns = list(self._dataframe.columns)
+        return columns
 
-        default_mapper = ColumnMapper()
-        return self.iter_dataframe(hed_ops=hed_ops, mapper=default_mapper, run_string_ops_on_columns=True,
-                                   error_handler=error_handler, **kwargs)
+    def __iter__(self):
+        """ Iterate over the underlying dataframe. """
+        return self.iter_dataframe()
 
-    def iter_dataframe(self, hed_ops=None, mapper=None, return_string_only=True, run_string_ops_on_columns=False,
-                       error_handler=None, expand_defs=False, remove_definitions=True, **kwargs):
+    def iter_dataframe(self, hed_ops=None, mapper=None, requested_columns=None, return_string_only=True,
+                       run_string_ops_on_columns=False, error_handler=None, expand_defs=False, remove_definitions=True,
+                       **kwargs):
         """ Iterate rows based on the given column mapper.
 
         Args:
             hed_ops (list, func, HedOps, or None):  A func, a HedOps or a list of these to apply to the
                                                     hed strings before returning.
             mapper (ColumnMapper or None): The column name to column number mapper (or internal mapper if None).
+            requested_columns(list or None): If this is not None, return ONLY these columns.  Names or numbers allowed.
             return_string_only (bool): If True, do not return issues list, individual columns, attribute columns, etc.
             run_string_ops_on_columns (bool):   If true, run all tag and string ops on columns,
                                                 rather than columns then rows.
@@ -289,6 +282,11 @@ def iter_dataframe(self, hed_ops=None, mapper=None, return_string_only=True, run
         if mapper is None:
             mapper = self._mapper
 
+        if requested_columns:
+            # Make a copy to ensure we don't alter the actual mapper
+            mapper = copy.deepcopy(mapper)
+            mapper.set_requested_columns(requested_columns)
+
         tag_funcs, string_funcs = self._translate_ops(hed_ops, run_string_ops_on_columns=run_string_ops_on_columns,
                                                       expand_defs=expand_defs, remove_definitions=remove_definitions,
                                                       error_handler=error_handler, **kwargs)
@@ -438,20 +436,13 @@ def _get_dataframe_from_worksheet(worksheet, has_headers):
         else:
             return pandas.DataFrame(worksheet.values, dtype=str)
 
-    def _run_validators(self, hed_ops, error_handler, run_on_raw=False, expand_defs=False, **kwargs):
+    def _run_validators(self, hed_ops, error_handler, expand_defs=False, **kwargs):
         validation_issues = []
-        if run_on_raw:
-            for row_dict in self.iter_raw(hed_ops=hed_ops,
-                                          return_string_only=False,
-                                          error_handler=error_handler, expand_defs=expand_defs,
-                                          **kwargs):
-                validation_issues += row_dict[model_constants.ROW_ISSUES]
-        else:
-            for row_dict in self.iter_dataframe(hed_ops=hed_ops,
-                                                return_string_only=False,
-                                                error_handler=error_handler, expand_defs=expand_defs,
-                                                **kwargs):
-                validation_issues += row_dict[model_constants.ROW_ISSUES]
+        for row_dict in self.iter_dataframe(hed_ops=hed_ops,
+                                            return_string_only=False,
+                                            error_handler=error_handler, expand_defs=expand_defs,
+                                            **kwargs):
+            validation_issues += row_dict[model_constants.ROW_ISSUES]
 
         return validation_issues
 
@@ -529,7 +520,14 @@ def extract_definitions(self, error_handler=None):
             error_handler = ErrorHandler()
         new_def_dict = DefinitionDict()
         hed_ops = [new_def_dict]
-        _ = self._run_validators(hed_ops, run_on_raw=True, error_handler=error_handler)
+        for _ in self.iter_dataframe(hed_ops=hed_ops,
+                                     return_string_only=False,
+                                     requested_columns=self._def_columns,
+                                     run_string_ops_on_columns=True,
+                                     remove_definitions=False,
+                                     error_handler=error_handler):
+            pass
+
         return new_def_dict
 
     def update_definition_mapper(self, def_dict):
diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py
index f8b3198a2..9c11568ff 100644
--- a/hed/models/column_mapper.py
+++ b/hed/models/column_mapper.py
@@ -18,7 +18,7 @@ class ColumnMapper:
         - Functions and variables column and row indexing starts at 0.
     """
     def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None,
-                 attribute_columns=None, optional_tag_columns=None):
+                 attribute_columns=None, optional_tag_columns=None, requested_columns=None):
         """ Constructor for ColumnMapper.
 
         Args:
@@ -45,8 +45,10 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None
         self.column_data = {}
         # Maps column number to column_entry.  This is what's actually used by most code.
         self._final_column_map = {}
+        self._no_mapping_info = True
 
         self._column_map = None
+        self._requested_columns = []
         self._tag_columns = []
         self._optional_tag_columns = []
         self._column_prefix_dictionary = {}
@@ -57,6 +59,7 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None
         self._set_sidecar(sidecar)
         self.add_columns(attribute_columns)
 
+        self.set_requested_columns(requested_columns, False)
         self.set_tag_columns(tag_columns, optional_tag_columns, False)
         self.set_column_prefix_dict(column_prefix_dictionary, False)
 
@@ -125,6 +128,21 @@ def set_tag_columns(self, tag_columns=None, optional_tag_columns=None, finalize_
             return issues
         return []
 
+    def set_requested_columns(self, requested_columns, finalize_mapping=True):
+        """ Set to return only the columns listed in requested_columns
+
+        Args:
+            requested_columns(list or None): If this is not None, return ONLY these columns.  Names or numbers allowed.
+            finalize_mapping(bool): Finalize the mapping right now if True
+
+        Returns:
+        issues(list): An empty list of mapping issues
+        """
+        self._requested_columns = requested_columns
+        if finalize_mapping:
+            return self._finalize_mapping()
+        return []
+
     def set_column_map(self, new_column_map=None):
         """ Set the column number to name mapping.
 
@@ -174,7 +192,7 @@ def _expand_column(self, column_number, input_text):
         """
 
         # Default 1-1 mapping if we don't have specific behavior.
-        if not self._final_column_map:
+        if self._no_mapping_info:
             return HedString(input_text), False
 
         # If no entry, ignore this column.
@@ -300,14 +318,17 @@ def _finalize_mapping(self):
         self._final_column_map = {}
         found_named_tag_columns = {}
         all_tag_columns = self._tag_columns + self._optional_tag_columns
+        if self._requested_columns:
+            all_tag_columns += self._requested_columns
         self._finalize_mapping_issues = []
         if self._column_map is not None:
             for column_number, column_name in self._column_map.items():
-                if column_name in self.column_data:
+                name_requested = self._column_name_requested(column_name)
+                if name_requested and column_name in self.column_data:
                     column_entry = self.column_data[column_name]
                     column_entry.column_name = column_name
                     self._final_column_map[column_number] = column_entry
-                elif column_name in all_tag_columns:
+                elif name_requested and column_name in all_tag_columns:
                     found_named_tag_columns[column_name] = column_number
                 elif column_name.startswith(PANDAS_COLUMN_PREFIX_TO_IGNORE):
                     continue
@@ -318,13 +339,16 @@ def _finalize_mapping(self):
 
         # Add any numbered columns
         for column_name, column_entry in self.column_data.items():
-            if isinstance(column_name, int):
+            if isinstance(column_name, int) and self._column_name_requested(column_name):
                 # Convert to internal numbering format
                 column_number = column_name
                 self._final_column_map[column_number] = column_entry
 
         # Add any tag columns
         for column_number in all_tag_columns:
+            name_requested = self._column_name_requested(column_number)
+            if not name_requested:
+                continue
             if isinstance(column_number, int):
                 if column_number not in self._final_column_map:
                     self._final_column_map[column_number] = ColumnMetadata(ColumnType.HEDTags, column_number)
@@ -340,8 +364,14 @@ def _finalize_mapping(self):
         for column_number, prefix in self._column_prefix_dictionary.items():
             self._set_column_prefix(column_number, prefix)
 
+        self._no_mapping_info = self._requested_columns is None and not self._final_column_map
         return self._finalize_mapping_issues
 
+    def _column_name_requested(self, column_name):
+        if self._requested_columns is None:
+            return True
+        return column_name in self._requested_columns
+
     def get_def_dicts(self):
         """ Return def dicts from every column description.
 
diff --git a/hed/models/tabular_input.py b/hed/models/tabular_input.py
index 5efc28614..42076ccbc 100644
--- a/hed/models/tabular_input.py
+++ b/hed/models/tabular_input.py
@@ -34,13 +34,14 @@ def __init__(self, file=None, sidecar=None, attribute_columns=None, extra_def_di
         new_mapper = ColumnMapper(sidecar=sidecar, optional_tag_columns=[self.HED_COLUMN_NAME],
                                   attribute_columns=attribute_columns)
 
+        definition_columns = [self.HED_COLUMN_NAME]
         self._sidecar = sidecar
         self._also_gather_defs = also_gather_defs
         self._extra_def_dicts = extra_def_dicts
         def_mapper = self.create_def_mapper(new_mapper, extra_def_dicts)
 
         super().__init__(file, file_type=".tsv", worksheet_name=None, has_column_names=True, mapper=new_mapper,
-                         def_mapper=def_mapper, name=name)
+                         def_mapper=def_mapper, name=name, definition_columns=definition_columns)
 
         if not self._has_column_names:
             raise ValueError("You are attempting to open a bids_old style file with no column headers provided.\n"
diff --git a/hed/schema/hed_cache.py b/hed/schema/hed_cache.py
index 57f34900c..5a7cf1525 100644
--- a/hed/schema/hed_cache.py
+++ b/hed/schema/hed_cache.py
@@ -10,7 +10,7 @@
 from semantic_version import Version
 import portalocker
 import time
-from hed.schema.schema_util import url_to_file
+from hed.schema.schema_io.schema_util import url_to_file
 
 """Infrastructure for caching HED schema from remote repositories."""
 
@@ -128,13 +128,18 @@ def cache_specific_url(hed_xml_url, xml_version=None, library_name=None, cache_f
     filename = hed_xml_url.split('/')[-1]
     cache_filename = os.path.join(cache_folder, filename)
 
+    return _cache_specific_url(hed_xml_url, cache_filename)
+
+
+def _cache_specific_url(hed_xml_url, cache_filename):
+    cache_folder = cache_filename.rpartition("/")[0]
     os.makedirs(cache_folder, exist_ok=True)
     temp_hed_xml_file = url_to_file(hed_xml_url)
     if temp_hed_xml_file:
-        cache_filename = _safe_copy_tmp_to_folder(temp_hed_xml_file, cache_filename)
+        cache_filename = _safe_move_tmp_to_folder(temp_hed_xml_file, cache_filename)
+        os.remove(temp_hed_xml_file)
         return cache_filename
-    else:
-        return None
+    return None
 
 
 def get_hed_version_path(xml_version=None, library_name=None, local_hed_directory=None):
@@ -184,7 +189,7 @@ def get_path_from_hed_version(hed_version, library_name=None, local_hed_director
 
 
 def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP_FOLDERS, cache_folder=None):
-    """ Cache a file from a URL.
+    """ Cache all schemas at the given URLs.
 
     Args:
         hed_base_urls (str or list): Path or list of paths.
@@ -298,6 +303,22 @@ def _sort_version_list(hed_versions):
 
 def _get_hed_xml_versions_from_url(hed_base_url, library_name=None,
                                    skip_folders=DEFAULT_SKIP_FOLDERS, get_libraries=False):
+    """ Get all available schemas and their hash values
+
+    Args:
+        hed_base_url (str): A single GitHub API url to cache
+        library_name(str or None): If str, cache only the named library schemas
+        skip_folders (list): A list of subfolders to skip over when downloading.
+        get_libraries (bool): If true, return a dictionary of version numbers, with an entry for each library name.
+
+    Returns:
+        list or dict: List of version numbers or dictionary {library_name: [versions]}.
+
+        - The Default skip_folders is 'deprecated'.
+        - The HED cache folder defaults to HED_CACHE_DIRECTORY.
+        - The directories on Github are of the form:
+            https://api.github.com/repos/hed-standard/hed-specification/contents/hedxml
+    """
     url_request = urllib.request.urlopen(hed_base_url)
     url_data = str(url_request.read(), 'utf-8')
     loaded_json = json.loads(url_data)
@@ -375,7 +396,7 @@ def _calculate_sha1(filename):
         return None
 
 
-def _safe_copy_tmp_to_folder(temp_hed_xml_file, dest_filename):
+def _safe_move_tmp_to_folder(temp_hed_xml_file, dest_filename):
     """ Copy to destination folder and rename.
 
     Args:
@@ -385,16 +406,12 @@ def _safe_copy_tmp_to_folder(temp_hed_xml_file, dest_filename):
     Returns:
         dest_filename (str): The new filename on success or None on failure.
 
-    Notes:
-        The file will be deleted on a successful copy.
-
     """
     _, temp_xml_file = os.path.split(temp_hed_xml_file)
     dest_folder, _ = os.path.split(dest_filename)
 
     temp_filename_in_cache = os.path.join(dest_folder, temp_xml_file)
     copyfile(temp_hed_xml_file, temp_filename_in_cache)
-    os.remove(temp_hed_xml_file)
     try:
         os.replace(temp_filename_in_cache, dest_filename)
     except OSError:
@@ -413,13 +430,7 @@ def _cache_hed_version(version, library_name, version_info, cache_folder):
     if sha_hash == local_sha_hash:
         return possible_cache_filename
 
-    os.makedirs(cache_folder, exist_ok=True)
-    temp_hed_xml_file = url_to_file(download_url)
-    if temp_hed_xml_file:
-        cache_filename = _safe_copy_tmp_to_folder(temp_hed_xml_file, possible_cache_filename)
-        return cache_filename
-    else:
-        return None
+    return _cache_specific_url(download_url, possible_cache_filename)
 
 
 def _get_latest_semantic_version_in_list(semantic_version_list):
diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py
index 772ccc307..063a58b4d 100644
--- a/hed/schema/hed_schema.py
+++ b/hed/schema/hed_schema.py
@@ -1,9 +1,10 @@
 
 from hed.schema.hed_schema_constants import HedKey, HedSectionKey
+from hed.schema.schema_io import schema_util
 from hed.schema.schema_io.schema2xml import HedSchema2XML
 from hed.schema.schema_io.schema2wiki import HedSchema2Wiki
 
-from hed.schema import schema_validation_util, schema_util
+from hed.schema import schema_validation_util
 from hed.schema.hed_schema_section import HedSchemaSection, HedSchemaTagSection
 from hed.errors import ErrorHandler
 from hed.errors.error_types import ValidationErrors
@@ -15,10 +16,7 @@ class HedSchema:
     def __init__(self):
         """ Constructor for the HedSchema class.
 
-
-        Returns:
-            HedSchema:  The constructed HED schema.
-
+            A HedSchema can be used for validation, checking tag attributes, parsing tags, etc.
         """
         self._has_duplicate_tags = False
         self.header_attributes = {}
@@ -31,7 +29,6 @@ def __init__(self):
         self._library_prefix = ""
 
         self._sections = self._create_empty_sections()
-        self.short_tag_mapping = {}
 
     # ===============================================
     # Basic schema properties
@@ -42,7 +39,6 @@ def filename(self):
 
         Returns:
             str: The filename of this schema.
-
         """
         return self._filename
 
@@ -51,7 +47,7 @@ def filename(self, value):
         """ Set the filename, if one has not already been set.
 
         Args:
-         value (str): The source filename for this file
+            value (str): The source filename for this file
         """
         if self._filename is None:
             self._filename = value
@@ -103,7 +99,7 @@ def valid_prefixes(self):
         Notes:
             - The return value is always length 1 if using a HedSchema.
         """
-        return list(self._library_prefix)
+        return [self._library_prefix]
 
     # ===============================================
     # Creation and saving functions
@@ -381,7 +377,7 @@ def find_tag_entry(self, tag, library_prefix=""):
             list: A list of errors while converting.
 
         Notes:
-            Works right to left (which is mostly relevant for errors).
+            Works left to right (which is mostly relevant for errors).
 
         """
         clean_tag = str(tag)
@@ -472,7 +468,7 @@ def _initialize_attributes(self, key_class):
             key_class (str): The section key for the section to update.
 
         """
-        self._sections[key_class].valid_attributes = self._get_attributes_for_class(key_class)
+        self._sections[key_class].valid_attributes = self._get_attributes_for_section(key_class)
 
     # ===============================================
     # Getters used to write out schema primarily.
@@ -624,7 +620,7 @@ def get_modifiers_for_unit(self, unit):
         valid_modifiers = self.unit_modifiers.get_entries_with_attribute(modifier_attribute_name)
         return valid_modifiers
 
-    def _get_attributes_for_class(self, key_class):
+    def _get_attributes_for_section(self, key_class):
         """ Return the valid attributes for this section.
 
         Args:
diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py
index 7c2f67d99..1f65b9247 100644
--- a/hed/schema/hed_schema_io.py
+++ b/hed/schema/hed_schema_io.py
@@ -4,9 +4,10 @@
 
 from hed.schema.schema_io.xml2schema import HedSchemaXMLParser
 from hed.schema.schema_io.wiki2schema import HedSchemaWikiParser
-from hed.schema import hed_schema_constants, hed_cache, schema_util
+from hed.schema import hed_schema_constants, hed_cache
 
 from hed.errors.exceptions import HedFileError, HedExceptions
+from hed.schema.schema_io import schema_util
 
 
 def from_string(schema_string, file_type=".xml", library_prefix=None):
diff --git a/hed/schema/schema_util.py b/hed/schema/schema_io/schema_util.py
similarity index 67%
rename from hed/schema/schema_util.py
rename to hed/schema/schema_io/schema_util.py
index 8dfc8ea87..5b9b08c13 100644
--- a/hed/schema/schema_util.py
+++ b/hed/schema/schema_io/schema_util.py
@@ -6,33 +6,6 @@
 from xml.dom import minidom
 from xml.etree import ElementTree
 
-NO_VERSION_INFO_STRING = "No version info found"
-
-
-def get_version_from_xml(hed_xml_tree):
-    """Get version from root node of an XML tree.
-
-        Args:
-            hed_xml_tree (Element):  The root node of an XML tree.
-
-        Returns:
-            str: The version of the HED schema (e.g. "8.0.0").
-
-        Raises:
-            KeyError or AttributeError: If invalid.
-
-        TODO: This should be moved to the schema module
-
-    """
-
-    if hed_xml_tree is None:
-        return NO_VERSION_INFO_STRING
-
-    try:
-        return hed_xml_tree.attrib['version']
-    except KeyError or AttributeError:
-        return NO_VERSION_INFO_STRING
-
 
 def url_to_file(resource_url):
     """ Write data from a URL resource into a file. Data is decoded as unicode.
@@ -65,22 +38,6 @@ def url_to_string(resource_url):
     return url_data
 
 
-def write_errors_to_file(issues, extension=".txt"):
-    """ Write an array of issue dictionaries to a temporary file.
-
-    Args:
-        issues (list):    List of 2-element dictionaries containing code and message keys.
-        extension (str):  Desired file extension.
-
-    Returns:
-        str: The name of the temporary file.
-    """
-    with tempfile.NamedTemporaryFile(suffix=extension, mode='w', delete=False, encoding='utf-8') as error_file:
-        for line in issues:
-            error_file.write(f"{line['code']}: {line['message']}\n")
-        return error_file.name
-
-
 def write_strings_to_file(output_strings, extension=None):
     """ Write output strings to a temporary file.
 
@@ -111,8 +68,6 @@ def write_xml_tree_2_xml_file(xml_tree, extension=".xml"):
     Returns:
         str:  Name of the temporary file.
 
-    TODO:  Should this be in the schema module?
-
     """
     with tempfile.NamedTemporaryFile(suffix=extension, mode='w', delete=False, encoding='utf-8') as hed_xml_file:
         xml_string = _xml_element_2_str(xml_tree)
@@ -129,8 +84,6 @@ def _xml_element_2_str(elem):
     Returns:
         str: An XML string representing the XML element.
 
-    TODO: Shouldn't this be with the schema?
-
     """
     rough_string = ElementTree.tostring(elem, method='xml')
     reparsed = minidom.parseString(rough_string)
diff --git a/tests/schema/test_schema_util.py b/tests/schema/test_schema_util.py
index 2331fce4a..fc75d4918 100644
--- a/tests/schema/test_schema_util.py
+++ b/tests/schema/test_schema_util.py
@@ -1,7 +1,7 @@
 import unittest
 import os
 
-from hed.schema import schema_util
+from hed.schema.schema_io import schema_util
 
 
 class Test(unittest.TestCase):