hed-standard · VisLab · Jun 12, 2022 · Jun 12, 2022
diff --git a/hed/models/base_input.py b/hed/models/base_input.py
@@ -28,7 +28,7 @@ class BaseInput:
     COMMA_DELIMITER = ','
 
     def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=True, mapper=None, def_mapper=None,
-                 name=None):
+                 definition_columns=None, name=None):
         """ Constructor for the BaseInput class.
 
         Args:
@@ -39,6 +39,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
                 (Not applicable to tsv files.)
             has_column_names (bool): True if file has column names.
             mapper (ColumnMapper or None):  Indicates which columns have HED tags.
+            definition_columns(list or None): A list of columns to check for definitions.  Explicit 'None' means all.
             name (str or None): Optional field for how this file will report errors.
 
         Notes:
@@ -54,6 +55,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
         # This is the loaded workbook if we loaded originally from an excel file.
         self._loaded_workbook = None
         self._worksheet_name = worksheet_name
+        self._def_columns = definition_columns
         self.file_def_dict = None
         pandas_header = 0
         if not self._has_column_names:
@@ -234,43 +236,34 @@ def to_csv(self, file=None, output_processed_file=False):
                                                                     header=output_file._has_column_names)
         return csv_string_if_filename_none
 
-    def __iter__(self):
-        """ Iterate over the underlying dataframe. """
-        return self.iter_dataframe()
-
-    def iter_raw(self, hed_ops=None, error_handler=None, **kwargs):
-        """ Iterate all columns without substitutions
-
-        Args:
-            hed_ops (list, func, HedOps, or None): A func, a HedOps or a list of these to apply to the
-                hed strings before returning.
-            error_handler (ErrorHandler or None): Handler to use for context or a default one if None.
-            kwargs:
-
-        Yields:
-            - dict: A dict with column_number keys and values corresponding to the cell at that position.
+    @property
+    def columns(self):
+        """ Returns a list of the column names.
 
-        Notes:
-            - See models.hed_ops.translate_ops or the specific hed_ops for additional options.
-            - Primarily for altering or re-saving the original file (e.g., convert short tags to long).
-            - Used for initial processing when trying to find definitions.
+            Empty if no column names.
 
+        Returns:
+            columns(list): The column names.
         """
-        if error_handler is None:
-            error_handler = ErrorHandler()
+        columns = []
+        if self._dataframe is not None and self._has_column_names:
+            columns = list(self._dataframe.columns)
+        return columns
 
-        default_mapper = ColumnMapper()
-        return self.iter_dataframe(hed_ops=hed_ops, mapper=default_mapper, run_string_ops_on_columns=True,
-                                   error_handler=error_handler, **kwargs)
+    def __iter__(self):
+        """ Iterate over the underlying dataframe. """
+        return self.iter_dataframe()
 
-    def iter_dataframe(self, hed_ops=None, mapper=None, return_string_only=True, run_string_ops_on_columns=False,
-                       error_handler=None, expand_defs=False, remove_definitions=True, **kwargs):
+    def iter_dataframe(self, hed_ops=None, mapper=None, requested_columns=None, return_string_only=True,
+                       run_string_ops_on_columns=False, error_handler=None, expand_defs=False, remove_definitions=True,
+                       **kwargs):
         """ Iterate rows based on the given column mapper.
 
         Args:
             hed_ops (list, func, HedOps, or None):  A func, a HedOps or a list of these to apply to the
                                                     hed strings before returning.
             mapper (ColumnMapper or None): The column name to column number mapper (or internal mapper if None).
+            requested_columns(list or None): If this is not None, return ONLY these columns.  Names or numbers allowed.
             return_string_only (bool): If True, do not return issues list, individual columns, attribute columns, etc.
             run_string_ops_on_columns (bool):   If true, run all tag and string ops on columns,
                                                 rather than columns then rows.
@@ -289,6 +282,11 @@ def iter_dataframe(self, hed_ops=None, mapper=None, return_string_only=True, run
         if mapper is None:
             mapper = self._mapper
 
+        if requested_columns:
+            # Make a copy to ensure we don't alter the actual mapper
+            mapper = copy.deepcopy(mapper)
+            mapper.set_requested_columns(requested_columns)
+
         tag_funcs, string_funcs = self._translate_ops(hed_ops, run_string_ops_on_columns=run_string_ops_on_columns,
                                                       expand_defs=expand_defs, remove_definitions=remove_definitions,
                                                       error_handler=error_handler, **kwargs)
@@ -438,20 +436,13 @@ def _get_dataframe_from_worksheet(worksheet, has_headers):
         else:
             return pandas.DataFrame(worksheet.values, dtype=str)
 
-    def _run_validators(self, hed_ops, error_handler, run_on_raw=False, expand_defs=False, **kwargs):
+    def _run_validators(self, hed_ops, error_handler, expand_defs=False, **kwargs):
         validation_issues = []
-        if run_on_raw:
-            for row_dict in self.iter_raw(hed_ops=hed_ops,
-                                          return_string_only=False,
-                                          error_handler=error_handler, expand_defs=expand_defs,
-                                          **kwargs):
-                validation_issues += row_dict[model_constants.ROW_ISSUES]
-        else:
-            for row_dict in self.iter_dataframe(hed_ops=hed_ops,
-                                                return_string_only=False,
-                                                error_handler=error_handler, expand_defs=expand_defs,
-                                                **kwargs):
-                validation_issues += row_dict[model_constants.ROW_ISSUES]
+        for row_dict in self.iter_dataframe(hed_ops=hed_ops,
+                                            return_string_only=False,
+                                            error_handler=error_handler, expand_defs=expand_defs,
+                                            **kwargs):
+            validation_issues += row_dict[model_constants.ROW_ISSUES]
 
         return validation_issues
 
@@ -529,7 +520,14 @@ def extract_definitions(self, error_handler=None):
             error_handler = ErrorHandler()
         new_def_dict = DefinitionDict()
         hed_ops = [new_def_dict]
-        _ = self._run_validators(hed_ops, run_on_raw=True, error_handler=error_handler)
+        for _ in self.iter_dataframe(hed_ops=hed_ops,
+                                     return_string_only=False,
+                                     requested_columns=self._def_columns,
+                                     run_string_ops_on_columns=True,
+                                     remove_definitions=False,
+                                     error_handler=error_handler):
+            pass
+
         return new_def_dict
 
     def update_definition_mapper(self, def_dict):

diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py
@@ -18,7 +18,7 @@ class ColumnMapper:
         - Functions and variables column and row indexing starts at 0.
     """
     def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None,
-                 attribute_columns=None, optional_tag_columns=None):
+                 attribute_columns=None, optional_tag_columns=None, requested_columns=None):
         """ Constructor for ColumnMapper.
 
         Args:
@@ -45,8 +45,10 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None
         self.column_data = {}
         # Maps column number to column_entry.  This is what's actually used by most code.
         self._final_column_map = {}
+        self._no_mapping_info = True
 
         self._column_map = None
+        self._requested_columns = []
         self._tag_columns = []
         self._optional_tag_columns = []
         self._column_prefix_dictionary = {}
@@ -57,6 +59,7 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None
         self._set_sidecar(sidecar)
         self.add_columns(attribute_columns)
 
+        self.set_requested_columns(requested_columns, False)
         self.set_tag_columns(tag_columns, optional_tag_columns, False)
         self.set_column_prefix_dict(column_prefix_dictionary, False)
 
@@ -125,6 +128,21 @@ def set_tag_columns(self, tag_columns=None, optional_tag_columns=None, finalize_
             return issues
         return []
 
+    def set_requested_columns(self, requested_columns, finalize_mapping=True):
+        """ Set to return only the columns listed in requested_columns
+
+        Args:
+            requested_columns(list or None): If this is not None, return ONLY these columns.  Names or numbers allowed.
+            finalize_mapping(bool): Finalize the mapping right now if True
+
+        Returns:
+        issues(list): An empty list of mapping issues
+        """
+        self._requested_columns = requested_columns
+        if finalize_mapping:
+            return self._finalize_mapping()
+        return []
+
     def set_column_map(self, new_column_map=None):
         """ Set the column number to name mapping.
 
@@ -174,7 +192,7 @@ def _expand_column(self, column_number, input_text):
         """
 
         # Default 1-1 mapping if we don't have specific behavior.
-        if not self._final_column_map:
+        if self._no_mapping_info:
             return HedString(input_text), False
 
         # If no entry, ignore this column.
@@ -300,14 +318,17 @@ def _finalize_mapping(self):
         self._final_column_map = {}
         found_named_tag_columns = {}
         all_tag_columns = self._tag_columns + self._optional_tag_columns
+        if self._requested_columns:
+            all_tag_columns += self._requested_columns
         self._finalize_mapping_issues = []
         if self._column_map is not None:
             for column_number, column_name in self._column_map.items():
-                if column_name in self.column_data:
+                name_requested = self._column_name_requested(column_name)
+                if name_requested and column_name in self.column_data:
                     column_entry = self.column_data[column_name]
                     column_entry.column_name = column_name
                     self._final_column_map[column_number] = column_entry
-                elif column_name in all_tag_columns:
+                elif name_requested and column_name in all_tag_columns:
                     found_named_tag_columns[column_name] = column_number
                 elif column_name.startswith(PANDAS_COLUMN_PREFIX_TO_IGNORE):
                     continue
@@ -318,13 +339,16 @@ def _finalize_mapping(self):
 
         # Add any numbered columns
         for column_name, column_entry in self.column_data.items():
-            if isinstance(column_name, int):
+            if isinstance(column_name, int) and self._column_name_requested(column_name):
                 # Convert to internal numbering format
                 column_number = column_name
                 self._final_column_map[column_number] = column_entry
 
         # Add any tag columns
         for column_number in all_tag_columns:
+            name_requested = self._column_name_requested(column_number)
+            if not name_requested:
+                continue
             if isinstance(column_number, int):
                 if column_number not in self._final_column_map:
                     self._final_column_map[column_number] = ColumnMetadata(ColumnType.HEDTags, column_number)
@@ -340,8 +364,14 @@ def _finalize_mapping(self):
         for column_number, prefix in self._column_prefix_dictionary.items():
             self._set_column_prefix(column_number, prefix)
 
+        self._no_mapping_info = self._requested_columns is None and not self._final_column_map
         return self._finalize_mapping_issues
 
+    def _column_name_requested(self, column_name):
+        if self._requested_columns is None:
+            return True
+        return column_name in self._requested_columns
+
     def get_def_dicts(self):
         """ Return def dicts from every column description.
 

diff --git a/hed/models/tabular_input.py b/hed/models/tabular_input.py
@@ -34,13 +34,14 @@ def __init__(self, file=None, sidecar=None, attribute_columns=None, extra_def_di
         new_mapper = ColumnMapper(sidecar=sidecar, optional_tag_columns=[self.HED_COLUMN_NAME],
                                   attribute_columns=attribute_columns)
 
+        definition_columns = [self.HED_COLUMN_NAME]
         self._sidecar = sidecar
         self._also_gather_defs = also_gather_defs
         self._extra_def_dicts = extra_def_dicts
         def_mapper = self.create_def_mapper(new_mapper, extra_def_dicts)
 
         super().__init__(file, file_type=".tsv", worksheet_name=None, has_column_names=True, mapper=new_mapper,
-                         def_mapper=def_mapper, name=name)
+                         def_mapper=def_mapper, name=name, definition_columns=definition_columns)
 
         if not self._has_column_names:
             raise ValueError("You are attempting to open a bids_old style file with no column headers provided.\n"

diff --git a/hed/schema/hed_cache.py b/hed/schema/hed_cache.py
@@ -10,7 +10,7 @@
 from semantic_version import Version
 import portalocker
 import time
-from hed.schema.schema_util import url_to_file
+from hed.schema.schema_io.schema_util import url_to_file
 
 """Infrastructure for caching HED schema from remote repositories."""
 
@@ -128,13 +128,18 @@ def cache_specific_url(hed_xml_url, xml_version=None, library_name=None, cache_f
     filename = hed_xml_url.split('/')[-1]
     cache_filename = os.path.join(cache_folder, filename)
 
+    return _cache_specific_url(hed_xml_url, cache_filename)
+
+
+def _cache_specific_url(hed_xml_url, cache_filename):
+    cache_folder = cache_filename.rpartition("/")[0]
     os.makedirs(cache_folder, exist_ok=True)
     temp_hed_xml_file = url_to_file(hed_xml_url)
     if temp_hed_xml_file:
-        cache_filename = _safe_copy_tmp_to_folder(temp_hed_xml_file, cache_filename)
+        cache_filename = _safe_move_tmp_to_folder(temp_hed_xml_file, cache_filename)
+        os.remove(temp_hed_xml_file)
         return cache_filename
-    else:
-        return None
+    return None
 
 
 def get_hed_version_path(xml_version=None, library_name=None, local_hed_directory=None):
@@ -184,7 +189,7 @@ def get_path_from_hed_version(hed_version, library_name=None, local_hed_director
 
 
 def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP_FOLDERS, cache_folder=None):
-    """ Cache a file from a URL.
+    """ Cache all schemas at the given URLs.
 
     Args:
         hed_base_urls (str or list): Path or list of paths.
@@ -298,6 +303,22 @@ def _sort_version_list(hed_versions):
 
 def _get_hed_xml_versions_from_url(hed_base_url, library_name=None,
                                    skip_folders=DEFAULT_SKIP_FOLDERS, get_libraries=False):
+    """ Get all available schemas and their hash values
+
+    Args:
+        hed_base_url (str): A single GitHub API url to cache
+        library_name(str or None): If str, cache only the named library schemas
+        skip_folders (list): A list of subfolders to skip over when downloading.
+        get_libraries (bool): If true, return a dictionary of version numbers, with an entry for each library name.
+
+    Returns:
+        list or dict: List of version numbers or dictionary {library_name: [versions]}.
+
+        - The Default skip_folders is 'deprecated'.
+        - The HED cache folder defaults to HED_CACHE_DIRECTORY.
+        - The directories on Github are of the form:
+            https://api.github.com/repos/hed-standard/hed-specification/contents/hedxml
+    """
     url_request = urllib.request.urlopen(hed_base_url)
     url_data = str(url_request.read(), 'utf-8')
     loaded_json = json.loads(url_data)
@@ -375,7 +396,7 @@ def _calculate_sha1(filename):
         return None
 
 
-def _safe_copy_tmp_to_folder(temp_hed_xml_file, dest_filename):
+def _safe_move_tmp_to_folder(temp_hed_xml_file, dest_filename):
     """ Copy to destination folder and rename.
 
     Args:
@@ -385,16 +406,12 @@ def _safe_copy_tmp_to_folder(temp_hed_xml_file, dest_filename):
     Returns:
         dest_filename (str): The new filename on success or None on failure.
 
-    Notes:
-        The file will be deleted on a successful copy.
-
     """
     _, temp_xml_file = os.path.split(temp_hed_xml_file)
     dest_folder, _ = os.path.split(dest_filename)
 
     temp_filename_in_cache = os.path.join(dest_folder, temp_xml_file)
     copyfile(temp_hed_xml_file, temp_filename_in_cache)
-    os.remove(temp_hed_xml_file)
     try:
         os.replace(temp_filename_in_cache, dest_filename)
     except OSError:
@@ -413,13 +430,7 @@ def _cache_hed_version(version, library_name, version_info, cache_folder):
     if sha_hash == local_sha_hash:
         return possible_cache_filename
 
-    os.makedirs(cache_folder, exist_ok=True)
-    temp_hed_xml_file = url_to_file(download_url)
-    if temp_hed_xml_file:
-        cache_filename = _safe_copy_tmp_to_folder(temp_hed_xml_file, possible_cache_filename)
-        return cache_filename
-    else:
-        return None
+    return _cache_specific_url(download_url, possible_cache_filename)
 
 
 def _get_latest_semantic_version_in_list(semantic_version_list):