Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add requested_columns to column mapper. Only check specific columns for definitions #505

Merged
merged 1 commit into from
Jun 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 39 additions & 41 deletions hed/models/base_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class BaseInput:
COMMA_DELIMITER = ','

def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=True, mapper=None, def_mapper=None,
name=None):
definition_columns=None, name=None):
""" Constructor for the BaseInput class.

Args:
Expand All @@ -39,6 +39,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
(Not applicable to tsv files.)
has_column_names (bool): True if file has column names.
mapper (ColumnMapper or None): Indicates which columns have HED tags.
definition_columns(list or None): A list of columns to check for definitions. Explicit 'None' means all.
name (str or None): Optional field for how this file will report errors.

Notes:
Expand All @@ -54,6 +55,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
# This is the loaded workbook if we loaded originally from an excel file.
self._loaded_workbook = None
self._worksheet_name = worksheet_name
self._def_columns = definition_columns
self.file_def_dict = None
pandas_header = 0
if not self._has_column_names:
Expand Down Expand Up @@ -234,43 +236,34 @@ def to_csv(self, file=None, output_processed_file=False):
header=output_file._has_column_names)
return csv_string_if_filename_none

def __iter__(self):
""" Iterate over the underlying dataframe. """
return self.iter_dataframe()

def iter_raw(self, hed_ops=None, error_handler=None, **kwargs):
""" Iterate all columns without substitutions

Args:
hed_ops (list, func, HedOps, or None): A func, a HedOps or a list of these to apply to the
hed strings before returning.
error_handler (ErrorHandler or None): Handler to use for context or a default one if None.
kwargs:

Yields:
- dict: A dict with column_number keys and values corresponding to the cell at that position.
@property
def columns(self):
""" Returns a list of the column names.

Notes:
- See models.hed_ops.translate_ops or the specific hed_ops for additional options.
- Primarily for altering or re-saving the original file (e.g., convert short tags to long).
- Used for initial processing when trying to find definitions.
Empty if no column names.

Returns:
columns(list): The column names.
"""
if error_handler is None:
error_handler = ErrorHandler()
columns = []
if self._dataframe is not None and self._has_column_names:
columns = list(self._dataframe.columns)
return columns

default_mapper = ColumnMapper()
return self.iter_dataframe(hed_ops=hed_ops, mapper=default_mapper, run_string_ops_on_columns=True,
error_handler=error_handler, **kwargs)
def __iter__(self):
""" Iterate over the underlying dataframe. """
return self.iter_dataframe()

def iter_dataframe(self, hed_ops=None, mapper=None, return_string_only=True, run_string_ops_on_columns=False,
error_handler=None, expand_defs=False, remove_definitions=True, **kwargs):
def iter_dataframe(self, hed_ops=None, mapper=None, requested_columns=None, return_string_only=True,
run_string_ops_on_columns=False, error_handler=None, expand_defs=False, remove_definitions=True,
**kwargs):
""" Iterate rows based on the given column mapper.

Args:
hed_ops (list, func, HedOps, or None): A func, a HedOps or a list of these to apply to the
hed strings before returning.
mapper (ColumnMapper or None): The column name to column number mapper (or internal mapper if None).
requested_columns(list or None): If this is not None, return ONLY these columns. Names or numbers allowed.
return_string_only (bool): If True, do not return issues list, individual columns, attribute columns, etc.
run_string_ops_on_columns (bool): If true, run all tag and string ops on columns,
rather than columns then rows.
Expand All @@ -289,6 +282,11 @@ def iter_dataframe(self, hed_ops=None, mapper=None, return_string_only=True, run
if mapper is None:
mapper = self._mapper

if requested_columns:
# Make a copy to ensure we don't alter the actual mapper
mapper = copy.deepcopy(mapper)
mapper.set_requested_columns(requested_columns)

tag_funcs, string_funcs = self._translate_ops(hed_ops, run_string_ops_on_columns=run_string_ops_on_columns,
expand_defs=expand_defs, remove_definitions=remove_definitions,
error_handler=error_handler, **kwargs)
Expand Down Expand Up @@ -438,20 +436,13 @@ def _get_dataframe_from_worksheet(worksheet, has_headers):
else:
return pandas.DataFrame(worksheet.values, dtype=str)

def _run_validators(self, hed_ops, error_handler, run_on_raw=False, expand_defs=False, **kwargs):
def _run_validators(self, hed_ops, error_handler, expand_defs=False, **kwargs):
validation_issues = []
if run_on_raw:
for row_dict in self.iter_raw(hed_ops=hed_ops,
return_string_only=False,
error_handler=error_handler, expand_defs=expand_defs,
**kwargs):
validation_issues += row_dict[model_constants.ROW_ISSUES]
else:
for row_dict in self.iter_dataframe(hed_ops=hed_ops,
return_string_only=False,
error_handler=error_handler, expand_defs=expand_defs,
**kwargs):
validation_issues += row_dict[model_constants.ROW_ISSUES]
for row_dict in self.iter_dataframe(hed_ops=hed_ops,
return_string_only=False,
error_handler=error_handler, expand_defs=expand_defs,
**kwargs):
validation_issues += row_dict[model_constants.ROW_ISSUES]

return validation_issues

Expand Down Expand Up @@ -529,7 +520,14 @@ def extract_definitions(self, error_handler=None):
error_handler = ErrorHandler()
new_def_dict = DefinitionDict()
hed_ops = [new_def_dict]
_ = self._run_validators(hed_ops, run_on_raw=True, error_handler=error_handler)
for _ in self.iter_dataframe(hed_ops=hed_ops,
return_string_only=False,
requested_columns=self._def_columns,
run_string_ops_on_columns=True,
remove_definitions=False,
error_handler=error_handler):
pass

return new_def_dict

def update_definition_mapper(self, def_dict):
Expand Down
40 changes: 35 additions & 5 deletions hed/models/column_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class ColumnMapper:
- Functions and variables column and row indexing starts at 0.
"""
def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None,
attribute_columns=None, optional_tag_columns=None):
attribute_columns=None, optional_tag_columns=None, requested_columns=None):
""" Constructor for ColumnMapper.

Args:
Expand All @@ -45,8 +45,10 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None
self.column_data = {}
# Maps column number to column_entry. This is what's actually used by most code.
self._final_column_map = {}
self._no_mapping_info = True

self._column_map = None
self._requested_columns = []
self._tag_columns = []
self._optional_tag_columns = []
self._column_prefix_dictionary = {}
Expand All @@ -57,6 +59,7 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None
self._set_sidecar(sidecar)
self.add_columns(attribute_columns)

self.set_requested_columns(requested_columns, False)
self.set_tag_columns(tag_columns, optional_tag_columns, False)
self.set_column_prefix_dict(column_prefix_dictionary, False)

Expand Down Expand Up @@ -125,6 +128,21 @@ def set_tag_columns(self, tag_columns=None, optional_tag_columns=None, finalize_
return issues
return []

def set_requested_columns(self, requested_columns, finalize_mapping=True):
""" Set to return only the columns listed in requested_columns

Args:
requested_columns(list or None): If this is not None, return ONLY these columns. Names or numbers allowed.
finalize_mapping(bool): Finalize the mapping right now if True

Returns:
issues(list): An empty list of mapping issues
"""
self._requested_columns = requested_columns
if finalize_mapping:
return self._finalize_mapping()
return []

def set_column_map(self, new_column_map=None):
""" Set the column number to name mapping.

Expand Down Expand Up @@ -174,7 +192,7 @@ def _expand_column(self, column_number, input_text):
"""

# Default 1-1 mapping if we don't have specific behavior.
if not self._final_column_map:
if self._no_mapping_info:
return HedString(input_text), False

# If no entry, ignore this column.
Expand Down Expand Up @@ -300,14 +318,17 @@ def _finalize_mapping(self):
self._final_column_map = {}
found_named_tag_columns = {}
all_tag_columns = self._tag_columns + self._optional_tag_columns
if self._requested_columns:
all_tag_columns += self._requested_columns
self._finalize_mapping_issues = []
if self._column_map is not None:
for column_number, column_name in self._column_map.items():
if column_name in self.column_data:
name_requested = self._column_name_requested(column_name)
if name_requested and column_name in self.column_data:
column_entry = self.column_data[column_name]
column_entry.column_name = column_name
self._final_column_map[column_number] = column_entry
elif column_name in all_tag_columns:
elif name_requested and column_name in all_tag_columns:
found_named_tag_columns[column_name] = column_number
elif column_name.startswith(PANDAS_COLUMN_PREFIX_TO_IGNORE):
continue
Expand All @@ -318,13 +339,16 @@ def _finalize_mapping(self):

# Add any numbered columns
for column_name, column_entry in self.column_data.items():
if isinstance(column_name, int):
if isinstance(column_name, int) and self._column_name_requested(column_name):
# Convert to internal numbering format
column_number = column_name
self._final_column_map[column_number] = column_entry

# Add any tag columns
for column_number in all_tag_columns:
name_requested = self._column_name_requested(column_number)
if not name_requested:
continue
if isinstance(column_number, int):
if column_number not in self._final_column_map:
self._final_column_map[column_number] = ColumnMetadata(ColumnType.HEDTags, column_number)
Expand All @@ -340,8 +364,14 @@ def _finalize_mapping(self):
for column_number, prefix in self._column_prefix_dictionary.items():
self._set_column_prefix(column_number, prefix)

self._no_mapping_info = self._requested_columns is None and not self._final_column_map
return self._finalize_mapping_issues

def _column_name_requested(self, column_name):
if self._requested_columns is None:
return True
return column_name in self._requested_columns

def get_def_dicts(self):
""" Return def dicts from every column description.

Expand Down
3 changes: 2 additions & 1 deletion hed/models/tabular_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,14 @@ def __init__(self, file=None, sidecar=None, attribute_columns=None, extra_def_di
new_mapper = ColumnMapper(sidecar=sidecar, optional_tag_columns=[self.HED_COLUMN_NAME],
attribute_columns=attribute_columns)

definition_columns = [self.HED_COLUMN_NAME]
self._sidecar = sidecar
self._also_gather_defs = also_gather_defs
self._extra_def_dicts = extra_def_dicts
def_mapper = self.create_def_mapper(new_mapper, extra_def_dicts)

super().__init__(file, file_type=".tsv", worksheet_name=None, has_column_names=True, mapper=new_mapper,
def_mapper=def_mapper, name=name)
def_mapper=def_mapper, name=name, definition_columns=definition_columns)

if not self._has_column_names:
raise ValueError("You are attempting to open a bids_old style file with no column headers provided.\n"
Expand Down
45 changes: 28 additions & 17 deletions hed/schema/hed_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from semantic_version import Version
import portalocker
import time
from hed.schema.schema_util import url_to_file
from hed.schema.schema_io.schema_util import url_to_file

"""Infrastructure for caching HED schema from remote repositories."""

Expand Down Expand Up @@ -128,13 +128,18 @@ def cache_specific_url(hed_xml_url, xml_version=None, library_name=None, cache_f
filename = hed_xml_url.split('/')[-1]
cache_filename = os.path.join(cache_folder, filename)

return _cache_specific_url(hed_xml_url, cache_filename)


def _cache_specific_url(hed_xml_url, cache_filename):
cache_folder = cache_filename.rpartition("/")[0]
os.makedirs(cache_folder, exist_ok=True)
temp_hed_xml_file = url_to_file(hed_xml_url)
if temp_hed_xml_file:
cache_filename = _safe_copy_tmp_to_folder(temp_hed_xml_file, cache_filename)
cache_filename = _safe_move_tmp_to_folder(temp_hed_xml_file, cache_filename)
os.remove(temp_hed_xml_file)
return cache_filename
else:
return None
return None


def get_hed_version_path(xml_version=None, library_name=None, local_hed_directory=None):
Expand Down Expand Up @@ -184,7 +189,7 @@ def get_path_from_hed_version(hed_version, library_name=None, local_hed_director


def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP_FOLDERS, cache_folder=None):
""" Cache a file from a URL.
""" Cache all schemas at the given URLs.

Args:
hed_base_urls (str or list): Path or list of paths.
Expand Down Expand Up @@ -298,6 +303,22 @@ def _sort_version_list(hed_versions):

def _get_hed_xml_versions_from_url(hed_base_url, library_name=None,
skip_folders=DEFAULT_SKIP_FOLDERS, get_libraries=False):
""" Get all available schemas and their hash values

Args:
hed_base_url (str): A single GitHub API url to cache
library_name(str or None): If str, cache only the named library schemas
skip_folders (list): A list of subfolders to skip over when downloading.
get_libraries (bool): If true, return a dictionary of version numbers, with an entry for each library name.

Returns:
list or dict: List of version numbers or dictionary {library_name: [versions]}.

- The Default skip_folders is 'deprecated'.
- The HED cache folder defaults to HED_CACHE_DIRECTORY.
- The directories on Github are of the form:
https://api.github.com/repos/hed-standard/hed-specification/contents/hedxml
"""
url_request = urllib.request.urlopen(hed_base_url)
url_data = str(url_request.read(), 'utf-8')
loaded_json = json.loads(url_data)
Expand Down Expand Up @@ -375,7 +396,7 @@ def _calculate_sha1(filename):
return None


def _safe_copy_tmp_to_folder(temp_hed_xml_file, dest_filename):
def _safe_move_tmp_to_folder(temp_hed_xml_file, dest_filename):
""" Copy to destination folder and rename.

Args:
Expand All @@ -385,16 +406,12 @@ def _safe_copy_tmp_to_folder(temp_hed_xml_file, dest_filename):
Returns:
dest_filename (str): The new filename on success or None on failure.

Notes:
The file will be deleted on a successful copy.

"""
_, temp_xml_file = os.path.split(temp_hed_xml_file)
dest_folder, _ = os.path.split(dest_filename)

temp_filename_in_cache = os.path.join(dest_folder, temp_xml_file)
copyfile(temp_hed_xml_file, temp_filename_in_cache)
os.remove(temp_hed_xml_file)
try:
os.replace(temp_filename_in_cache, dest_filename)
except OSError:
Expand All @@ -413,13 +430,7 @@ def _cache_hed_version(version, library_name, version_info, cache_folder):
if sha_hash == local_sha_hash:
return possible_cache_filename

os.makedirs(cache_folder, exist_ok=True)
temp_hed_xml_file = url_to_file(download_url)
if temp_hed_xml_file:
cache_filename = _safe_copy_tmp_to_folder(temp_hed_xml_file, possible_cache_filename)
return cache_filename
else:
return None
return _cache_specific_url(download_url, possible_cache_filename)


def _get_latest_semantic_version_in_list(semantic_version_list):
Expand Down
Loading