Skip to content

Commit

Permalink
Merge pull request #955 from dandi/validator_fixes
Browse files Browse the repository at this point in the history
Validator fixes: fallback to our version, skip dotdirs on windows, etc
  • Loading branch information
yarikoptic committed Apr 19, 2022
2 parents 0b7cf59 + 1a6d835 commit b890375
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 64 deletions.
134 changes: 84 additions & 50 deletions dandi/bids_validator_xs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import os
import re

from .support.bids import schema, utils
from . import utils
from .support.bids import schema

lgr = utils.get_logger()

# The list of which entities create directories could be dynamically specified by the YAML, but for
# now, it is not.
Expand Down Expand Up @@ -32,9 +35,9 @@ def _get_paths(bids_paths):
input.
"""
exclude_subdirs = [
"/.dandi",
"/.datalad",
"/.git",
rf"{os.sep}.dandi",
rf"{os.sep}.datalad",
rf"{os.sep}.git",
]
# `.bidsignore` is not, in fact, a BIDS file, as per:
# https://github.com/bids-standard/bids-specification/issues/980
Expand Down Expand Up @@ -107,29 +110,48 @@ def _add_entity(
return regex_entities


def _extension_safety(extension):
"""
Making extensions formatting-safe.
Issues covered by this function are listed under “Notes”
Parameters
----------
extension : str
Extension string, as present in the BIDS YAML schema.
Returns
-------
str
Extension string, safe for use in validator Regex formatting.
Notes
-----
* Bash-wildcard safety: https://github.com/bids-standard/bids-specification/issues/990
* Period safety: https://github.com/bids-standard/bids-specification/issues/1055
* Hopefully this function will be deprecated soon, but it will not break safe entries.
"""
if extension == "None":
return ""
if "." in extension:
extension = extension.replace(".", "\\.")
if "*" in extension:
extension = extension.replace("*", ".*?")

return extension


def _add_extensions(regex_string, variant):
"""Add extensions to a regex string."""
if len(variant["extensions"]) == 1:
# This only happens in `rules/datatypes/meg.yaml` once:
if variant["extensions"][0] == "*":
regex_extensions = ".*?"
else:
# Making it period-safe:
if variant["extensions"][0][0] == ".":
regex_extensions = variant["extensions"][0][1:]
else:
regex_extensions = variant["extensions"][0]
else:
# Making it period-safe:
fixed_variant_extensions = []
for variant_extension in variant["extensions"]:
if variant_extension[0] == ".":
fixed_variant_extensions.append(variant_extension[1:])
else:
fixed_variant_extensions.append(variant_extension)

fixed_variant_extensions = []
for variant_extension in variant["extensions"]:
variant_extension = _extension_safety(variant_extension)
fixed_variant_extensions.append(variant_extension)
if len(fixed_variant_extensions) > 1:
regex_extensions = "({})".format("|".join(fixed_variant_extensions))
regex_string = f"{regex_string}\\.{regex_extensions}"
else:
regex_extensions = fixed_variant_extensions[0]
regex_string = f"{regex_string}{regex_extensions}"

return regex_string

Expand Down Expand Up @@ -195,14 +217,8 @@ def load_top_level(
# None value gets passed as list of strings...
extensions = top_level_file["extensions"]
if extensions != ["None"]:
periodsafe_extensions = []
for extension in extensions:
if extension[0] == ".":
periodsafe_extensions.append(extension[1:])
else:
periodsafe_extensions.append(extension)
extensions_regex = "|".join(periodsafe_extensions)
regex = f".*?/{top_level_filename}\\.({extensions_regex})$"
extensions_regex = "|".join(map(_extension_safety, extensions))
regex = f".*?/{top_level_filename}({extensions_regex})$"
else:
regex = f".*?/{top_level_filename}$"
regex_entry = {
Expand Down Expand Up @@ -520,15 +536,16 @@ def select_schema_dir(
bids_paths,
schema_reference_root,
schema_version,
force_select=False,
schema_min_version="1.7.0+012+dandi001",
):
"""
Select schema directory, according to a fallback logic whereby the schema path is
either (1) `schema_version` if the value is a path, (2) a concatenation of
`schema_reference_root` and `schema_version`, (3) a concatenation of the detected
version specification from a `dataset_description.json` file if one is found in
parents of the input path, (4) the newest schema from the code distribution only
if `force_select` is enabled.
parents of the input path, (4) `schema_min_version` if no other version can be found
or if the detected version from `dataset_description.json` is smaller than
`schema_min_version`.
Parameters
----------
Expand All @@ -548,8 +565,10 @@ def select_schema_dir(
If the path starts with the string "{module_path}" it will be expanded relative to the
module path.
If None, the `dataset_description.json` fie will be queried for the dataset schema version.
force_select : bool, optional
Whether to fall back to newest version of schema if no version is given or found.
schema_min_version : str, optional
Minimal version to use UNLESS the schema version is manually specified.
If the version is auto-detected and the version is smaller than schema_min_version,
schema_min_version will be selected instead.
Returns
-------
Expand Down Expand Up @@ -581,31 +600,40 @@ def select_schema_dir(
else:
with open(dataset_description) as f:
dataset_info = json.load(f)
if force_select:
try:
schema_version = dataset_info["BIDSVersion"]
except KeyError:
return utils.get_schema_path()
else:
try:
schema_version = dataset_info["BIDSVersion"]
except KeyError:
lgr.warning(
"BIDSVersion is not specified in "
"`dataset_description.json`. "
f"Falling back to {schema_min_version}."
)
schema_version = schema_min_version
if schema_min_version:
if schema_version < schema_min_version:
lgr.warning(
f"BIDSVersion {schema_version} is less than the minimal working "
"{schema_min_version}. "
"Falling back to {schema_min_version}. "
"To force the usage of earlier versions specify them explicitly "
"when calling the validator."
)
schema_version = schema_min_version
schema_dir = os.path.join(schema_reference_root, schema_version)
if os.path.isdir(schema_dir):
return schema_dir
elif force_select:
return utils.get_schema_path()
else:
raise ValueError(
f"The expected schema directory {schema_dir} does not exist on the system."
"Please ensure the file exists or use the `force_select` option, in order"
"to auto-select the most recent schema as a fallback."
"Please ensure the file exists or manually specify a schema version for "
"which the schemacode files are available on your system."
)


def validate_bids(
bids_paths,
schema_reference_root="{module_path}/support/bids/schemadata/",
schema_version=None,
force_select=False,
debug=False,
report_path=False,
):
Expand All @@ -628,13 +656,19 @@ def validate_bids(
If the path starts with the string "{module_path}" it will be expanded relative to the
module path.
If None, the `dataset_description.json` fie will be queried for the dataset schema version.
force_select : bool, optional
Whether to fall back to newest version of schema if no version is given or found.
report_path : bool or str, optional
If `True` a log will be written using the standard output path of `.write_report()`.
If string, the string will be used as the output path.
If the variable evaluates as False, no log will be written.
Returns
-------
results : dict
A dictionary reporting the target files for validation, the unmatched files and unmatched
regexes, and optionally the itemwise comparison results.
Keys include "schema_tracking", "path_tracking", "path_listing", "match_listing", and
optionally "itemwise"
Examples
--------
>>> from schemacode import validator
Expand Down
24 changes: 10 additions & 14 deletions dandi/tests/test_bids_validator_xs.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ def test__add_extensions():
assert (
_regex_string == "sub-(?P=subject)(|_ses-(?P=session))"
"_sample-(?P<sample>([a-z,A-Z,0-9]*?))"
"(|_acq-(?P<acquisition>([a-z,A-Z,0-9]*?)))_photo\\.(jpg|png|tif)"
"(|_acq-(?P<acquisition>([a-z,A-Z,0-9]*?)))"
"_photo(\\.jpg|\\.png|\\.tif)"
)


Expand Down Expand Up @@ -292,30 +293,24 @@ def test_bids_datasets(bids_examples, tmp_path):
from dandi.bids_validator_xs import validate_bids

whitelist = [
"qmri_megre",
"asl003",
"pet002",
"asl005",
"asl002",
"pet004",
"eeg_cbm",
"pet005",
"hcp_example_bids",
"asl004",
"qmri_tb1tfl",
"micr_SEM",
"micr_SEM-dandi",
"micr_SPIM",
"pet001",
"pet003",
"micr_SEM",
"micr_SEM-dandi",
"qmri_tb1tfl",
"qmri_vfa",
]
schema_path = "{module_path}/support/bids/schemadata/1.7.0+012+dandi001"

# Validate per dataset, with debugging:
# Validate per dataset, with automatic schema selection:
for i in os.listdir(bids_examples):
if i in whitelist:
result = validate_bids(
os.path.join(bids_examples, i), schema_version=schema_path
os.path.join(bids_examples, i),
)
# Have all files been validated?
assert len(result["path_tracking"]) == 0
Expand All @@ -327,10 +322,11 @@ def test_bids_datasets(bids_examples, tmp_path):
for f in files:
selected_path = os.path.join(root, f)
selected_paths.append(selected_path)
# Does terminal debug output work?
# Does terminal debug output work and explicit schema specification work?
result = validate_bids(selected_paths, schema_version=schema_path, debug=True)
# Does default log path specification work?
result = validate_bids(selected_paths, schema_version=schema_path, report_path=True)

# Does custom log path specification work?
result = validate_bids(
selected_paths,
Expand Down

0 comments on commit b890375

Please sign in to comment.