Merge pull request #1104 from TheChymera/ngff

[ENH] NGFF format support
bids-standard · Jul 7, 2022 · f2e34d4 · f2e34d4
2 parents 0749755 + 988bd2d
commit f2e34d4
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 21 deletions.
diff --git a/src/04-modality-specific-files/10-microscopy.md b/src/04-modality-specific-files/10-microscopy.md
@@ -37,7 +37,8 @@ by the [Open Microscopy Environment](https://www.openmicroscopy.org/) for whole-
 the [OME-TIFF file specifications](https://docs.openmicroscopy.org/ome-model/6.1.2/ome-tiff/file-structure.html).
 The OME-TIFF file allows for multi-page TIFF files to store multiple image planes and supports
 multi-resolution pyramidal tiled images. An OME-XML data block is also embedded inside the
-file’s header.
+file’s header. Further, OME-ZARR (sometimes referred to as OME-NGFF or NGFF) has been developed to provide improved
+access and storage for large data via chunked and compressed N-dimensional arrays.
 
 The BIDS standard accepts microscopy data in a number of file formats to accommodate datasets
 stored in 2D image formats and whole-slide imaging formats, to accommodate lossless and lossy
@@ -54,12 +55,10 @@ Microscopy raw data MUST be stored in one of the following formats:
     (`.ome.tif` for standard TIFF files or `.ome.btf` for
     [BigTIFF](https://www.awaresystems.be/imaging/tiff/bigtiff.html) files)
 
-If different from PNG, TIFF or OME-TIFF, the original unprocessed data in the native format MAY be
-stored in the [`/sourcedata` directory](../02-common-principles.md#source-vs-raw-vs-derived-data).
+-   [OME-ZARR/NGFF](https://ngff.openmicroscopy.org/latest/) (`.ome.zarr` directories)
 
-Future versions may extend this list of supported file formats, for example with the
-Next-Generation File Formats currently developed by OME ([OME-NGFF](https://ngff.openmicroscopy.org/latest/))
-as a successor to OME-TIFF for better remote sharing of large datasets.
+If different from PNG, TIFF, OME-TIFF, or OME-ZARR, the original unprocessed data in the native format MAY be
+stored in the [`/sourcedata` directory](../02-common-principles.md#source-vs-raw-vs-derived-data).
 
 ### Modality suffixes
 Microscopy data currently support the following imaging modalities:

diff --git a/src/schema/objects/extensions.yaml b/src/schema/objects/extensions.yaml
@@ -140,6 +140,13 @@
 
     Used by KIT, Yokogawa, and Ricoh MEG systems.
     Successor to the `.sqd` extension for marker files.
+.ome.zarr/:
+  name: OME Next Generation File Format
+  description: |
+    An OME-NGFF file.
+
+    OME-NGFF is a [Zarr](https://zarr.readthedocs.io)-based format, organizing data arrays in nested directories.
+    This format was developed by the Open Microscopy Environment to provide data stream access to very large data.
 .nii:
   name: NIfTI
   description: |

diff --git a/src/schema/rules/datatypes/micr.yaml b/src/schema/rules/datatypes/micr.yaml
@@ -21,6 +21,7 @@ microscopy:
   extensions:
   - .ome.tif
   - .ome.btf
+  - .ome.zarr/
   - .png
   - .tif
   - .json

diff --git a/tools/schemacode/schemacode/tests/test_validator.py b/tools/schemacode/schemacode/tests/test_validator.py
@@ -229,7 +229,7 @@ def test_load_all():
         os.path.abspath(os.path.dirname(__file__)),
         "../data/schema",
     )
-    schema_all = load_all(schema_path)
+    schema_all, _ = load_all(schema_path)
 
     # Check if expected keys are present in all entries
     for entry in schema_all:

diff --git a/tools/schemacode/schemacode/validator.py b/tools/schemacode/schemacode/validator.py
@@ -16,7 +16,10 @@
 DIR_ENTITIES = ["subject", "session"]
 
 
-def _get_paths(bids_paths):
+def _get_paths(
+    bids_paths,
+    pseudofile_suffixes=[],
+):
     """
     Get all paths from a list of directories, excluding hidden subdirectories from distribution.
 
@@ -25,6 +28,9 @@ def _get_paths(bids_paths):
     bids_paths : list or str
         Directories from which to get paths, may also contain file paths, which will remain
         unchanged.
+    pseudofile_suffixes : list of str
+        Directory suffixes prompting the validation of the directory name and limiting further
+        directory walk.
 
     Notes
     -----
@@ -47,23 +53,19 @@ def _get_paths(bids_paths):
         ".bidsignore",
         "dandiset.yaml",
     ]
-    # Inelegant hard-coded solution.
-    # Could be replaced by a maximum depth limit if BIDS root auto-detection is implemented.
-    treat_as_file_suffix = [".ngff"]
 
     path_list = []
     for bids_path in bids_paths:
         bids_path = os.path.abspath(os.path.expanduser(bids_path))
         if os.path.isfile(bids_path):
             path_list.append(bids_path)
             continue
-        for root, dirs, file_names in os.walk(bids_path, topdown=False):
-            if any(root.endswith(i) for i in treat_as_file_suffix):
-                continue
-            if any(f"{i}/" in root for i in treat_as_file_suffix):
-                continue
-            if any(f"{i}\\" in root for i in treat_as_file_suffix):
-                continue
+        for root, dirs, file_names in os.walk(bids_path, topdown=True):
+            if any(root.endswith(i) for i in pseudofile_suffixes):
+                # Add the directory name to the validation paths list.
+                path_list.append(f"{root}/")
+                # Do not index the contents of the directory.
+                dirs[:] = []
             # will break if BIDS ever puts meaningful data under `/.{dandi,datalad,git}*/`
             if any(exclude_subdir in root for exclude_subdir in exclude_subdirs):
                 continue
@@ -335,6 +337,8 @@ def load_all(
     -------
     all_regex : list of dict
         A list of dictionaries, with keys including 'regex' and 'mandatory'.
+    my_schema : list of dict
+        Nested dictionaries representing the full schema.
     """
 
     my_schema = schema.load_schema(schema_dir)
@@ -346,13 +350,14 @@ def load_all(
     )
     all_regex.extend(top_level_regex)
 
-    return all_regex
+    return all_regex, my_schema
 
 
 def validate_all(
     bids_paths,
     regex_schema,
     debug=False,
+    pseudofile_suffixes=[],
 ):
     """
     Validate `bids_paths` based on a `regex_schema` dictionary list, including regexes.
@@ -366,6 +371,11 @@ def validate_all(
     debug : tuple, optional
         Whether to print itemwise notices for checks on the console, and include them in the
         validation result.
+    pseudofile_suffixes : list of str, optional
+        Any suffixes which identify BIDS-valid directory data.
+        These pseudo-file suffixes will be validated based on the directory name, with the
+        directory contents not being indexed for validation.
+        By default, no pseudo-file suffixes are checked.
 
     Returns
     -------
@@ -384,7 +394,7 @@ def validate_all(
     """
 
     tracking_schema = deepcopy(regex_schema)
-    paths_list = _get_paths(bids_paths)
+    paths_list = _get_paths(bids_paths, pseudofile_suffixes=pseudofile_suffixes)
     tracking_paths = deepcopy(paths_list)
     if debug:
         itemwise_results = []
@@ -658,6 +668,34 @@ def log_errors(validation_result):
         lgr.warning("The `%s` file was not matched by any regex schema entry.", i)
 
 
+def _get_directory_suffixes(my_schema):
+    """Query schema for suffixes which identify directory entities.
+
+    Parameters
+    ----------
+    my_schema : dict
+        Nested directory as produced by `schemacode.schema.load_schema()`.
+
+    Returns
+    -------
+    list of str
+        Directory pseudofile suffixes excluding trailing slashes.
+
+    Notes
+    -----
+    * Yes this seems super-awkward to do explicitly, after all, the trailing slash is
+        already in so it should automagically work, but no:
+        - Subdirectory names need to be dynamically excluded from validation input.
+        - Backslash directory delimiters are still in use, which is regrettable.
+    """
+    pseudofile_suffixes = []
+    for i in my_schema["objects"]["extensions"]:
+        if i.endswith("/"):
+            if i != "/":
+                pseudofile_suffixes.append(i[:-1])
+    return pseudofile_suffixes
+
+
 def validate_bids(
     bids_paths,
     schema_reference_root="/usr/share/bids-schema/",
@@ -716,11 +754,13 @@ def validate_bids(
         bids_paths = [bids_paths]
 
     bids_schema_dir = select_schema_dir(bids_paths, schema_reference_root, schema_version)
-    regex_schema = load_all(bids_schema_dir)
+    regex_schema, my_schema = load_all(bids_schema_dir)
+    pseudofile_suffixes = _get_directory_suffixes(my_schema)
     validation_result = validate_all(
         bids_paths,
         regex_schema,
         debug=debug,
+        pseudofile_suffixes=pseudofile_suffixes,
     )
 
     log_errors(validation_result)