Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft for BIDS organize #1404

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions dandi/cli/cmd_organize.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@
default=None,
help="How to relocate video files referenced by NWB files",
)
@click.option(
"--style",
type=str,
default="dandi",
help="Output directory style, currently `dandi`, by default, and `bids`.",
)
@click.option(
"--required-field",
"required_fields",
Expand All @@ -71,6 +77,7 @@ def organize(
files_mode: FileOperationMode,
media_files_mode: CopyMode | None,
update_external_file_paths: bool,
style: str,
jobs: int | None,
devel_debug: bool = False,
) -> None:
Expand Down Expand Up @@ -118,5 +125,6 @@ def organize(
update_external_file_paths=update_external_file_paths,
media_files_mode=media_files_mode,
required_fields=required_fields,
style=style,
jobs=jobs,
)
19 changes: 19 additions & 0 deletions dandi/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,25 @@ def urls(self) -> Iterator[str]:
# Fields which would be used to compose organized filenames
# TODO: add full description into command --help etc
# Order matters!

# Once BST includes BEP032 this should be dynamically parsed from there.
bids_layout_fields = {
# NWB files can be either ieeg or ephys, datatype probably inferable from:
# metadata["acquisition"]
"acquisition": {"format": "{}", "type": "required"},
"subject_id": {"format": "sub-{}", "type": "required"},
"session_id": {"format": "_ses-{}", "type": "required"},
"slice_id": {"format": "_sample-{}"},
# "session_description"
"modalities": {
"format": "_{}",
"type": "required_if_not_empty",
"remap": [
["ecephys", "ephys"],
],
},
"extension": {"format": "{}", "type": "required"},
}
dandi_layout_fields = {
# "type" - if not defined, additional
"subject_id": {"format": "sub-{}", "type": "required"},
Expand Down
111 changes: 84 additions & 27 deletions dandi/organize.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import ruamel.yaml

from . import __version__, get_logger
from .consts import dandi_layout_fields
from .consts import bids_layout_fields, dandi_layout_fields
from .dandiset import Dandiset
from .exceptions import OrganizeImpossibleError
from .utils import (
Expand All @@ -38,6 +38,11 @@

lgr = get_logger()

DANDI_PATH = op.join("sub-{subject_id}", "{organized_filename}")
BIDS_PATH = op.join(
"sub-{subject_id}", "ses-{session_id}", "{datatype}", "{organized_filename}"
)


class FileOperationMode(str, Enum):
DRY = "dry"
Expand Down Expand Up @@ -86,9 +91,6 @@
return self.value


dandi_path = op.join("sub-{subject_id}", "{dandi_filename}")


def filter_invalid_metadata_rows(metadata_rows):
"""Split into two lists - valid and invalid entries"""
valid, invalid = [], []
Expand All @@ -108,7 +110,9 @@


def create_unique_filenames_from_metadata(
metadata: list[dict], required_fields: Sequence[str] | None = None
metadata: list[dict],
required_fields: Sequence[str] | None = None,
style: str | None = "dandi",
) -> list[dict]:
"""Create unique filenames given metadata

Expand Down Expand Up @@ -163,7 +167,7 @@
for r in metadata:
# extract File name extension and place them into the records
r["extension"] = op.splitext(r["path"])[1]
# since those might be used in dandi_path
# since those might be used in path_format
for field in "subject_id", "session_id":
value = r.get(field, None)
if value:
Expand All @@ -174,9 +178,12 @@
if required_fields:
r.setdefault("_required_if_not_empty", []).extend(required_fields)

_assign_dandi_names(metadata)
if style is None or style == "dandi":
_assign_dandi_names(metadata)
elif style == "bids":
_assign_bids_names(metadata)

Check warning on line 184 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L183-L184

Added lines #L183 - L184 were not covered by tests

non_unique = _get_non_unique_paths(metadata)
non_unique = _get_non_unique_paths(metadata, style)

additional_nonunique = []

Expand Down Expand Up @@ -204,10 +211,15 @@
if values: # helps disambiguation, but might still be non-unique
# add to all files in the group
for r in metadata:
if r["dandi_path"] == conflicting_path:
if r["organized_path"] == conflicting_path:

Check warning on line 214 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L214

Added line #L214 was not covered by tests
r.setdefault("_required_if_not_empty", []).append(field)
_assign_dandi_names(metadata)
non_unique = _get_non_unique_paths(metadata)
if style is None or style == "dandi":
_assign_dandi_names(metadata)
elif style == "bids":
_assign_bids_names(metadata)

Check warning on line 219 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L216-L219

Added lines #L216 - L219 were not covered by tests
else:
lgr.error("“%s” is not a valid `dandi organize` style. ", style)
non_unique = _get_non_unique_paths(metadata, style)

Check warning on line 222 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L221-L222

Added lines #L221 - L222 were not covered by tests
if not non_unique:
break

Expand Down Expand Up @@ -254,9 +266,9 @@
"""
metadata = deepcopy(metadata)
for meta in metadata:
if "dandi_path" not in meta or "external_file_objects" not in meta:
if "organized_path" not in meta or "external_file_objects" not in meta:
continue
nwb_folder_name = op.splitext(op.basename(meta["dandi_path"]))[0]
nwb_folder_name = op.splitext(op.basename(meta["organized_path"]))[0]
for ext_file_dict in meta["external_file_objects"]:
renamed_path_list = []
uuid_str = ext_file_dict.get("id", str(uuid.uuid4()))
Expand Down Expand Up @@ -292,7 +304,9 @@
):
if is_url(str(name_old)):
continue
new_path = op.join(dandiset_path, op.dirname(e["dandi_path"]), name_new)
new_path = op.join(

Check warning on line 307 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L307

Added line #L307 was not covered by tests
dandiset_path, op.dirname(e["organized_path"]), name_new
)
name_old_str = str(name_old)
if not op.isabs(name_old_str):
name_old_str = op.join(op.dirname(e["path"]), name_old_str)
Expand All @@ -314,7 +328,7 @@
seen_object_ids = {} # object_id: path
recent_nwb_msg = "NWB>=2.1.0 standard (supported by pynwb>=1.1.0)."
for r in metadata:
if r["dandi_path"] in non_unique:
if r["organized_path"] in non_unique:

Check warning on line 331 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L331

Added line #L331 was not covered by tests
try:
object_id = get_object_id(r["path"])
except KeyError:
Expand Down Expand Up @@ -385,6 +399,46 @@
return value is None or (hasattr(value, "__len__") and not len(value))


def _assign_bids_names(metadata):
unique_values = _get_unique_values(metadata, bids_layout_fields)

Check warning on line 403 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L403

Added line #L403 was not covered by tests
# unless it is required, we would not include the fields with more than a
# single unique field
for r in metadata:
bids_filename = ""
for field, field_rec in bids_layout_fields.items():
field_format = field_rec["format"]
field_type = field_rec.get("type", "additional")
if (

Check warning on line 411 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L406-L411

Added lines #L406 - L411 were not covered by tests
(field_type == "required")
or (field_type == "additional" and len(unique_values[field]) > 1)
or (
field_type == "required_if_not_empty"
or (field in r.get("_required_if_not_empty", []))
)
):
value = r.get(field, None)
if is_undefined(value):

Check warning on line 420 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L419-L420

Added lines #L419 - L420 were not covered by tests
# skip empty things
continue
if isinstance(value, (list, tuple)):
value = "+".join(map(str, value))
remap = field_rec.get("remap", None)
if remap:
for i in remap:
if value == i[0]:
value = i[1]
r[field] = value

Check warning on line 430 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L422-L430

Added lines #L422 - L430 were not covered by tests
# sanitize value to avoid undesired characters
value = _sanitize_value(value, field)

Check warning on line 432 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L432

Added line #L432 was not covered by tests
# Format _key-value according to the "schema"
formatted_value = field_format.format(value)
bids_filename += formatted_value

Check warning on line 435 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L434-L435

Added lines #L434 - L435 were not covered by tests
# This does not generalize to other datatypes:
r["datatype"] = r["modalities"]
r["organized_filename"] = bids_filename
r["organized_path"] = BIDS_PATH.format(**r)

Check warning on line 439 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L437-L439

Added lines #L437 - L439 were not covered by tests


def _assign_dandi_names(metadata):
unique_values = _get_unique_values(metadata, dandi_layout_fields)
# unless it is required, we would not include the fields with more than a
Expand Down Expand Up @@ -413,8 +467,8 @@
# Format _key-value according to the "schema"
formatted_value = field_format.format(value)
dandi_filename += formatted_value
r["dandi_filename"] = dandi_filename
r["dandi_path"] = dandi_path.format(**r)
r["organized_filename"] = dandi_filename
r["organized_path"] = DANDI_PATH.format(**r)


def _get_unique_values(metadata, fields, filter_=False):
Expand Down Expand Up @@ -687,7 +741,7 @@
yaml.dump(rec, f)


def _get_non_unique_paths(metadata):
def _get_non_unique_paths(metadata, style):
"""Identify non-unique paths after mapping

Parameters
Expand All @@ -697,10 +751,10 @@
Returns
-------
dict:
of dandi_path: list(orig paths)
of organized_path: list(orig paths)
"""
# Verify that we got unique paths
all_paths = [m["dandi_path"] for m in metadata]
all_paths = [m["organized_path"] for m in metadata]
all_paths_unique = set(all_paths)
non_unique = {}
if not len(all_paths) == len(all_paths_unique):
Expand All @@ -710,7 +764,7 @@
for p in non_unique:
orig_paths = []
for e in metadata:
if e["dandi_path"] == p:
if e["organized_path"] == p:

Check warning on line 767 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L767

Added line #L767 was not covered by tests
orig_paths.append(e["path"])
non_unique[p] = orig_paths # overload with the list instead of count
return non_unique
Expand Down Expand Up @@ -750,6 +804,7 @@
def organize(
paths: Sequence[str],
dandiset_path: str | None = None,
style: str | None = None,
invalid: OrganizeInvalid = OrganizeInvalid.FAIL,
files_mode: FileOperationMode = FileOperationMode.AUTO,
devel_debug: bool = False,
Expand Down Expand Up @@ -903,7 +958,9 @@
files_mode = detect_link_type(link_test_file, dandiset_path)

metadata = create_unique_filenames_from_metadata(
metadata, required_fields=required_fields
metadata,
required_fields=required_fields,
style=style,
)

# update metadata with external_file information:
Expand Down Expand Up @@ -954,12 +1011,12 @@
# duplicate but shouldn't hurt
existing = []
for e in metadata:
dandi_fullpath = op.join(dandiset_path, e["dandi_path"])
dandi_fullpath = op.join(dandiset_path, e["organized_path"])
if op.lexists(dandi_fullpath):
# It might be the same file, then we would not complain
if not (
op.realpath(e["path"])
== op.realpath(op.join(dandiset_path, e["dandi_path"]))
== op.realpath(op.join(dandiset_path, e["organized_path"]))
):
existing.append(dandi_fullpath)
# TODO: it might happen that with "move" we are renaming files
Expand All @@ -984,8 +1041,8 @@
skip_same = []
acted_upon = []
for e in metadata:
dandi_path = e["dandi_path"]
dandi_fullpath = op.join(dandiset_path, dandi_path)
organized_path = e["organized_path"]
dandi_fullpath = op.join(dandiset_path, organized_path)
dandi_abs_fullpath = (
op.abspath(dandi_fullpath)
if not op.isabs(dandi_fullpath)
Expand Down Expand Up @@ -1022,7 +1079,7 @@
if (
files_mode is FileOperationMode.DRY
): # TODO: this is actually a files_mode on top of modes!!!?
dry_print(f"{e_path} -> {dandi_path}")
dry_print(f"{e_path} -> {organized_path}")

Check warning on line 1082 in dandi/organize.py

View check run for this annotation

Codecov / codecov/patch

dandi/organize.py#L1082

Added line #L1082 was not covered by tests
else:
if not op.lexists(dandi_dirpath):
os.makedirs(dandi_dirpath)
Expand Down
Loading