-
Notifications
You must be signed in to change notification settings - Fork 20
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add get_name patch/spool method #471
Changes from 4 commits
a3e2fb9
53748a3
b90bcbd
70e2e93
ec69270
1b0ab1b
eb7acdc
b9137ac
6044f67
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,7 +46,7 @@ | |
warn_or_raise, | ||
yield_sub_sequences, | ||
) | ||
from dascore.utils.time import to_datetime64, to_float | ||
from dascore.utils.time import to_float | ||
|
||
attr_type = dict[str, Any] | str | Sequence[str] | None | ||
|
||
|
@@ -306,7 +306,10 @@ def patches_to_df( | |
elif isinstance(patches, pd.DataFrame): | ||
df = patches | ||
else: | ||
df = pd.DataFrame([x.flat_dump() for x in scan_patches(patches)]) | ||
df = dc.scan_to_df( | ||
patches, | ||
exclude=(), | ||
) | ||
if df.empty: # create empty df with appropriate columns | ||
cols = list(dc.PatchAttrs().model_dump()) | ||
df = pd.DataFrame(columns=cols).assign(patch=None, history=None) | ||
|
@@ -418,24 +421,6 @@ def _get_new_coord(df, merge_dim, coords): | |
return [new_dict] | ||
|
||
|
||
def scan_patches(patches: PatchType | Sequence[PatchType]) -> list[dc.PatchAttrs]: | ||
""" | ||
Scan a sequence of patches and return a list of summaries. | ||
|
||
The summary dicts have the following fields: | ||
{fields} | ||
|
||
Parameters | ||
---------- | ||
patches | ||
A single patch or a sequence of patches. | ||
""" | ||
if isinstance(patches, dc.Patch): | ||
patches = [patches] # make sure we have an iterable | ||
out = [pa.attrs for pa in patches] | ||
return out | ||
|
||
|
||
def get_start_stop_step(patch: PatchType, dim): | ||
"""Convenience method for getting start, stop, step for a given coord.""" | ||
assert dim in patch.dims, f"{dim} is not in Patch dimensions of {patch.dims}" | ||
|
@@ -446,21 +431,90 @@ def get_start_stop_step(patch: PatchType, dim): | |
return start, stop, step | ||
|
||
|
||
def get_default_patch_name(patch): | ||
"""Generates the name of the node.""" | ||
def get_patch_names( | ||
patch_data: pd.DataFrame | dc.Patch | dc.BaseSpool, | ||
prefix="DAS", | ||
attrs=("network", "station", "tag"), | ||
coords=("time",), | ||
sep="__", | ||
) -> pd.Series: | ||
""" | ||
Generates the default name of patch data. | ||
|
||
Parameters | ||
---------- | ||
prefix | ||
A string to prefix the names. | ||
patch_data | ||
A container with patch data. | ||
coords | ||
The coordinate ranges to use for names. | ||
sep | ||
The separator for the strings. | ||
|
||
Notes | ||
----- | ||
There are two special cases where the default logic is overwritten. | ||
The first one, is when a column called "name" already exists. This | ||
will simply be returned. | ||
|
||
The second is when a column called "path" exists. In this case, the | ||
output will be the file name with the extension removed. The path must | ||
use / as a delinater. | ||
|
||
def _format_datetime64(dt): | ||
Examples | ||
-------- | ||
>>> import dascore as dc | ||
>>> from dascore.utils.patch import get_patch_names | ||
>>> patch = dc.get_example_patch() | ||
>>> name = get_patch_names(patch) | ||
d-chambers marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
0 DAS_______random__2017_09_18__2017_09_18T00_00_07
Name: network, dtype: object We get There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, for the function to work with a spool it needs to return multiple names. I avoided returning multiple types (based on if the input is a Patch or not) from I understand the confusion though. I will add |
||
""" | ||
|
||
def _format_time_column(ser): | ||
"""Format the time column.""" | ||
ser = ser.astype(str).str.split(".", expand=True)[0] | ||
chars_to_replace = (":", "-") | ||
for char in chars_to_replace: | ||
ser = ser.str.replace(char, "_") | ||
ser = ser.str.replace(" ", "T") | ||
return ser | ||
|
||
def _format_time_columns(df): | ||
"""Format the datetime string in a sensible way.""" | ||
out = str(to_datetime64(dt)) | ||
return out.replace(":", "_").replace("-", "_").replace(".", "_") | ||
|
||
attrs = patch.attrs | ||
start = _format_datetime64(attrs.get("time_min", "")) | ||
end = _format_datetime64(attrs.get("time_max", "")) | ||
net = attrs.get("network", "") | ||
sta = attrs.get("station", "") | ||
tag = attrs.get("tag", "") | ||
return f"DAS__{net}__{sta}__{tag}__{start}__{end}" | ||
sub = df.select_dtypes(include=["datetime64", "timedelta64"]) | ||
out = {} | ||
for col in sub.columns: | ||
out[col] = _format_time_column(df[col]) | ||
return df.assign(**out) | ||
|
||
def _get_filename(path_ser): | ||
"""Get the file name from a path series.""" | ||
ser = path_ser.astype(str) | ||
file_names = [x[-1].split(".")[0] for x in ser.str.split("/")] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about we introduce an optional argument (such as keep_extension) so the user can decide whether to get patch's name with or without file extension? I can work on this as I already implemented this in my version #461. What do you think? |
||
return pd.Series(file_names) | ||
|
||
# Ensure we are working with a dataframe. | ||
df = dc.scan_to_df( | ||
patch_data, | ||
exclude=(), | ||
) | ||
if df.empty: | ||
return pd.Series(dtype=str) | ||
col_set = set(df.columns) | ||
# Handle special cases. | ||
if "name" in col_set: | ||
return df["name"].astype(str) | ||
if "path" in col_set: | ||
return _get_filename(df["path"]) | ||
# Determine the requested fields and get the ones that are there. | ||
coord_fields = zip([f"{x}_min" for x in coords], [f"{x}_max" for x in coords]) | ||
requested_fields = list(attrs) + list(*coord_fields) | ||
current = set(df.columns) | ||
fields = [x for x in requested_fields if x in current] | ||
# Get a sub dataframe and convert any datetime things to strings. | ||
sub = df[fields].pipe(_format_time_columns).fillna("").astype(str) | ||
out = f"{prefix}_{sep}" + sub[fields[0]].str.cat(sub[fields[1:]], sep=sep) | ||
return out | ||
|
||
|
||
def get_dim_axis_value( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -385,6 +385,15 @@ def one_file_dir(tmp_path_factory, random_patch): | |
return ex.spool_to_directory(spool, path=out) | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def random_spool_directory(tmp_path_factory): | ||
"""A directory with a few patch files.""" | ||
out = Path(tmp_path_factory.mktemp("one_file_file_spool")) | ||
spool = ex.get_example_spool("random_das") | ||
out_path = ex.spool_to_directory(spool, path=out) | ||
return dc.spool(out_path).update() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nice to have this as an example in dascore.examples as all of the example spools are memory spools. |
||
|
||
|
||
@pytest.fixture(scope="class") | ||
def two_patch_directory(tmp_path_factory, terra15_das_example_path, random_patch): | ||
"""Create a directory of DAS files for testing.""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
need test(s) for optional inputs . For example,
get_patch_names(patch, patch)
does not raise an error.