Skip to content

Commit

Permalink
Refactor (dilated) distance profile.
Browse files Browse the repository at this point in the history
Unify the `dilated_distance_profile` and `distance_profile` functions
to simplify the API.
  • Loading branch information
isaksamsten committed Jan 8, 2024
1 parent f49a0d5 commit b5afc7d
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 139 deletions.
2 changes: 0 additions & 2 deletions src/wildboar/distance/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
argmin_subsequence_distance,
check_metric,
check_subsequence_metric,
dilated_distance_profile,
distance_profile,
paired_distance,
paired_subsequence_distance,
Expand All @@ -35,7 +34,6 @@
"argmin_subsequence_distance",
"check_metric",
"check_subsequence_metric",
"dilated_distance_profile",
"distance_profile",
"pairwise_subsequence_distance",
"paired_subsequence_distance",
Expand Down
40 changes: 19 additions & 21 deletions src/wildboar/distance/_cdistance.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1626,35 +1626,26 @@ def _paired_distance(
return out.base


cdef class _SubsequenceDistanceProfile:
cdef Subsequence *subsequence
cdef Py_ssize_t n_subsequences
cdef class _DistanceProfile:
cdef double[:, :] distance_profile
cdef TSArray x
cdef TSArray y
cdef Py_ssize_t dim
cdef SubsequenceMetric metric

def __cinit__(
self,
np.ndarray y,
TSArray y,
TSArray x,
Py_ssize_t dim,
SubsequenceMetric metric,
double[:, :] distance_profile,
):
self.distance_profile = distance_profile
self.x = x
self.y = y
self.metric = metric
self.dim = dim
self.subsequence = <Subsequence*> malloc(sizeof(Subsequence))
self.metric.reset(x)
self.metric.from_array(self.subsequence, (self.dim, y))

def __dealloc__(self):
if self.subsequence != NULL:
self.metric.free_persistent(self.subsequence)
free(self.subsequence)
self.subsequence = NULL

@property
def n_work(self):
Expand All @@ -1663,31 +1654,38 @@ cdef class _SubsequenceDistanceProfile:
def __call__(self, Py_ssize_t job_id, Py_ssize_t offset, Py_ssize_t batch_size):
cdef Py_ssize_t i, j
cdef SubsequenceMetric metric = deepcopy(self.metric)
cdef SubsequenceView view

with nogil:
metric.reset(self.x)
for i in range(offset, offset + batch_size):
metric.persistent_profile(
self.subsequence,
self.x,
i,
metric.init_transient(self.y, &view, i, 0, self.y.shape[2], 0)
metric._distance_profile(
&self.y[i, 0, 0],
self.y.shape[2],
view.mean,
view.std,
view.extra,
&self.x[i, 0, 0],
self.x.shape[2],
&self.distance_profile[i, 0],
)
metric.free_transient(&view)


def _subsequence_distance_profile(
np.ndarray y,
def _distance_profile(
TSArray y,
TSArray x,
Py_ssize_t dim,
SubsequenceMetric metric,
n_jobs=None
):
cdef double[:, :] out = np.ones(
(x.shape[0], x.shape[2] - y.shape[0] + 1), dtype=float
(x.shape[0], x.shape[2] - y.shape[2] + 1), dtype=float
)

run_in_parallel(
_SubsequenceDistanceProfile(
_DistanceProfile(
y,
x,
dim,
Expand Down
188 changes: 74 additions & 114 deletions src/wildboar/distance/_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@
_argmin_distance,
_argmin_subsequence_distance,
_dilated_distance_profile,
_distance_profile,
_paired_distance,
_paired_subsequence_distance,
_paired_subsequence_match,
_pairwise_distance,
_pairwise_subsequence_distance,
_singleton_pairwise_distance,
_subsequence_distance_profile,
_subsequence_match,
)
from ._elastic import (
Expand Down Expand Up @@ -1212,6 +1212,11 @@ def argmin_distance(
{
"y": ["array-like"],
"x": ["array-like"],
"dilation": [Interval(numbers.Integral, 1, None, closed="left")],
"padding": [
Interval(numbers.Integral, 0, None, closed="left"),
StrOptions({"same"}),
],
"dim": [Interval(numbers.Integral, 0, None, closed="left")],
"metric": [callable, StrOptions(_SUBSEQUENCE_METRICS.keys())],
"metric_params": [None, dict],
Expand All @@ -1221,7 +1226,16 @@ def argmin_distance(
prefer_skip_nested_validation=True,
)
def distance_profile(
y, x, *, dim=0, metric="mass", metric_params=None, scale=False, n_jobs=None
y,
x,
*,
dilation=1,
padding=0,
dim=0,
metric="mass",
metric_params=None,
scale=False,
n_jobs=None,
):
"""
Compute the distance profile.
Expand All @@ -1233,10 +1247,16 @@ def distance_profile(
Parameters
----------
y : array-like of shape (yn_timestep, )
The subsequence.
The subsequences.
x : ndarray of shape (n_timestep, ), (n_samples, n_timestep)\
or (n_samples, n_dims, n_timestep)
The input data.
The samples. If `x.ndim` is 1, we will broadcast it to have the same
number of samples as `y`.
dilation : int, optional
The dilation, i.e., the spacing between points in the subsequences.
padding : int or {"same"}, optional
The amount of padding applied to the input time series. If "same", the output
size is the same as the input size.
dim : int, optional
The dim to search for shapelets.
metric : str or callable, optional
Expand All @@ -1250,16 +1270,14 @@ def distance_profile(
:ref:`User guide <list_of_subsequence_metrics>`.
scale : bool, optional
If True, scale the subsequences before distance computation.
.. versionadded:: 1.3
n_jobs : int, optional
The number of parallel jobs to run.
Returns
-------
ndarray of shape (n_samples, n_timestep - yn_timestep + 1) or\
(n_timestep - yn_timestep + 1, )
The distance between every subsequence in `x` to `y`.
ndarray of shape (n_samples, output_size)
The distance profile. `output_size` is given by:
`x.shape[-1] + 2 * padding - (y.shape[-1] - 1) * dilation + 1) + 1`.
Warnings
--------
Expand All @@ -1275,27 +1293,58 @@ def distance_profile(
array([14.00120332, 14.41943788, 14.81597243, ..., 4.75219094,
5.72681005, 6.70155561])
"""
y = _validate_subsequence(y)
if len(y) > 1:
raise ValueError("A single subsequence expected, got %d" % len(y))
y = check_array(y, dtype=float, ensure_2d=False, ensure_ts_array=True)
if x.ndim == 1:
x = np.broadcast_to(x, shape=(y.shape[0], x.shape[0]))

y = y[0]
x = check_array(x, allow_3d=True, ensure_2d=False, dtype=float)
x = check_array(x, allow_3d=True, dtype=float, ensure_ts_array=True)

if y.shape[0] > x.shape[-1]:
shapelet_size = (y.shape[2] - 1) * dilation + 1
if padding == "same":
if y.shape[2] % 2 == 0:
raise ValueError(
"padding='same' is only supported for odd subsequence length"
)
padding = shapelet_size // 2

input_size = x.shape[2] + 2 * padding

if shapelet_size > input_size:
raise ValueError("subsequence in y is larger than input in x")

if y.shape[0] != x.shape[0]:
raise ValueError("y and x must be paired")

if dim >= x.shape[1]:
raise ValueError(
"Invalid subsequnce shape (%d > %d)" % (y.shape[0], x.shape[-1])
f"The parameter dim must be dim ({dim}) < n_dims ({x.shape[1]})"
)
n_dims = x.shape[1] if x.ndim == 3 else 1
if dim >= n_dims:
raise ValueError(f"The parameter dim must be dim ({dim}) < n_dims ({n_dims})")

Metric = check_subsequence_metric(metric, scale=scale)
metric_params = metric_params if metric_params is not None else {}
if dilation == 1 and padding == 0:
Metric = check_subsequence_metric(metric, scale=scale)
metric_params = metric_params if metric_params is not None else {}
dp = _distance_profile(y, x, dim, Metric(**metric_params), n_jobs)
else:
# HACK
if metric == "mass":
metric = "scaled_euclidean"

dp = _subsequence_distance_profile(
y, _check_ts_array(x), dim, Metric(**metric_params), n_jobs
)
scale = (isinstance(metric, str) and metric.startswith("scaled_")) or scale
if isinstance(metric, str) and metric.startswith("scaled_"):
metric = metric[7:]

Metric = check_metric(metric)
metric_params = metric_params if metric_params is not None else {}

if scale:
std = np.std(y, axis=-1, keepdims=True)
mean = np.mean(y, axis=-1, keepdims=True)
std[std < 1e-13] = 1 # NOTE! see EPSILON in _cdistance.pxd
y = (y - mean) / std

dp = _dilated_distance_profile(
y, x, dim, Metric(**metric_params), dilation, padding, scale, n_jobs
)

# TODO: fix the output
if dp.shape[0] == 1:
Expand All @@ -1318,7 +1367,7 @@ def distance_profile(
},
prefer_skip_nested_validation=True,
)
def argmin_subsequence_distance(
def argmin_subsequence_distance( # noqa: PLR0912
y,
x,
*,
Expand Down Expand Up @@ -1444,92 +1493,3 @@ def argmin_subsequence_distance(
return indices, distances
else:
return indices


def dilated_distance_profile(
y,
x,
*,
dilation=1,
padding=0,
dim=0,
metric="euclidean",
metric_params=None,
scale=False,
n_jobs=None,
):
"""
Compute the dilated distance profile.
The distance profile is computed between the paired subsequences
and samples in `y` and `x`, using the provided dilation and padding.
Parameters
----------
y : array-like of shape (n_samples, m_timestep)
The subsequences.
x : array-like of shape (n_timestep, ) or (n_samples, n_timestep)
The samples. If `x.ndim` is 1, we will broadcast it to have the same
number of samples as `y`.
dilation : int, optional
The dilation, i.e., the spacing between points in the subsequences.
padding : int, optional
The amount of padding applied to the input time series.
dim : int, optional
The dimension in `x` for which the distance profile is computed.
metric : str, optional
The metric.
See ``_SUBSEQUENCE_METRICS.keys()`` for a list of supported metrics.
metric_params : dict, optional
Parameters to the metric.
Read more about the parameters in the
:ref:`User guide <list_of_subsequence_metrics>`.
scale : bool, optional
If True, scale the subsequences before distance computation.
n_jobs : int, optional
The number of parallel jobs.
Returns
-------
ndarray of shape (n_samples, output_size)
The distance profile. The size of the output is given by:
`x.shape[-1] + 2 * padding - (y.shape[-1] - 1) * dilation + 1) + 1`.
"""
y = check_array(y, dtype=float, ensure_2d=False, ensure_ts_array=True)
if x.ndim == 1:
x = np.broadcast_to(x, shape=(y.shape[0], x.shape[0]))

x = check_array(x, dtype=float, ensure_ts_array=True, allow_3d=True)
shapelet_size = (y.shape[2] - 1) * dilation + 1
input_size = x.shape[2] + 2 * padding

if shapelet_size > input_size:
raise ValueError("subsequence in y is larger than input in x")

if y.shape[0] != x.shape[0]:
raise ValueError("y and x must be paired")

if dim >= x.shape[1]:
raise ValueError(
f"The parameter dim must be dim ({dim}) < n_dims ({x.shape[1]})"
)

scale = (isinstance(metric, str) and metric.startswith("scaled_")) or scale
if isinstance(metric, str) and metric.startswith("scaled_"):
metric = metric[7:]

Metric = check_metric(metric)
metric_params = metric_params if metric_params is not None else {}

if scale:
std = np.std(y, axis=-1, keepdims=True)
mean = np.mean(y, axis=-1, keepdims=True)
std[std < 1e-13] = 1 # see EPSILON in _cdistance.pxd
y = (y - mean) / std

# TODO: fix the output
return _dilated_distance_profile(
y, x, dim, Metric(**metric_params), dilation, padding, scale, n_jobs
)
5 changes: 3 additions & 2 deletions tests/wildboar/distance/test_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from wildboar.distance import (
argmin_distance,
argmin_subsequence_distance,
dilated_distance_profile,
distance_profile,
paired_distance,
paired_subsequence_distance,
Expand Down Expand Up @@ -1566,7 +1565,9 @@ def test_argmin_subsequence_distance(metric, k):

def test_dilated_distance_profile():
X, y = load_two_lead_ecg()
dp = dilated_distance_profile(X[0:2, 9:21], X[2:4], dilation=3, padding=4)
dp = distance_profile(
X[0:2, 9:21], X[2:4], dilation=3, padding=4, metric="euclidean"
)
# fmt: off
expected = np.array([[
2.11436574, 2.24116065, 2.49773792, 3.27044544, 3.51998601,
Expand Down

0 comments on commit b5afc7d

Please sign in to comment.