Refactor (dilated) distance profile.

Unify the `dilated_distance_profile` and `distance_profile` functions to simplify the API.
wildboar-foundation · Jan 8, 2024 · b5afc7d · b5afc7d
1 parent f49a0d5
commit b5afc7d
Show file tree

Hide file tree

Showing 4 changed files with 96 additions and 139 deletions.
diff --git a/src/wildboar/distance/__init__.py b/src/wildboar/distance/__init__.py
@@ -17,7 +17,6 @@
     argmin_subsequence_distance,
     check_metric,
     check_subsequence_metric,
-    dilated_distance_profile,
     distance_profile,
     paired_distance,
     paired_subsequence_distance,
@@ -35,7 +34,6 @@
     "argmin_subsequence_distance",
     "check_metric",
     "check_subsequence_metric",
-    "dilated_distance_profile",
     "distance_profile",
     "pairwise_subsequence_distance",
     "paired_subsequence_distance",

diff --git a/src/wildboar/distance/_cdistance.pyx b/src/wildboar/distance/_cdistance.pyx
@@ -1626,35 +1626,26 @@ def _paired_distance(
     return out.base
 
 
-cdef class _SubsequenceDistanceProfile:
-    cdef Subsequence *subsequence
-    cdef Py_ssize_t n_subsequences
+cdef class _DistanceProfile:
     cdef double[:, :] distance_profile
     cdef TSArray x
+    cdef TSArray y
     cdef Py_ssize_t dim
     cdef SubsequenceMetric metric
 
     def __cinit__(
         self,
-        np.ndarray y,
+        TSArray y,
         TSArray x,
         Py_ssize_t dim,
         SubsequenceMetric metric,
         double[:, :] distance_profile,
     ):
         self.distance_profile = distance_profile
         self.x = x
+        self.y = y
         self.metric = metric
         self.dim = dim
-        self.subsequence = <Subsequence*> malloc(sizeof(Subsequence))
-        self.metric.reset(x)
-        self.metric.from_array(self.subsequence, (self.dim, y))
-
-    def __dealloc__(self):
-        if self.subsequence != NULL:
-            self.metric.free_persistent(self.subsequence)
-            free(self.subsequence)
-            self.subsequence = NULL
 
     @property
     def n_work(self):
@@ -1663,31 +1654,38 @@ cdef class _SubsequenceDistanceProfile:
     def __call__(self, Py_ssize_t job_id, Py_ssize_t offset, Py_ssize_t batch_size):
         cdef Py_ssize_t i, j
         cdef SubsequenceMetric metric = deepcopy(self.metric)
+        cdef SubsequenceView view
 
         with nogil:
             metric.reset(self.x)
             for i in range(offset, offset + batch_size):
-                metric.persistent_profile(
-                    self.subsequence,
-                    self.x,
-                    i,
+                metric.init_transient(self.y, &view, i, 0, self.y.shape[2], 0)
+                metric._distance_profile(
+                    &self.y[i, 0, 0],
+                    self.y.shape[2],
+                    view.mean,
+                    view.std,
+                    view.extra,
+                    &self.x[i, 0, 0],
+                    self.x.shape[2],
                     &self.distance_profile[i, 0],
                 )
+                metric.free_transient(&view)
 
 
-def _subsequence_distance_profile(
-    np.ndarray y,
+def _distance_profile(
+    TSArray y,
     TSArray x,
     Py_ssize_t dim,
     SubsequenceMetric metric,
     n_jobs=None
 ):
     cdef double[:, :] out = np.ones(
-        (x.shape[0], x.shape[2] - y.shape[0] + 1), dtype=float
+        (x.shape[0], x.shape[2] - y.shape[2] + 1), dtype=float
     )
 
     run_in_parallel(
-        _SubsequenceDistanceProfile(
+        _DistanceProfile(
             y,
             x,
             dim,

diff --git a/src/wildboar/distance/_distance.py b/src/wildboar/distance/_distance.py
@@ -19,13 +19,13 @@
     _argmin_distance,
     _argmin_subsequence_distance,
     _dilated_distance_profile,
+    _distance_profile,
     _paired_distance,
     _paired_subsequence_distance,
     _paired_subsequence_match,
     _pairwise_distance,
     _pairwise_subsequence_distance,
     _singleton_pairwise_distance,
-    _subsequence_distance_profile,
     _subsequence_match,
 )
 from ._elastic import (
@@ -1212,6 +1212,11 @@ def argmin_distance(
     {
         "y": ["array-like"],
         "x": ["array-like"],
+        "dilation": [Interval(numbers.Integral, 1, None, closed="left")],
+        "padding": [
+            Interval(numbers.Integral, 0, None, closed="left"),
+            StrOptions({"same"}),
+        ],
         "dim": [Interval(numbers.Integral, 0, None, closed="left")],
         "metric": [callable, StrOptions(_SUBSEQUENCE_METRICS.keys())],
         "metric_params": [None, dict],
@@ -1221,7 +1226,16 @@ def argmin_distance(
     prefer_skip_nested_validation=True,
 )
 def distance_profile(
-    y, x, *, dim=0, metric="mass", metric_params=None, scale=False, n_jobs=None
+    y,
+    x,
+    *,
+    dilation=1,
+    padding=0,
+    dim=0,
+    metric="mass",
+    metric_params=None,
+    scale=False,
+    n_jobs=None,
 ):
     """
     Compute the distance profile.
@@ -1233,10 +1247,16 @@ def distance_profile(
     Parameters
     ----------
     y : array-like of shape (yn_timestep, )
-        The subsequence.
+        The subsequences.
     x : ndarray of shape (n_timestep, ), (n_samples, n_timestep)\
     or (n_samples, n_dims, n_timestep)
-        The input data.
+        The samples. If `x.ndim` is 1, we will broadcast it to have the same
+        number of samples as `y`.
+    dilation : int, optional
+        The dilation, i.e., the spacing between points in the subsequences.
+    padding : int or {"same"}, optional
+        The amount of padding applied to the input time series. If "same", the output
+        size is the same as the input size.
     dim : int, optional
         The dim to search for shapelets.
     metric : str or callable, optional
@@ -1250,16 +1270,14 @@ def distance_profile(
         :ref:`User guide <list_of_subsequence_metrics>`.
     scale : bool, optional
         If True, scale the subsequences before distance computation.
-
-        .. versionadded:: 1.3
     n_jobs : int, optional
         The number of parallel jobs to run.
 
     Returns
     -------
-    ndarray of shape (n_samples, n_timestep - yn_timestep + 1) or\
-            (n_timestep - yn_timestep + 1, )
-        The distance between every subsequence in `x` to `y`.
+    ndarray of shape (n_samples, output_size)
+        The distance profile. `output_size` is given by:
+        `x.shape[-1] + 2 * padding - (y.shape[-1] - 1) * dilation + 1) + 1`.
 
     Warnings
     --------
@@ -1275,27 +1293,58 @@ def distance_profile(
     array([14.00120332, 14.41943788, 14.81597243, ...,  4.75219094,
            5.72681005,  6.70155561])
     """
-    y = _validate_subsequence(y)
-    if len(y) > 1:
-        raise ValueError("A single subsequence expected, got %d" % len(y))
+    y = check_array(y, dtype=float, ensure_2d=False, ensure_ts_array=True)
+    if x.ndim == 1:
+        x = np.broadcast_to(x, shape=(y.shape[0], x.shape[0]))
 
-    y = y[0]
-    x = check_array(x, allow_3d=True, ensure_2d=False, dtype=float)
+    x = check_array(x, allow_3d=True, dtype=float, ensure_ts_array=True)
 
-    if y.shape[0] > x.shape[-1]:
+    shapelet_size = (y.shape[2] - 1) * dilation + 1
+    if padding == "same":
+        if y.shape[2] % 2 == 0:
+            raise ValueError(
+                "padding='same' is only supported for odd subsequence length"
+            )
+        padding = shapelet_size // 2
+
+    input_size = x.shape[2] + 2 * padding
+
+    if shapelet_size > input_size:
+        raise ValueError("subsequence in y is larger than input in x")
+
+    if y.shape[0] != x.shape[0]:
+        raise ValueError("y and x must be paired")
+
+    if dim >= x.shape[1]:
         raise ValueError(
-            "Invalid subsequnce shape (%d > %d)" % (y.shape[0], x.shape[-1])
+            f"The parameter dim must be dim ({dim}) < n_dims ({x.shape[1]})"
         )
-    n_dims = x.shape[1] if x.ndim == 3 else 1
-    if dim >= n_dims:
-        raise ValueError(f"The parameter dim must be dim ({dim}) < n_dims ({n_dims})")
 
-    Metric = check_subsequence_metric(metric, scale=scale)
-    metric_params = metric_params if metric_params is not None else {}
+    if dilation == 1 and padding == 0:
+        Metric = check_subsequence_metric(metric, scale=scale)
+        metric_params = metric_params if metric_params is not None else {}
+        dp = _distance_profile(y, x, dim, Metric(**metric_params), n_jobs)
+    else:
+        # HACK
+        if metric == "mass":
+            metric = "scaled_euclidean"
 
-    dp = _subsequence_distance_profile(
-        y, _check_ts_array(x), dim, Metric(**metric_params), n_jobs
-    )
+        scale = (isinstance(metric, str) and metric.startswith("scaled_")) or scale
+        if isinstance(metric, str) and metric.startswith("scaled_"):
+            metric = metric[7:]
+
+        Metric = check_metric(metric)
+        metric_params = metric_params if metric_params is not None else {}
+
+        if scale:
+            std = np.std(y, axis=-1, keepdims=True)
+            mean = np.mean(y, axis=-1, keepdims=True)
+            std[std < 1e-13] = 1  # NOTE! see EPSILON in _cdistance.pxd
+            y = (y - mean) / std
+
+        dp = _dilated_distance_profile(
+            y, x, dim, Metric(**metric_params), dilation, padding, scale, n_jobs
+        )
 
     # TODO: fix the output
     if dp.shape[0] == 1:
@@ -1318,7 +1367,7 @@ def distance_profile(
     },
     prefer_skip_nested_validation=True,
 )
-def argmin_subsequence_distance(
+def argmin_subsequence_distance(  # noqa: PLR0912
     y,
     x,
     *,
@@ -1444,92 +1493,3 @@ def argmin_subsequence_distance(
         return indices, distances
     else:
         return indices
-
-
-def dilated_distance_profile(
-    y,
-    x,
-    *,
-    dilation=1,
-    padding=0,
-    dim=0,
-    metric="euclidean",
-    metric_params=None,
-    scale=False,
-    n_jobs=None,
-):
-    """
-    Compute the dilated distance profile.
-
-    The distance profile is computed between the paired subsequences
-    and samples in `y` and `x`, using the provided dilation and padding.
-
-    Parameters
-    ----------
-    y : array-like of shape (n_samples, m_timestep)
-        The subsequences.
-    x : array-like of shape (n_timestep, ) or (n_samples, n_timestep)
-        The samples. If `x.ndim` is 1, we will broadcast it to have the same
-        number of samples as `y`.
-    dilation : int, optional
-        The dilation, i.e., the spacing between points in the subsequences.
-    padding : int, optional
-        The amount of padding applied to the input time series.
-    dim : int, optional
-        The dimension in `x` for which the distance profile is computed.
-    metric : str, optional
-        The metric.
-
-        See ``_SUBSEQUENCE_METRICS.keys()`` for a list of supported metrics.
-    metric_params : dict, optional
-        Parameters to the metric.
-
-        Read more about the parameters in the
-        :ref:`User guide <list_of_subsequence_metrics>`.
-    scale : bool, optional
-        If True, scale the subsequences before distance computation.
-    n_jobs : int, optional
-       The number of parallel jobs.
-
-    Returns
-    -------
-    ndarray of shape (n_samples, output_size)
-        The distance profile. The size of the output is given by:
-        `x.shape[-1] + 2 * padding - (y.shape[-1] - 1) * dilation + 1) + 1`.
-    """
-    y = check_array(y, dtype=float, ensure_2d=False, ensure_ts_array=True)
-    if x.ndim == 1:
-        x = np.broadcast_to(x, shape=(y.shape[0], x.shape[0]))
-
-    x = check_array(x, dtype=float, ensure_ts_array=True, allow_3d=True)
-    shapelet_size = (y.shape[2] - 1) * dilation + 1
-    input_size = x.shape[2] + 2 * padding
-
-    if shapelet_size > input_size:
-        raise ValueError("subsequence in y is larger than input in x")
-
-    if y.shape[0] != x.shape[0]:
-        raise ValueError("y and x must be paired")
-
-    if dim >= x.shape[1]:
-        raise ValueError(
-            f"The parameter dim must be dim ({dim}) < n_dims ({x.shape[1]})"
-        )
-
-    scale = (isinstance(metric, str) and metric.startswith("scaled_")) or scale
-    if isinstance(metric, str) and metric.startswith("scaled_"):
-        metric = metric[7:]
-
-    Metric = check_metric(metric)
-    metric_params = metric_params if metric_params is not None else {}
-
-    if scale:
-        std = np.std(y, axis=-1, keepdims=True)
-        mean = np.mean(y, axis=-1, keepdims=True)
-        std[std < 1e-13] = 1  # see EPSILON in _cdistance.pxd
-        y = (y - mean) / std
-
-    # TODO: fix the output
-    return _dilated_distance_profile(
-        y, x, dim, Metric(**metric_params), dilation, padding, scale, n_jobs
-    )
diff --git a/tests/wildboar/distance/test_distance.py b/tests/wildboar/distance/test_distance.py
@@ -8,7 +8,6 @@
 from wildboar.distance import (
     argmin_distance,
     argmin_subsequence_distance,
-    dilated_distance_profile,
     distance_profile,
     paired_distance,
     paired_subsequence_distance,
@@ -1566,7 +1565,9 @@ def test_argmin_subsequence_distance(metric, k):
 
 def test_dilated_distance_profile():
     X, y = load_two_lead_ecg()
-    dp = dilated_distance_profile(X[0:2, 9:21], X[2:4], dilation=3, padding=4)
+    dp = distance_profile(
+        X[0:2, 9:21], X[2:4], dilation=3, padding=4, metric="euclidean"
+    )
     # fmt: off
     expected = np.array([[
         2.11436574, 2.24116065, 2.49773792, 3.27044544, 3.51998601,