✅ Cross-tested CPU/GPU DBSCAN for finding lake clusters

Towards a properly tested and reproducible pipeline for finding and analyzing subglacial lake clusters in Antarctica! Moving the find_clusters DBSCAN algorithm from atlxi_lake.ipynb into a properly tested and documented function inside lake_algorithms.py. Importing cudf/pandas as xpd to enable cross-device (GPU/CPU) use, and ensure that an xpd.Series is returned. Test scenarios (written in Gherkin) are in the subglacial_lakes.feature file, and integration tests with Given-When-Then statements are in test_subglacial_lakes.py. There was quite a bit of boilerplate BDD code to set up, including the empty context class for passing contextual objects between steps. The test parquet file is somewhat hardcoded to a Github URL, and is cached using fsspec's simplecache mechanism (ideally would use intake-parquet if it worked). Still need some more robust tests, as the cuml and scikit-learn DBSCAN algorithms return a different number of clusters for whillans_upstream.
weiji14 · Oct 31, 2020 · c66d137 · c66d137
1 parent cbea60c
commit c66d137
Show file tree

Hide file tree

Showing 8 changed files with 201 additions and 51 deletions.
diff --git a/atlxi_lake.ipynb b/atlxi_lake.ipynb
@@ -204,35 +204,15 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "source": [
     "# Find Active Subglacial Lake clusters\n",
     "\n",
     "Uses Density-based spatial clustering of applications with noise (DBSCAN)."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:\n",
-    "    \"\"\"\n",
-    "    Density-based spatial clustering of applications with noise (DBSCAN)\n",
-    "    See also https://www.naftaliharris.com/blog/visualizing-dbscan-clustering\n",
-    "    \"\"\"\n",
-    "    # Run DBSCAN using 3000 m distance, and minimum of 250 points\n",
-    "    dbscan = cuml.DBSCAN(eps=3000, min_samples=250)\n",
-    "    dbscan.fit(X=X)\n",
-    "\n",
-    "    cluster_labels = dbscan.labels_ + 1  # noise points -1 becomes 0\n",
-    "    cluster_labels = cluster_labels.mask(cond=cluster_labels == 0)  # turn 0 to NaN\n",
-    "    cluster_labels.index = X.index  # let labels have same index as input data\n",
-    "\n",
-    "    return cluster_labels"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -757,11 +737,14 @@
     "    # Filling lake points have positive labels (e.g. 1, 2, 3),\n",
     "    # Noise points have NaN labels (i.e. NaN)\n",
     "    cluster_vars = [\"x\", \"y\", \"dhdt_slope\"]\n",
-    "    draining_lake_labels = -find_clusters(X=X.loc[X.dhdt_slope < 0][cluster_vars])\n",
-    "    filling_lake_labels = find_clusters(X=X.loc[X.dhdt_slope > 0][cluster_vars])\n",
+    "    draining_lake_labels = -deepicedrain.find_clusters(\n",
+    "        X=X.loc[X.dhdt_slope < 0][cluster_vars]\n",
+    "    )\n",
+    "    filling_lake_labels = deepicedrain.find_clusters(\n",
+    "        X=X.loc[X.dhdt_slope > 0][cluster_vars]\n",
+    "    )\n",
     "    lake_labels = cudf.concat(objs=[draining_lake_labels, filling_lake_labels])\n",
     "    lake_labels: cudf.Series = lake_labels.sort_index()\n",
-    "    lake_labels.name = \"cluster_label\"\n",
     "\n",
     "    # Checking all potential subglacial lakes in a basin\n",
     "    clusters: cudf.Series = lake_labels.unique()\n",
@@ -877,8 +860,8 @@
    "source": [
     "# Plot clusters on a map in colour, noise points/outliers as small dots\n",
     "fig = pygmt.Figure()\n",
-    "n_clusters_ = len(X_.cluster_label.unique()) - 1  # No. of clusters minus noise (NaN)\n",
-    "sizes = (X_.cluster_label.isna()).map(arg={True: 0.01, False: 0.1})\n",
+    "n_clusters_ = len(X_.cluster_id.unique()) - 1  # No. of clusters minus noise (NaN)\n",
+    "sizes = (X_.cluster_id.isna()).map(arg={True: 0.01, False: 0.1})\n",
     "if n_clusters_:\n",
     "    pygmt.makecpt(cmap=\"polar+h0\", series=(-1.5, 1.5, 1), reverse=True, D=True)\n",
     "else:\n",
@@ -888,7 +871,7 @@
     "    y=X_.y,\n",
     "    sizes=sizes,\n",
     "    style=\"cc\",\n",
-    "    color=X_.cluster_label,\n",
+    "    color=X_.cluster_id,\n",
     "    cmap=True,\n",
     "    frame=[\n",
     "        f'WSne+t\"Estimated number of lake clusters at {basin.NAME}: {n_clusters_}\"',\n",

diff --git a/atlxi_lake.py b/atlxi_lake.py
@@ -134,22 +134,6 @@
 #
 # Uses Density-based spatial clustering of applications with noise (DBSCAN).
 
-# %%
-def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
-    """
-    Density-based spatial clustering of applications with noise (DBSCAN)
-    See also https://www.naftaliharris.com/blog/visualizing-dbscan-clustering
-    """
-    # Run DBSCAN using 3000 m distance, and minimum of 250 points
-    dbscan = cuml.DBSCAN(eps=3000, min_samples=250)
-    dbscan.fit(X=X)
-
-    cluster_labels = dbscan.labels_ + 1  # noise points -1 becomes 0
-    cluster_labels = cluster_labels.mask(cond=cluster_labels == 0)  # turn 0 to NaN
-    cluster_labels.index = X.index  # let labels have same index as input data
-
-    return cluster_labels
-
 
 # %% [markdown]
 # ### Subglacial Lake Finder algorithm
@@ -212,11 +196,14 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
     # Filling lake points have positive labels (e.g. 1, 2, 3),
     # Noise points have NaN labels (i.e. NaN)
     cluster_vars = ["x", "y", "dhdt_slope"]
-    draining_lake_labels = -find_clusters(X=X.loc[X.dhdt_slope < 0][cluster_vars])
-    filling_lake_labels = find_clusters(X=X.loc[X.dhdt_slope > 0][cluster_vars])
+    draining_lake_labels = -deepicedrain.find_clusters(
+        X=X.loc[X.dhdt_slope < 0][cluster_vars]
+    )
+    filling_lake_labels = deepicedrain.find_clusters(
+        X=X.loc[X.dhdt_slope > 0][cluster_vars]
+    )
     lake_labels = cudf.concat(objs=[draining_lake_labels, filling_lake_labels])
     lake_labels: cudf.Series = lake_labels.sort_index()
-    lake_labels.name = "cluster_label"
 
     # Checking all potential subglacial lakes in a basin
     clusters: cudf.Series = lake_labels.unique()
@@ -314,8 +301,8 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
 # %%
 # Plot clusters on a map in colour, noise points/outliers as small dots
 fig = pygmt.Figure()
-n_clusters_ = len(X_.cluster_label.unique()) - 1  # No. of clusters minus noise (NaN)
-sizes = (X_.cluster_label.isna()).map(arg={True: 0.01, False: 0.1})
+n_clusters_ = len(X_.cluster_id.unique()) - 1  # No. of clusters minus noise (NaN)
+sizes = (X_.cluster_id.isna()).map(arg={True: 0.01, False: 0.1})
 if n_clusters_:
     pygmt.makecpt(cmap="polar+h0", series=(-1.5, 1.5, 1), reverse=True, D=True)
 else:
@@ -325,7 +312,7 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
     y=X_.y,
     sizes=sizes,
     style="cc",
-    color=X_.cluster_label,
+    color=X_.cluster_id,
     cmap=True,
     frame=[
         f'WSne+t"Estimated number of lake clusters at {basin.NAME}: {n_clusters_}"',

diff --git a/deepicedrain/README.md b/deepicedrain/README.md
@@ -24,6 +24,9 @@ Contents:
   - ndarray_to_parquet - Turns an n-dimensional xarray/zarr array into an a parquet columnar format
   - wide_to_long - Turns a pandas dataframe table with many columns into one with many rows
 
+- :droplet: lakealgorithms.py - Custom algorithms for detecting and filtering active subglacial lakes
+  - find_clusters - Density based clustering algorithm (DBSCAN) to group points into lakes
+
 - :world_map: vizplots.py - Makes interactive dashboard plots and publication quality figures
   - IceSat2Explorer - Dashboard for interacting with ICESat-2 point clouds on a 2D map
   - plot_alongtrack - Makes a 2D along track figure of height measurements taken at different cycle times

diff --git a/deepicedrain/__init__.py b/deepicedrain/__init__.py
@@ -6,6 +6,7 @@
 import deepicedrain
 from deepicedrain.deltamath import calculate_delta, nan_linregress, nanptp
 from deepicedrain.extraload import array_to_dataframe, ndarray_to_parquet, wide_to_long
+from deepicedrain.lake_algorithms import find_clusters
 from deepicedrain.spatiotemporal import (
     Region,
     deltatime_to_utctime,

diff --git a/deepicedrain/lake_algorithms.py b/deepicedrain/lake_algorithms.py
@@ -0,0 +1,68 @@
+"""
+Custom algorithms for helping to detect active subglacial lakes.
+"""
+try:
+    import cudf as xpd
+except ImportError:
+    import pandas as xpd
+
+import numpy as np
+
+
+def find_clusters(
+    X: xpd.DataFrame,
+    eps: float = 3000,
+    min_samples: int = 250,
+    output_colname: str = "cluster_id",
+) -> xpd.Series:
+    """
+    Classify a point cloud into several groups, with each group being assigned
+    a positive integer label like 1, 2, 3, etc. Unclassified noise points are
+    labelled as NaN.
+
+    Uses Density-based spatial clustering of applications with noise (DBSCAN).
+    See also https://www.naftaliharris.com/blog/visualizing-dbscan-clustering
+
+    ***       **         111       NN
+    **    **   *         11    22   N
+    *     ****     -->   1     2222
+      **     **            33     22
+    ******               333333
+
+    Parameters
+    ----------
+    X : cudf.DataFrame or pandas.DataFrame
+        A table of X, Y, Z points to run the clustering algorithm on.
+    eps : float
+        The maximum distance between 2 points such they reside in the same
+        neighborhood. Default is 3000 (metres).
+    min_samples : int
+        The number of samples in a neighborhood such that this group can be
+        considered as an important core point (including the point itself).
+        Default is 250 (sample points).
+    output_colname : str
+        The name of the column for the output Series. Default is 'cluster_id'.
+
+    Returns
+    -------
+    cluster_labels : cudf.Series or pd.Series
+        Which cluster each datapoint belongs to. Noisy samples are labeled as
+        NaN.
+    """
+    try:
+        from cuml.cluster import DBSCAN
+    except ImportError:
+        from sklearn.cluster import DBSCAN
+
+    # Run DBSCAN using {eps} m distance, and minimum of {min_samples} points
+    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
+    dbscan.fit(X=X)
+
+    cluster_labels = dbscan.labels_ + 1  # noise points -1 becomes 0
+    if isinstance(cluster_labels, np.ndarray):
+        cluster_labels = xpd.Series(data=cluster_labels, dtype=xpd.Int32Dtype())
+    cluster_labels = cluster_labels.mask(cond=cluster_labels == 0)  # turn 0 to NaN
+    cluster_labels.index = X.index  # let labels have same index as input data
+    cluster_labels.name = output_colname
+
+    return cluster_labels
diff --git a/deepicedrain/tests/conftest.py b/deepicedrain/tests/conftest.py
@@ -0,0 +1,16 @@
+"""
+This module contains shared fixtures, steps, and hooks.
+"""
+import pytest
+
+
+@pytest.fixture
+def context():
+    """
+    An empty context class, used for passing arbitrary objects between tests.
+    """
+
+    class Context:
+        pass
+
+    return Context()
diff --git a/deepicedrain/tests/features/subglacial_lakes.feature b/deepicedrain/tests/features/subglacial_lakes.feature
@@ -0,0 +1,14 @@
+# language: en
+Feature: Mapping Antarctic subglacial lakes
+  In order to understand the flow of subglacial water in Antarctica
+  As a glaciologist,
+  We want to see how active subglacial lakes are behaving over time
+
+  Scenario Outline: Subglacial Lake Finder
+    Given some altimetry data at <placename>
+    When it is passed through an unsupervised clustering algorithm
+    Then <this_many> potential subglacial lakes are found
+
+    Examples:
+    |  placename           | this_many |
+    |  whillans_downstream | 7         |
diff --git a/deepicedrain/tests/test_subglacial_lakes.py b/deepicedrain/tests/test_subglacial_lakes.py
@@ -0,0 +1,78 @@
+"""
+Feature tests for analyzing Active Subglacial Lakes in Antactica.
+"""
+try:
+    import cudf as xpd
+except ImportError:
+    import pandas as xpd
+import deepicedrain
+import fsspec
+from pytest_bdd import given, scenario, then, when
+
+
+@scenario(
+    feature_name="features/subglacial_lakes.feature",
+    scenario_name="Subglacial Lake Finder",
+    example_converters=dict(placename=str, this_many=int),
+)
+def test_subglacial_lake_finder():
+    """Find active subglacial lakes at some place"""
+    pass
+
+
+@given("some altimetry data at <placename>", target_fixture="dataframe")
+def altimetry_data(placename):
+    """
+    Load up some pre-processed ICESat-2 ATL11 altimetry data with x, y,
+    dhdt_slope and referencegroundtrack columns from a Parquet file.
+    """
+    # TODO use intake_parquet after https://github.com/intake/intake-parquet/issues/18
+    with fsspec.open(
+        f"simplecache::https://github.com/weiji14/deepicedrain/releases/download/v0.3.1/df_dhdt_{placename}.parquet",
+        simplecache=dict(cache_storage="ATLXI", same_names=True),
+    ) as openfile:
+        _dataframe: xpd.DataFrame = xpd.read_parquet(
+            openfile, columns=["x", "y", "dhdt_slope", "referencegroundtrack"]
+        )
+        # Take only 1/4 of the data for speed
+        _dataframe: xpd.DataFrame = _dataframe.loc[: len(_dataframe) / 4]
+
+    # Filter to points > 2 * Median(dhdt)
+    abs_dhdt: xpd.Series = _dataframe.dhdt_slope.abs()
+    dataframe: xpd.DataFrame = _dataframe.loc[abs_dhdt > 2 * abs_dhdt.median()]
+
+    return dataframe
+
+
+@when("it is passed through an unsupervised clustering algorithm")
+def run_unsupervised_clustering(dataframe, context):
+    """
+    Find draining and filling lake clusters by pass a point cloud through the
+    DBSCAN unsupervised clustering algorithm.
+    """
+    X = dataframe
+    cluster_vars = ["x", "y", "dhdt_slope"]
+
+    draining_lake_labels: xpd.Series = -deepicedrain.find_clusters(
+        X=X.loc[X.dhdt_slope < 0][cluster_vars]
+    )
+    filling_lake_labels: xpd.Series = deepicedrain.find_clusters(
+        X=X.loc[X.dhdt_slope > 0][cluster_vars]
+    )
+    lake_labels = xpd.concat(objs=[draining_lake_labels, filling_lake_labels])
+    context.lake_labels: xpd.Series = lake_labels.sort_index()
+
+    return context.lake_labels
+
+
+@then("<this_many> potential subglacial lakes are found")
+def verify_subglacial_lake_labels(this_many, context):
+    """
+    Ensure that the lake_labels column is of int32 type, and that we are
+    getting a specific number of unique lake clusters.
+    """
+    assert context.lake_labels.dtype.name.lower() == "int32"
+    clusters: xpd.Series = context.lake_labels.unique()
+    assert this_many == len(clusters) - 1
+
+    return clusters