-
-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
✅ Cross-tested CPU/GPU DBSCAN for finding lake clusters
Towards a properly tested and reproducible pipeline for finding and analyzing subglacial lake clusters in Antarctica! Moving the find_clusters DBSCAN algorithm from atlxi_lake.ipynb into a properly tested and documented function inside lake_algorithms.py. Importing cudf/pandas as xpd to enable cross-device (GPU/CPU) use, and ensure that an xpd.Series is returned. Test scenarios (written in Gherkin) are in the subglacial_lakes.feature file, and integration tests with Given-When-Then statements are in test_subglacial_lakes.py. There was quite a bit of boilerplate BDD code to set up, including the empty context class for passing contextual objects between steps. The test parquet file is somewhat hardcoded to a Github URL, and is cached using fsspec's simplecache mechanism (ideally would use intake-parquet if it worked). Still need some more robust tests, as the cuml and scikit-learn DBSCAN algorithms return a different number of clusters for whillans_upstream.
- Loading branch information
Showing
8 changed files
with
201 additions
and
51 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
""" | ||
Custom algorithms for helping to detect active subglacial lakes. | ||
""" | ||
try: | ||
import cudf as xpd | ||
except ImportError: | ||
import pandas as xpd | ||
|
||
import numpy as np | ||
|
||
|
||
def find_clusters( | ||
X: xpd.DataFrame, | ||
eps: float = 3000, | ||
min_samples: int = 250, | ||
output_colname: str = "cluster_id", | ||
) -> xpd.Series: | ||
""" | ||
Classify a point cloud into several groups, with each group being assigned | ||
a positive integer label like 1, 2, 3, etc. Unclassified noise points are | ||
labelled as NaN. | ||
Uses Density-based spatial clustering of applications with noise (DBSCAN). | ||
See also https://www.naftaliharris.com/blog/visualizing-dbscan-clustering | ||
*** ** 111 NN | ||
** ** * 11 22 N | ||
* **** --> 1 2222 | ||
** ** 33 22 | ||
****** 333333 | ||
Parameters | ||
---------- | ||
X : cudf.DataFrame or pandas.DataFrame | ||
A table of X, Y, Z points to run the clustering algorithm on. | ||
eps : float | ||
The maximum distance between 2 points such they reside in the same | ||
neighborhood. Default is 3000 (metres). | ||
min_samples : int | ||
The number of samples in a neighborhood such that this group can be | ||
considered as an important core point (including the point itself). | ||
Default is 250 (sample points). | ||
output_colname : str | ||
The name of the column for the output Series. Default is 'cluster_id'. | ||
Returns | ||
------- | ||
cluster_labels : cudf.Series or pd.Series | ||
Which cluster each datapoint belongs to. Noisy samples are labeled as | ||
NaN. | ||
""" | ||
try: | ||
from cuml.cluster import DBSCAN | ||
except ImportError: | ||
from sklearn.cluster import DBSCAN | ||
|
||
# Run DBSCAN using {eps} m distance, and minimum of {min_samples} points | ||
dbscan = DBSCAN(eps=eps, min_samples=min_samples) | ||
dbscan.fit(X=X) | ||
|
||
cluster_labels = dbscan.labels_ + 1 # noise points -1 becomes 0 | ||
if isinstance(cluster_labels, np.ndarray): | ||
cluster_labels = xpd.Series(data=cluster_labels, dtype=xpd.Int32Dtype()) | ||
cluster_labels = cluster_labels.mask(cond=cluster_labels == 0) # turn 0 to NaN | ||
cluster_labels.index = X.index # let labels have same index as input data | ||
cluster_labels.name = output_colname | ||
|
||
return cluster_labels |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
""" | ||
This module contains shared fixtures, steps, and hooks. | ||
""" | ||
import pytest | ||
|
||
|
||
@pytest.fixture | ||
def context(): | ||
""" | ||
An empty context class, used for passing arbitrary objects between tests. | ||
""" | ||
|
||
class Context: | ||
pass | ||
|
||
return Context() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# language: en | ||
Feature: Mapping Antarctic subglacial lakes | ||
In order to understand the flow of subglacial water in Antarctica | ||
As a glaciologist, | ||
We want to see how active subglacial lakes are behaving over time | ||
|
||
Scenario Outline: Subglacial Lake Finder | ||
Given some altimetry data at <placename> | ||
When it is passed through an unsupervised clustering algorithm | ||
Then <this_many> potential subglacial lakes are found | ||
|
||
Examples: | ||
| placename | this_many | | ||
| whillans_downstream | 7 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
""" | ||
Feature tests for analyzing Active Subglacial Lakes in Antactica. | ||
""" | ||
try: | ||
import cudf as xpd | ||
except ImportError: | ||
import pandas as xpd | ||
import deepicedrain | ||
import fsspec | ||
from pytest_bdd import given, scenario, then, when | ||
|
||
|
||
@scenario( | ||
feature_name="features/subglacial_lakes.feature", | ||
scenario_name="Subglacial Lake Finder", | ||
example_converters=dict(placename=str, this_many=int), | ||
) | ||
def test_subglacial_lake_finder(): | ||
"""Find active subglacial lakes at some place""" | ||
pass | ||
|
||
|
||
@given("some altimetry data at <placename>", target_fixture="dataframe") | ||
def altimetry_data(placename): | ||
""" | ||
Load up some pre-processed ICESat-2 ATL11 altimetry data with x, y, | ||
dhdt_slope and referencegroundtrack columns from a Parquet file. | ||
""" | ||
# TODO use intake_parquet after https://github.com/intake/intake-parquet/issues/18 | ||
with fsspec.open( | ||
f"simplecache::https://github.com/weiji14/deepicedrain/releases/download/v0.3.1/df_dhdt_{placename}.parquet", | ||
simplecache=dict(cache_storage="ATLXI", same_names=True), | ||
) as openfile: | ||
_dataframe: xpd.DataFrame = xpd.read_parquet( | ||
openfile, columns=["x", "y", "dhdt_slope", "referencegroundtrack"] | ||
) | ||
# Take only 1/4 of the data for speed | ||
_dataframe: xpd.DataFrame = _dataframe.loc[: len(_dataframe) / 4] | ||
|
||
# Filter to points > 2 * Median(dhdt) | ||
abs_dhdt: xpd.Series = _dataframe.dhdt_slope.abs() | ||
dataframe: xpd.DataFrame = _dataframe.loc[abs_dhdt > 2 * abs_dhdt.median()] | ||
|
||
return dataframe | ||
|
||
|
||
@when("it is passed through an unsupervised clustering algorithm") | ||
def run_unsupervised_clustering(dataframe, context): | ||
""" | ||
Find draining and filling lake clusters by pass a point cloud through the | ||
DBSCAN unsupervised clustering algorithm. | ||
""" | ||
X = dataframe | ||
cluster_vars = ["x", "y", "dhdt_slope"] | ||
|
||
draining_lake_labels: xpd.Series = -deepicedrain.find_clusters( | ||
X=X.loc[X.dhdt_slope < 0][cluster_vars] | ||
) | ||
filling_lake_labels: xpd.Series = deepicedrain.find_clusters( | ||
X=X.loc[X.dhdt_slope > 0][cluster_vars] | ||
) | ||
lake_labels = xpd.concat(objs=[draining_lake_labels, filling_lake_labels]) | ||
context.lake_labels: xpd.Series = lake_labels.sort_index() | ||
|
||
return context.lake_labels | ||
|
||
|
||
@then("<this_many> potential subglacial lakes are found") | ||
def verify_subglacial_lake_labels(this_many, context): | ||
""" | ||
Ensure that the lake_labels column is of int32 type, and that we are | ||
getting a specific number of unique lake clusters. | ||
""" | ||
assert context.lake_labels.dtype.name.lower() == "int32" | ||
clusters: xpd.Series = context.lake_labels.unique() | ||
assert this_many == len(clusters) - 1 | ||
|
||
return clusters |