diff --git a/atlxi_lake.ipynb b/atlxi_lake.ipynb index bb70b45..3801a98 100644 --- a/atlxi_lake.ipynb +++ b/atlxi_lake.ipynb @@ -204,35 +204,15 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ "# Find Active Subglacial Lake clusters\n", "\n", "Uses Density-based spatial clustering of applications with noise (DBSCAN)." ] }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:\n", - " \"\"\"\n", - " Density-based spatial clustering of applications with noise (DBSCAN)\n", - " See also https://www.naftaliharris.com/blog/visualizing-dbscan-clustering\n", - " \"\"\"\n", - " # Run DBSCAN using 3000 m distance, and minimum of 250 points\n", - " dbscan = cuml.DBSCAN(eps=3000, min_samples=250)\n", - " dbscan.fit(X=X)\n", - "\n", - " cluster_labels = dbscan.labels_ + 1 # noise points -1 becomes 0\n", - " cluster_labels = cluster_labels.mask(cond=cluster_labels == 0) # turn 0 to NaN\n", - " cluster_labels.index = X.index # let labels have same index as input data\n", - "\n", - " return cluster_labels" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -757,11 +737,14 @@ " # Filling lake points have positive labels (e.g. 1, 2, 3),\n", " # Noise points have NaN labels (i.e. NaN)\n", " cluster_vars = [\"x\", \"y\", \"dhdt_slope\"]\n", - " draining_lake_labels = -find_clusters(X=X.loc[X.dhdt_slope < 0][cluster_vars])\n", - " filling_lake_labels = find_clusters(X=X.loc[X.dhdt_slope > 0][cluster_vars])\n", + " draining_lake_labels = -deepicedrain.find_clusters(\n", + " X=X.loc[X.dhdt_slope < 0][cluster_vars]\n", + " )\n", + " filling_lake_labels = deepicedrain.find_clusters(\n", + " X=X.loc[X.dhdt_slope > 0][cluster_vars]\n", + " )\n", " lake_labels = cudf.concat(objs=[draining_lake_labels, filling_lake_labels])\n", " lake_labels: cudf.Series = lake_labels.sort_index()\n", - " lake_labels.name = \"cluster_label\"\n", "\n", " # Checking all potential subglacial lakes in a basin\n", " clusters: cudf.Series = lake_labels.unique()\n", @@ -877,8 +860,8 @@ "source": [ "# Plot clusters on a map in colour, noise points/outliers as small dots\n", "fig = pygmt.Figure()\n", - "n_clusters_ = len(X_.cluster_label.unique()) - 1 # No. of clusters minus noise (NaN)\n", - "sizes = (X_.cluster_label.isna()).map(arg={True: 0.01, False: 0.1})\n", + "n_clusters_ = len(X_.cluster_id.unique()) - 1 # No. of clusters minus noise (NaN)\n", + "sizes = (X_.cluster_id.isna()).map(arg={True: 0.01, False: 0.1})\n", "if n_clusters_:\n", " pygmt.makecpt(cmap=\"polar+h0\", series=(-1.5, 1.5, 1), reverse=True, D=True)\n", "else:\n", @@ -888,7 +871,7 @@ " y=X_.y,\n", " sizes=sizes,\n", " style=\"cc\",\n", - " color=X_.cluster_label,\n", + " color=X_.cluster_id,\n", " cmap=True,\n", " frame=[\n", " f'WSne+t\"Estimated number of lake clusters at {basin.NAME}: {n_clusters_}\"',\n", diff --git a/atlxi_lake.py b/atlxi_lake.py index 07806b5..aca35f2 100644 --- a/atlxi_lake.py +++ b/atlxi_lake.py @@ -134,22 +134,6 @@ # # Uses Density-based spatial clustering of applications with noise (DBSCAN). -# %% -def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series: - """ - Density-based spatial clustering of applications with noise (DBSCAN) - See also https://www.naftaliharris.com/blog/visualizing-dbscan-clustering - """ - # Run DBSCAN using 3000 m distance, and minimum of 250 points - dbscan = cuml.DBSCAN(eps=3000, min_samples=250) - dbscan.fit(X=X) - - cluster_labels = dbscan.labels_ + 1 # noise points -1 becomes 0 - cluster_labels = cluster_labels.mask(cond=cluster_labels == 0) # turn 0 to NaN - cluster_labels.index = X.index # let labels have same index as input data - - return cluster_labels - # %% [markdown] # ### Subglacial Lake Finder algorithm @@ -212,11 +196,14 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series: # Filling lake points have positive labels (e.g. 1, 2, 3), # Noise points have NaN labels (i.e. NaN) cluster_vars = ["x", "y", "dhdt_slope"] - draining_lake_labels = -find_clusters(X=X.loc[X.dhdt_slope < 0][cluster_vars]) - filling_lake_labels = find_clusters(X=X.loc[X.dhdt_slope > 0][cluster_vars]) + draining_lake_labels = -deepicedrain.find_clusters( + X=X.loc[X.dhdt_slope < 0][cluster_vars] + ) + filling_lake_labels = deepicedrain.find_clusters( + X=X.loc[X.dhdt_slope > 0][cluster_vars] + ) lake_labels = cudf.concat(objs=[draining_lake_labels, filling_lake_labels]) lake_labels: cudf.Series = lake_labels.sort_index() - lake_labels.name = "cluster_label" # Checking all potential subglacial lakes in a basin clusters: cudf.Series = lake_labels.unique() @@ -314,8 +301,8 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series: # %% # Plot clusters on a map in colour, noise points/outliers as small dots fig = pygmt.Figure() -n_clusters_ = len(X_.cluster_label.unique()) - 1 # No. of clusters minus noise (NaN) -sizes = (X_.cluster_label.isna()).map(arg={True: 0.01, False: 0.1}) +n_clusters_ = len(X_.cluster_id.unique()) - 1 # No. of clusters minus noise (NaN) +sizes = (X_.cluster_id.isna()).map(arg={True: 0.01, False: 0.1}) if n_clusters_: pygmt.makecpt(cmap="polar+h0", series=(-1.5, 1.5, 1), reverse=True, D=True) else: @@ -325,7 +312,7 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series: y=X_.y, sizes=sizes, style="cc", - color=X_.cluster_label, + color=X_.cluster_id, cmap=True, frame=[ f'WSne+t"Estimated number of lake clusters at {basin.NAME}: {n_clusters_}"', diff --git a/deepicedrain/README.md b/deepicedrain/README.md index 17c6af7..929afbc 100644 --- a/deepicedrain/README.md +++ b/deepicedrain/README.md @@ -24,6 +24,9 @@ Contents: - ndarray_to_parquet - Turns an n-dimensional xarray/zarr array into an a parquet columnar format - wide_to_long - Turns a pandas dataframe table with many columns into one with many rows +- :droplet: lakealgorithms.py - Custom algorithms for detecting and filtering active subglacial lakes + - find_clusters - Density based clustering algorithm (DBSCAN) to group points into lakes + - :world_map: vizplots.py - Makes interactive dashboard plots and publication quality figures - IceSat2Explorer - Dashboard for interacting with ICESat-2 point clouds on a 2D map - plot_alongtrack - Makes a 2D along track figure of height measurements taken at different cycle times diff --git a/deepicedrain/__init__.py b/deepicedrain/__init__.py index 8e5d3fa..1040ccd 100644 --- a/deepicedrain/__init__.py +++ b/deepicedrain/__init__.py @@ -6,6 +6,7 @@ import deepicedrain from deepicedrain.deltamath import calculate_delta, nan_linregress, nanptp from deepicedrain.extraload import array_to_dataframe, ndarray_to_parquet, wide_to_long +from deepicedrain.lake_algorithms import find_clusters from deepicedrain.spatiotemporal import ( Region, deltatime_to_utctime, diff --git a/deepicedrain/lake_algorithms.py b/deepicedrain/lake_algorithms.py new file mode 100644 index 0000000..e134700 --- /dev/null +++ b/deepicedrain/lake_algorithms.py @@ -0,0 +1,68 @@ +""" +Custom algorithms for helping to detect active subglacial lakes. +""" +try: + import cudf as xpd +except ImportError: + import pandas as xpd + +import numpy as np + + +def find_clusters( + X: xpd.DataFrame, + eps: float = 3000, + min_samples: int = 250, + output_colname: str = "cluster_id", +) -> xpd.Series: + """ + Classify a point cloud into several groups, with each group being assigned + a positive integer label like 1, 2, 3, etc. Unclassified noise points are + labelled as NaN. + + Uses Density-based spatial clustering of applications with noise (DBSCAN). + See also https://www.naftaliharris.com/blog/visualizing-dbscan-clustering + + *** ** 111 NN + ** ** * 11 22 N + * **** --> 1 2222 + ** ** 33 22 + ****** 333333 + + Parameters + ---------- + X : cudf.DataFrame or pandas.DataFrame + A table of X, Y, Z points to run the clustering algorithm on. + eps : float + The maximum distance between 2 points such they reside in the same + neighborhood. Default is 3000 (metres). + min_samples : int + The number of samples in a neighborhood such that this group can be + considered as an important core point (including the point itself). + Default is 250 (sample points). + output_colname : str + The name of the column for the output Series. Default is 'cluster_id'. + + Returns + ------- + cluster_labels : cudf.Series or pd.Series + Which cluster each datapoint belongs to. Noisy samples are labeled as + NaN. + """ + try: + from cuml.cluster import DBSCAN + except ImportError: + from sklearn.cluster import DBSCAN + + # Run DBSCAN using {eps} m distance, and minimum of {min_samples} points + dbscan = DBSCAN(eps=eps, min_samples=min_samples) + dbscan.fit(X=X) + + cluster_labels = dbscan.labels_ + 1 # noise points -1 becomes 0 + if isinstance(cluster_labels, np.ndarray): + cluster_labels = xpd.Series(data=cluster_labels, dtype=xpd.Int32Dtype()) + cluster_labels = cluster_labels.mask(cond=cluster_labels == 0) # turn 0 to NaN + cluster_labels.index = X.index # let labels have same index as input data + cluster_labels.name = output_colname + + return cluster_labels diff --git a/deepicedrain/tests/conftest.py b/deepicedrain/tests/conftest.py new file mode 100644 index 0000000..bfd91a7 --- /dev/null +++ b/deepicedrain/tests/conftest.py @@ -0,0 +1,16 @@ +""" +This module contains shared fixtures, steps, and hooks. +""" +import pytest + + +@pytest.fixture +def context(): + """ + An empty context class, used for passing arbitrary objects between tests. + """ + + class Context: + pass + + return Context() diff --git a/deepicedrain/tests/features/subglacial_lakes.feature b/deepicedrain/tests/features/subglacial_lakes.feature new file mode 100644 index 0000000..6f7d5f6 --- /dev/null +++ b/deepicedrain/tests/features/subglacial_lakes.feature @@ -0,0 +1,14 @@ +# language: en +Feature: Mapping Antarctic subglacial lakes + In order to understand the flow of subglacial water in Antarctica + As a glaciologist, + We want to see how active subglacial lakes are behaving over time + + Scenario Outline: Subglacial Lake Finder + Given some altimetry data at + When it is passed through an unsupervised clustering algorithm + Then potential subglacial lakes are found + + Examples: + | placename | this_many | + | whillans_downstream | 7 | diff --git a/deepicedrain/tests/test_subglacial_lakes.py b/deepicedrain/tests/test_subglacial_lakes.py new file mode 100644 index 0000000..fa5efd5 --- /dev/null +++ b/deepicedrain/tests/test_subglacial_lakes.py @@ -0,0 +1,78 @@ +""" +Feature tests for analyzing Active Subglacial Lakes in Antactica. +""" +try: + import cudf as xpd +except ImportError: + import pandas as xpd +import deepicedrain +import fsspec +from pytest_bdd import given, scenario, then, when + + +@scenario( + feature_name="features/subglacial_lakes.feature", + scenario_name="Subglacial Lake Finder", + example_converters=dict(placename=str, this_many=int), +) +def test_subglacial_lake_finder(): + """Find active subglacial lakes at some place""" + pass + + +@given("some altimetry data at ", target_fixture="dataframe") +def altimetry_data(placename): + """ + Load up some pre-processed ICESat-2 ATL11 altimetry data with x, y, + dhdt_slope and referencegroundtrack columns from a Parquet file. + """ + # TODO use intake_parquet after https://github.com/intake/intake-parquet/issues/18 + with fsspec.open( + f"simplecache::https://github.com/weiji14/deepicedrain/releases/download/v0.3.1/df_dhdt_{placename}.parquet", + simplecache=dict(cache_storage="ATLXI", same_names=True), + ) as openfile: + _dataframe: xpd.DataFrame = xpd.read_parquet( + openfile, columns=["x", "y", "dhdt_slope", "referencegroundtrack"] + ) + # Take only 1/4 of the data for speed + _dataframe: xpd.DataFrame = _dataframe.loc[: len(_dataframe) / 4] + + # Filter to points > 2 * Median(dhdt) + abs_dhdt: xpd.Series = _dataframe.dhdt_slope.abs() + dataframe: xpd.DataFrame = _dataframe.loc[abs_dhdt > 2 * abs_dhdt.median()] + + return dataframe + + +@when("it is passed through an unsupervised clustering algorithm") +def run_unsupervised_clustering(dataframe, context): + """ + Find draining and filling lake clusters by pass a point cloud through the + DBSCAN unsupervised clustering algorithm. + """ + X = dataframe + cluster_vars = ["x", "y", "dhdt_slope"] + + draining_lake_labels: xpd.Series = -deepicedrain.find_clusters( + X=X.loc[X.dhdt_slope < 0][cluster_vars] + ) + filling_lake_labels: xpd.Series = deepicedrain.find_clusters( + X=X.loc[X.dhdt_slope > 0][cluster_vars] + ) + lake_labels = xpd.concat(objs=[draining_lake_labels, filling_lake_labels]) + context.lake_labels: xpd.Series = lake_labels.sort_index() + + return context.lake_labels + + +@then(" potential subglacial lakes are found") +def verify_subglacial_lake_labels(this_many, context): + """ + Ensure that the lake_labels column is of int32 type, and that we are + getting a specific number of unique lake clusters. + """ + assert context.lake_labels.dtype.name.lower() == "int32" + clusters: xpd.Series = context.lake_labels.unique() + assert this_many == len(clusters) - 1 + + return clusters