Skip to content

Commit

Permalink
🔀 Merge branch 'dependencies/pytest-bdd-4.0.1' (#187)
Browse files Browse the repository at this point in the history
Closes #187 Parallelized Behavioural Driven Development testing.
  • Loading branch information
weiji14 committed Oct 31, 2020
2 parents 8453eb1 + c66d137 commit 742a98c
Show file tree
Hide file tree
Showing 10 changed files with 376 additions and 72 deletions.
41 changes: 12 additions & 29 deletions atlxi_lake.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -204,35 +204,15 @@
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"lines_to_next_cell": 2
},
"source": [
"# Find Active Subglacial Lake clusters\n",
"\n",
"Uses Density-based spatial clustering of applications with noise (DBSCAN)."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:\n",
" \"\"\"\n",
" Density-based spatial clustering of applications with noise (DBSCAN)\n",
" See also https://www.naftaliharris.com/blog/visualizing-dbscan-clustering\n",
" \"\"\"\n",
" # Run DBSCAN using 3000 m distance, and minimum of 250 points\n",
" dbscan = cuml.DBSCAN(eps=3000, min_samples=250)\n",
" dbscan.fit(X=X)\n",
"\n",
" cluster_labels = dbscan.labels_ + 1 # noise points -1 becomes 0\n",
" cluster_labels = cluster_labels.mask(cond=cluster_labels == 0) # turn 0 to NaN\n",
" cluster_labels.index = X.index # let labels have same index as input data\n",
"\n",
" return cluster_labels"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -757,11 +737,14 @@
" # Filling lake points have positive labels (e.g. 1, 2, 3),\n",
" # Noise points have NaN labels (i.e. NaN)\n",
" cluster_vars = [\"x\", \"y\", \"dhdt_slope\"]\n",
" draining_lake_labels = -find_clusters(X=X.loc[X.dhdt_slope < 0][cluster_vars])\n",
" filling_lake_labels = find_clusters(X=X.loc[X.dhdt_slope > 0][cluster_vars])\n",
" draining_lake_labels = -deepicedrain.find_clusters(\n",
" X=X.loc[X.dhdt_slope < 0][cluster_vars]\n",
" )\n",
" filling_lake_labels = deepicedrain.find_clusters(\n",
" X=X.loc[X.dhdt_slope > 0][cluster_vars]\n",
" )\n",
" lake_labels = cudf.concat(objs=[draining_lake_labels, filling_lake_labels])\n",
" lake_labels: cudf.Series = lake_labels.sort_index()\n",
" lake_labels.name = \"cluster_label\"\n",
"\n",
" # Checking all potential subglacial lakes in a basin\n",
" clusters: cudf.Series = lake_labels.unique()\n",
Expand Down Expand Up @@ -877,8 +860,8 @@
"source": [
"# Plot clusters on a map in colour, noise points/outliers as small dots\n",
"fig = pygmt.Figure()\n",
"n_clusters_ = len(X_.cluster_label.unique()) - 1 # No. of clusters minus noise (NaN)\n",
"sizes = (X_.cluster_label.isna()).map(arg={True: 0.01, False: 0.1})\n",
"n_clusters_ = len(X_.cluster_id.unique()) - 1 # No. of clusters minus noise (NaN)\n",
"sizes = (X_.cluster_id.isna()).map(arg={True: 0.01, False: 0.1})\n",
"if n_clusters_:\n",
" pygmt.makecpt(cmap=\"polar+h0\", series=(-1.5, 1.5, 1), reverse=True, D=True)\n",
"else:\n",
Expand All @@ -888,7 +871,7 @@
" y=X_.y,\n",
" sizes=sizes,\n",
" style=\"cc\",\n",
" color=X_.cluster_label,\n",
" color=X_.cluster_id,\n",
" cmap=True,\n",
" frame=[\n",
" f'WSne+t\"Estimated number of lake clusters at {basin.NAME}: {n_clusters_}\"',\n",
Expand Down
31 changes: 9 additions & 22 deletions atlxi_lake.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,22 +134,6 @@
#
# Uses Density-based spatial clustering of applications with noise (DBSCAN).

# %%
def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
"""
Density-based spatial clustering of applications with noise (DBSCAN)
See also https://www.naftaliharris.com/blog/visualizing-dbscan-clustering
"""
# Run DBSCAN using 3000 m distance, and minimum of 250 points
dbscan = cuml.DBSCAN(eps=3000, min_samples=250)
dbscan.fit(X=X)

cluster_labels = dbscan.labels_ + 1 # noise points -1 becomes 0
cluster_labels = cluster_labels.mask(cond=cluster_labels == 0) # turn 0 to NaN
cluster_labels.index = X.index # let labels have same index as input data

return cluster_labels


# %% [markdown]
# ### Subglacial Lake Finder algorithm
Expand Down Expand Up @@ -212,11 +196,14 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
# Filling lake points have positive labels (e.g. 1, 2, 3),
# Noise points have NaN labels (i.e. NaN)
cluster_vars = ["x", "y", "dhdt_slope"]
draining_lake_labels = -find_clusters(X=X.loc[X.dhdt_slope < 0][cluster_vars])
filling_lake_labels = find_clusters(X=X.loc[X.dhdt_slope > 0][cluster_vars])
draining_lake_labels = -deepicedrain.find_clusters(
X=X.loc[X.dhdt_slope < 0][cluster_vars]
)
filling_lake_labels = deepicedrain.find_clusters(
X=X.loc[X.dhdt_slope > 0][cluster_vars]
)
lake_labels = cudf.concat(objs=[draining_lake_labels, filling_lake_labels])
lake_labels: cudf.Series = lake_labels.sort_index()
lake_labels.name = "cluster_label"

# Checking all potential subglacial lakes in a basin
clusters: cudf.Series = lake_labels.unique()
Expand Down Expand Up @@ -314,8 +301,8 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
# %%
# Plot clusters on a map in colour, noise points/outliers as small dots
fig = pygmt.Figure()
n_clusters_ = len(X_.cluster_label.unique()) - 1 # No. of clusters minus noise (NaN)
sizes = (X_.cluster_label.isna()).map(arg={True: 0.01, False: 0.1})
n_clusters_ = len(X_.cluster_id.unique()) - 1 # No. of clusters minus noise (NaN)
sizes = (X_.cluster_id.isna()).map(arg={True: 0.01, False: 0.1})
if n_clusters_:
pygmt.makecpt(cmap="polar+h0", series=(-1.5, 1.5, 1), reverse=True, D=True)
else:
Expand All @@ -325,7 +312,7 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
y=X_.y,
sizes=sizes,
style="cc",
color=X_.cluster_label,
color=X_.cluster_id,
cmap=True,
frame=[
f'WSne+t"Estimated number of lake clusters at {basin.NAME}: {n_clusters_}"',
Expand Down
3 changes: 3 additions & 0 deletions deepicedrain/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ Contents:
- ndarray_to_parquet - Turns an n-dimensional xarray/zarr array into an a parquet columnar format
- wide_to_long - Turns a pandas dataframe table with many columns into one with many rows

- :droplet: lakealgorithms.py - Custom algorithms for detecting and filtering active subglacial lakes
- find_clusters - Density based clustering algorithm (DBSCAN) to group points into lakes

- :world_map: vizplots.py - Makes interactive dashboard plots and publication quality figures
- IceSat2Explorer - Dashboard for interacting with ICESat-2 point clouds on a 2D map
- plot_alongtrack - Makes a 2D along track figure of height measurements taken at different cycle times
Expand Down
1 change: 1 addition & 0 deletions deepicedrain/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import deepicedrain
from deepicedrain.deltamath import calculate_delta, nan_linregress, nanptp
from deepicedrain.extraload import array_to_dataframe, ndarray_to_parquet, wide_to_long
from deepicedrain.lake_algorithms import find_clusters
from deepicedrain.spatiotemporal import (
Region,
deltatime_to_utctime,
Expand Down
68 changes: 68 additions & 0 deletions deepicedrain/lake_algorithms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
Custom algorithms for helping to detect active subglacial lakes.
"""
try:
import cudf as xpd
except ImportError:
import pandas as xpd

import numpy as np


def find_clusters(
X: xpd.DataFrame,
eps: float = 3000,
min_samples: int = 250,
output_colname: str = "cluster_id",
) -> xpd.Series:
"""
Classify a point cloud into several groups, with each group being assigned
a positive integer label like 1, 2, 3, etc. Unclassified noise points are
labelled as NaN.
Uses Density-based spatial clustering of applications with noise (DBSCAN).
See also https://www.naftaliharris.com/blog/visualizing-dbscan-clustering
*** ** 111 NN
** ** * 11 22 N
* **** --> 1 2222
** ** 33 22
****** 333333
Parameters
----------
X : cudf.DataFrame or pandas.DataFrame
A table of X, Y, Z points to run the clustering algorithm on.
eps : float
The maximum distance between 2 points such they reside in the same
neighborhood. Default is 3000 (metres).
min_samples : int
The number of samples in a neighborhood such that this group can be
considered as an important core point (including the point itself).
Default is 250 (sample points).
output_colname : str
The name of the column for the output Series. Default is 'cluster_id'.
Returns
-------
cluster_labels : cudf.Series or pd.Series
Which cluster each datapoint belongs to. Noisy samples are labeled as
NaN.
"""
try:
from cuml.cluster import DBSCAN
except ImportError:
from sklearn.cluster import DBSCAN

# Run DBSCAN using {eps} m distance, and minimum of {min_samples} points
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan.fit(X=X)

cluster_labels = dbscan.labels_ + 1 # noise points -1 becomes 0
if isinstance(cluster_labels, np.ndarray):
cluster_labels = xpd.Series(data=cluster_labels, dtype=xpd.Int32Dtype())
cluster_labels = cluster_labels.mask(cond=cluster_labels == 0) # turn 0 to NaN
cluster_labels.index = X.index # let labels have same index as input data
cluster_labels.name = output_colname

return cluster_labels
16 changes: 16 additions & 0 deletions deepicedrain/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""
This module contains shared fixtures, steps, and hooks.
"""
import pytest


@pytest.fixture
def context():
"""
An empty context class, used for passing arbitrary objects between tests.
"""

class Context:
pass

return Context()
14 changes: 14 additions & 0 deletions deepicedrain/tests/features/subglacial_lakes.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# language: en
Feature: Mapping Antarctic subglacial lakes
In order to understand the flow of subglacial water in Antarctica
As a glaciologist,
We want to see how active subglacial lakes are behaving over time

Scenario Outline: Subglacial Lake Finder
Given some altimetry data at <placename>
When it is passed through an unsupervised clustering algorithm
Then <this_many> potential subglacial lakes are found

Examples:
| placename | this_many |
| whillans_downstream | 7 |
78 changes: 78 additions & 0 deletions deepicedrain/tests/test_subglacial_lakes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""
Feature tests for analyzing Active Subglacial Lakes in Antactica.
"""
try:
import cudf as xpd
except ImportError:
import pandas as xpd
import deepicedrain
import fsspec
from pytest_bdd import given, scenario, then, when


@scenario(
feature_name="features/subglacial_lakes.feature",
scenario_name="Subglacial Lake Finder",
example_converters=dict(placename=str, this_many=int),
)
def test_subglacial_lake_finder():
"""Find active subglacial lakes at some place"""
pass


@given("some altimetry data at <placename>", target_fixture="dataframe")
def altimetry_data(placename):
"""
Load up some pre-processed ICESat-2 ATL11 altimetry data with x, y,
dhdt_slope and referencegroundtrack columns from a Parquet file.
"""
# TODO use intake_parquet after https://github.com/intake/intake-parquet/issues/18
with fsspec.open(
f"simplecache::https://github.com/weiji14/deepicedrain/releases/download/v0.3.1/df_dhdt_{placename}.parquet",
simplecache=dict(cache_storage="ATLXI", same_names=True),
) as openfile:
_dataframe: xpd.DataFrame = xpd.read_parquet(
openfile, columns=["x", "y", "dhdt_slope", "referencegroundtrack"]
)
# Take only 1/4 of the data for speed
_dataframe: xpd.DataFrame = _dataframe.loc[: len(_dataframe) / 4]

# Filter to points > 2 * Median(dhdt)
abs_dhdt: xpd.Series = _dataframe.dhdt_slope.abs()
dataframe: xpd.DataFrame = _dataframe.loc[abs_dhdt > 2 * abs_dhdt.median()]

return dataframe


@when("it is passed through an unsupervised clustering algorithm")
def run_unsupervised_clustering(dataframe, context):
"""
Find draining and filling lake clusters by pass a point cloud through the
DBSCAN unsupervised clustering algorithm.
"""
X = dataframe
cluster_vars = ["x", "y", "dhdt_slope"]

draining_lake_labels: xpd.Series = -deepicedrain.find_clusters(
X=X.loc[X.dhdt_slope < 0][cluster_vars]
)
filling_lake_labels: xpd.Series = deepicedrain.find_clusters(
X=X.loc[X.dhdt_slope > 0][cluster_vars]
)
lake_labels = xpd.concat(objs=[draining_lake_labels, filling_lake_labels])
context.lake_labels: xpd.Series = lake_labels.sort_index()

return context.lake_labels


@then("<this_many> potential subglacial lakes are found")
def verify_subglacial_lake_labels(this_many, context):
"""
Ensure that the lake_labels column is of int32 type, and that we are
getting a specific number of unique lake clusters.
"""
assert context.lake_labels.dtype.name.lower() == "int32"
clusters: xpd.Series = context.lake_labels.unique()
assert this_many == len(clusters) - 1

return clusters
Loading

0 comments on commit 742a98c

Please sign in to comment.