Skip to content

Commit

Permalink
♻️ Refactor to enable plotting drain/fill lakes in one figure
Browse files Browse the repository at this point in the history
Combining draining/filling active lake cluster labels, which allows us to reduce the number of for-loop nesting in the active subglacial lake finder code, and plot both draining/filling lakes in the same figure! Cluster labels are now negative integers for draining lakes, positive integers for filling lakes, and NaN for noise points. Lake cluster plot now uses red (draining) and blue (filling) 'polar' colormap, with unclassified noise points in black as before. Code still takes 11 seconds to run for the entire Antarctic continent which is awesome! Also made a minor change to deepicedrain/__init__.py script to disable loading IceSat2Explorer dashboard script otherwise `import deepicedrain` will load stuff into GPU memory!
  • Loading branch information
weiji14 committed Sep 13, 2020
1 parent 8efb862 commit 8982919
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 93 deletions.
3 changes: 2 additions & 1 deletion atlxi_dhdt.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
"import dask\n",
"import datashader\n",
"import deepicedrain\n",
"import deepicedrain.vizplots\n",
"import holoviews as hv\n",
"import hvplot.cudf # comment out if no GPU\n",
"import hvplot.pandas\n",
Expand Down Expand Up @@ -942,7 +943,7 @@
"source": [
"# Interactive holoviews scatter plot to find referencegroundtrack needed\n",
"# Tip: Hover over the points, and find those with high 'dhdt_slope' values\n",
"viewer = deepicedrain.IceSat2Explorer(name=\"ICESat-2 Explorer\")\n",
"viewer = deepicedrain.vizplots.IceSat2Explorer(name=\"ICESat-2 Explorer\")\n",
"dashboard: pn.layout.Column = pn.Column(viewer.widgets, viewer.view)\n",
"# dashboard"
]
Expand Down
3 changes: 2 additions & 1 deletion atlxi_dhdt.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import dask
import datashader
import deepicedrain
import deepicedrain.vizplots
import holoviews as hv
import hvplot.cudf # comment out if no GPU
import hvplot.pandas
Expand Down Expand Up @@ -431,7 +432,7 @@
# %%
# Interactive holoviews scatter plot to find referencegroundtrack needed
# Tip: Hover over the points, and find those with high 'dhdt_slope' values
viewer = deepicedrain.IceSat2Explorer(name="ICESat-2 Explorer")
viewer = deepicedrain.vizplots.IceSat2Explorer(name="ICESat-2 Explorer")
dashboard: pn.layout.Column = pn.Column(viewer.widgets, viewer.view)
# dashboard

Expand Down
110 changes: 67 additions & 43 deletions atlxi_lake.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,11 @@
" dbscan = cuml.DBSCAN(eps=2500, min_samples=250)\n",
" dbscan.fit(X=X)\n",
"\n",
" return dbscan.labels_"
" cluster_labels = dbscan.labels_ + 1 # noise points -1 becomes 0\n",
" cluster_labels = cluster_labels.mask(cond=cluster_labels == 0) # turn 0 to NaN\n",
" cluster_labels.index = X.index # let labels have same index as input data\n",
"\n",
" return cluster_labels"
]
},
{
Expand All @@ -177,6 +181,9 @@
" \"refgtracks\": [],\n",
" \"geometry\": [],\n",
"}\n",
"# for basin_index in tqdm.tqdm(\n",
"# iterable=drainage_basins[drainage_basins.NAME.str.startswith(\"Whillans\")].index\n",
"# ):\n",
"for basin_index in tqdm.tqdm(iterable=drainage_basins.index):\n",
" # Initial data cleaning, filter to rows that are in the drainage basin\n",
" basin = drainage_basins.loc[basin_index]\n",
Expand All @@ -186,41 +193,47 @@
" print(f\"{len(X)} rows at {basin.NAME}\")\n",
"\n",
" # Run unsupervised clustering separately on draining and filling lakes\n",
" for activity, X_ in (\n",
" (\"draining\", X.loc[X.dhdt_slope < -1]),\n",
" (\"filling\", X.loc[X.dhdt_slope > 1]),\n",
" ):\n",
" labels_ = find_clusters(X=X_[[\"x\", \"y\", \"dhdt_slope\"]])\n",
" n_clusters_ = len(labels_.unique()) - 1 # No. of clusters minus noise (-1)\n",
" print(f\"{n_clusters_} {activity} lakes found\")\n",
" # Draining lake points have negative labels (e.g. -1, -2, 3),\n",
" # Filling lake points have positive labels (e.g. 1, 2, 3),\n",
" # Noise points have NaN labels (i.e. NaN)\n",
" cluster_vars = [\"x\", \"y\", \"dhdt_slope\"]\n",
" draining_lake_labels = -find_clusters(X=X.loc[X.dhdt_slope < 0][cluster_vars])\n",
" filling_lake_labels = find_clusters(X=X.loc[X.dhdt_slope > 0][cluster_vars])\n",
" lake_labels = cudf.concat(objs=[draining_lake_labels, filling_lake_labels])\n",
" lake_labels.name = \"cluster_label\"\n",
"\n",
" clusters: cudf.Series = lake_labels.unique()\n",
" print(\n",
" f\"{(clusters < 0).sum()} draining and {(clusters > 0).sum()} filling lakes found\"\n",
" )\n",
"\n",
" for cluster_label in clusters.to_array():\n",
" # Store attribute and geometry information of each active lake\n",
" for i in range(n_clusters_):\n",
" lake_points: cudf.DataFrame = X_.loc[labels_ == i]\n",
" lake_points: cudf.DataFrame = X.loc[lake_labels == cluster_label]\n",
"\n",
" try:\n",
" assert len(lake_points) > 2\n",
" except AssertionError:\n",
" continue\n",
" try:\n",
" assert len(lake_points) > 2\n",
" except AssertionError:\n",
" continue\n",
"\n",
" multipoint: shapely.geometry.MultiPoint = shapely.geometry.MultiPoint(\n",
" points=lake_points[[\"x\", \"y\"]].as_matrix()\n",
" )\n",
" convexhull: shapely.geometry.Polygon = multipoint.convex_hull\n",
" multipoint: shapely.geometry.MultiPoint = shapely.geometry.MultiPoint(\n",
" points=lake_points[[\"x\", \"y\"]].as_matrix()\n",
" )\n",
" convexhull: shapely.geometry.Polygon = multipoint.convex_hull\n",
"\n",
" maxabsdhdt: float = (\n",
" lake_points.dhdt_slope.max()\n",
" if activity == \"filling\"\n",
" else lake_points.dhdt_slope.min()\n",
" )\n",
" refgtracks: str = \"|\".join(\n",
" map(str, lake_points.referencegroundtrack.unique().to_pandas())\n",
" )\n",
" maxabsdhdt: float = (\n",
" lake_points.dhdt_slope.max()\n",
" if cluster_label > 0 # positive label = filling\n",
" else lake_points.dhdt_slope.min() # negative label = draining\n",
" )\n",
" refgtracks: str = \"|\".join(\n",
" map(str, lake_points.referencegroundtrack.unique().to_pandas())\n",
" )\n",
"\n",
" activelakes[\"basin_name\"].append(basin.NAME)\n",
" activelakes[\"maxabsdhdt\"].append(maxabsdhdt)\n",
" activelakes[\"refgtracks\"].append(refgtracks)\n",
" activelakes[\"geometry\"].append(convexhull)\n",
" activelakes[\"basin_name\"].append(basin.NAME)\n",
" activelakes[\"maxabsdhdt\"].append(maxabsdhdt)\n",
" activelakes[\"refgtracks\"].append(refgtracks)\n",
" activelakes[\"geometry\"].append(convexhull)\n",
"\n",
"if len(activelakes[\"geometry\"]) >= 1:\n",
" gdf = gpd.GeoDataFrame(activelakes, crs=\"EPSG:3031\")\n",
Expand All @@ -243,32 +256,43 @@
"lines_to_next_cell": 2
},
"outputs": [],
"source": [
"# Concatenate XY points with labels, and move data from GPU to CPU\n",
"X: cudf.DataFrame = cudf.concat(objs=[X, lake_labels], axis=\"columns\")\n",
"X_ = X.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Plot clusters on a map in colour, noise points/outliers as small dots\n",
"X_cpu = X_.to_pandas()\n",
"\n",
"fig = pygmt.Figure()\n",
"labels_cpu = labels_.to_pandas().astype(np.int32)\n",
"sizes = (labels_cpu < 0).map(arg={True: 0.02, False: 0.2})\n",
"n_clusters_ = len(X_.cluster_label.unique()) - 1 # No. of clusters minus noise (NaN)\n",
"sizes = (X_.cluster_label.isna()).map(arg={True: 0.01, False: 0.1})\n",
"if n_clusters_:\n",
" pygmt.makecpt(cmap=\"categorical\", series=(-0.5, n_clusters_ - 0.5, 1))\n",
" pygmt.makecpt(cmap=\"polar+h0\", series=(-1.5, 1.5, 1), reverse=True, D=True)\n",
"else:\n",
" pygmt.makecpt(cmap=\"gray\")\n",
"fig.plot(\n",
" x=X_cpu.x,\n",
" y=X_cpu.y,\n",
" x=X_.x,\n",
" y=X_.y,\n",
" sizes=sizes,\n",
" style=\"cc\",\n",
" color=labels_cpu,\n",
" color=X_.cluster_label,\n",
" cmap=True,\n",
" frame=[\n",
" f'WSne+t\"Estimated number of clusters at {basin.NAME}: {n_clusters_}\"',\n",
" 'xaf+l\"Polar Stereographic X (km)\"',\n",
" 'yaf+l\"Polar Stereographic Y (km)\"',\n",
" f'WSne+t\"Estimated number of lake clusters at {basin.NAME}: {n_clusters_}\"',\n",
" 'xaf+l\"Polar Stereographic X (m)\"',\n",
" 'yaf+l\"Polar Stereographic Y (m)\"',\n",
" ],\n",
")\n",
"fig.colorbar(frame='af+l\"Cluster Number\"')\n",
"fig.savefig(fname=f\"figures/subglacial_lakes_at_{basin.NAME}.png\")\n",
"basinx, basiny = basin.geometry.exterior.coords.xy\n",
"fig.plot(x=basinx, y=basiny, pen=\"thinnest,-\")\n",
"fig.colorbar(frame='af+l\"Draining/Filling\"', position='JBC+n\"Unclassified\"')\n",
"fig.savefig(fname=f\"figures/subglacial_lake_clusters_at_{basin.NAME}.png\")\n",
"fig.show()"
]
},
Expand Down
113 changes: 66 additions & 47 deletions atlxi_lake.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
dbscan = cuml.DBSCAN(eps=2500, min_samples=250)
dbscan.fit(X=X)

return dbscan.labels_
cluster_labels = dbscan.labels_ + 1 # noise points -1 becomes 0
cluster_labels = cluster_labels.mask(cond=cluster_labels == 0) # turn 0 to NaN
cluster_labels.index = X.index # let labels have same index as input data

return cluster_labels


# %%
Expand All @@ -127,6 +131,9 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
"refgtracks": [],
"geometry": [],
}
# for basin_index in tqdm.tqdm(
# iterable=drainage_basins[drainage_basins.NAME.str.startswith("Whillans")].index
# ):
for basin_index in tqdm.tqdm(iterable=drainage_basins.index):
# Initial data cleaning, filter to rows that are in the drainage basin
basin = drainage_basins.loc[basin_index]
Expand All @@ -136,41 +143,47 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
print(f"{len(X)} rows at {basin.NAME}")

# Run unsupervised clustering separately on draining and filling lakes
for activity, X_ in (
("draining", X.loc[X.dhdt_slope < -1]),
("filling", X.loc[X.dhdt_slope > 1]),
):
labels_ = find_clusters(X=X_[["x", "y", "dhdt_slope"]])
n_clusters_ = len(labels_.unique()) - 1 # No. of clusters minus noise (-1)
print(f"{n_clusters_} {activity} lakes found")
# Draining lake points have negative labels (e.g. -1, -2, 3),
# Filling lake points have positive labels (e.g. 1, 2, 3),
# Noise points have NaN labels (i.e. NaN)
cluster_vars = ["x", "y", "dhdt_slope"]
draining_lake_labels = -find_clusters(X=X.loc[X.dhdt_slope < 0][cluster_vars])
filling_lake_labels = find_clusters(X=X.loc[X.dhdt_slope > 0][cluster_vars])
lake_labels = cudf.concat(objs=[draining_lake_labels, filling_lake_labels])
lake_labels.name = "cluster_label"

clusters: cudf.Series = lake_labels.unique()
print(
f"{(clusters < 0).sum()} draining and {(clusters > 0).sum()} filling lakes found"
)

for cluster_label in clusters.to_array():
# Store attribute and geometry information of each active lake
for i in range(n_clusters_):
lake_points: cudf.DataFrame = X_.loc[labels_ == i]

try:
assert len(lake_points) > 2
except AssertionError:
continue

multipoint: shapely.geometry.MultiPoint = shapely.geometry.MultiPoint(
points=lake_points[["x", "y"]].as_matrix()
)
convexhull: shapely.geometry.Polygon = multipoint.convex_hull

maxabsdhdt: float = (
lake_points.dhdt_slope.max()
if activity == "filling"
else lake_points.dhdt_slope.min()
)
refgtracks: str = "|".join(
map(str, lake_points.referencegroundtrack.unique().to_pandas())
)

activelakes["basin_name"].append(basin.NAME)
activelakes["maxabsdhdt"].append(maxabsdhdt)
activelakes["refgtracks"].append(refgtracks)
activelakes["geometry"].append(convexhull)
lake_points: cudf.DataFrame = X.loc[lake_labels == cluster_label]

try:
assert len(lake_points) > 2
except AssertionError:
continue

multipoint: shapely.geometry.MultiPoint = shapely.geometry.MultiPoint(
points=lake_points[["x", "y"]].as_matrix()
)
convexhull: shapely.geometry.Polygon = multipoint.convex_hull

maxabsdhdt: float = (
lake_points.dhdt_slope.max()
if cluster_label > 0 # positive label = filling
else lake_points.dhdt_slope.min() # negative label = draining
)
refgtracks: str = "|".join(
map(str, lake_points.referencegroundtrack.unique().to_pandas())
)

activelakes["basin_name"].append(basin.NAME)
activelakes["maxabsdhdt"].append(maxabsdhdt)
activelakes["refgtracks"].append(refgtracks)
activelakes["geometry"].append(convexhull)

if len(activelakes["geometry"]) >= 1:
gdf = gpd.GeoDataFrame(activelakes, crs="EPSG:3031")
Expand All @@ -182,31 +195,37 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
# ## Visualize lakes

# %%
# Plot clusters on a map in colour, noise points/outliers as small dots
X_cpu = X_.to_pandas()
# Concatenate XY points with labels, and move data from GPU to CPU
X: cudf.DataFrame = cudf.concat(objs=[X, lake_labels], axis="columns")
X_ = X.to_pandas()


# %%
# Plot clusters on a map in colour, noise points/outliers as small dots
fig = pygmt.Figure()
labels_cpu = labels_.to_pandas().astype(np.int32)
sizes = (labels_cpu < 0).map(arg={True: 0.02, False: 0.2})
n_clusters_ = len(X_.cluster_label.unique()) - 1 # No. of clusters minus noise (NaN)
sizes = (X_.cluster_label.isna()).map(arg={True: 0.01, False: 0.1})
if n_clusters_:
pygmt.makecpt(cmap="categorical", series=(-0.5, n_clusters_ - 0.5, 1))
pygmt.makecpt(cmap="polar+h0", series=(-1.5, 1.5, 1), reverse=True, D=True)
else:
pygmt.makecpt(cmap="gray")
fig.plot(
x=X_cpu.x,
y=X_cpu.y,
x=X_.x,
y=X_.y,
sizes=sizes,
style="cc",
color=labels_cpu,
color=X_.cluster_label,
cmap=True,
frame=[
f'WSne+t"Estimated number of clusters at {basin.NAME}: {n_clusters_}"',
'xaf+l"Polar Stereographic X (km)"',
'yaf+l"Polar Stereographic Y (km)"',
f'WSne+t"Estimated number of lake clusters at {basin.NAME}: {n_clusters_}"',
'xaf+l"Polar Stereographic X (m)"',
'yaf+l"Polar Stereographic Y (m)"',
],
)
fig.colorbar(frame='af+l"Cluster Number"')
fig.savefig(fname=f"figures/subglacial_lakes_at_{basin.NAME}.png")
basinx, basiny = basin.geometry.exterior.coords.xy
fig.plot(x=basinx, y=basiny, pen="thinnest,-")
fig.colorbar(frame='af+l"Draining/Filling"', position='JBC+n"Unclassified"')
fig.savefig(fname=f"figures/subglacial_lake_clusters_at_{basin.NAME}.png")
fig.show()


Expand Down
3 changes: 2 additions & 1 deletion deepicedrain/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
lonlat_to_xy,
point_in_polygon_gpu,
)
from deepicedrain.vizplots import IceSat2Explorer

# from deepicedrain.vizplots import IceSat2Explorer

__version__: str = "0.2.1"

Expand Down

0 comments on commit 8982919

Please sign in to comment.