♻️ Refactor to enable plotting drain/fill lakes in one figure

Combining draining/filling active lake cluster labels, which allows us to reduce the number of for-loop nesting in the active subglacial lake finder code, and plot both draining/filling lakes in the same figure! Cluster labels are now negative integers for draining lakes, positive integers for filling lakes, and NaN for noise points. Lake cluster plot now uses red (draining) and blue (filling) 'polar' colormap, with unclassified noise points in black as before. Code still takes 11 seconds to run for the entire Antarctic continent which is awesome! Also made a minor change to deepicedrain/__init__.py script to disable loading IceSat2Explorer dashboard script otherwise `import deepicedrain` will load stuff into GPU memory!
weiji14 · Sep 15, 2020 · 9142e87 · 9142e87
1 parent 3193f7a
commit 9142e87
Show file tree

Hide file tree

Showing 5 changed files with 139 additions and 93 deletions.
diff --git a/atlxi_dhdt.ipynb b/atlxi_dhdt.ipynb
@@ -44,6 +44,7 @@
     "import dask\n",
     "import datashader\n",
     "import deepicedrain\n",
+    "import deepicedrain.vizplots\n",
     "import holoviews as hv\n",
     "import hvplot.cudf  # comment out if no GPU\n",
     "import hvplot.pandas\n",
@@ -942,7 +943,7 @@
    "source": [
     "# Interactive holoviews scatter plot to find referencegroundtrack needed\n",
     "# Tip: Hover over the points, and find those with high 'dhdt_slope' values\n",
-    "viewer = deepicedrain.IceSat2Explorer(name=\"ICESat-2 Explorer\")\n",
+    "viewer = deepicedrain.vizplots.IceSat2Explorer(name=\"ICESat-2 Explorer\")\n",
     "dashboard: pn.layout.Column = pn.Column(viewer.widgets, viewer.view)\n",
     "# dashboard"
    ]

diff --git a/atlxi_dhdt.py b/atlxi_dhdt.py
@@ -47,6 +47,7 @@
 import dask
 import datashader
 import deepicedrain
+import deepicedrain.vizplots
 import holoviews as hv
 import hvplot.cudf  # comment out if no GPU
 import hvplot.pandas
@@ -431,7 +432,7 @@
 # %%
 # Interactive holoviews scatter plot to find referencegroundtrack needed
 # Tip: Hover over the points, and find those with high 'dhdt_slope' values
-viewer = deepicedrain.IceSat2Explorer(name="ICESat-2 Explorer")
+viewer = deepicedrain.vizplots.IceSat2Explorer(name="ICESat-2 Explorer")
 dashboard: pn.layout.Column = pn.Column(viewer.widgets, viewer.view)
 # dashboard
 

diff --git a/atlxi_lake.ipynb b/atlxi_lake.ipynb
@@ -161,7 +161,11 @@
     "    dbscan = cuml.DBSCAN(eps=2500, min_samples=250)\n",
     "    dbscan.fit(X=X)\n",
     "\n",
-    "    return dbscan.labels_"
+    "    cluster_labels = dbscan.labels_ + 1  # noise points -1 becomes 0\n",
+    "    cluster_labels = cluster_labels.mask(cond=cluster_labels == 0)  # turn 0 to NaN\n",
+    "    cluster_labels.index = X.index  # let labels have same index as input data\n",
+    "\n",
+    "    return cluster_labels"
    ]
   },
   {
@@ -177,6 +181,9 @@
     "    \"refgtracks\": [],\n",
     "    \"geometry\": [],\n",
     "}\n",
+    "# for basin_index in tqdm.tqdm(\n",
+    "#     iterable=drainage_basins[drainage_basins.NAME.str.startswith(\"Whillans\")].index\n",
+    "# ):\n",
     "for basin_index in tqdm.tqdm(iterable=drainage_basins.index):\n",
     "    # Initial data cleaning, filter to rows that are in the drainage basin\n",
     "    basin = drainage_basins.loc[basin_index]\n",
@@ -186,41 +193,47 @@
     "    print(f\"{len(X)} rows at {basin.NAME}\")\n",
     "\n",
     "    # Run unsupervised clustering separately on draining and filling lakes\n",
-    "    for activity, X_ in (\n",
-    "        (\"draining\", X.loc[X.dhdt_slope < -1]),\n",
-    "        (\"filling\", X.loc[X.dhdt_slope > 1]),\n",
-    "    ):\n",
-    "        labels_ = find_clusters(X=X_[[\"x\", \"y\", \"dhdt_slope\"]])\n",
-    "        n_clusters_ = len(labels_.unique()) - 1  # No. of clusters minus noise (-1)\n",
-    "        print(f\"{n_clusters_} {activity} lakes found\")\n",
+    "    # Draining lake points have negative labels (e.g. -1, -2, 3),\n",
+    "    # Filling lake points have positive labels (e.g. 1, 2, 3),\n",
+    "    # Noise points have NaN labels (i.e. NaN)\n",
+    "    cluster_vars = [\"x\", \"y\", \"dhdt_slope\"]\n",
+    "    draining_lake_labels = -find_clusters(X=X.loc[X.dhdt_slope < 0][cluster_vars])\n",
+    "    filling_lake_labels = find_clusters(X=X.loc[X.dhdt_slope > 0][cluster_vars])\n",
+    "    lake_labels = cudf.concat(objs=[draining_lake_labels, filling_lake_labels])\n",
+    "    lake_labels.name = \"cluster_label\"\n",
+    "\n",
+    "    clusters: cudf.Series = lake_labels.unique()\n",
+    "    print(\n",
+    "        f\"{(clusters < 0).sum()} draining and {(clusters > 0).sum()} filling lakes found\"\n",
+    "    )\n",
     "\n",
+    "    for cluster_label in clusters.to_array():\n",
     "        # Store attribute and geometry information of each active lake\n",
-    "        for i in range(n_clusters_):\n",
-    "            lake_points: cudf.DataFrame = X_.loc[labels_ == i]\n",
+    "        lake_points: cudf.DataFrame = X.loc[lake_labels == cluster_label]\n",
     "\n",
-    "            try:\n",
-    "                assert len(lake_points) > 2\n",
-    "            except AssertionError:\n",
-    "                continue\n",
+    "        try:\n",
+    "            assert len(lake_points) > 2\n",
+    "        except AssertionError:\n",
+    "            continue\n",
     "\n",
-    "            multipoint: shapely.geometry.MultiPoint = shapely.geometry.MultiPoint(\n",
-    "                points=lake_points[[\"x\", \"y\"]].as_matrix()\n",
-    "            )\n",
-    "            convexhull: shapely.geometry.Polygon = multipoint.convex_hull\n",
+    "        multipoint: shapely.geometry.MultiPoint = shapely.geometry.MultiPoint(\n",
+    "            points=lake_points[[\"x\", \"y\"]].as_matrix()\n",
+    "        )\n",
+    "        convexhull: shapely.geometry.Polygon = multipoint.convex_hull\n",
     "\n",
-    "            maxabsdhdt: float = (\n",
-    "                lake_points.dhdt_slope.max()\n",
-    "                if activity == \"filling\"\n",
-    "                else lake_points.dhdt_slope.min()\n",
-    "            )\n",
-    "            refgtracks: str = \"|\".join(\n",
-    "                map(str, lake_points.referencegroundtrack.unique().to_pandas())\n",
-    "            )\n",
+    "        maxabsdhdt: float = (\n",
+    "            lake_points.dhdt_slope.max()\n",
+    "            if cluster_label > 0  # positive label = filling\n",
+    "            else lake_points.dhdt_slope.min()  # negative label = draining\n",
+    "        )\n",
+    "        refgtracks: str = \"|\".join(\n",
+    "            map(str, lake_points.referencegroundtrack.unique().to_pandas())\n",
+    "        )\n",
     "\n",
-    "            activelakes[\"basin_name\"].append(basin.NAME)\n",
-    "            activelakes[\"maxabsdhdt\"].append(maxabsdhdt)\n",
-    "            activelakes[\"refgtracks\"].append(refgtracks)\n",
-    "            activelakes[\"geometry\"].append(convexhull)\n",
+    "        activelakes[\"basin_name\"].append(basin.NAME)\n",
+    "        activelakes[\"maxabsdhdt\"].append(maxabsdhdt)\n",
+    "        activelakes[\"refgtracks\"].append(refgtracks)\n",
+    "        activelakes[\"geometry\"].append(convexhull)\n",
     "\n",
     "if len(activelakes[\"geometry\"]) >= 1:\n",
     "    gdf = gpd.GeoDataFrame(activelakes, crs=\"EPSG:3031\")\n",
@@ -243,32 +256,43 @@
     "lines_to_next_cell": 2
    },
    "outputs": [],
+   "source": [
+    "# Concatenate XY points with labels, and move data from GPU to CPU\n",
+    "X: cudf.DataFrame = cudf.concat(objs=[X, lake_labels], axis=\"columns\")\n",
+    "X_ = X.to_pandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# Plot clusters on a map in colour, noise points/outliers as small dots\n",
-    "X_cpu = X_.to_pandas()\n",
-    "\n",
     "fig = pygmt.Figure()\n",
-    "labels_cpu = labels_.to_pandas().astype(np.int32)\n",
-    "sizes = (labels_cpu < 0).map(arg={True: 0.02, False: 0.2})\n",
+    "n_clusters_ = len(X_.cluster_label.unique()) - 1  # No. of clusters minus noise (NaN)\n",
+    "sizes = (X_.cluster_label.isna()).map(arg={True: 0.01, False: 0.1})\n",
     "if n_clusters_:\n",
-    "    pygmt.makecpt(cmap=\"categorical\", series=(-0.5, n_clusters_ - 0.5, 1))\n",
+    "    pygmt.makecpt(cmap=\"polar+h0\", series=(-1.5, 1.5, 1), reverse=True, D=True)\n",
     "else:\n",
     "    pygmt.makecpt(cmap=\"gray\")\n",
     "fig.plot(\n",
-    "    x=X_cpu.x,\n",
-    "    y=X_cpu.y,\n",
+    "    x=X_.x,\n",
+    "    y=X_.y,\n",
     "    sizes=sizes,\n",
     "    style=\"cc\",\n",
-    "    color=labels_cpu,\n",
+    "    color=X_.cluster_label,\n",
     "    cmap=True,\n",
     "    frame=[\n",
-    "        f'WSne+t\"Estimated number of clusters at {basin.NAME}: {n_clusters_}\"',\n",
-    "        'xaf+l\"Polar Stereographic X (km)\"',\n",
-    "        'yaf+l\"Polar Stereographic Y (km)\"',\n",
+    "        f'WSne+t\"Estimated number of lake clusters at {basin.NAME}: {n_clusters_}\"',\n",
+    "        'xaf+l\"Polar Stereographic X (m)\"',\n",
+    "        'yaf+l\"Polar Stereographic Y (m)\"',\n",
     "    ],\n",
     ")\n",
-    "fig.colorbar(frame='af+l\"Cluster Number\"')\n",
-    "fig.savefig(fname=f\"figures/subglacial_lakes_at_{basin.NAME}.png\")\n",
+    "basinx, basiny = basin.geometry.exterior.coords.xy\n",
+    "fig.plot(x=basinx, y=basiny, pen=\"thinnest,-\")\n",
+    "fig.colorbar(frame='af+l\"Draining/Filling\"', position='JBC+n\"Unclassified\"')\n",
+    "fig.savefig(fname=f\"figures/subglacial_lake_clusters_at_{basin.NAME}.png\")\n",
     "fig.show()"
    ]
   },

diff --git a/atlxi_lake.py b/atlxi_lake.py
@@ -116,7 +116,11 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
     dbscan = cuml.DBSCAN(eps=2500, min_samples=250)
     dbscan.fit(X=X)
 
-    return dbscan.labels_
+    cluster_labels = dbscan.labels_ + 1  # noise points -1 becomes 0
+    cluster_labels = cluster_labels.mask(cond=cluster_labels == 0)  # turn 0 to NaN
+    cluster_labels.index = X.index  # let labels have same index as input data
+
+    return cluster_labels
 
 
 # %%
@@ -127,6 +131,9 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
     "refgtracks": [],
     "geometry": [],
 }
+# for basin_index in tqdm.tqdm(
+#     iterable=drainage_basins[drainage_basins.NAME.str.startswith("Whillans")].index
+# ):
 for basin_index in tqdm.tqdm(iterable=drainage_basins.index):
     # Initial data cleaning, filter to rows that are in the drainage basin
     basin = drainage_basins.loc[basin_index]
@@ -136,41 +143,47 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
     print(f"{len(X)} rows at {basin.NAME}")
 
     # Run unsupervised clustering separately on draining and filling lakes
-    for activity, X_ in (
-        ("draining", X.loc[X.dhdt_slope < -1]),
-        ("filling", X.loc[X.dhdt_slope > 1]),
-    ):
-        labels_ = find_clusters(X=X_[["x", "y", "dhdt_slope"]])
-        n_clusters_ = len(labels_.unique()) - 1  # No. of clusters minus noise (-1)
-        print(f"{n_clusters_} {activity} lakes found")
+    # Draining lake points have negative labels (e.g. -1, -2, 3),
+    # Filling lake points have positive labels (e.g. 1, 2, 3),
+    # Noise points have NaN labels (i.e. NaN)
+    cluster_vars = ["x", "y", "dhdt_slope"]
+    draining_lake_labels = -find_clusters(X=X.loc[X.dhdt_slope < 0][cluster_vars])
+    filling_lake_labels = find_clusters(X=X.loc[X.dhdt_slope > 0][cluster_vars])
+    lake_labels = cudf.concat(objs=[draining_lake_labels, filling_lake_labels])
+    lake_labels.name = "cluster_label"
+
+    clusters: cudf.Series = lake_labels.unique()
+    print(
+        f"{(clusters < 0).sum()} draining and {(clusters > 0).sum()} filling lakes found"
+    )
 
+    for cluster_label in clusters.to_array():
         # Store attribute and geometry information of each active lake
-        for i in range(n_clusters_):
-            lake_points: cudf.DataFrame = X_.loc[labels_ == i]
-
-            try:
-                assert len(lake_points) > 2
-            except AssertionError:
-                continue
-
-            multipoint: shapely.geometry.MultiPoint = shapely.geometry.MultiPoint(
-                points=lake_points[["x", "y"]].as_matrix()
-            )
-            convexhull: shapely.geometry.Polygon = multipoint.convex_hull
-
-            maxabsdhdt: float = (
-                lake_points.dhdt_slope.max()
-                if activity == "filling"
-                else lake_points.dhdt_slope.min()
-            )
-            refgtracks: str = "|".join(
-                map(str, lake_points.referencegroundtrack.unique().to_pandas())
-            )
-
-            activelakes["basin_name"].append(basin.NAME)
-            activelakes["maxabsdhdt"].append(maxabsdhdt)
-            activelakes["refgtracks"].append(refgtracks)
-            activelakes["geometry"].append(convexhull)
+        lake_points: cudf.DataFrame = X.loc[lake_labels == cluster_label]
+
+        try:
+            assert len(lake_points) > 2
+        except AssertionError:
+            continue
+
+        multipoint: shapely.geometry.MultiPoint = shapely.geometry.MultiPoint(
+            points=lake_points[["x", "y"]].as_matrix()
+        )
+        convexhull: shapely.geometry.Polygon = multipoint.convex_hull
+
+        maxabsdhdt: float = (
+            lake_points.dhdt_slope.max()
+            if cluster_label > 0  # positive label = filling
+            else lake_points.dhdt_slope.min()  # negative label = draining
+        )
+        refgtracks: str = "|".join(
+            map(str, lake_points.referencegroundtrack.unique().to_pandas())
+        )
+
+        activelakes["basin_name"].append(basin.NAME)
+        activelakes["maxabsdhdt"].append(maxabsdhdt)
+        activelakes["refgtracks"].append(refgtracks)
+        activelakes["geometry"].append(convexhull)
 
 if len(activelakes["geometry"]) >= 1:
     gdf = gpd.GeoDataFrame(activelakes, crs="EPSG:3031")
@@ -182,31 +195,37 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series:
 # ## Visualize lakes
 
 # %%
-# Plot clusters on a map in colour, noise points/outliers as small dots
-X_cpu = X_.to_pandas()
+# Concatenate XY points with labels, and move data from GPU to CPU
+X: cudf.DataFrame = cudf.concat(objs=[X, lake_labels], axis="columns")
+X_ = X.to_pandas()
+
 
+# %%
+# Plot clusters on a map in colour, noise points/outliers as small dots
 fig = pygmt.Figure()
-labels_cpu = labels_.to_pandas().astype(np.int32)
-sizes = (labels_cpu < 0).map(arg={True: 0.02, False: 0.2})
+n_clusters_ = len(X_.cluster_label.unique()) - 1  # No. of clusters minus noise (NaN)
+sizes = (X_.cluster_label.isna()).map(arg={True: 0.01, False: 0.1})
 if n_clusters_:
-    pygmt.makecpt(cmap="categorical", series=(-0.5, n_clusters_ - 0.5, 1))
+    pygmt.makecpt(cmap="polar+h0", series=(-1.5, 1.5, 1), reverse=True, D=True)
 else:
     pygmt.makecpt(cmap="gray")
 fig.plot(
-    x=X_cpu.x,
-    y=X_cpu.y,
+    x=X_.x,
+    y=X_.y,
     sizes=sizes,
     style="cc",
-    color=labels_cpu,
+    color=X_.cluster_label,
     cmap=True,
     frame=[
-        f'WSne+t"Estimated number of clusters at {basin.NAME}: {n_clusters_}"',
-        'xaf+l"Polar Stereographic X (km)"',
-        'yaf+l"Polar Stereographic Y (km)"',
+        f'WSne+t"Estimated number of lake clusters at {basin.NAME}: {n_clusters_}"',
+        'xaf+l"Polar Stereographic X (m)"',
+        'yaf+l"Polar Stereographic Y (m)"',
     ],
 )
-fig.colorbar(frame='af+l"Cluster Number"')
-fig.savefig(fname=f"figures/subglacial_lakes_at_{basin.NAME}.png")
+basinx, basiny = basin.geometry.exterior.coords.xy
+fig.plot(x=basinx, y=basiny, pen="thinnest,-")
+fig.colorbar(frame='af+l"Draining/Filling"', position='JBC+n"Unclassified"')
+fig.savefig(fname=f"figures/subglacial_lake_clusters_at_{basin.NAME}.png")
 fig.show()
 
 

diff --git a/deepicedrain/__init__.py b/deepicedrain/__init__.py
@@ -11,7 +11,8 @@
     lonlat_to_xy,
     point_in_polygon_gpu,
 )
-from deepicedrain.vizplots import IceSat2Explorer
+
+# from deepicedrain.vizplots import IceSat2Explorer
 
 __version__: str = "0.2.1"