diff --git a/atlxi_dhdt.ipynb b/atlxi_dhdt.ipynb index aea20db..37b502c 100644 --- a/atlxi_dhdt.ipynb +++ b/atlxi_dhdt.ipynb @@ -44,6 +44,7 @@ "import dask\n", "import datashader\n", "import deepicedrain\n", + "import deepicedrain.vizplots\n", "import holoviews as hv\n", "import hvplot.cudf # comment out if no GPU\n", "import hvplot.pandas\n", @@ -942,7 +943,7 @@ "source": [ "# Interactive holoviews scatter plot to find referencegroundtrack needed\n", "# Tip: Hover over the points, and find those with high 'dhdt_slope' values\n", - "viewer = deepicedrain.IceSat2Explorer(name=\"ICESat-2 Explorer\")\n", + "viewer = deepicedrain.vizplots.IceSat2Explorer(name=\"ICESat-2 Explorer\")\n", "dashboard: pn.layout.Column = pn.Column(viewer.widgets, viewer.view)\n", "# dashboard" ] diff --git a/atlxi_dhdt.py b/atlxi_dhdt.py index 0cac79d..67c4896 100644 --- a/atlxi_dhdt.py +++ b/atlxi_dhdt.py @@ -47,6 +47,7 @@ import dask import datashader import deepicedrain +import deepicedrain.vizplots import holoviews as hv import hvplot.cudf # comment out if no GPU import hvplot.pandas @@ -431,7 +432,7 @@ # %% # Interactive holoviews scatter plot to find referencegroundtrack needed # Tip: Hover over the points, and find those with high 'dhdt_slope' values -viewer = deepicedrain.IceSat2Explorer(name="ICESat-2 Explorer") +viewer = deepicedrain.vizplots.IceSat2Explorer(name="ICESat-2 Explorer") dashboard: pn.layout.Column = pn.Column(viewer.widgets, viewer.view) # dashboard diff --git a/atlxi_lake.ipynb b/atlxi_lake.ipynb index 04add6b..77fc88c 100644 --- a/atlxi_lake.ipynb +++ b/atlxi_lake.ipynb @@ -161,7 +161,11 @@ " dbscan = cuml.DBSCAN(eps=2500, min_samples=250)\n", " dbscan.fit(X=X)\n", "\n", - " return dbscan.labels_" + " cluster_labels = dbscan.labels_ + 1 # noise points -1 becomes 0\n", + " cluster_labels = cluster_labels.mask(cond=cluster_labels == 0) # turn 0 to NaN\n", + " cluster_labels.index = X.index # let labels have same index as input data\n", + "\n", + " return cluster_labels" ] }, { @@ -177,6 +181,9 @@ " \"refgtracks\": [],\n", " \"geometry\": [],\n", "}\n", + "# for basin_index in tqdm.tqdm(\n", + "# iterable=drainage_basins[drainage_basins.NAME.str.startswith(\"Whillans\")].index\n", + "# ):\n", "for basin_index in tqdm.tqdm(iterable=drainage_basins.index):\n", " # Initial data cleaning, filter to rows that are in the drainage basin\n", " basin = drainage_basins.loc[basin_index]\n", @@ -186,41 +193,47 @@ " print(f\"{len(X)} rows at {basin.NAME}\")\n", "\n", " # Run unsupervised clustering separately on draining and filling lakes\n", - " for activity, X_ in (\n", - " (\"draining\", X.loc[X.dhdt_slope < -1]),\n", - " (\"filling\", X.loc[X.dhdt_slope > 1]),\n", - " ):\n", - " labels_ = find_clusters(X=X_[[\"x\", \"y\", \"dhdt_slope\"]])\n", - " n_clusters_ = len(labels_.unique()) - 1 # No. of clusters minus noise (-1)\n", - " print(f\"{n_clusters_} {activity} lakes found\")\n", + " # Draining lake points have negative labels (e.g. -1, -2, 3),\n", + " # Filling lake points have positive labels (e.g. 1, 2, 3),\n", + " # Noise points have NaN labels (i.e. NaN)\n", + " cluster_vars = [\"x\", \"y\", \"dhdt_slope\"]\n", + " draining_lake_labels = -find_clusters(X=X.loc[X.dhdt_slope < 0][cluster_vars])\n", + " filling_lake_labels = find_clusters(X=X.loc[X.dhdt_slope > 0][cluster_vars])\n", + " lake_labels = cudf.concat(objs=[draining_lake_labels, filling_lake_labels])\n", + " lake_labels.name = \"cluster_label\"\n", + "\n", + " clusters: cudf.Series = lake_labels.unique()\n", + " print(\n", + " f\"{(clusters < 0).sum()} draining and {(clusters > 0).sum()} filling lakes found\"\n", + " )\n", "\n", + " for cluster_label in clusters.to_array():\n", " # Store attribute and geometry information of each active lake\n", - " for i in range(n_clusters_):\n", - " lake_points: cudf.DataFrame = X_.loc[labels_ == i]\n", + " lake_points: cudf.DataFrame = X.loc[lake_labels == cluster_label]\n", "\n", - " try:\n", - " assert len(lake_points) > 2\n", - " except AssertionError:\n", - " continue\n", + " try:\n", + " assert len(lake_points) > 2\n", + " except AssertionError:\n", + " continue\n", "\n", - " multipoint: shapely.geometry.MultiPoint = shapely.geometry.MultiPoint(\n", - " points=lake_points[[\"x\", \"y\"]].as_matrix()\n", - " )\n", - " convexhull: shapely.geometry.Polygon = multipoint.convex_hull\n", + " multipoint: shapely.geometry.MultiPoint = shapely.geometry.MultiPoint(\n", + " points=lake_points[[\"x\", \"y\"]].as_matrix()\n", + " )\n", + " convexhull: shapely.geometry.Polygon = multipoint.convex_hull\n", "\n", - " maxabsdhdt: float = (\n", - " lake_points.dhdt_slope.max()\n", - " if activity == \"filling\"\n", - " else lake_points.dhdt_slope.min()\n", - " )\n", - " refgtracks: str = \"|\".join(\n", - " map(str, lake_points.referencegroundtrack.unique().to_pandas())\n", - " )\n", + " maxabsdhdt: float = (\n", + " lake_points.dhdt_slope.max()\n", + " if cluster_label > 0 # positive label = filling\n", + " else lake_points.dhdt_slope.min() # negative label = draining\n", + " )\n", + " refgtracks: str = \"|\".join(\n", + " map(str, lake_points.referencegroundtrack.unique().to_pandas())\n", + " )\n", "\n", - " activelakes[\"basin_name\"].append(basin.NAME)\n", - " activelakes[\"maxabsdhdt\"].append(maxabsdhdt)\n", - " activelakes[\"refgtracks\"].append(refgtracks)\n", - " activelakes[\"geometry\"].append(convexhull)\n", + " activelakes[\"basin_name\"].append(basin.NAME)\n", + " activelakes[\"maxabsdhdt\"].append(maxabsdhdt)\n", + " activelakes[\"refgtracks\"].append(refgtracks)\n", + " activelakes[\"geometry\"].append(convexhull)\n", "\n", "if len(activelakes[\"geometry\"]) >= 1:\n", " gdf = gpd.GeoDataFrame(activelakes, crs=\"EPSG:3031\")\n", @@ -245,30 +258,33 @@ "outputs": [], "source": [ "# Plot clusters on a map in colour, noise points/outliers as small dots\n", - "X_cpu = X_.to_pandas()\n", + "X: cudf.DataFrame = cudf.concat(objs=[X, lake_labels], axis=\"columns\")\n", + "X_ = X.to_pandas() # move data from GPU to CPU\n", "\n", "fig = pygmt.Figure()\n", - "labels_cpu = labels_.to_pandas().astype(np.int32)\n", - "sizes = (labels_cpu < 0).map(arg={True: 0.02, False: 0.2})\n", + "n_clusters_ = len(X_.cluster_label.unique()) - 1 # No. of clusters minus noise (NaN)\n", + "sizes = (X_.cluster_label.isna()).map(arg={True: 0.01, False: 0.1})\n", "if n_clusters_:\n", - " pygmt.makecpt(cmap=\"categorical\", series=(-0.5, n_clusters_ - 0.5, 1))\n", + " pygmt.makecpt(cmap=\"polar+h0\", series=(-1.5, 1.5, 1), reverse=True, D=True)\n", "else:\n", " pygmt.makecpt(cmap=\"gray\")\n", "fig.plot(\n", - " x=X_cpu.x,\n", - " y=X_cpu.y,\n", + " x=X_.x,\n", + " y=X_.y,\n", " sizes=sizes,\n", " style=\"cc\",\n", - " color=labels_cpu,\n", + " color=X_.cluster_label,\n", " cmap=True,\n", " frame=[\n", - " f'WSne+t\"Estimated number of clusters at {basin.NAME}: {n_clusters_}\"',\n", + " f'WSne+t\"Estimated number of lake clusters at {basin.NAME}: {n_clusters_}\"',\n", " 'xaf+l\"Polar Stereographic X (km)\"',\n", " 'yaf+l\"Polar Stereographic Y (km)\"',\n", " ],\n", ")\n", - "fig.colorbar(frame='af+l\"Cluster Number\"')\n", - "fig.savefig(fname=f\"figures/subglacial_lakes_at_{basin.NAME}.png\")\n", + "basinx, basiny = basin.geometry.exterior.coords.xy\n", + "fig.plot(x=basinx, y=basiny, pen=\"thinnest,-\")\n", + "fig.colorbar(frame='af+l\"Draining/Filling\"', position='JBC+n\"Unclassified\"')\n", + "fig.savefig(fname=f\"figures/subglacial_lake_clusters_at_{basin.NAME}.png\")\n", "fig.show()" ] }, diff --git a/atlxi_lake.py b/atlxi_lake.py index 3e5093f..a73e1d5 100644 --- a/atlxi_lake.py +++ b/atlxi_lake.py @@ -116,7 +116,11 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series: dbscan = cuml.DBSCAN(eps=2500, min_samples=250) dbscan.fit(X=X) - return dbscan.labels_ + cluster_labels = dbscan.labels_ + 1 # noise points -1 becomes 0 + cluster_labels = cluster_labels.mask(cond=cluster_labels == 0) # turn 0 to NaN + cluster_labels.index = X.index # let labels have same index as input data + + return cluster_labels # %% @@ -127,6 +131,9 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series: "refgtracks": [], "geometry": [], } +# for basin_index in tqdm.tqdm( +# iterable=drainage_basins[drainage_basins.NAME.str.startswith("Whillans")].index +# ): for basin_index in tqdm.tqdm(iterable=drainage_basins.index): # Initial data cleaning, filter to rows that are in the drainage basin basin = drainage_basins.loc[basin_index] @@ -136,41 +143,47 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series: print(f"{len(X)} rows at {basin.NAME}") # Run unsupervised clustering separately on draining and filling lakes - for activity, X_ in ( - ("draining", X.loc[X.dhdt_slope < -1]), - ("filling", X.loc[X.dhdt_slope > 1]), - ): - labels_ = find_clusters(X=X_[["x", "y", "dhdt_slope"]]) - n_clusters_ = len(labels_.unique()) - 1 # No. of clusters minus noise (-1) - print(f"{n_clusters_} {activity} lakes found") + # Draining lake points have negative labels (e.g. -1, -2, 3), + # Filling lake points have positive labels (e.g. 1, 2, 3), + # Noise points have NaN labels (i.e. NaN) + cluster_vars = ["x", "y", "dhdt_slope"] + draining_lake_labels = -find_clusters(X=X.loc[X.dhdt_slope < 0][cluster_vars]) + filling_lake_labels = find_clusters(X=X.loc[X.dhdt_slope > 0][cluster_vars]) + lake_labels = cudf.concat(objs=[draining_lake_labels, filling_lake_labels]) + lake_labels.name = "cluster_label" + + clusters: cudf.Series = lake_labels.unique() + print( + f"{(clusters < 0).sum()} draining and {(clusters > 0).sum()} filling lakes found" + ) + for cluster_label in clusters.to_array(): # Store attribute and geometry information of each active lake - for i in range(n_clusters_): - lake_points: cudf.DataFrame = X_.loc[labels_ == i] - - try: - assert len(lake_points) > 2 - except AssertionError: - continue - - multipoint: shapely.geometry.MultiPoint = shapely.geometry.MultiPoint( - points=lake_points[["x", "y"]].as_matrix() - ) - convexhull: shapely.geometry.Polygon = multipoint.convex_hull - - maxabsdhdt: float = ( - lake_points.dhdt_slope.max() - if activity == "filling" - else lake_points.dhdt_slope.min() - ) - refgtracks: str = "|".join( - map(str, lake_points.referencegroundtrack.unique().to_pandas()) - ) - - activelakes["basin_name"].append(basin.NAME) - activelakes["maxabsdhdt"].append(maxabsdhdt) - activelakes["refgtracks"].append(refgtracks) - activelakes["geometry"].append(convexhull) + lake_points: cudf.DataFrame = X.loc[lake_labels == cluster_label] + + try: + assert len(lake_points) > 2 + except AssertionError: + continue + + multipoint: shapely.geometry.MultiPoint = shapely.geometry.MultiPoint( + points=lake_points[["x", "y"]].as_matrix() + ) + convexhull: shapely.geometry.Polygon = multipoint.convex_hull + + maxabsdhdt: float = ( + lake_points.dhdt_slope.max() + if cluster_label > 0 # positive label = filling + else lake_points.dhdt_slope.min() # negative label = draining + ) + refgtracks: str = "|".join( + map(str, lake_points.referencegroundtrack.unique().to_pandas()) + ) + + activelakes["basin_name"].append(basin.NAME) + activelakes["maxabsdhdt"].append(maxabsdhdt) + activelakes["refgtracks"].append(refgtracks) + activelakes["geometry"].append(convexhull) if len(activelakes["geometry"]) >= 1: gdf = gpd.GeoDataFrame(activelakes, crs="EPSG:3031") @@ -183,30 +196,33 @@ def find_clusters(X: cudf.core.dataframe.DataFrame) -> cudf.core.series.Series: # %% # Plot clusters on a map in colour, noise points/outliers as small dots -X_cpu = X_.to_pandas() +X: cudf.DataFrame = cudf.concat(objs=[X, lake_labels], axis="columns") +X_ = X.to_pandas() # move data from GPU to CPU fig = pygmt.Figure() -labels_cpu = labels_.to_pandas().astype(np.int32) -sizes = (labels_cpu < 0).map(arg={True: 0.02, False: 0.2}) +n_clusters_ = len(X_.cluster_label.unique()) - 1 # No. of clusters minus noise (NaN) +sizes = (X_.cluster_label.isna()).map(arg={True: 0.01, False: 0.1}) if n_clusters_: - pygmt.makecpt(cmap="categorical", series=(-0.5, n_clusters_ - 0.5, 1)) + pygmt.makecpt(cmap="polar+h0", series=(-1.5, 1.5, 1), reverse=True, D=True) else: pygmt.makecpt(cmap="gray") fig.plot( - x=X_cpu.x, - y=X_cpu.y, + x=X_.x, + y=X_.y, sizes=sizes, style="cc", - color=labels_cpu, + color=X_.cluster_label, cmap=True, frame=[ - f'WSne+t"Estimated number of clusters at {basin.NAME}: {n_clusters_}"', + f'WSne+t"Estimated number of lake clusters at {basin.NAME}: {n_clusters_}"', 'xaf+l"Polar Stereographic X (km)"', 'yaf+l"Polar Stereographic Y (km)"', ], ) -fig.colorbar(frame='af+l"Cluster Number"') -fig.savefig(fname=f"figures/subglacial_lakes_at_{basin.NAME}.png") +basinx, basiny = basin.geometry.exterior.coords.xy +fig.plot(x=basinx, y=basiny, pen="thinnest,-") +fig.colorbar(frame='af+l"Draining/Filling"', position='JBC+n"Unclassified"') +fig.savefig(fname=f"figures/subglacial_lake_clusters_at_{basin.NAME}.png") fig.show() diff --git a/deepicedrain/__init__.py b/deepicedrain/__init__.py index 97b12a2..d029e66 100644 --- a/deepicedrain/__init__.py +++ b/deepicedrain/__init__.py @@ -11,7 +11,8 @@ lonlat_to_xy, point_in_polygon_gpu, ) -from deepicedrain.vizplots import IceSat2Explorer + +# from deepicedrain.vizplots import IceSat2Explorer __version__: str = "0.2.1"