fix: handle small cluster dataset reduction plot by switching to PCA …

…and reduce build to only linux
mbari-org · Jul 31, 2024 · 16ab4de · 16ab4de
1 parent 09542e1
commit 16ab4de
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 11 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -85,7 +85,7 @@ jobs:
           docker buildx create --name mybuilder --platform linux/amd64,linux/arm64 --use
           # Running out of space for this build so commenting out for now
           #docker buildx build --push --platform linux/amd64 -t mbari/sdcat:$RELEASE_VERSION-cuda124 --label GIT_VERSION=$RELEASE_VERSION --label IMAGE_URI=mbari/sdcat:$RELEASE_VERSION-cuda124 -f docker/Dockerfile.cuda .
-          docker buildx build --push --platform linux/amd64,linux/arm64 -t mbari/sdcat:$RELEASE_VERSION --label GIT_VERSION=$RELEASE_VERSION --label IMAGE_URI=mbari/sdcat:$RELEASE_VERSION -f docker/Dockerfile .
+          docker buildx build --push --platform linux/amd64 -t mbari/sdcat:$RELEASE_VERSION --label GIT_VERSION=$RELEASE_VERSION --label IMAGE_URI=mbari/sdcat:$RELEASE_VERSION -f docker/Dockerfile .
   push_readme_to_dockerhub:
     runs-on: ubuntu-latest
     name: Push README to Docker Hub

diff --git a/sdcat/cluster/cluster.py b/sdcat/cluster/cluster.py
@@ -214,17 +214,26 @@ def _run_hdbscan_assign(
         init = 'spectral'
 
     # Reduce the dimensionality of the embeddings using UMAP to 2 dimensions to visualize the clusters
-    if have_gpu:
-        xx = cuUMAP(init=init,
-                    n_components=2,
-                    n_neighbors=3,
-                    min_dist=0.1,
-                    metric='euclidean').fit_transform(df.values)
+    n_neighbors = min(15, df.values.shape[0] - 1)
+    info(f'Using {n_neighbors} neighbors for dimensional reduction')
+    if n_neighbors < 2:
+        warn('Using PCA instead of UMAP')
+        from sklearn.decomposition import PCA
+        pca = PCA(n_components=2)
+        xx = pca.fit_transform(df.values)
     else:
-        xx = UMAP(init=init,
-                  n_components=2,
-                  metric='cosine',
-                  low_memory=True).fit_transform(df.values)
+        if have_gpu:
+            xx = cuUMAP(init=init,
+                        n_components=2,
+                        n_neighbors=n_neighbors,
+                        min_dist=0.1,
+                        metric='euclidean').fit_transform(df.values)
+        else:
+            xx = UMAP(init=init,
+                      n_components=2,
+                      n_neighbors=n_neighbors,
+                      metric='cosine',
+                      low_memory=True).fit_transform(df.values)
 
     df = pd.DataFrame({'x': xx[clustered, 0], 'y': xx[clustered, 1], 'labels': labels[clustered]})
     p = sns.jointplot(data=df, x='x', y='y', hue='labels')