Merge pull request #742 from jh83775/master

Potential Fix for Issue #741 (Transform not giving expected results for similar data #741)
lmcinnes · Jul 29, 2021 · dd415cc · dd415cc
2 parents c10a82d + 984591c
commit dd415cc
Show file tree

Hide file tree

Showing 17 changed files with 192 additions and 91 deletions.
diff --git a/doc/conf.py b/doc/conf.py
@@ -222,5 +222,7 @@
 
 
 def setup(app):
-    app.add_javascript("https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js")
-    app.add_javascript("https://cdn.plot.ly/plotly-latest.min.js")
+    app.add_javascript(
+        "https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js"
+    )
+    app.add_javascript("https://cdn.plot.ly/plotly-latest.min.js")
diff --git a/examples/inverse_transform_example.py b/examples/inverse_transform_example.py
@@ -6,7 +6,7 @@
 
 import umap
 
-mnist = fetch_openml('Fashion-MNIST', version=1)
+mnist = fetch_openml("Fashion-MNIST", version=1)
 
 
 trans = umap.UMAP(
@@ -19,7 +19,12 @@
 ).fit(mnist.data)
 
 corners = np.array(
-    [[-5.1, 2.9], [-1.9, 6.4], [-5.4, -6.3], [8.3, 4.0],]  # 7  # 4  # 1  # 0
+    [
+        [-5.1, 2.9],
+        [-1.9, 6.4],
+        [-5.4, -6.3],
+        [8.3, 4.0],
+    ]  # 7  # 4  # 1  # 0
 )
 
 test_pts = np.array(

diff --git a/umap/__init__.py b/umap/__init__.py
@@ -8,11 +8,10 @@
 except ImportError:
     warn(
         "Tensorflow not installed; ParametricUMAP will be unavailable",
-         category=ImportWarning
+        category=ImportWarning,
     )
     # Add a dummy class to raise an error
-    class ParametricUMAP (object):
-
+    class ParametricUMAP(object):
         def __init__(self, **kwds):
             warn(
                 """The umap.parametric_umap package requires Tensorflow > 2.0 to be installed.
@@ -25,7 +24,9 @@ def __init__(self, **kwds):
             """
             )
             raise ImportError(
-                "umap.parametric_umap requires Tensorflow >= 2.0") from None
+                "umap.parametric_umap requires Tensorflow >= 2.0"
+            ) from None
+
 
 from .aligned_umap import AlignedUMAP
 

diff --git a/umap/aligned_umap.py b/umap/aligned_umap.py
@@ -209,7 +209,11 @@ def init_from_existing(previous_embedding, graph, relations):
     for key, val in relations.items():
         typed_relations[np.int32(key)] = np.int32(val)
     return init_from_existing_internal(
-        previous_embedding, graph.indptr, graph.indices, graph.data, typed_relations,
+        previous_embedding,
+        graph.indptr,
+        graph.indices,
+        graph.data,
+        typed_relations,
     )
 
 
@@ -354,24 +358,36 @@ def fit(self, X, y=None, **fit_params):
 
         rng_state_transform = np.random.RandomState(self.transform_seed)
         regularisation_weights = build_neighborhood_similarities(
-            indptr_list, indices_list, relations,
+            indptr_list,
+            indices_list,
+            relations,
         )
         first_init = spectral_layout(
-            self.mappers_[0]._raw_data, self.mappers_[0].graph_, self.n_components,
+            self.mappers_[0]._raw_data,
+            self.mappers_[0].graph_,
+            self.n_components,
             rng_state_transform,
         )
         expansion = 10.0 / np.abs(first_init).max()
-        first_embedding = (first_init * expansion).astype(np.float32, order="C",)
+        first_embedding = (first_init * expansion).astype(
+            np.float32,
+            order="C",
+        )
 
         embeddings = numba.typed.List.empty_list(numba.types.float32[:, ::1])
         embeddings.append(first_embedding)
         for i in range(1, self.n_models_):
             next_init = spectral_layout(
-                self.mappers_[i]._raw_data, self.mappers_[i].graph_, self.n_components,
+                self.mappers_[i]._raw_data,
+                self.mappers_[i].graph_,
+                self.n_components,
                 rng_state_transform,
             )
             expansion = 10.0 / np.abs(next_init).max()
-            next_embedding = (next_init * expansion).astype(np.float32, order="C",)
+            next_embedding = (next_init * expansion).astype(
+                np.float32,
+                order="C",
+            )
             anchor_data = relations[i][window_size - 1]
             left_anchors = anchor_data[anchor_data >= 0]
             right_anchors = np.where(anchor_data >= 0)[0]
@@ -383,9 +399,9 @@ def fit(self, X, y=None, **fit_params):
                 )
             )
 
-        seed_triplet = rng_state_transform.randint(
-            INT32_MIN, INT32_MAX, 3
-        ).astype(np.int64)
+        seed_triplet = rng_state_transform.randint(INT32_MIN, INT32_MAX, 3).astype(
+            np.int64
+        )
         self.embeddings_ = optimize_layout_aligned_euclidean(
             embeddings,
             embeddings,
@@ -397,10 +413,13 @@ def fit(self, X, y=None, **fit_params):
             relations,
             seed_triplet,
             lambda_=self.alignment_regularisation,
+            move_other=True,
         )
 
         for i, embedding in enumerate(self.embeddings_):
-            disconnected_vertices = np.array(self.mappers_[i].graph_.sum(axis=1)).flatten() == 0
+            disconnected_vertices = (
+                np.array(self.mappers_[i].graph_.sum(axis=1)).flatten() == 0
+            )
             embedding[disconnected_vertices] = np.full(self.n_components, np.nan)
 
         return self
@@ -475,7 +494,9 @@ def update(self, X, y=None, **fit_params):
 
         new_relations = expand_relations(self.dict_relations_)
         new_regularisation_weights = build_neighborhood_similarities(
-            indptr_list, indices_list, new_relations,
+            indptr_list,
+            indices_list,
+            new_relations,
         )
 
         new_embedding = init_from_existing(
@@ -485,9 +506,9 @@ def update(self, X, y=None, **fit_params):
         self.embeddings_.append(new_embedding)
 
         rng_state_transform = np.random.RandomState(self.transform_seed)
-        seed_triplet = rng_state_transform.randint(
-            INT32_MIN, INT32_MAX, 3
-        ).astype(np.int64)
+        seed_triplet = rng_state_transform.randint(INT32_MIN, INT32_MAX, 3).astype(
+            np.int64
+        )
         self.embeddings_ = optimize_layout_aligned_euclidean(
             self.embeddings_,
             self.embeddings_,

diff --git a/umap/layouts.py b/umap/layouts.py
@@ -182,7 +182,14 @@ def _optimize_layout_euclidean_single_epoch(
 
 
 def _optimize_layout_euclidean_densmap_epoch_init(
-    head_embedding, tail_embedding, head, tail, a, b, re_sum, phi_sum,
+    head_embedding,
+    tail_embedding,
+    head,
+    tail,
+    a,
+    b,
+    re_sum,
+    phi_sum,
 ):
     re_sum.fill(0)
     phi_sum.fill(0)
@@ -225,6 +232,7 @@ def optimize_layout_euclidean(
     verbose=False,
     densmap=False,
     densmap_kwds={},
+    move_other=False,
 ):
     """Improve an embedding using stochastic gradient descent to minimize the
     fuzzy set cross entropy between the 1-skeletons of the high dimensional
@@ -273,14 +281,15 @@ def optimize_layout_euclidean(
         Whether to use the density-augmented densMAP objective
     densmap_kwds: dict (optional, default {})
         Auxiliary data for densMAP
+    move_other: bool (optional, default False)
+        Whether to adjust tail_embedding alongside head_embedding
     Returns
     -------
     embedding: array of shape (n_samples, n_components)
         The optimized embedding.
     """
 
     dim = head_embedding.shape[1]
-    move_other = head_embedding.shape[0] == tail_embedding.shape[0]
     alpha = initial_alpha
 
     epochs_per_negative_sample = epochs_per_sample / negative_sample_rate
@@ -397,6 +406,7 @@ def optimize_layout_generic(
     output_metric=dist.euclidean,
     output_metric_kwds=(),
     verbose=False,
+    move_other=False,
 ):
     """Improve an embedding using stochastic gradient descent to minimize the
     fuzzy set cross entropy between the 1-skeletons of the high dimensional
@@ -455,14 +465,16 @@ def optimize_layout_generic(
     verbose: bool (optional, default False)
         Whether to report information on the current progress of the algorithm.
 
+    move_other: bool (optional, default False)
+        Whether to adjust tail_embedding alongside head_embedding
+
     Returns
     -------
     embedding: array of shape (n_samples, n_components)
         The optimized embedding.
     """
 
     dim = head_embedding.shape[1]
-    move_other = head_embedding.shape[0] == tail_embedding.shape[0]
     alpha = initial_alpha
 
     epochs_per_negative_sample = epochs_per_sample / negative_sample_rate
@@ -561,6 +573,7 @@ def optimize_layout_inverse(
     output_metric=dist.euclidean,
     output_metric_kwds=(),
     verbose=False,
+    move_other=False,
 ):
     """Improve an embedding using stochastic gradient descent to minimize the
     fuzzy set cross entropy between the 1-skeletons of the high dimensional
@@ -619,14 +632,16 @@ def optimize_layout_inverse(
     verbose: bool (optional, default False)
         Whether to report information on the current progress of the algorithm.
 
+    move_other: bool (optional, default False)
+        Whether to adjust tail_embedding alongside head_embedding
+
     Returns
     -------
     embedding: array of shape (n_samples, n_components)
         The optimized embedding.
     """
 
     dim = head_embedding.shape[1]
-    move_other = head_embedding.shape[0] == tail_embedding.shape[0]
     alpha = initial_alpha
 
     epochs_per_negative_sample = epochs_per_sample / negative_sample_rate
@@ -873,9 +888,9 @@ def optimize_layout_aligned_euclidean(
     negative_sample_rate=5.0,
     parallel=True,
     verbose=False,
+    move_other=False,
 ):
     dim = head_embeddings[0].shape[1]
-    move_other = head_embeddings[0].shape[0] == tail_embeddings[0].shape[0]
     alpha = initial_alpha
 
     epochs_per_negative_sample = numba.typed.List.empty_list(numba.types.float32[::1])

diff --git a/umap/parametric_umap.py b/umap/parametric_umap.py
@@ -785,8 +785,7 @@ def loss(placeholder_y, embed_to_from):
 
 
 def distance_loss_corr(x, z_x):
-    """ Loss based on the distance between elements in a batch
-    """
+    """Loss based on the distance between elements in a batch"""
 
     # flatten data
     x = tf.keras.layers.Flatten()(x)
@@ -959,7 +958,7 @@ def make_sham_generator():
         """
         The sham generator is a placeholder when all data is already intrinsic to
         the model, but keras wants some input data. Used for non-parametric
-        embedding. 
+        embedding.
         """
 
         def sham_generator():
@@ -1119,8 +1118,8 @@ def load_ParametricUMAP(save_location, verbose=True):
 
 class GradientClippedModel(tf.keras.Model):
     """
-    We need to define a custom keras model here for gradient clipping, 
-    to stabilize training. 
+    We need to define a custom keras model here for gradient clipping,
+    to stabilize training.
     """
 
     def train_step(self, data):