Skip to content

Commit

Permalink
Merge pull request #742 from jh83775/master
Browse files Browse the repository at this point in the history
Potential Fix for Issue #741 (Transform not giving expected results for similar data #741)
  • Loading branch information
lmcinnes authored Jul 29, 2021
2 parents c10a82d + 984591c commit dd415cc
Show file tree
Hide file tree
Showing 17 changed files with 192 additions and 91 deletions.
6 changes: 4 additions & 2 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,5 +222,7 @@


def setup(app):
app.add_javascript("https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js")
app.add_javascript("https://cdn.plot.ly/plotly-latest.min.js")
app.add_javascript(
"https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js"
)
app.add_javascript("https://cdn.plot.ly/plotly-latest.min.js")
9 changes: 7 additions & 2 deletions examples/inverse_transform_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import umap

mnist = fetch_openml('Fashion-MNIST', version=1)
mnist = fetch_openml("Fashion-MNIST", version=1)


trans = umap.UMAP(
Expand All @@ -19,7 +19,12 @@
).fit(mnist.data)

corners = np.array(
[[-5.1, 2.9], [-1.9, 6.4], [-5.4, -6.3], [8.3, 4.0],] # 7 # 4 # 1 # 0
[
[-5.1, 2.9],
[-1.9, 6.4],
[-5.4, -6.3],
[8.3, 4.0],
] # 7 # 4 # 1 # 0
)

test_pts = np.array(
Expand Down
9 changes: 5 additions & 4 deletions umap/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,10 @@
except ImportError:
warn(
"Tensorflow not installed; ParametricUMAP will be unavailable",
category=ImportWarning
category=ImportWarning,
)
# Add a dummy class to raise an error
class ParametricUMAP (object):

class ParametricUMAP(object):
def __init__(self, **kwds):
warn(
"""The umap.parametric_umap package requires Tensorflow > 2.0 to be installed.
Expand All @@ -25,7 +24,9 @@ def __init__(self, **kwds):
"""
)
raise ImportError(
"umap.parametric_umap requires Tensorflow >= 2.0") from None
"umap.parametric_umap requires Tensorflow >= 2.0"
) from None


from .aligned_umap import AlignedUMAP

Expand Down
49 changes: 35 additions & 14 deletions umap/aligned_umap.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,11 @@ def init_from_existing(previous_embedding, graph, relations):
for key, val in relations.items():
typed_relations[np.int32(key)] = np.int32(val)
return init_from_existing_internal(
previous_embedding, graph.indptr, graph.indices, graph.data, typed_relations,
previous_embedding,
graph.indptr,
graph.indices,
graph.data,
typed_relations,
)


Expand Down Expand Up @@ -354,24 +358,36 @@ def fit(self, X, y=None, **fit_params):

rng_state_transform = np.random.RandomState(self.transform_seed)
regularisation_weights = build_neighborhood_similarities(
indptr_list, indices_list, relations,
indptr_list,
indices_list,
relations,
)
first_init = spectral_layout(
self.mappers_[0]._raw_data, self.mappers_[0].graph_, self.n_components,
self.mappers_[0]._raw_data,
self.mappers_[0].graph_,
self.n_components,
rng_state_transform,
)
expansion = 10.0 / np.abs(first_init).max()
first_embedding = (first_init * expansion).astype(np.float32, order="C",)
first_embedding = (first_init * expansion).astype(
np.float32,
order="C",
)

embeddings = numba.typed.List.empty_list(numba.types.float32[:, ::1])
embeddings.append(first_embedding)
for i in range(1, self.n_models_):
next_init = spectral_layout(
self.mappers_[i]._raw_data, self.mappers_[i].graph_, self.n_components,
self.mappers_[i]._raw_data,
self.mappers_[i].graph_,
self.n_components,
rng_state_transform,
)
expansion = 10.0 / np.abs(next_init).max()
next_embedding = (next_init * expansion).astype(np.float32, order="C",)
next_embedding = (next_init * expansion).astype(
np.float32,
order="C",
)
anchor_data = relations[i][window_size - 1]
left_anchors = anchor_data[anchor_data >= 0]
right_anchors = np.where(anchor_data >= 0)[0]
Expand All @@ -383,9 +399,9 @@ def fit(self, X, y=None, **fit_params):
)
)

seed_triplet = rng_state_transform.randint(
INT32_MIN, INT32_MAX, 3
).astype(np.int64)
seed_triplet = rng_state_transform.randint(INT32_MIN, INT32_MAX, 3).astype(
np.int64
)
self.embeddings_ = optimize_layout_aligned_euclidean(
embeddings,
embeddings,
Expand All @@ -397,10 +413,13 @@ def fit(self, X, y=None, **fit_params):
relations,
seed_triplet,
lambda_=self.alignment_regularisation,
move_other=True,
)

for i, embedding in enumerate(self.embeddings_):
disconnected_vertices = np.array(self.mappers_[i].graph_.sum(axis=1)).flatten() == 0
disconnected_vertices = (
np.array(self.mappers_[i].graph_.sum(axis=1)).flatten() == 0
)
embedding[disconnected_vertices] = np.full(self.n_components, np.nan)

return self
Expand Down Expand Up @@ -475,7 +494,9 @@ def update(self, X, y=None, **fit_params):

new_relations = expand_relations(self.dict_relations_)
new_regularisation_weights = build_neighborhood_similarities(
indptr_list, indices_list, new_relations,
indptr_list,
indices_list,
new_relations,
)

new_embedding = init_from_existing(
Expand All @@ -485,9 +506,9 @@ def update(self, X, y=None, **fit_params):
self.embeddings_.append(new_embedding)

rng_state_transform = np.random.RandomState(self.transform_seed)
seed_triplet = rng_state_transform.randint(
INT32_MIN, INT32_MAX, 3
).astype(np.int64)
seed_triplet = rng_state_transform.randint(INT32_MIN, INT32_MAX, 3).astype(
np.int64
)
self.embeddings_ = optimize_layout_aligned_euclidean(
self.embeddings_,
self.embeddings_,
Expand Down
25 changes: 20 additions & 5 deletions umap/layouts.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,14 @@ def _optimize_layout_euclidean_single_epoch(


def _optimize_layout_euclidean_densmap_epoch_init(
head_embedding, tail_embedding, head, tail, a, b, re_sum, phi_sum,
head_embedding,
tail_embedding,
head,
tail,
a,
b,
re_sum,
phi_sum,
):
re_sum.fill(0)
phi_sum.fill(0)
Expand Down Expand Up @@ -225,6 +232,7 @@ def optimize_layout_euclidean(
verbose=False,
densmap=False,
densmap_kwds={},
move_other=False,
):
"""Improve an embedding using stochastic gradient descent to minimize the
fuzzy set cross entropy between the 1-skeletons of the high dimensional
Expand Down Expand Up @@ -273,14 +281,15 @@ def optimize_layout_euclidean(
Whether to use the density-augmented densMAP objective
densmap_kwds: dict (optional, default {})
Auxiliary data for densMAP
move_other: bool (optional, default False)
Whether to adjust tail_embedding alongside head_embedding
Returns
-------
embedding: array of shape (n_samples, n_components)
The optimized embedding.
"""

dim = head_embedding.shape[1]
move_other = head_embedding.shape[0] == tail_embedding.shape[0]
alpha = initial_alpha

epochs_per_negative_sample = epochs_per_sample / negative_sample_rate
Expand Down Expand Up @@ -397,6 +406,7 @@ def optimize_layout_generic(
output_metric=dist.euclidean,
output_metric_kwds=(),
verbose=False,
move_other=False,
):
"""Improve an embedding using stochastic gradient descent to minimize the
fuzzy set cross entropy between the 1-skeletons of the high dimensional
Expand Down Expand Up @@ -455,14 +465,16 @@ def optimize_layout_generic(
verbose: bool (optional, default False)
Whether to report information on the current progress of the algorithm.
move_other: bool (optional, default False)
Whether to adjust tail_embedding alongside head_embedding
Returns
-------
embedding: array of shape (n_samples, n_components)
The optimized embedding.
"""

dim = head_embedding.shape[1]
move_other = head_embedding.shape[0] == tail_embedding.shape[0]
alpha = initial_alpha

epochs_per_negative_sample = epochs_per_sample / negative_sample_rate
Expand Down Expand Up @@ -561,6 +573,7 @@ def optimize_layout_inverse(
output_metric=dist.euclidean,
output_metric_kwds=(),
verbose=False,
move_other=False,
):
"""Improve an embedding using stochastic gradient descent to minimize the
fuzzy set cross entropy between the 1-skeletons of the high dimensional
Expand Down Expand Up @@ -619,14 +632,16 @@ def optimize_layout_inverse(
verbose: bool (optional, default False)
Whether to report information on the current progress of the algorithm.
move_other: bool (optional, default False)
Whether to adjust tail_embedding alongside head_embedding
Returns
-------
embedding: array of shape (n_samples, n_components)
The optimized embedding.
"""

dim = head_embedding.shape[1]
move_other = head_embedding.shape[0] == tail_embedding.shape[0]
alpha = initial_alpha

epochs_per_negative_sample = epochs_per_sample / negative_sample_rate
Expand Down Expand Up @@ -873,9 +888,9 @@ def optimize_layout_aligned_euclidean(
negative_sample_rate=5.0,
parallel=True,
verbose=False,
move_other=False,
):
dim = head_embeddings[0].shape[1]
move_other = head_embeddings[0].shape[0] == tail_embeddings[0].shape[0]
alpha = initial_alpha

epochs_per_negative_sample = numba.typed.List.empty_list(numba.types.float32[::1])
Expand Down
9 changes: 4 additions & 5 deletions umap/parametric_umap.py
Original file line number Diff line number Diff line change
Expand Up @@ -785,8 +785,7 @@ def loss(placeholder_y, embed_to_from):


def distance_loss_corr(x, z_x):
""" Loss based on the distance between elements in a batch
"""
"""Loss based on the distance between elements in a batch"""

# flatten data
x = tf.keras.layers.Flatten()(x)
Expand Down Expand Up @@ -959,7 +958,7 @@ def make_sham_generator():
"""
The sham generator is a placeholder when all data is already intrinsic to
the model, but keras wants some input data. Used for non-parametric
embedding.
embedding.
"""

def sham_generator():
Expand Down Expand Up @@ -1119,8 +1118,8 @@ def load_ParametricUMAP(save_location, verbose=True):

class GradientClippedModel(tf.keras.Model):
"""
We need to define a custom keras model here for gradient clipping,
to stabilize training.
We need to define a custom keras model here for gradient clipping,
to stabilize training.
"""

def train_step(self, data):
Expand Down
Loading

0 comments on commit dd415cc

Please sign in to comment.