From 8aec6ec793cebd6ed6a4e66352218206537f6aca Mon Sep 17 00:00:00 2001 From: MaartenGr Date: Mon, 1 Apr 2024 19:36:49 +0200 Subject: [PATCH 1/2] Adresses #1866 --- bertopic/_bertopic.py | 58 +++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 822586a0..36a7be21 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -3237,41 +3237,41 @@ def merge_models(cls, models, min_similarity: float = .7, embedding_model=None): # Extract new topics new_topics = sorted([index - selected_topics["_outliers"] for index, sim in enumerate(sims) if sim < min_similarity]) - max_topic = max(set(merged_topics["topics"])) + max_topic = max(set(merged_topics["topics"])) + 1 # Merge Topic Representations new_topics_dict = {} - new_topic_val = max_topic + 1 - for index, new_topic in enumerate(new_topics): - new_topic_val = max_topic + index + 1 - new_topics_dict[new_topic] = new_topic_val - merged_topics["topic_representations"][str(new_topic_val)] = selected_topics["topic_representations"][str(new_topic)] - merged_topics["topic_labels"][str(new_topic_val)] = selected_topics["topic_labels"][str(new_topic)] - - # Add new aspects - if selected_topics["topic_aspects"]: - aspects_1 = set(merged_topics["topic_aspects"].keys()) - aspects_2 = set(selected_topics["topic_aspects"].keys()) - aspects_diff = aspects_2.difference(aspects_1) - if aspects_diff: - for aspect in aspects_diff: - merged_topics["topic_aspects"][aspect] = {} - - # If the original model does not have topic aspects but the to be added model does - if not merged_topics.get("topic_aspects"): - merged_topics["topic_aspects"] = selected_topics["topic_aspects"] - - # If they both contain topic aspects, add to the existing set of aspects - else: - for aspect, values in selected_topics["topic_aspects"].items(): - merged_topics["topic_aspects"][aspect][str(new_topic_val)] = values[str(new_topic)] + for new_topic in new_topics: + if new_topic != -1: + max_topic += 1 + new_topics_dict[new_topic] = max_topic + merged_topics["topic_representations"][str(max_topic)] = selected_topics["topic_representations"][str(new_topic)] + merged_topics["topic_labels"][str(max_topic)] = selected_topics["topic_labels"][str(new_topic)] + + # Add new aspects + if selected_topics["topic_aspects"]: + aspects_1 = set(merged_topics["topic_aspects"].keys()) + aspects_2 = set(selected_topics["topic_aspects"].keys()) + aspects_diff = aspects_2.difference(aspects_1) + if aspects_diff: + for aspect in aspects_diff: + merged_topics["topic_aspects"][aspect] = {} + + # If the original model does not have topic aspects but the to be added model does + if not merged_topics.get("topic_aspects"): + merged_topics["topic_aspects"] = selected_topics["topic_aspects"] + + # If they both contain topic aspects, add to the existing set of aspects + else: + for aspect, values in selected_topics["topic_aspects"].items(): + merged_topics["topic_aspects"][aspect][str(max_topic)] = values[str(new_topic)] - # Add new embeddings - new_tensors = tensors[new_topic + selected_topics["_outliers"]] - merged_tensors = np.vstack([merged_tensors, new_tensors]) + # Add new embeddings + new_tensors = tensors[new_topic + selected_topics["_outliers"]] + merged_tensors = np.vstack([merged_tensors, new_tensors]) # Topic Mapper - merged_topics["topic_mapper"] = TopicMapper(list(range(-1, new_topic_val+1, 1))).mappings_ + merged_topics["topic_mapper"] = TopicMapper(list(range(-1, max_topic+1, 1))).mappings_ # Find similar topics and re-assign those from the new models sims_idx = np.argmax(sim_matrix, axis=1) From 8dfbe49becb5a5d2efc5e81c458cba0a7b981abc Mon Sep 17 00:00:00 2001 From: MaartenGr Date: Wed, 10 Apr 2024 11:33:32 +0200 Subject: [PATCH 2/2] Fix lazy mistake --- bertopic/_bertopic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 36a7be21..568894d2 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -3237,7 +3237,7 @@ def merge_models(cls, models, min_similarity: float = .7, embedding_model=None): # Extract new topics new_topics = sorted([index - selected_topics["_outliers"] for index, sim in enumerate(sims) if sim < min_similarity]) - max_topic = max(set(merged_topics["topics"])) + 1 + max_topic = max(set(merged_topics["topics"])) # Merge Topic Representations new_topics_dict = {}