diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..1909da7 Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9bea433 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ + +.DS_Store diff --git a/requirements.txt b/requirements.txt index a8c415a..be1d31d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ tensorflow_text torch sentence_transformers hnswlib +conda create -n rapids-22.02 -c rapidsai -c nvidia -c conda-forge rapids=22.02 python=3.8 cudatoolkit=11.4 dask-sql \ No newline at end of file diff --git a/setup.py b/setup.py index 32c4c97..30bcaf6 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,10 @@ 'torch', 'sentence_transformers', ], + 'gpu_support': [ + 'cuml', + 'cudf', + ], 'indexing': [ 'hnswlib', ], diff --git a/top2vec/.DS_Store b/top2vec/.DS_Store new file mode 100644 index 0000000..47e405e Binary files /dev/null and b/top2vec/.DS_Store differ diff --git a/top2vec/Top2Vec.py b/top2vec/Top2Vec.py index edb85b4..3a10193 100644 --- a/top2vec/Top2Vec.py +++ b/top2vec/Top2Vec.py @@ -18,6 +18,16 @@ from sklearn.preprocessing import normalize from scipy.special import softmax +#importing gpu libs +try: + import cuml + import cudf + from cuml.cluster import HDBSCAN as cuml_hdbscan + _HAVE_CUML = True +except ImportError: + _HAVE_CUML = False + + try: import hnswlib @@ -324,6 +334,11 @@ class Top2Vec: functions only document ids will be returned, not the actual documents. + use_gpu: bool (Optional, default False) + If set to True documents will be using the Rapids.ai hdbscan library for + clustering. Rapids.ai converts dataframes into Cudf that are optimized + for GPU based parallelization. + workers: int (Optional) The amount of worker threads to be used in training the model. Larger amount will lead to faster training. @@ -372,7 +387,8 @@ def __init__(self, use_embedding_model_tokenizer=False, umap_args=None, hdbscan_args=None, - verbose=True + verbose=True, + use_gpu=False ): if verbose: @@ -603,18 +619,32 @@ def return_doc(doc): umap_args = {'n_neighbors': 15, 'n_components': 5, 'metric': 'cosine'} + if use_gpu: + try: + umap_args.pop('metric') + except: + None + docvecs_cudf = cudf.DataFrame(self._get_document_vectors(norm=False)) + umap_model = cuml.UMAP(**umap_args).fit(docvecs_cudf) + else: + umap_model = umap.UMAP(**umap_args).fit(self._get_document_vectors(norm=False)) - umap_model = umap.UMAP(**umap_args).fit(self.document_vectors) # find dense areas of document vectors logger.info('Finding dense areas of documents') if hdbscan_args is None: - hdbscan_args = {'min_cluster_size': 15, + hdbscan_args = {'min_cluster_size': 5, 'metric': 'euclidean', 'cluster_selection_method': 'eom'} - cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_) + + logger.info('custom') + print(umap_model.embedding_.shape) + cluster = cuml_hdbscan(**hdbscan_args).fit(umap_model.embedding_) +# cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_.to_numpy()) + + # calculate topic vectors from dense areas of documents logger.info('Finding topics') @@ -790,13 +820,35 @@ def _embed_query(self, query): return self._l2_normalize(np.array(self.embed([query])[0])) + + def _set_document_vectors(self, document_vectors): + if self.embedding_model == 'doc2vec': + self.model.docvecs.vectors_docs = document_vectors + else: + self.document_vectors = document_vectors + + def _get_document_vectors(self, norm=True): + + if self.embedding_model == 'doc2vec': + + if norm: + self.model.docvecs.init_sims() + return self.model.docvecs.vectors_docs_norm + else: + return self.model.docvecs.vectors_docs + else: + return self.document_vectors + def _create_topic_vectors(self, cluster_labels): + + cluster_labels = cluster_labels.to_pandas() unique_labels = set(cluster_labels) if -1 in unique_labels: unique_labels.remove(-1) + self.topic_vectors = self._l2_normalize( - np.vstack([self.document_vectors[np.where(cluster_labels == label)[0]] - .mean(axis=0) for label in unique_labels])) + np.vstack([self._get_document_vectors(norm=False)[np.where(cluster_labels == label)[0]] + .mean(axis=0) for label in unique_labels])) def _deduplicate_topics(self): core_samples, labels = dbscan(X=self.topic_vectors, @@ -2640,4 +2692,4 @@ def generate_topic_wordcloud(self, topic_num, background_color="black", reduced= WordCloud(width=1600, height=400, background_color=background_color).generate_from_frequencies(word_score_dict)) - plt.title("Topic " + str(topic_num), loc='left', fontsize=25, pad=20) + plt.title("Topic " + str(topic_num), loc='left', fontsize=25, pad=20) \ No newline at end of file