diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/404.html b/404.html new file mode 100644 index 0000000..6d55edb --- /dev/null +++ b/404.html @@ -0,0 +1,632 @@ + + + +
+ + + + + + + + + + + + + + +Let’s evaluate the tinytopics topic model training speed on CPU vs. GPU +on mainstream consumer hardware using simulated data. We will compare +the time consumed under combinations of the three key parameters +defining the problem size:
+n
).m
).k
).Experiment environment:
+n
) grows,
+ on both CPU and GPU.k
) grows.n
and k
fixed and vocabulary size (m
) grows, CPU time will
+ grow linearly while GPU time stays constant. For m
larger than a
+ certain threshold, training on GPU will be faster than CPU.import time
+import torch
+import pandas as pd
+import matplotlib.pyplot as plt
+from tinytopics.fit import fit_model
+from tinytopics.utils import generate_synthetic_data, set_random_seed
+from tinytopics.colors import scale_color_tinytopics
+
Set seed for reproducibility:
+ +Define parameter grids:
+n_values = [1000, 5000] # Number of documents
+m_values = [1000, 5000, 10000, 20000] # Vocabulary size
+k_values = [10, 50, 100] # Number of topics
+avg_doc_length = 256 * 256
+
Create a data frame to store the benchmark results.
+benchmark_results = pd.DataFrame()
+
+def benchmark(X, k, device):
+ start_time = time.time()
+ model, losses = fit_model(X, k, device=device)
+ elapsed_time = time.time() - start_time
+
+ return elapsed_time
+
for n in n_values:
+ for m in m_values:
+ for k in k_values:
+ print(f"Benchmarking for n={n}, m={m}, k={k}...")
+
+ X, true_L, true_F = generate_synthetic_data(n, m, k, avg_doc_length=avg_doc_length)
+
+ # Benchmark on CPU
+ cpu_time = benchmark(X, k, torch.device("cpu"))
+ cpu_result = pd.DataFrame([{"n": n, "m": m, "k": k, "device": "CPU", "time": cpu_time}])
+
+ if not cpu_result.isna().all().any():
+ benchmark_results = pd.concat([benchmark_results, cpu_result], ignore_index=True)
+
+ # Benchmark on GPU if available
+ if torch.cuda.is_available():
+ gpu_time = benchmark(X, k, torch.device("cuda"))
+ gpu_result = pd.DataFrame([{"n": n, "m": m, "k": k, "device": "GPU", "time": gpu_time}])
+
+ if not gpu_result.isna().all().any():
+ benchmark_results = pd.concat([benchmark_results, gpu_result], ignore_index=True)
+
Save results to a CSV file:
+ +Plot the number of terms (m
) against the time consumed, conditioning
+on the number of documents (n
), for each number of topics (k
).
unique_series = len(n_values) * (2 if torch.cuda.is_available() else 1)
+colormap = scale_color_tinytopics(unique_series)
+colors_list = [colormap(i) for i in range(unique_series)]
+
+for k in k_values:
+ plt.figure(figsize=(7, 4.3), dpi=300)
+
+ color_idx = 0
+ for n in n_values:
+ subset = benchmark_results[
+ (benchmark_results["n"] == n) & (benchmark_results["k"] == k)
+ ]
+
+ # Plot CPU results with a specific color
+ plt.plot(
+ subset[subset["device"] == "CPU"]["m"],
+ subset[subset["device"] == "CPU"]["time"],
+ label=f"CPU (n={n})",
+ linestyle="--",
+ marker="o",
+ color=colors_list[color_idx],
+ )
+ color_idx += 1
+
+ # Plot GPU results if available
+ if torch.cuda.is_available():
+ plt.plot(
+ subset[subset["device"] == "GPU"]["m"],
+ subset[subset["device"] == "GPU"]["time"],
+ label=f"GPU (n={n})",
+ linestyle="-",
+ marker="x",
+ color=colors_list[color_idx],
+ )
+ color_idx += 1
+
+ plt.xlabel("Vocabulary size (m)")
+ plt.ylabel("Training time (seconds)")
+ plt.title(f"Training time vs. vocabulary size (k={k})")
+ plt.legend()
+ plt.grid(True)
+ plt.savefig(f"training-time-k-{k}.png", dpi=300)
+ plt.close()
+
Let’s walk through a canonical tinytopics workflow using a synthetic +dataset.
+from tinytopics.fit import fit_model
+from tinytopics.plot import plot_loss, plot_structure, plot_top_terms
+from tinytopics.utils import (
+ set_random_seed,
+ generate_synthetic_data,
+ align_topics,
+ sort_documents,
+)
+
Set random seed for reproducibility:
+ +Generate a synthetic dataset:
+n, m, k = 5000, 1000, 10
+X, true_L, true_F = generate_synthetic_data(n, m, k, avg_doc_length=256 * 256)
+
Fit the topic model and plot the loss curve. There will be a progress +bar.
+ + +Tip
+By default, tinytopics uses AdamW with weight decay as the optimizer, +and the cosine annealing with warm restarts scheduler. +This combination should help reduce the need of extensive manual tuning +of hyperparameters such as the learning rate. For optimal performance, +exploring the possible tuning parameter space is still recommended.
+Get the learned L and F matrices from the fitted topic model:
+ +To make it easier to inspect the results visually, we should try to +“align” the learned topics with the ground truth topics by their terms +similarity.
+aligned_indices = align_topics(true_F, learned_F)
+learned_F_aligned = learned_F[aligned_indices]
+learned_L_aligned = learned_L[:, aligned_indices]
+
Sort the documents in both the true document-topic matrix and the +learned document-topic matrix, grouped by dominant topics.
+sorted_indices = sort_documents(true_L)
+true_L_sorted = true_L[sorted_indices]
+learned_L_sorted = learned_L_aligned[sorted_indices]
+
Note
+The alignment step mostly only applies to simulation studies +because we often don't know the ground truth L and F for real datasets.
+We can use a “Structure plot” to visualize and compare the +document-topic distributions.
+plot_structure(
+ true_L_sorted,
+ normalize_rows=True,
+ title="True document-topic distributions (sorted)",
+ output_file="L-true.png",
+)
+
plot_structure(
+ learned_L_sorted,
+ normalize_rows=True,
+ title="Learned document-topic distributions (sorted and aligned)",
+ output_file="L-learned.png",
+)
+
We can also plot the top terms for each topic using bar charts.
+ + + + + + + + + + + + + + + + + +Tip
+Prerequisite: run example-text.R +to get the count data and the model fitted with fastTopics for comparison.
+To run the code from this article as a Python script:
+ +We show a minimal example of text data topic modeling using tinytopics. +The NIPS dataset contains a count matrix for 2483 research papers on +14036 terms. More details about the dataset can be found in this GitHub +repo.
+import numpy as np
+import pandas as pd
+import torch
+from pyreadr import read_r
+from tinytopics.fit import fit_model
+from tinytopics.plot import plot_loss, plot_structure, plot_top_terms
+from tinytopics.utils import (
+ set_random_seed,
+ align_topics,
+ sort_documents,
+)
+
def read_rds_numpy(file_path):
+ X0 = read_r(file_path)
+ X = X0[list(X0.keys())[0]]
+ return(X.to_numpy())
+
+def read_rds_torch(file_path):
+ X = read_rds_numpy(file_path)
+ return(torch.from_numpy(X))
+
X = read_rds_torch("counts.rds")
+
+with open("terms.txt", "r") as file:
+ terms = [line.strip() for line in file]
+
set_random_seed(42)
+
+k = 10
+model, losses = fit_model(X, k)
+plot_loss(losses, output_file="loss.png")
+
We first load the L and F matrices fitted by fastTopics and then compare +them with the tinytopics model. For easier visual comparison, we will +try to “align” the topics fitted by tinytopics with those from +fastTopics, and sort documents grouped by dominant topics.
+L_tt = model.get_normalized_L().numpy()
+F_tt = model.get_normalized_F().numpy()
+
+L_ft = read_rds_numpy("L_fastTopics.rds")
+F_ft = read_rds_numpy("F_fastTopics.rds")
+
+aligned_indices = align_topics(F_ft, F_tt)
+F_aligned_tt = F_tt[aligned_indices]
+L_aligned_tt = L_tt[:, aligned_indices]
+
+sorted_indices_ft = sort_documents(L_ft)
+L_sorted_ft = L_ft[sorted_indices_ft]
+sorted_indices_tt = sort_documents(L_aligned_tt)
+L_sorted_tt = L_aligned_tt[sorted_indices_tt]
+
Use Structure plot to check the document-topic distributions:
+plot_structure(
+ L_sorted_ft,
+ title="fastTopics document-topic distributions (sorted)",
+ output_file="L-fastTopics.png",
+)
+
plot_structure(
+ L_sorted_tt,
+ title="tinytopics document-topic distributions (sorted and aligned)",
+ output_file="L-tinytopics.png",
+)
+
Plot the probability of top 15 terms in each topic from both models to +inspect their concordance:
+ + + + + + + + + + + + + + + + + +