docs: Refine example plot_cache_mechanism (#1211)

Fix some minor typos, make some parts clearer, add some sections, etc
probabl-ai · Jan 23, 2025 · 2a321e2 · 2a321e2
1 parent bb64834
commit 2a321e2
Show file tree

Hide file tree

Showing 6 changed files with 146 additions and 95 deletions.
diff --git a/examples/getting_started/plot_skore_getting_started.py b/examples/getting_started/plot_skore_getting_started.py
@@ -253,7 +253,7 @@
 # .. seealso::
 #
 #   For more information about the motivation and usage of
-#   :class:`skore.CrossValidationReport`, see :ref:`_example_use_case_employee_salaries`.
+#   :class:`skore.CrossValidationReport`, see :ref:`example_use_case_employee_salaries`.
 
 # %%
 # Train-test split with skore
@@ -262,21 +262,7 @@
 # Skore has implemented a :func:`skore.train_test_split` function that wraps
 # scikit-learn's :func:`sklearn.model_selection.train_test_split`.
 #
-# For example, it can raise warnings when there is class imbalance in the data to
-# provide methodological advice:
-
-# %%
-X = np.arange(400).reshape((200, 2))
-y = [0] * 150 + [1] * 50
-
-X_train, X_test, y_train, y_test = skore.train_test_split(
-    X=X, y=y, test_size=0.2, random_state=0
-)
-
-# %%
-# In particular, there is a ``HighClassImbalanceWarning``.
-#
-# Now, let us load a dataset containing some time series data:
+# Let us load a dataset containing some time series data:
 
 # %%
 from skrub.datasets import fetch_employee_salaries

diff --git a/examples/model_evaluation/plot_train_test_split.py b/examples/model_evaluation/plot_train_test_split.py
@@ -12,7 +12,7 @@
 # %%
 # Creating and loading the skore project
 # ======================================
-# %%
+#
 # We create and load the skore project from the current directory:
 
 # %%

diff --git a/examples/technical_details/README.txt b/examples/technical_details/README.txt
@@ -1,5 +1,5 @@
 Technical details
 -----------------
 
-These examples shows some technical details at the core of `skore` to better understand
-some of the mechanic under the hood.
+These examples shows some technical details at the core of skore to better understand
+some of the mechanics under the hood.
diff --git a/examples/technical_details/plot_cache_mechanism.py b/examples/technical_details/plot_cache_mechanism.py
@@ -18,6 +18,8 @@
 os.environ["POLARS_ALLOW_FORKING_THREAD"] = "1"
 
 # %%
+# Loading some data
+# =================
 #
 # First, we load a dataset from `skrub`. Our goal is to predict if a company paid a
 # physician. The ultimate goal is to detect potential conflict of interest when it comes
@@ -33,11 +35,22 @@
 
 TableReport(df)
 
+# %%
+import pandas as pd
+
+TableReport(pd.DataFrame(y))
+
 # %%
 #
-# The dataset has over 70,000 records with only categorical features. Some categories
-# are not well-defined. We use `skrub` to create a simple predictive model that handles
-# this.
+# The dataset has over 70,000 records with only categorical features.
+# Some categories are not well defined.
+
+# %%
+# Caching with :class:`~skore.EstimatorReport` and :class:`~skore.CrossValidationReport`
+# ======================================================================================
+#
+# We use `skrub` to create a simple predictive model that handles our dataset's
+# challenges.
 from skrub import tabular_learner
 
 model = tabular_learner("classifier")
@@ -52,6 +65,11 @@
 X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42)
 
 # %%
+# Caching the predictions for fast metric computation
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# First, let us focus on :class:`~skore.EstimatorReport`, as the same philosophy will
+# apply to :class:`~skore.CrossValidationReport`.
 #
 # Let's explore how :class:`~skore.EstimatorReport` uses caching to speed up
 # predictions. We start by training the model:
@@ -64,7 +82,7 @@
 
 # %%
 #
-# Let's compute the accuracy on our test set and measure how long it takes:
+# We compute the accuracy on our test set and measure how long it takes:
 import time
 
 start = time.time()
@@ -90,8 +108,10 @@
 
 # %%
 #
-# Both approaches take similar time. Now watch what happens when we compute accuracy
-# again:
+# Both approaches take similar time.
+#
+# Now, watch what happens when we compute the accuracy again with our skore estimator
+# report:
 start = time.time()
 result = report.metrics.accuracy()
 end = time.time()
@@ -107,9 +127,9 @@
 report._cache
 
 # %%
-#
-# The cache stores predictions by type and data source. This means metrics that use
-# the same type of predictions will be faster. Let's try the precision metric:
+# The cache stores predictions by type and data source. This means that computing
+# metrics that use the same type of predictions will be faster.
+# Let's try the precision metric:
 start = time.time()
 result = report.metrics.precision()
 end = time.time()
@@ -121,15 +141,20 @@
 # %%
 # We observe that it takes only a few milliseconds to compute the precision because we
 # don't need to re-compute the predictions and only have to compute the precision
-# metric itself. Since the predictions are the bottleneck in terms of time, we observe
+# metric itself.
+# Since the predictions are the bottleneck in terms of computation time, we observe
 # an interesting speedup.
+
+# %%
+# Caching all the possible predictions at once
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
 # We can pre-compute all predictions at once using parallel processing:
 report.cache_predictions(n_jobs=2)
 
 # %%
 #
-# Now all possible predictions are stored. Any metric calculation will be much faster,
+# Now, all possible predictions are stored. Any metric calculation will be much faster,
 # even on different data (like the training set):
 start = time.time()
 result = report.metrics.log_loss(data_source="train")
@@ -140,6 +165,8 @@
 print(f"Time taken: {end - start:.2f} seconds")
 
 # %%
+# Caching external data
+# ^^^^^^^^^^^^^^^^^^^^^
 #
 # The report can also work with external data. We use `data_source="X_y"` to indicate
 # that we want to pass those external data.
@@ -153,9 +180,9 @@
 
 # %%
 #
-# The first calculation is slower than when using the internal train or test sets
-# because it needs to compute a hash of the new data for later retrieval. Let's
-# calculate it again:
+# The first calculation of the above cell is slower than when using the internal train
+# or test sets because it needs to compute a hash of the new data for later retrieval.
+# Let's calculate it again:
 start = time.time()
 result = report.metrics.log_loss(data_source="X_y", X=X_test, y=y_test)
 end = time.time()
@@ -166,8 +193,9 @@
 
 # %%
 #
-# Much faster! The remaining time is related to the hash computation. Let's compute the
-# ROC AUC on the same data:
+# It is much faster for the second time as the predictions are cached!
+# The remaining time corresponds to the hash computation.
+# Let's compute the ROC AUC on the same data:
 start = time.time()
 result = report.metrics.roc_auc(data_source="X_y", X=X_test, y=y_test)
 end = time.time()
@@ -178,8 +206,12 @@
 
 # %%
 # We observe that the computation is already efficient because it boils down to two
-# computations: the hash of the data and the ROC-AUC metric. We save a lot of time
-# because we don't need to re-compute the predictions.
+# computations: the hash of the data and the ROC-AUC metric.
+# We save a lot of time because we don't need to re-compute the predictions.
+
+# %%
+# Caching for plotting
+# ^^^^^^^^^^^^^^^^^^^^
 #
 # The cache also speeds up plots. Let's create a ROC curve:
 import matplotlib.pyplot as plt
@@ -220,6 +252,9 @@
 #
 # It means that nothing is stored anymore in the cache.
 #
+# Caching with :class:`~skore.CrossValidationReport`
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
 # :class:`~skore.CrossValidationReport` uses the same caching system for each fold
 # in cross-validation by leveraging the previous :class:`~skore.EstimatorReport`:
 from skore import CrossValidationReport
@@ -234,8 +269,8 @@
 
 # %%
 #
-# Now all possible predictions are stored. Any metric calculation will be much faster,
-# even on different data as we show for the :class:`~skore.EstimatorReport`.
+# Now, all possible predictions are stored. Any metric calculation will be much faster,
+# even on different data, as we showed for the :class:`~skore.EstimatorReport`.
 start = time.time()
 result = report.metrics.report_metrics(aggregate=["mean", "std"])
 end = time.time()
@@ -246,4 +281,4 @@
 
 # %%
 #
-# So we observe the same type of behaviour as we previously exposed.
+# Hence, we observe the same type of behaviour as we previously exposed.
diff --git a/examples/use_cases/README.txt b/examples/use_cases/README.txt
@@ -1,9 +1,9 @@
 End-to-end data science use cases
 ---------------------------------
 
-These examples show `skore` in action on real use case. We aimed at showing `skore`
+These examples show skore in action on real use cases. We aimed at showing skore's
 ability to:
 
-- be compatible with `scikit-learn`
+- be compatible with scikit-learn
 - reduce boilerplate code for some standard *de facto* data science analysis
 - speed-up exploration by optimizing some internal computation
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,7 +12,7 @@ @@
     # %%
     # Creating and loading the skore project
     # ======================================
-    # %%
+    #
     # We create and load the skore project from the current directory:
     # %%
@@ Expand Down @@