diff --git a/docs/source/notebooks/clv/clv_quickstart.ipynb b/docs/source/notebooks/clv/clv_quickstart.ipynb
index 2061d48cb..4cd8347a6 100644
--- a/docs/source/notebooks/clv/clv_quickstart.ipynb
+++ b/docs/source/notebooks/clv/clv_quickstart.ipynb
@@ -67,10 +67,10 @@
"* `customer_id` represents a unique identifier for each customer.\n",
"* `frequency` represents the number of _repeat_ purchases that a customer has made, i.e. one less than the total number of purchases.\n",
"* `T` represents a customer's \"age\", i.e. the duration between a customer's first purchase and the end of the period of study. In this example notebook, the units of time are in weeks.\n",
- "* `recency` represents the timepoint when a customer made their most recent purchase. This is also equal to the duration between a customer’s first non-repeat purchase (usually time 0) and last purchase. If a customer has made only 1 purchase, their recency is 0;\n",
+ "* `recency` represents the time period when a customer made their most recent purchase. This is equal to the duration between a customer’s first and last purchase. If a customer has made only 1 purchase, their recency is 0.\n",
"* `monetary_value` represents the average value of a given customer’s repeat purchases. Customers who have only made a single purchase have monetary values of zero.\n",
"\n",
- "If working with raw transaction data, the `rfm_summary` function can be used to preprocess data for modeling:"
+ "The `rfm_summary` function can be used to preprocess raw transaction data for modeling:"
]
},
{
@@ -339,6 +339,8 @@
"id": "514ee548",
"metadata": {},
"source": [
+ "It is important to note these definitions differ from that used in RFM segmentation, where the first purchase is included, `T` is not used, and `recency` is the number of time periods since a customer's most recent purchase.\n",
+ "\n",
"To visualize data in RFM format, we can plot the recency and T of the customers with the `plot_customer_exposure` function. We see a large chunk (>60%) of customers haven't made another purchase in a while."
]
},
@@ -2579,7 +2581,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.18"
+ "version": "3.10.14"
},
"toc": {
"base_numbering": 1,
diff --git a/docs/source/notebooks/clv/dev/utilities_plotting.ipynb b/docs/source/notebooks/clv/dev/utilities_plotting.ipynb
index a797237eb..ec5156c86 100644
--- a/docs/source/notebooks/clv/dev/utilities_plotting.ipynb
+++ b/docs/source/notebooks/clv/dev/utilities_plotting.ipynb
@@ -5,15 +5,7 @@
"execution_count": 1,
"id": "435ed203-5c3c-4efc-93d1-abac66ce7187",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "WARNING (pytensor.tensor.blas): Using NumPy C-API based implementation for BLAS functions.\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"from pymc_marketing.clv import utils\n",
"\n",
@@ -30,7 +22,7 @@
},
{
"cell_type": "code",
- "execution_count": 69,
+ "execution_count": 2,
"id": "7de7f396-1d5b-4457-916b-c29ed90aa132",
"metadata": {},
"outputs": [],
@@ -66,7 +58,7 @@
},
{
"cell_type": "code",
- "execution_count": 70,
+ "execution_count": 3,
"id": "932e8db6-78cf-49df-aa4a-83ee6584e5dd",
"metadata": {},
"outputs": [
@@ -196,7 +188,7 @@
"13 6 2015-02-02 True"
]
},
- "execution_count": 70,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -223,7 +215,7 @@
},
{
"cell_type": "code",
- "execution_count": 74,
+ "execution_count": 4,
"id": "4c0a7de5-8825-40af-84e5-6cd0ad26a0e3",
"metadata": {},
"outputs": [
@@ -259,42 +251,42 @@
"
\n",
" 0 | \n",
" 1 | \n",
- " 1.0 | \n",
+ " 2.0 | \n",
" 5.0 | \n",
" 5.0 | \n",
- " 2.0 | \n",
+ " 1.5 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
- " 0.0 | \n",
+ " 1.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
- " 0.0 | \n",
+ " 2.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
- " 1.0 | \n",
+ " 2.0 | \n",
" 1.0 | \n",
" 5.0 | \n",
- " 5.0 | \n",
+ " 4.5 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
- " 1.0 | \n",
+ " 2.0 | \n",
" 3.0 | \n",
" 3.0 | \n",
- " 8.0 | \n",
+ " 7.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
- " 0.0 | \n",
+ " 1.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
- " 0.0 | \n",
+ " 12.0 | \n",
"
\n",
" \n",
"\n",
@@ -302,14 +294,14 @@
],
"text/plain": [
" customer_id frequency recency T monetary_value\n",
- "0 1 1.0 5.0 5.0 2.0\n",
- "1 2 0.0 0.0 5.0 0.0\n",
- "2 3 1.0 1.0 5.0 5.0\n",
- "3 4 1.0 3.0 3.0 8.0\n",
- "4 5 0.0 0.0 3.0 0.0"
+ "0 1 2.0 5.0 5.0 1.5\n",
+ "1 2 1.0 0.0 5.0 2.0\n",
+ "2 3 2.0 1.0 5.0 4.5\n",
+ "3 4 2.0 3.0 3.0 7.0\n",
+ "4 5 1.0 0.0 3.0 12.0"
]
},
- "execution_count": 74,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -323,7 +315,7 @@
" observation_period_end = \"2015-02-06\",\n",
" datetime_format = \"%Y-%m-%d\",\n",
" time_unit = \"W\",\n",
- " include_first_transaction=False,\n",
+ " include_first_transaction=True,\n",
")\n",
"\n",
"rfm_df.head()"
@@ -339,7 +331,7 @@
},
{
"cell_type": "code",
- "execution_count": 76,
+ "execution_count": 5,
"id": "761edfe9-1b69-4966-83bf-4f1242eda2d5",
"metadata": {},
"outputs": [
@@ -450,7 +442,7 @@
"4 0.0 5.0 "
]
},
- "execution_count": 76,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -467,13 +459,137 @@
"train_test.head()"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "73dc1b93-6a4f-4171-b838-30759b2c1e0e",
+ "metadata": {},
+ "source": [
+ "`rfm_segments` will assign customer to segments based on their recency, frequency, and monetary value. It uses a quartile-based RFM score approach that is very computationally efficient, but defining custom segments is a rather subjective exercise. The returned dataframe also cannot be used for modeling because it does not zero out the initial transactions."
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 40,
"id": "c7b3f800-8dfb-4e5a-b939-5f908281563c",
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "segments = utils.rfm_segments(\n",
+ " test_data, \n",
+ " customer_id_col = \"id\", \n",
+ " datetime_col = \"date\", \n",
+ " monetary_value_col = \"monetary_value\",\n",
+ " observation_period_end = \"2015-02-06\",\n",
+ " datetime_format = \"%Y-%m-%d\",\n",
+ " time_unit = \"W\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "932ac4e5-361e-42fa-97d3-d8e508128944",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " customer_id | \n",
+ " frequency | \n",
+ " recency | \n",
+ " monetary_value | \n",
+ " segment | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 1.5 | \n",
+ " Other | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1.0 | \n",
+ " 5.0 | \n",
+ " 2.0 | \n",
+ " Inactive Customer | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2.0 | \n",
+ " 4.0 | \n",
+ " 4.5 | \n",
+ " At Risk Customer | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 7.0 | \n",
+ " Top Spender | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 12.0 | \n",
+ " At Risk Customer | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " Top Spender | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " customer_id frequency recency monetary_value segment\n",
+ "0 1 2.0 0.0 1.5 Other\n",
+ "1 2 1.0 5.0 2.0 Inactive Customer\n",
+ "2 3 2.0 4.0 4.5 At Risk Customer\n",
+ "3 4 2.0 0.0 7.0 Top Spender\n",
+ "4 5 1.0 3.0 12.0 At Risk Customer\n",
+ "5 6 1.0 0.0 5.0 Top Spender"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "segments"
+ ]
}
],
"metadata": {
@@ -492,7 +608,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.18"
+ "version": "3.10.14"
}
},
"nbformat": 4,
diff --git a/pymc_marketing/clv/__init__.py b/pymc_marketing/clv/__init__.py
index f0b3c847e..57a4f4826 100644
--- a/pymc_marketing/clv/__init__.py
+++ b/pymc_marketing/clv/__init__.py
@@ -25,6 +25,7 @@
)
from pymc_marketing.clv.utils import (
customer_lifetime_value,
+ rfm_segments,
rfm_summary,
rfm_train_test_split,
)
@@ -39,6 +40,7 @@
"plot_customer_exposure",
"plot_frequency_recency_matrix",
"plot_probability_alive_matrix",
+ "rfm_segments",
"rfm_summary",
"rfm_train_test_split",
)
diff --git a/pymc_marketing/clv/utils.py b/pymc_marketing/clv/utils.py
index 1abcc7933..055b03dd6 100644
--- a/pymc_marketing/clv/utils.py
+++ b/pymc_marketing/clv/utils.py
@@ -22,6 +22,7 @@
__all__ = [
"to_xarray",
"customer_lifetime_value",
+ "rfm_segments",
"rfm_summary",
"rfm_train_test_split",
]
@@ -304,9 +305,14 @@ def rfm_summary(
This transforms a DataFrame of transaction data of the form:
customer_id, datetime [, monetary_value]
- to a DataFrame of the form:
+ to a DataFrame for CLV modeling:
customer_id, frequency, recency, T [, monetary_value]
+ If the `include_first_transaction = True` argument is specified, a DataFrame for RFM segmentation is returned:
+ customer_id, frequency, recency, monetary_value
+
+ This function is not required if using the `rfm_segments` utility.
+
Adapted from lifetimes package
https://github.com/CamDavidsonPilon/lifetimes/blob/41e394923ad72b17b5da93e88cfabab43f51abe2/lifetimes/utils.py#L230
@@ -320,7 +326,7 @@ def rfm_summary(
Column in the transactions DataFrame that denotes the datetime the purchase was made.
monetary_value_col: string, optional
Column in the transactions DataFrame that denotes the monetary value of the transaction.
- Optional; only needed for spend estimation models like the Gamma-Gamma model.
+ Optional; only needed for RFM segmentation and spend estimation models like the Gamma-Gamma model.
observation_period_end: Union[str, pd.Period, datetime], optional
A string or datetime to denote the final date of the study.
Events after this date are truncated. If not given, defaults to the max 'datetime_col'.
@@ -387,25 +393,33 @@ def rfm_summary(
datetime_col
].agg(["min", "max", "count"])
- if not include_first_transaction:
- # subtract 1 from count, as we ignore their first order.
- customers["frequency"] = customers["count"] - 1
- else:
- customers["frequency"] = customers["count"]
+ # subtract 1 from count, as we ignore the first order.
+ customers["frequency"] = customers["count"] - 1
- customers["T"] = (
- (observation_period_end_ts - customers["min"])
+ customers["recency"] = (
+ (pd.to_datetime(customers["max"]) - pd.to_datetime(customers["min"])) # type: ignore
/ np.timedelta64(1, time_unit)
/ time_scaler
)
- customers["recency"] = (
- (pd.to_datetime(customers["max"]) - pd.to_datetime(customers["min"])) # type: ignore
+
+ customers["T"] = (
+ (observation_period_end_ts - customers["min"])
/ np.timedelta64(1, time_unit)
/ time_scaler
)
summary_columns = ["frequency", "recency", "T"]
+ if include_first_transaction:
+ # add the first order back to the frequency count
+ customers["frequency"] = customers["frequency"] + 1
+
+ # change recency to segmentation definition
+ customers["recency"] = customers["T"] - customers["recency"]
+
+ # T column is not used for segmentation
+ summary_columns = ["frequency", "recency"]
+
if monetary_value_col:
if not include_first_transaction:
# create an index of all the first purchases
@@ -597,3 +611,202 @@ def rfm_train_test_split(
train_test_rfm_data["test_T"] = time_delta / time_scaler # type: ignore
return train_test_rfm_data
+
+
+def rfm_segments(
+ transactions: pd.DataFrame,
+ customer_id_col: str,
+ datetime_col: str,
+ monetary_value_col: str,
+ segment_config: dict | None = None,
+ observation_period_end: str | pd.Period | datetime | None = None,
+ datetime_format: str | None = None,
+ time_unit: str = "D",
+ time_scaler: float | None = 1,
+ sort_transactions: bool | None = True,
+) -> pd.DataFrame:
+ """
+ Assign customers to segments based on spending behavior derived from RFM scores.
+
+ This transforms a DataFrame of transaction data of the form:
+ customer_id, datetime, monetary_value
+ to a DataFrame of the form:
+ customer_id, frequency, recency, monetary_value, rfm_score, segment
+
+ Customer purchasing data is aggregated into three variables: `recency`, `frequency`, and `monetary_value`.
+ Quartiles are estimated for each variable, and a three-digit RFM score is then assigned to each customer.
+ For example, a customer with a score of '234' is in the second quartile for `recency`, third quartile for
+ `frequency`, and fourth quartile for `monetary_value`.
+ RFM scores corresponding to segments such as "Top Spender", "Frequent Buyer", or "At-Risk" are determined, and
+ customers are then segmented based on their RFM score.
+
+ By default, the following segments are created:
+ "Premium Customer": Customers in top 2 quartiles for all variables.
+ "Repeat Customer": Customers in top 2 quartiles for frequency, and either recency or monetary value.
+ "Top Spender": Customers in top 2 quartiles for monetary value, and either frequency or recency.
+ "At-Risk Customer": Customers in bottom 2 quartiles for two or more variables.
+ "Inactive Customer": Customers in bottom quartile for two or more variables.
+ Customers with unspecified RFM scores will be assigned to a segment named "Other".
+
+ If an alternative segmentation approach is desired, use
+ `rfm_summary(include_first_transaction=True, *args, **kwargs)` instead to preprocess data for segmentation.
+ In either case, the returned DataFrame cannot be used for modeling.
+ If assigning model predictions to RFM segments, create a separate DataFrame for modeling and join by Customer ID.
+
+ Parameters
+ ----------
+ transactions: :obj: DataFrame
+ A Pandas DataFrame that contains the customer_id col and the datetime col.
+ customer_id_col: string
+ Column in the transactions DataFrame that denotes the customer_id.
+ datetime_col: string
+ Column in the transactions DataFrame that denotes the datetime the purchase was made.
+ monetary_value_col: string, optional
+ Column in the transactions DataFrame that denotes the monetary value of the transaction.
+ Optional; only needed for spend estimation models like the Gamma-Gamma model.
+ segment_config: dict, optional
+ Dictionary containing segment names and list of RFM score assignments;
+ key/value pairs should be formatted as `{"segment": ['111', '123', '321'], ...}`.
+ If not provided, default segment names and definitions are applied.
+ observation_period_end: Union[str, pd.Period, datetime, None], optional
+ A string or datetime to denote the final date of the study.
+ Events after this date are truncated. If not given, defaults to the max 'datetime_col'.
+ datetime_format: string, optional
+ A string that represents the timestamp format. Useful if Pandas can't understand
+ the provided format.
+ time_unit: string, optional
+ Time granularity for study.
+ Default: 'D' for days. Possible values listed here:
+ https://numpy.org/devdocs/reference/arrays.datetime.html#datetime-units
+ time_scaler: int, optional
+ Default: 1. Useful for scaling recency & T to a different time granularity. Example:
+ With freq='D' and freq_multiplier=1, we get recency=591 and T=632
+ With freq='h' and freq_multiplier=24, we get recency=590.125 and T=631.375
+ This is useful if predictions in a different time granularity are desired,
+ and can also help with model convergence for study periods of many years.
+ sort_transactions: bool, optional
+ Default: True
+ If raw data is already sorted in chronological order, set to `False` to improve computational efficiency.
+
+ Returns
+ -------
+ :obj: DataFrame:
+ customer_id, frequency, recency, T [, monetary_value]
+ """
+
+ rfm_data = rfm_summary(
+ transactions,
+ customer_id_col=customer_id_col,
+ datetime_col=datetime_col,
+ monetary_value_col=monetary_value_col,
+ observation_period_end=observation_period_end,
+ datetime_format=datetime_format,
+ time_unit=time_unit,
+ time_scaler=time_scaler,
+ include_first_transaction=True,
+ sort_transactions=sort_transactions,
+ )
+
+ # iteratively assign quartile labels for each row/variable
+ for column_name in zip(
+ ["r_quartile", "f_quartile", "m_quartile"],
+ ["recency", "frequency", "monetary_value"],
+ strict=False,
+ ):
+ # If data has many repeat values, fewer than 4 bins will be returned.
+ # These try blocks will modify labelling for fewer bins.
+ try:
+ labels = _rfm_quartile_labels(column_name[0], 5)
+ rfm_data[column_name[0]] = pd.qcut(
+ rfm_data[column_name[1]], q=4, labels=labels, duplicates="drop"
+ ).astype(str)
+ except ValueError:
+ try:
+ labels = _rfm_quartile_labels(column_name[0], 4)
+ rfm_data[column_name[0]] = pd.qcut(
+ rfm_data[column_name[1]], q=4, labels=labels, duplicates="drop"
+ ).astype(str)
+ except ValueError:
+ labels = _rfm_quartile_labels(column_name[0], 3)
+ rfm_data[column_name[0]] = pd.qcut(
+ rfm_data[column_name[1]], q=4, labels=labels, duplicates="drop"
+ ).astype(str)
+ warnings.warn(
+ f"RFM score will not exceed 2 for {column_name[0]}. Specify a custom segment_config",
+ UserWarning,
+ stacklevel=1,
+ )
+
+ rfm_data = pd.eval( # type: ignore
+ "rfm_score = rfm_data.r_quartile + rfm_data.f_quartile + rfm_data.m_quartile",
+ target=rfm_data,
+ )
+
+ if segment_config is None:
+ segment_config = _default_rfm_segment_config
+
+ segment_names = list(segment_config.keys())
+
+ # create catch-all "Other" segment and assign defined segments from config
+ rfm_data["segment"] = "Other"
+
+ for key in segment_names:
+ rfm_data.loc[rfm_data["rfm_score"].isin(segment_config[key]), "segment"] = key
+
+ # drop unnecessary columns
+ rfm_data = rfm_data.drop(columns=["r_quartile", "f_quartile", "m_quartile"])
+
+ return rfm_data
+
+
+def _rfm_quartile_labels(column_name, max_label_range):
+ """called internally by rfm_segments to label quartiles for each variable"""
+ # recency labels must be reversed because lower values are more desirable
+ if column_name == "r_quartile":
+ return list(range(max_label_range - 1, 0, -1))
+ else:
+ return range(1, max_label_range)
+
+
+_default_rfm_segment_config = {
+ "Premium Customer": [
+ "334",
+ "443",
+ "444",
+ "344",
+ "434",
+ "433",
+ "343",
+ "333",
+ ],
+ "Repeat Customer": ["244", "234", "232", "332", "143", "233", "243"],
+ "Top Spender": [
+ "424",
+ "414",
+ "144",
+ "314",
+ "324",
+ "124",
+ "224",
+ "423",
+ "413",
+ "133",
+ "323",
+ "313",
+ "134",
+ ],
+ "At Risk Customer": [
+ "422",
+ "223",
+ "212",
+ "122",
+ "222",
+ "132",
+ "322",
+ "312",
+ "412",
+ "123",
+ "214",
+ ],
+ "Inactive Customer": ["411", "111", "113", "114", "112", "211", "311"],
+}
diff --git a/tests/clv/test_utils.py b/tests/clv/test_utils.py
index 46323b6b0..f6711534b 100644
--- a/tests/clv/test_utils.py
+++ b/tests/clv/test_utils.py
@@ -23,8 +23,10 @@
from pymc_marketing.clv import BetaGeoModel, GammaGammaModel, ParetoNBDModel
from pymc_marketing.clv.utils import (
_find_first_transactions,
+ _rfm_quartile_labels,
clv_summary,
customer_lifetime_value,
+ rfm_segments,
rfm_summary,
rfm_train_test_split,
to_xarray,
@@ -350,20 +352,20 @@ def transaction_data(self) -> pd.DataFrame:
[5, "2015-01-18", 8],
[6, "2015-02-02", 5],
]
- return pd.DataFrame(d, columns=["id", "date", "monetary_value"])
+ return pd.DataFrame(d, columns=["identifier", "date", "monetary_value"])
def test_find_first_transactions_test_period_end_none(self, transaction_data):
max_date = transaction_data["date"].max()
pd.testing.assert_frame_equal(
left=_find_first_transactions(
transactions=transaction_data,
- customer_id_col="id",
+ customer_id_col="identifier",
datetime_col="date",
observation_period_end=None,
),
right=_find_first_transactions(
transactions=transaction_data,
- customer_id_col="id",
+ customer_id_col="identifier",
datetime_col="date",
observation_period_end=max_date,
),
@@ -382,7 +384,7 @@ def test_find_first_transactions_returns_correct_results(
actual = _find_first_transactions(
transaction_data,
- "id",
+ "identifier",
"date",
observation_period_end=today,
)
@@ -402,7 +404,7 @@ def test_find_first_transactions_returns_correct_results(
[5, pd.Period("2015-01-18", "D"), False],
[6, pd.Period("2015-02-02", "D"), True],
],
- columns=["id", "date", "first"],
+ columns=["identifier", "date", "first"],
index=[0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13],
) # row indices are skipped for time periods with multiple transactions
assert_frame_equal(actual, expected)
@@ -420,7 +422,7 @@ def test_find_first_transactions_with_specific_non_daily_frequency(
actual = _find_first_transactions(
transaction_data,
- "id",
+ "identifier",
"date",
observation_period_end=today,
time_unit="W",
@@ -437,7 +439,7 @@ def test_find_first_transactions_with_specific_non_daily_frequency(
[5, pd.Period("2015-01-12/2015-01-18", "W-SUN"), True],
[6, pd.Period("2015-02-02/2015-02-08", "W-SUN"), True],
],
- columns=["id", "date", "first"],
+ columns=["identifier", "date", "first"],
index=actual.index,
) # we shouldn't really care about row ordering or indexing, but assert_frame_equals is strict about it
assert_frame_equal(actual, expected)
@@ -455,7 +457,7 @@ def test_find_first_transactions_with_monetary_values(
actual = _find_first_transactions(
transaction_data,
- "id",
+ "identifier",
"date",
"monetary_value",
observation_period_end=today,
@@ -476,7 +478,7 @@ def test_find_first_transactions_with_monetary_values(
[5, pd.Period("2015-01-18", "D"), 8, False],
[6, pd.Period("2015-02-02", "D"), 5, True],
],
- columns=["id", "date", "monetary_value", "first"],
+ columns=["identifier", "date", "monetary_value", "first"],
)
assert_frame_equal(actual, expected)
@@ -493,7 +495,7 @@ def test_find_first_transactions_with_monetary_values_with_specific_non_daily_fr
actual = _find_first_transactions(
transaction_data,
- "id",
+ "identifier",
"date",
"monetary_value",
observation_period_end=today,
@@ -511,7 +513,7 @@ def test_find_first_transactions_with_monetary_values_with_specific_non_daily_fr
[5, pd.Period("2015-01-12/2015-01-18", "W-SUN"), 12, True],
[6, pd.Period("2015-02-02/2015-02-08", "W-SUN"), 5, True],
],
- columns=["id", "date", "monetary_value", "first"],
+ columns=["identifier", "date", "monetary_value", "first"],
)
assert_frame_equal(actual, expected)
@@ -525,7 +527,7 @@ def test_rfm_summary_returns_correct_results(self, transaction_data, today):
# https://github.com/CamDavidsonPilon/lifetimes/blob/aae339c5437ec31717309ba0ec394427e19753c4/tests/test_utils.py#L239
actual = rfm_summary(
- transaction_data, "id", "date", observation_period_end=today
+ transaction_data, "identifier", "date", observation_period_end=today
)
expected = pd.DataFrame(
[
@@ -552,8 +554,8 @@ def test_rfm_summary_works_with_string_customer_ids(self):
["Y", "2015-01-02"],
["Y", "2015-01-05"],
]
- df = pd.DataFrame(d, columns=["id", "date"])
- rfm_summary(df, "id", "date")
+ df = pd.DataFrame(d, columns=["identifier", "date"])
+ rfm_summary(df, "identifier", "date")
def test_rfm_summary_works_with_int_customer_ids_and_doesnt_coerce_to_float(self):
# Test adapted from
@@ -567,8 +569,8 @@ def test_rfm_summary_works_with_int_customer_ids_and_doesnt_coerce_to_float(self
[2, "2015-01-02"],
[2, "2015-01-05"],
]
- df = pd.DataFrame(d, columns=["id", "date"])
- actual = rfm_summary(df, "id", "date")
+ df = pd.DataFrame(d, columns=["identifier", "date"])
+ actual = rfm_summary(df, "identifier", "date")
assert actual.index.dtype == "int64"
def test_rfm_summary_with_specific_datetime_format(
@@ -585,7 +587,7 @@ def test_rfm_summary_with_specific_datetime_format(
today = "20150207"
actual = rfm_summary(
transaction_data,
- "id",
+ "identifier",
"date",
observation_period_end=today,
datetime_format=format,
@@ -614,7 +616,7 @@ def test_rfm_summary_non_daily_frequency(
today = "20150207"
actual = rfm_summary(
transaction_data,
- "id",
+ "identifier",
"date",
observation_period_end=today,
time_unit="W",
@@ -642,7 +644,7 @@ def test_rfm_summary_monetary_values_and_first_transactions(
today = "20150207"
actual = rfm_summary(
transaction_data,
- "id",
+ "identifier",
"date",
monetary_value_col="monetary_value",
observation_period_end=today,
@@ -662,7 +664,7 @@ def test_rfm_summary_monetary_values_and_first_transactions(
actual_first_trans = rfm_summary(
transaction_data,
- "id",
+ "identifier",
"date",
monetary_value_col="monetary_value",
observation_period_end=today,
@@ -670,14 +672,14 @@ def test_rfm_summary_monetary_values_and_first_transactions(
)
expected_first_trans = pd.DataFrame(
[
- [1, 2.0, 36.0, 37.0, 1.5],
- [2, 1.0, 0.0, 37.0, 2],
- [3, 3.0, 4.0, 37.0, 3],
- [4, 3.0, 20.0, 22.0, 6],
- [5, 3.0, 2.0, 22.0, 4],
- [6, 1.0, 0.0, 5.0, 5],
+ [1, 2.0, 1.0, 1.5],
+ [2, 1.0, 37.0, 2],
+ [3, 3.0, 33.0, 3],
+ [4, 3.0, 2.0, 6],
+ [5, 3.0, 20.0, 4],
+ [6, 1.0, 5.0, 5],
],
- columns=["customer_id", "frequency", "recency", "T", "monetary_value"],
+ columns=["customer_id", "frequency", "recency", "monetary_value"],
)
assert_frame_equal(actual_first_trans, expected_first_trans)
@@ -695,9 +697,11 @@ def test_rfm_summary_will_choose_the_correct_first_order_to_drop_in_monetary_tra
)
sales = pd.Series([10, 20, 25])
transaction_data = pd.DataFrame(
- {"date": dates_ordered, "id": cust, "sales": sales}
+ {"date": dates_ordered, "identifier": cust, "sales": sales}
+ )
+ summary_ordered_data = rfm_summary(
+ transaction_data, "identifier", "date", "sales"
)
- summary_ordered_data = rfm_summary(transaction_data, "id", "date", "sales")
dates_unordered = pd.to_datetime(
pd.Series(
@@ -706,9 +710,11 @@ def test_rfm_summary_will_choose_the_correct_first_order_to_drop_in_monetary_tra
)
sales = pd.Series([20, 10, 25])
transaction_data = pd.DataFrame(
- {"date": dates_unordered, "id": cust, "sales": sales}
+ {"date": dates_unordered, "identifier": cust, "sales": sales}
+ )
+ summary_unordered_data = rfm_summary(
+ transaction_data, "identifier", "date", "sales"
)
- summary_unordered_data = rfm_summary(transaction_data, "id", "date", "sales")
assert_frame_equal(summary_ordered_data, summary_unordered_data)
assert summary_ordered_data["monetary_value"].loc[0] == 22.5
@@ -744,21 +750,21 @@ def test_rfm_summary_squashes_period_purchases_to_one_purchase(self):
# https://github.com/CamDavidsonPilon/lifetimes/blob/aae339c5437ec31717309ba0ec394427e19753c4/tests/test_utils.py#L472
transactions = pd.DataFrame(
- [[1, "2015-01-01"], [1, "2015-01-01"]], columns=["id", "t"]
+ [[1, "2015-01-01"], [1, "2015-01-01"]], columns=["identifier", "t"]
)
- actual = rfm_summary(transactions, "id", "t", time_unit="W")
+ actual = rfm_summary(transactions, "identifier", "t", time_unit="W")
assert actual.loc[0]["frequency"] == 1.0 - 1.0
def test_clv_summary_warning(self, transaction_data):
with pytest.warns(UserWarning, match="clv_summary was renamed to rfm_summary"):
- clv_summary(transaction_data, "id", "date")
+ clv_summary(transaction_data, "identifier", "date")
def test_rfm_train_test_split(self, transaction_data):
# Test adapted from
# https://github.com/CamDavidsonPilon/lifetimes/blob/aae339c5437ec31717309ba0ec394427e19753c4/tests/test_utils.py#L374
train_end = "2015-02-01"
- actual = rfm_train_test_split(transaction_data, "id", "date", train_end)
+ actual = rfm_train_test_split(transaction_data, "identifier", "date", train_end)
assert actual.loc[0]["test_frequency"] == 1
assert actual.loc[1]["test_frequency"] == 0
@@ -781,7 +787,11 @@ def test_rfm_train_test_split_throws_better_error_if_test_period_end_is_too_earl
with pytest.raises(ValueError, match=error_msg):
rfm_train_test_split(
- transaction_data, "id", "date", train_end, test_period_end=test_end
+ transaction_data,
+ "identifier",
+ "date",
+ train_end,
+ test_period_end=test_end,
)
def test_rfm_train_test_split_works_with_specific_frequency(self, transaction_data):
@@ -792,7 +802,7 @@ def test_rfm_train_test_split_works_with_specific_frequency(self, transaction_da
train_end = "2015-02-01"
actual = rfm_train_test_split(
transaction_data,
- "id",
+ "identifier",
"date",
train_end,
test_period_end=test_end,
@@ -824,7 +834,7 @@ def test_rfm_train_test_split_gives_correct_date_boundaries(self, transaction_da
actual = rfm_train_test_split(
transaction_data,
- "id",
+ "identifier",
"date",
train_period_end="2015-02-01",
test_period_end="2015-02-04",
@@ -840,7 +850,7 @@ def test_rfm_train_test_split_with_monetary_value(self, transaction_data):
train_end = "2015-02-01"
actual = rfm_train_test_split(
transaction_data,
- "id",
+ "identifier",
"date",
train_end,
test_period_end=test_end,
@@ -849,4 +859,70 @@ def test_rfm_train_test_split_with_monetary_value(self, transaction_data):
assert (actual["monetary_value"] == [0, 0, 3, 0, 4.5]).all()
assert (actual["test_monetary_value"] == [2, 0, 0, 6, 0]).all()
- # check test_monetary_value is being aggregated correctly for time periods with multiple purchases
+ @pytest.mark.parametrize("config", (None, "custom"))
+ def test_rfm_segmentation_config(self, transaction_data, config):
+ if config is not None:
+ config = {
+ "Test Segment": [
+ "111",
+ "222",
+ "333",
+ "444",
+ ]
+ }
+ expected = ["Other", "Test Segment", "Other", "Other", "Other", "Other"]
+ else:
+ expected = [
+ "Other",
+ "Inactive Customer",
+ "At Risk Customer",
+ "Premium Customer",
+ "Repeat Customer",
+ "Top Spender",
+ ]
+
+ actual = rfm_segments(
+ transaction_data,
+ "identifier",
+ "date",
+ "monetary_value",
+ segment_config=config,
+ )
+
+ assert (actual["segment"] == expected).all()
+
+ def test_rfm_segmentation_warning(self):
+ # this data will only return two bins for the frequency variable
+ d = [
+ [1, "2015-01-01", 1],
+ [1, "2015-02-06", 2],
+ [2, "2015-01-01", 2],
+ [3, "2015-01-02", 1],
+ [3, "2015-01-05", 3],
+ [4, "2015-01-16", 4],
+ [4, "2015-02-05", 5],
+ [5, "2015-01-17", 1],
+ [5, "2015-01-18", 2],
+ [5, "2015-01-19", 2],
+ ]
+ repetitive_data = pd.DataFrame(d, columns=["id", "date", "monetary_value"])
+
+ with pytest.warns(
+ UserWarning,
+ match="RFM score will not exceed 2 for f_quartile. Specify a custom segment_config",
+ ):
+ rfm_segments(
+ repetitive_data,
+ "id",
+ "date",
+ "monetary_value",
+ )
+
+ def test_rfm_quartile_labels(self):
+ # assert recency labels are in reverse order
+ recency = _rfm_quartile_labels("r_quartile", 5)
+ assert recency == [4, 3, 2, 1]
+
+ # assert max_quartile_range = 4 returns a range function for three labels
+ frequency = _rfm_quartile_labels("f_quartile", 4)
+ assert frequency == range(1, 4)