diff --git a/docs/source/notebooks/clv/clv_quickstart.ipynb b/docs/source/notebooks/clv/clv_quickstart.ipynb index 2061d48cb..4cd8347a6 100644 --- a/docs/source/notebooks/clv/clv_quickstart.ipynb +++ b/docs/source/notebooks/clv/clv_quickstart.ipynb @@ -67,10 +67,10 @@ "* `customer_id` represents a unique identifier for each customer.\n", "* `frequency` represents the number of _repeat_ purchases that a customer has made, i.e. one less than the total number of purchases.\n", "* `T` represents a customer's \"age\", i.e. the duration between a customer's first purchase and the end of the period of study. In this example notebook, the units of time are in weeks.\n", - "* `recency` represents the timepoint when a customer made their most recent purchase. This is also equal to the duration between a customer’s first non-repeat purchase (usually time 0) and last purchase. If a customer has made only 1 purchase, their recency is 0;\n", + "* `recency` represents the time period when a customer made their most recent purchase. This is equal to the duration between a customer’s first and last purchase. If a customer has made only 1 purchase, their recency is 0.\n", "* `monetary_value` represents the average value of a given customer’s repeat purchases. Customers who have only made a single purchase have monetary values of zero.\n", "\n", - "If working with raw transaction data, the `rfm_summary` function can be used to preprocess data for modeling:" + "The `rfm_summary` function can be used to preprocess raw transaction data for modeling:" ] }, { @@ -339,6 +339,8 @@ "id": "514ee548", "metadata": {}, "source": [ + "It is important to note these definitions differ from that used in RFM segmentation, where the first purchase is included, `T` is not used, and `recency` is the number of time periods since a customer's most recent purchase.\n", + "\n", "To visualize data in RFM format, we can plot the recency and T of the customers with the `plot_customer_exposure` function. We see a large chunk (>60%) of customers haven't made another purchase in a while." ] }, @@ -2579,7 +2581,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.14" }, "toc": { "base_numbering": 1, diff --git a/docs/source/notebooks/clv/dev/utilities_plotting.ipynb b/docs/source/notebooks/clv/dev/utilities_plotting.ipynb index a797237eb..ec5156c86 100644 --- a/docs/source/notebooks/clv/dev/utilities_plotting.ipynb +++ b/docs/source/notebooks/clv/dev/utilities_plotting.ipynb @@ -5,15 +5,7 @@ "execution_count": 1, "id": "435ed203-5c3c-4efc-93d1-abac66ce7187", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING (pytensor.tensor.blas): Using NumPy C-API based implementation for BLAS functions.\n" - ] - } - ], + "outputs": [], "source": [ "from pymc_marketing.clv import utils\n", "\n", @@ -30,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 2, "id": "7de7f396-1d5b-4457-916b-c29ed90aa132", "metadata": {}, "outputs": [], @@ -66,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 3, "id": "932e8db6-78cf-49df-aa4a-83ee6584e5dd", "metadata": {}, "outputs": [ @@ -196,7 +188,7 @@ "13 6 2015-02-02 True" ] }, - "execution_count": 70, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -223,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 4, "id": "4c0a7de5-8825-40af-84e5-6cd0ad26a0e3", "metadata": {}, "outputs": [ @@ -259,42 +251,42 @@ " \n", " 0\n", " 1\n", - " 1.0\n", + " 2.0\n", " 5.0\n", " 5.0\n", - " 2.0\n", + " 1.5\n", " \n", " \n", " 1\n", " 2\n", - " 0.0\n", + " 1.0\n", " 0.0\n", " 5.0\n", - " 0.0\n", + " 2.0\n", " \n", " \n", " 2\n", " 3\n", - " 1.0\n", + " 2.0\n", " 1.0\n", " 5.0\n", - " 5.0\n", + " 4.5\n", " \n", " \n", " 3\n", " 4\n", - " 1.0\n", + " 2.0\n", " 3.0\n", " 3.0\n", - " 8.0\n", + " 7.0\n", " \n", " \n", " 4\n", " 5\n", - " 0.0\n", + " 1.0\n", " 0.0\n", " 3.0\n", - " 0.0\n", + " 12.0\n", " \n", " \n", "\n", @@ -302,14 +294,14 @@ ], "text/plain": [ " customer_id frequency recency T monetary_value\n", - "0 1 1.0 5.0 5.0 2.0\n", - "1 2 0.0 0.0 5.0 0.0\n", - "2 3 1.0 1.0 5.0 5.0\n", - "3 4 1.0 3.0 3.0 8.0\n", - "4 5 0.0 0.0 3.0 0.0" + "0 1 2.0 5.0 5.0 1.5\n", + "1 2 1.0 0.0 5.0 2.0\n", + "2 3 2.0 1.0 5.0 4.5\n", + "3 4 2.0 3.0 3.0 7.0\n", + "4 5 1.0 0.0 3.0 12.0" ] }, - "execution_count": 74, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -323,7 +315,7 @@ " observation_period_end = \"2015-02-06\",\n", " datetime_format = \"%Y-%m-%d\",\n", " time_unit = \"W\",\n", - " include_first_transaction=False,\n", + " include_first_transaction=True,\n", ")\n", "\n", "rfm_df.head()" @@ -339,7 +331,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 5, "id": "761edfe9-1b69-4966-83bf-4f1242eda2d5", "metadata": {}, "outputs": [ @@ -450,7 +442,7 @@ "4 0.0 5.0 " ] }, - "execution_count": 76, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -467,13 +459,137 @@ "train_test.head()" ] }, + { + "cell_type": "markdown", + "id": "73dc1b93-6a4f-4171-b838-30759b2c1e0e", + "metadata": {}, + "source": [ + "`rfm_segments` will assign customer to segments based on their recency, frequency, and monetary value. It uses a quartile-based RFM score approach that is very computationally efficient, but defining custom segments is a rather subjective exercise. The returned dataframe also cannot be used for modeling because it does not zero out the initial transactions." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "id": "c7b3f800-8dfb-4e5a-b939-5f908281563c", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "segments = utils.rfm_segments(\n", + " test_data, \n", + " customer_id_col = \"id\", \n", + " datetime_col = \"date\", \n", + " monetary_value_col = \"monetary_value\",\n", + " observation_period_end = \"2015-02-06\",\n", + " datetime_format = \"%Y-%m-%d\",\n", + " time_unit = \"W\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "932ac4e5-361e-42fa-97d3-d8e508128944", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idfrequencyrecencymonetary_valuesegment
012.00.01.5Other
121.05.02.0Inactive Customer
232.04.04.5At Risk Customer
342.00.07.0Top Spender
451.03.012.0At Risk Customer
561.00.05.0Top Spender
\n", + "
" + ], + "text/plain": [ + " customer_id frequency recency monetary_value segment\n", + "0 1 2.0 0.0 1.5 Other\n", + "1 2 1.0 5.0 2.0 Inactive Customer\n", + "2 3 2.0 4.0 4.5 At Risk Customer\n", + "3 4 2.0 0.0 7.0 Top Spender\n", + "4 5 1.0 3.0 12.0 At Risk Customer\n", + "5 6 1.0 0.0 5.0 Top Spender" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "segments" + ] } ], "metadata": { @@ -492,7 +608,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/pymc_marketing/clv/__init__.py b/pymc_marketing/clv/__init__.py index f0b3c847e..57a4f4826 100644 --- a/pymc_marketing/clv/__init__.py +++ b/pymc_marketing/clv/__init__.py @@ -25,6 +25,7 @@ ) from pymc_marketing.clv.utils import ( customer_lifetime_value, + rfm_segments, rfm_summary, rfm_train_test_split, ) @@ -39,6 +40,7 @@ "plot_customer_exposure", "plot_frequency_recency_matrix", "plot_probability_alive_matrix", + "rfm_segments", "rfm_summary", "rfm_train_test_split", ) diff --git a/pymc_marketing/clv/utils.py b/pymc_marketing/clv/utils.py index 1abcc7933..055b03dd6 100644 --- a/pymc_marketing/clv/utils.py +++ b/pymc_marketing/clv/utils.py @@ -22,6 +22,7 @@ __all__ = [ "to_xarray", "customer_lifetime_value", + "rfm_segments", "rfm_summary", "rfm_train_test_split", ] @@ -304,9 +305,14 @@ def rfm_summary( This transforms a DataFrame of transaction data of the form: customer_id, datetime [, monetary_value] - to a DataFrame of the form: + to a DataFrame for CLV modeling: customer_id, frequency, recency, T [, monetary_value] + If the `include_first_transaction = True` argument is specified, a DataFrame for RFM segmentation is returned: + customer_id, frequency, recency, monetary_value + + This function is not required if using the `rfm_segments` utility. + Adapted from lifetimes package https://github.com/CamDavidsonPilon/lifetimes/blob/41e394923ad72b17b5da93e88cfabab43f51abe2/lifetimes/utils.py#L230 @@ -320,7 +326,7 @@ def rfm_summary( Column in the transactions DataFrame that denotes the datetime the purchase was made. monetary_value_col: string, optional Column in the transactions DataFrame that denotes the monetary value of the transaction. - Optional; only needed for spend estimation models like the Gamma-Gamma model. + Optional; only needed for RFM segmentation and spend estimation models like the Gamma-Gamma model. observation_period_end: Union[str, pd.Period, datetime], optional A string or datetime to denote the final date of the study. Events after this date are truncated. If not given, defaults to the max 'datetime_col'. @@ -387,25 +393,33 @@ def rfm_summary( datetime_col ].agg(["min", "max", "count"]) - if not include_first_transaction: - # subtract 1 from count, as we ignore their first order. - customers["frequency"] = customers["count"] - 1 - else: - customers["frequency"] = customers["count"] + # subtract 1 from count, as we ignore the first order. + customers["frequency"] = customers["count"] - 1 - customers["T"] = ( - (observation_period_end_ts - customers["min"]) + customers["recency"] = ( + (pd.to_datetime(customers["max"]) - pd.to_datetime(customers["min"])) # type: ignore / np.timedelta64(1, time_unit) / time_scaler ) - customers["recency"] = ( - (pd.to_datetime(customers["max"]) - pd.to_datetime(customers["min"])) # type: ignore + + customers["T"] = ( + (observation_period_end_ts - customers["min"]) / np.timedelta64(1, time_unit) / time_scaler ) summary_columns = ["frequency", "recency", "T"] + if include_first_transaction: + # add the first order back to the frequency count + customers["frequency"] = customers["frequency"] + 1 + + # change recency to segmentation definition + customers["recency"] = customers["T"] - customers["recency"] + + # T column is not used for segmentation + summary_columns = ["frequency", "recency"] + if monetary_value_col: if not include_first_transaction: # create an index of all the first purchases @@ -597,3 +611,202 @@ def rfm_train_test_split( train_test_rfm_data["test_T"] = time_delta / time_scaler # type: ignore return train_test_rfm_data + + +def rfm_segments( + transactions: pd.DataFrame, + customer_id_col: str, + datetime_col: str, + monetary_value_col: str, + segment_config: dict | None = None, + observation_period_end: str | pd.Period | datetime | None = None, + datetime_format: str | None = None, + time_unit: str = "D", + time_scaler: float | None = 1, + sort_transactions: bool | None = True, +) -> pd.DataFrame: + """ + Assign customers to segments based on spending behavior derived from RFM scores. + + This transforms a DataFrame of transaction data of the form: + customer_id, datetime, monetary_value + to a DataFrame of the form: + customer_id, frequency, recency, monetary_value, rfm_score, segment + + Customer purchasing data is aggregated into three variables: `recency`, `frequency`, and `monetary_value`. + Quartiles are estimated for each variable, and a three-digit RFM score is then assigned to each customer. + For example, a customer with a score of '234' is in the second quartile for `recency`, third quartile for + `frequency`, and fourth quartile for `monetary_value`. + RFM scores corresponding to segments such as "Top Spender", "Frequent Buyer", or "At-Risk" are determined, and + customers are then segmented based on their RFM score. + + By default, the following segments are created: + "Premium Customer": Customers in top 2 quartiles for all variables. + "Repeat Customer": Customers in top 2 quartiles for frequency, and either recency or monetary value. + "Top Spender": Customers in top 2 quartiles for monetary value, and either frequency or recency. + "At-Risk Customer": Customers in bottom 2 quartiles for two or more variables. + "Inactive Customer": Customers in bottom quartile for two or more variables. + Customers with unspecified RFM scores will be assigned to a segment named "Other". + + If an alternative segmentation approach is desired, use + `rfm_summary(include_first_transaction=True, *args, **kwargs)` instead to preprocess data for segmentation. + In either case, the returned DataFrame cannot be used for modeling. + If assigning model predictions to RFM segments, create a separate DataFrame for modeling and join by Customer ID. + + Parameters + ---------- + transactions: :obj: DataFrame + A Pandas DataFrame that contains the customer_id col and the datetime col. + customer_id_col: string + Column in the transactions DataFrame that denotes the customer_id. + datetime_col: string + Column in the transactions DataFrame that denotes the datetime the purchase was made. + monetary_value_col: string, optional + Column in the transactions DataFrame that denotes the monetary value of the transaction. + Optional; only needed for spend estimation models like the Gamma-Gamma model. + segment_config: dict, optional + Dictionary containing segment names and list of RFM score assignments; + key/value pairs should be formatted as `{"segment": ['111', '123', '321'], ...}`. + If not provided, default segment names and definitions are applied. + observation_period_end: Union[str, pd.Period, datetime, None], optional + A string or datetime to denote the final date of the study. + Events after this date are truncated. If not given, defaults to the max 'datetime_col'. + datetime_format: string, optional + A string that represents the timestamp format. Useful if Pandas can't understand + the provided format. + time_unit: string, optional + Time granularity for study. + Default: 'D' for days. Possible values listed here: + https://numpy.org/devdocs/reference/arrays.datetime.html#datetime-units + time_scaler: int, optional + Default: 1. Useful for scaling recency & T to a different time granularity. Example: + With freq='D' and freq_multiplier=1, we get recency=591 and T=632 + With freq='h' and freq_multiplier=24, we get recency=590.125 and T=631.375 + This is useful if predictions in a different time granularity are desired, + and can also help with model convergence for study periods of many years. + sort_transactions: bool, optional + Default: True + If raw data is already sorted in chronological order, set to `False` to improve computational efficiency. + + Returns + ------- + :obj: DataFrame: + customer_id, frequency, recency, T [, monetary_value] + """ + + rfm_data = rfm_summary( + transactions, + customer_id_col=customer_id_col, + datetime_col=datetime_col, + monetary_value_col=monetary_value_col, + observation_period_end=observation_period_end, + datetime_format=datetime_format, + time_unit=time_unit, + time_scaler=time_scaler, + include_first_transaction=True, + sort_transactions=sort_transactions, + ) + + # iteratively assign quartile labels for each row/variable + for column_name in zip( + ["r_quartile", "f_quartile", "m_quartile"], + ["recency", "frequency", "monetary_value"], + strict=False, + ): + # If data has many repeat values, fewer than 4 bins will be returned. + # These try blocks will modify labelling for fewer bins. + try: + labels = _rfm_quartile_labels(column_name[0], 5) + rfm_data[column_name[0]] = pd.qcut( + rfm_data[column_name[1]], q=4, labels=labels, duplicates="drop" + ).astype(str) + except ValueError: + try: + labels = _rfm_quartile_labels(column_name[0], 4) + rfm_data[column_name[0]] = pd.qcut( + rfm_data[column_name[1]], q=4, labels=labels, duplicates="drop" + ).astype(str) + except ValueError: + labels = _rfm_quartile_labels(column_name[0], 3) + rfm_data[column_name[0]] = pd.qcut( + rfm_data[column_name[1]], q=4, labels=labels, duplicates="drop" + ).astype(str) + warnings.warn( + f"RFM score will not exceed 2 for {column_name[0]}. Specify a custom segment_config", + UserWarning, + stacklevel=1, + ) + + rfm_data = pd.eval( # type: ignore + "rfm_score = rfm_data.r_quartile + rfm_data.f_quartile + rfm_data.m_quartile", + target=rfm_data, + ) + + if segment_config is None: + segment_config = _default_rfm_segment_config + + segment_names = list(segment_config.keys()) + + # create catch-all "Other" segment and assign defined segments from config + rfm_data["segment"] = "Other" + + for key in segment_names: + rfm_data.loc[rfm_data["rfm_score"].isin(segment_config[key]), "segment"] = key + + # drop unnecessary columns + rfm_data = rfm_data.drop(columns=["r_quartile", "f_quartile", "m_quartile"]) + + return rfm_data + + +def _rfm_quartile_labels(column_name, max_label_range): + """called internally by rfm_segments to label quartiles for each variable""" + # recency labels must be reversed because lower values are more desirable + if column_name == "r_quartile": + return list(range(max_label_range - 1, 0, -1)) + else: + return range(1, max_label_range) + + +_default_rfm_segment_config = { + "Premium Customer": [ + "334", + "443", + "444", + "344", + "434", + "433", + "343", + "333", + ], + "Repeat Customer": ["244", "234", "232", "332", "143", "233", "243"], + "Top Spender": [ + "424", + "414", + "144", + "314", + "324", + "124", + "224", + "423", + "413", + "133", + "323", + "313", + "134", + ], + "At Risk Customer": [ + "422", + "223", + "212", + "122", + "222", + "132", + "322", + "312", + "412", + "123", + "214", + ], + "Inactive Customer": ["411", "111", "113", "114", "112", "211", "311"], +} diff --git a/tests/clv/test_utils.py b/tests/clv/test_utils.py index 46323b6b0..f6711534b 100644 --- a/tests/clv/test_utils.py +++ b/tests/clv/test_utils.py @@ -23,8 +23,10 @@ from pymc_marketing.clv import BetaGeoModel, GammaGammaModel, ParetoNBDModel from pymc_marketing.clv.utils import ( _find_first_transactions, + _rfm_quartile_labels, clv_summary, customer_lifetime_value, + rfm_segments, rfm_summary, rfm_train_test_split, to_xarray, @@ -350,20 +352,20 @@ def transaction_data(self) -> pd.DataFrame: [5, "2015-01-18", 8], [6, "2015-02-02", 5], ] - return pd.DataFrame(d, columns=["id", "date", "monetary_value"]) + return pd.DataFrame(d, columns=["identifier", "date", "monetary_value"]) def test_find_first_transactions_test_period_end_none(self, transaction_data): max_date = transaction_data["date"].max() pd.testing.assert_frame_equal( left=_find_first_transactions( transactions=transaction_data, - customer_id_col="id", + customer_id_col="identifier", datetime_col="date", observation_period_end=None, ), right=_find_first_transactions( transactions=transaction_data, - customer_id_col="id", + customer_id_col="identifier", datetime_col="date", observation_period_end=max_date, ), @@ -382,7 +384,7 @@ def test_find_first_transactions_returns_correct_results( actual = _find_first_transactions( transaction_data, - "id", + "identifier", "date", observation_period_end=today, ) @@ -402,7 +404,7 @@ def test_find_first_transactions_returns_correct_results( [5, pd.Period("2015-01-18", "D"), False], [6, pd.Period("2015-02-02", "D"), True], ], - columns=["id", "date", "first"], + columns=["identifier", "date", "first"], index=[0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], ) # row indices are skipped for time periods with multiple transactions assert_frame_equal(actual, expected) @@ -420,7 +422,7 @@ def test_find_first_transactions_with_specific_non_daily_frequency( actual = _find_first_transactions( transaction_data, - "id", + "identifier", "date", observation_period_end=today, time_unit="W", @@ -437,7 +439,7 @@ def test_find_first_transactions_with_specific_non_daily_frequency( [5, pd.Period("2015-01-12/2015-01-18", "W-SUN"), True], [6, pd.Period("2015-02-02/2015-02-08", "W-SUN"), True], ], - columns=["id", "date", "first"], + columns=["identifier", "date", "first"], index=actual.index, ) # we shouldn't really care about row ordering or indexing, but assert_frame_equals is strict about it assert_frame_equal(actual, expected) @@ -455,7 +457,7 @@ def test_find_first_transactions_with_monetary_values( actual = _find_first_transactions( transaction_data, - "id", + "identifier", "date", "monetary_value", observation_period_end=today, @@ -476,7 +478,7 @@ def test_find_first_transactions_with_monetary_values( [5, pd.Period("2015-01-18", "D"), 8, False], [6, pd.Period("2015-02-02", "D"), 5, True], ], - columns=["id", "date", "monetary_value", "first"], + columns=["identifier", "date", "monetary_value", "first"], ) assert_frame_equal(actual, expected) @@ -493,7 +495,7 @@ def test_find_first_transactions_with_monetary_values_with_specific_non_daily_fr actual = _find_first_transactions( transaction_data, - "id", + "identifier", "date", "monetary_value", observation_period_end=today, @@ -511,7 +513,7 @@ def test_find_first_transactions_with_monetary_values_with_specific_non_daily_fr [5, pd.Period("2015-01-12/2015-01-18", "W-SUN"), 12, True], [6, pd.Period("2015-02-02/2015-02-08", "W-SUN"), 5, True], ], - columns=["id", "date", "monetary_value", "first"], + columns=["identifier", "date", "monetary_value", "first"], ) assert_frame_equal(actual, expected) @@ -525,7 +527,7 @@ def test_rfm_summary_returns_correct_results(self, transaction_data, today): # https://github.com/CamDavidsonPilon/lifetimes/blob/aae339c5437ec31717309ba0ec394427e19753c4/tests/test_utils.py#L239 actual = rfm_summary( - transaction_data, "id", "date", observation_period_end=today + transaction_data, "identifier", "date", observation_period_end=today ) expected = pd.DataFrame( [ @@ -552,8 +554,8 @@ def test_rfm_summary_works_with_string_customer_ids(self): ["Y", "2015-01-02"], ["Y", "2015-01-05"], ] - df = pd.DataFrame(d, columns=["id", "date"]) - rfm_summary(df, "id", "date") + df = pd.DataFrame(d, columns=["identifier", "date"]) + rfm_summary(df, "identifier", "date") def test_rfm_summary_works_with_int_customer_ids_and_doesnt_coerce_to_float(self): # Test adapted from @@ -567,8 +569,8 @@ def test_rfm_summary_works_with_int_customer_ids_and_doesnt_coerce_to_float(self [2, "2015-01-02"], [2, "2015-01-05"], ] - df = pd.DataFrame(d, columns=["id", "date"]) - actual = rfm_summary(df, "id", "date") + df = pd.DataFrame(d, columns=["identifier", "date"]) + actual = rfm_summary(df, "identifier", "date") assert actual.index.dtype == "int64" def test_rfm_summary_with_specific_datetime_format( @@ -585,7 +587,7 @@ def test_rfm_summary_with_specific_datetime_format( today = "20150207" actual = rfm_summary( transaction_data, - "id", + "identifier", "date", observation_period_end=today, datetime_format=format, @@ -614,7 +616,7 @@ def test_rfm_summary_non_daily_frequency( today = "20150207" actual = rfm_summary( transaction_data, - "id", + "identifier", "date", observation_period_end=today, time_unit="W", @@ -642,7 +644,7 @@ def test_rfm_summary_monetary_values_and_first_transactions( today = "20150207" actual = rfm_summary( transaction_data, - "id", + "identifier", "date", monetary_value_col="monetary_value", observation_period_end=today, @@ -662,7 +664,7 @@ def test_rfm_summary_monetary_values_and_first_transactions( actual_first_trans = rfm_summary( transaction_data, - "id", + "identifier", "date", monetary_value_col="monetary_value", observation_period_end=today, @@ -670,14 +672,14 @@ def test_rfm_summary_monetary_values_and_first_transactions( ) expected_first_trans = pd.DataFrame( [ - [1, 2.0, 36.0, 37.0, 1.5], - [2, 1.0, 0.0, 37.0, 2], - [3, 3.0, 4.0, 37.0, 3], - [4, 3.0, 20.0, 22.0, 6], - [5, 3.0, 2.0, 22.0, 4], - [6, 1.0, 0.0, 5.0, 5], + [1, 2.0, 1.0, 1.5], + [2, 1.0, 37.0, 2], + [3, 3.0, 33.0, 3], + [4, 3.0, 2.0, 6], + [5, 3.0, 20.0, 4], + [6, 1.0, 5.0, 5], ], - columns=["customer_id", "frequency", "recency", "T", "monetary_value"], + columns=["customer_id", "frequency", "recency", "monetary_value"], ) assert_frame_equal(actual_first_trans, expected_first_trans) @@ -695,9 +697,11 @@ def test_rfm_summary_will_choose_the_correct_first_order_to_drop_in_monetary_tra ) sales = pd.Series([10, 20, 25]) transaction_data = pd.DataFrame( - {"date": dates_ordered, "id": cust, "sales": sales} + {"date": dates_ordered, "identifier": cust, "sales": sales} + ) + summary_ordered_data = rfm_summary( + transaction_data, "identifier", "date", "sales" ) - summary_ordered_data = rfm_summary(transaction_data, "id", "date", "sales") dates_unordered = pd.to_datetime( pd.Series( @@ -706,9 +710,11 @@ def test_rfm_summary_will_choose_the_correct_first_order_to_drop_in_monetary_tra ) sales = pd.Series([20, 10, 25]) transaction_data = pd.DataFrame( - {"date": dates_unordered, "id": cust, "sales": sales} + {"date": dates_unordered, "identifier": cust, "sales": sales} + ) + summary_unordered_data = rfm_summary( + transaction_data, "identifier", "date", "sales" ) - summary_unordered_data = rfm_summary(transaction_data, "id", "date", "sales") assert_frame_equal(summary_ordered_data, summary_unordered_data) assert summary_ordered_data["monetary_value"].loc[0] == 22.5 @@ -744,21 +750,21 @@ def test_rfm_summary_squashes_period_purchases_to_one_purchase(self): # https://github.com/CamDavidsonPilon/lifetimes/blob/aae339c5437ec31717309ba0ec394427e19753c4/tests/test_utils.py#L472 transactions = pd.DataFrame( - [[1, "2015-01-01"], [1, "2015-01-01"]], columns=["id", "t"] + [[1, "2015-01-01"], [1, "2015-01-01"]], columns=["identifier", "t"] ) - actual = rfm_summary(transactions, "id", "t", time_unit="W") + actual = rfm_summary(transactions, "identifier", "t", time_unit="W") assert actual.loc[0]["frequency"] == 1.0 - 1.0 def test_clv_summary_warning(self, transaction_data): with pytest.warns(UserWarning, match="clv_summary was renamed to rfm_summary"): - clv_summary(transaction_data, "id", "date") + clv_summary(transaction_data, "identifier", "date") def test_rfm_train_test_split(self, transaction_data): # Test adapted from # https://github.com/CamDavidsonPilon/lifetimes/blob/aae339c5437ec31717309ba0ec394427e19753c4/tests/test_utils.py#L374 train_end = "2015-02-01" - actual = rfm_train_test_split(transaction_data, "id", "date", train_end) + actual = rfm_train_test_split(transaction_data, "identifier", "date", train_end) assert actual.loc[0]["test_frequency"] == 1 assert actual.loc[1]["test_frequency"] == 0 @@ -781,7 +787,11 @@ def test_rfm_train_test_split_throws_better_error_if_test_period_end_is_too_earl with pytest.raises(ValueError, match=error_msg): rfm_train_test_split( - transaction_data, "id", "date", train_end, test_period_end=test_end + transaction_data, + "identifier", + "date", + train_end, + test_period_end=test_end, ) def test_rfm_train_test_split_works_with_specific_frequency(self, transaction_data): @@ -792,7 +802,7 @@ def test_rfm_train_test_split_works_with_specific_frequency(self, transaction_da train_end = "2015-02-01" actual = rfm_train_test_split( transaction_data, - "id", + "identifier", "date", train_end, test_period_end=test_end, @@ -824,7 +834,7 @@ def test_rfm_train_test_split_gives_correct_date_boundaries(self, transaction_da actual = rfm_train_test_split( transaction_data, - "id", + "identifier", "date", train_period_end="2015-02-01", test_period_end="2015-02-04", @@ -840,7 +850,7 @@ def test_rfm_train_test_split_with_monetary_value(self, transaction_data): train_end = "2015-02-01" actual = rfm_train_test_split( transaction_data, - "id", + "identifier", "date", train_end, test_period_end=test_end, @@ -849,4 +859,70 @@ def test_rfm_train_test_split_with_monetary_value(self, transaction_data): assert (actual["monetary_value"] == [0, 0, 3, 0, 4.5]).all() assert (actual["test_monetary_value"] == [2, 0, 0, 6, 0]).all() - # check test_monetary_value is being aggregated correctly for time periods with multiple purchases + @pytest.mark.parametrize("config", (None, "custom")) + def test_rfm_segmentation_config(self, transaction_data, config): + if config is not None: + config = { + "Test Segment": [ + "111", + "222", + "333", + "444", + ] + } + expected = ["Other", "Test Segment", "Other", "Other", "Other", "Other"] + else: + expected = [ + "Other", + "Inactive Customer", + "At Risk Customer", + "Premium Customer", + "Repeat Customer", + "Top Spender", + ] + + actual = rfm_segments( + transaction_data, + "identifier", + "date", + "monetary_value", + segment_config=config, + ) + + assert (actual["segment"] == expected).all() + + def test_rfm_segmentation_warning(self): + # this data will only return two bins for the frequency variable + d = [ + [1, "2015-01-01", 1], + [1, "2015-02-06", 2], + [2, "2015-01-01", 2], + [3, "2015-01-02", 1], + [3, "2015-01-05", 3], + [4, "2015-01-16", 4], + [4, "2015-02-05", 5], + [5, "2015-01-17", 1], + [5, "2015-01-18", 2], + [5, "2015-01-19", 2], + ] + repetitive_data = pd.DataFrame(d, columns=["id", "date", "monetary_value"]) + + with pytest.warns( + UserWarning, + match="RFM score will not exceed 2 for f_quartile. Specify a custom segment_config", + ): + rfm_segments( + repetitive_data, + "id", + "date", + "monetary_value", + ) + + def test_rfm_quartile_labels(self): + # assert recency labels are in reverse order + recency = _rfm_quartile_labels("r_quartile", 5) + assert recency == [4, 3, 2, 1] + + # assert max_quartile_range = 4 returns a range function for three labels + frequency = _rfm_quartile_labels("f_quartile", 4) + assert frequency == range(1, 4)