|
| 1 | +import random |
| 2 | +import unittest |
| 3 | + |
| 4 | +import pandas as pd |
| 5 | + |
| 6 | +from tests.integration.feature_repos.test_repo_configuration import ( |
| 7 | + Environment, |
| 8 | + parametrize_online_test, |
| 9 | +) |
| 10 | + |
| 11 | + |
| 12 | +@parametrize_online_test |
| 13 | +def test_online_retrieval(environment: Environment): |
| 14 | + fs = environment.feature_store |
| 15 | + full_feature_names = environment.test_repo_config.full_feature_names |
| 16 | + |
| 17 | + sample_drivers = random.sample(environment.driver_entities, 10) |
| 18 | + drivers_df = environment.driver_df[ |
| 19 | + environment.driver_df["driver_id"].isin(sample_drivers) |
| 20 | + ] |
| 21 | + |
| 22 | + sample_customers = random.sample(environment.customer_entities, 10) |
| 23 | + customers_df = environment.customer_df[ |
| 24 | + environment.customer_df["customer_id"].isin(sample_customers) |
| 25 | + ] |
| 26 | + |
| 27 | + entity_rows = [ |
| 28 | + {"driver": d, "customer_id": c} |
| 29 | + for (d, c) in zip(sample_drivers, sample_customers) |
| 30 | + ] |
| 31 | + |
| 32 | + feature_refs = [ |
| 33 | + "driver_stats:conv_rate", |
| 34 | + "driver_stats:avg_daily_trips", |
| 35 | + "customer_profile:current_balance", |
| 36 | + "customer_profile:avg_passenger_count", |
| 37 | + "customer_profile:lifetime_trip_count", |
| 38 | + ] |
| 39 | + unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs] |
| 40 | + |
| 41 | + online_features = fs.get_online_features( |
| 42 | + features=feature_refs, |
| 43 | + entity_rows=entity_rows, |
| 44 | + full_feature_names=full_feature_names, |
| 45 | + ) |
| 46 | + assert online_features is not None |
| 47 | + |
| 48 | + keys = online_features.to_dict().keys() |
| 49 | + assert ( |
| 50 | + len(keys) == len(feature_refs) + 2 |
| 51 | + ) # Add two for the driver id and the customer id entity keys. |
| 52 | + for feature in feature_refs: |
| 53 | + if full_feature_names: |
| 54 | + assert feature.replace(":", "__") in keys |
| 55 | + else: |
| 56 | + assert feature.rsplit(":", 1)[-1] in keys |
| 57 | + assert "driver_stats" not in keys and "customer_profile" not in keys |
| 58 | + |
| 59 | + online_features_dict = online_features.to_dict() |
| 60 | + tc = unittest.TestCase() |
| 61 | + for i, entity_row in enumerate(entity_rows): |
| 62 | + df_features = get_latest_feature_values_from_dataframes( |
| 63 | + drivers_df, customers_df, entity_row |
| 64 | + ) |
| 65 | + |
| 66 | + assert df_features["customer_id"] == online_features_dict["customer_id"][i] |
| 67 | + assert df_features["driver_id"] == online_features_dict["driver_id"][i] |
| 68 | + for unprefixed_feature_ref in unprefixed_feature_refs: |
| 69 | + tc.assertEqual( |
| 70 | + df_features[unprefixed_feature_ref], |
| 71 | + online_features_dict[ |
| 72 | + response_feature_name(unprefixed_feature_ref, full_feature_names) |
| 73 | + ][i], |
| 74 | + ) |
| 75 | + |
| 76 | + # Check what happens for missing values |
| 77 | + missing_responses_dict = fs.get_online_features( |
| 78 | + features=feature_refs, |
| 79 | + entity_rows=[{"driver": 0, "customer_id": 0}], |
| 80 | + full_feature_names=full_feature_names, |
| 81 | + ).to_dict() |
| 82 | + assert missing_responses_dict is not None |
| 83 | + for unprefixed_feature_ref in unprefixed_feature_refs: |
| 84 | + tc.assertIsNone( |
| 85 | + missing_responses_dict[ |
| 86 | + response_feature_name(unprefixed_feature_ref, full_feature_names) |
| 87 | + ][0] |
| 88 | + ) |
| 89 | + |
| 90 | + |
| 91 | +def response_feature_name(feature: str, full_feature_names: bool) -> str: |
| 92 | + if ( |
| 93 | + feature in {"current_balance", "avg_passenger_count", "lifetime_trip_count"} |
| 94 | + and full_feature_names |
| 95 | + ): |
| 96 | + return f"customer_profile__{feature}" |
| 97 | + |
| 98 | + if feature in {"conv_rate", "avg_daily_trips"} and full_feature_names: |
| 99 | + return f"driver_stats__{feature}" |
| 100 | + |
| 101 | + return feature |
| 102 | + |
| 103 | + |
| 104 | +def get_latest_feature_values_from_dataframes(driver_df, customer_df, entity_row): |
| 105 | + driver_rows = driver_df[driver_df["driver_id"] == entity_row["driver"]] |
| 106 | + latest_driver_row: pd.DataFrame = driver_rows.loc[ |
| 107 | + driver_rows["event_timestamp"].idxmax() |
| 108 | + ].to_dict() |
| 109 | + customer_rows = customer_df[customer_df["customer_id"] == entity_row["customer_id"]] |
| 110 | + latest_customer_row = customer_rows.loc[ |
| 111 | + customer_rows["event_timestamp"].idxmax() |
| 112 | + ].to_dict() |
| 113 | + |
| 114 | + latest_customer_row.update(latest_driver_row) |
| 115 | + return latest_customer_row |
0 commit comments