|
1 |
| -from datetime import datetime, timedelta |
2 |
| -from pathlib import Path |
3 |
| - |
4 |
| -from pyspark.sql import SparkSession |
5 |
| - |
6 |
| -from feast.driver_test_data import ( |
7 |
| - create_customer_daily_profile_df, |
8 |
| - create_driver_hourly_stats_df, |
9 |
| -) |
10 |
| - |
11 |
| -CURRENT_DIR = Path(__file__).parent |
12 |
| -DRIVER_ENTITIES = [1001, 1002, 1003] |
13 |
| -CUSTOMER_ENTITIES = [201, 202, 203] |
14 |
| -START_DATE = datetime.strptime("2022-01-01", "%Y-%m-%d") |
15 |
| -END_DATE = START_DATE + timedelta(days=7) |
16 |
| - |
17 |
| - |
18 | 1 | def bootstrap():
|
19 | 2 | # Bootstrap() will automatically be called from the init_repo() during `feast init`
|
20 |
| - generate_example_data( |
21 |
| - spark_session=SparkSession.builder.getOrCreate(), base_dir=str(CURRENT_DIR), |
22 |
| - ) |
23 |
| - |
| 3 | + import pathlib |
| 4 | + from datetime import datetime, timedelta |
24 | 5 |
|
25 |
| -def example_data_exists(base_dir: str) -> bool: |
26 |
| - for path in [ |
27 |
| - Path(base_dir) / "data" / "driver_hourly_stats", |
28 |
| - Path(base_dir) / "data" / "customer_daily_profile", |
29 |
| - ]: |
30 |
| - if not path.exists(): |
31 |
| - return False |
32 |
| - return True |
| 6 | + from feast.driver_test_data import ( |
| 7 | + create_customer_daily_profile_df, |
| 8 | + create_driver_hourly_stats_df, |
| 9 | + ) |
33 | 10 |
|
| 11 | + repo_path = pathlib.Path(__file__).parent.absolute() |
| 12 | + data_path = repo_path / "data" |
| 13 | + data_path.mkdir(exist_ok=True) |
34 | 14 |
|
35 |
| -def generate_example_data(spark_session: SparkSession, base_dir: str) -> None: |
36 |
| - spark_session.createDataFrame( |
37 |
| - data=create_driver_hourly_stats_df(DRIVER_ENTITIES, START_DATE, END_DATE) |
38 |
| - ).write.parquet( |
39 |
| - path=str(Path(base_dir) / "data" / "driver_hourly_stats"), mode="overwrite", |
| 15 | + driver_entities = [1001, 1002, 1003] |
| 16 | + end_date = datetime.now().replace(microsecond=0, second=0, minute=0) |
| 17 | + start_date = end_date - timedelta(days=15) |
| 18 | + driver_stats_df = create_driver_hourly_stats_df( |
| 19 | + driver_entities, start_date, end_date |
| 20 | + ) |
| 21 | + driver_stats_df.to_parquet( |
| 22 | + path=str(data_path / "driver_hourly_stats.parquet"), |
| 23 | + allow_truncated_timestamps=True, |
40 | 24 | )
|
41 | 25 |
|
42 |
| - spark_session.createDataFrame( |
43 |
| - data=create_customer_daily_profile_df(CUSTOMER_ENTITIES, START_DATE, END_DATE) |
44 |
| - ).write.parquet( |
45 |
| - path=str(Path(base_dir) / "data" / "customer_daily_profile"), mode="overwrite", |
| 26 | + customer_entities = [201, 202, 203] |
| 27 | + customer_profile_df = create_customer_daily_profile_df( |
| 28 | + customer_entities, start_date, end_date |
| 29 | + ) |
| 30 | + customer_profile_df.to_parquet( |
| 31 | + path=str(data_path / "customer_daily_profile.parquet"), |
| 32 | + allow_truncated_timestamps=True, |
46 | 33 | )
|
47 | 34 |
|
48 | 35 |
|
|
0 commit comments