diff --git a/src/uni2ts/data/builder/simple.py b/src/uni2ts/data/builder/simple.py index 1a35007..b33178c 100644 --- a/src/uni2ts/data/builder/simple.py +++ b/src/uni2ts/data/builder/simple.py @@ -37,9 +37,22 @@ def _from_long_dataframe( df: pd.DataFrame, offset: Optional[int] = None, date_offset: Optional[pd.Timestamp] = None, + freq: str = "H", ) -> tuple[GenFunc, Features]: items = df.item_id.unique() + # Infer the freq and generate the prompt + inferred_freq = pd.infer_freq(df.index) + + if inferred_freq is not None: + print( + f"Inferred frequency: {inferred_freq}. Using this value for the 'freq' parameter." + ) + else: + print( + f"Inferred frequency is None. Using predefined {freq} for the 'freq' parameter." + ) + def example_gen_func() -> Generator[dict[str, Any], None, None]: for item_id in items: item_df = df.query(f'item_id == "{item_id}"').drop("item_id", axis=1) @@ -50,7 +63,11 @@ def example_gen_func() -> Generator[dict[str, Any], None, None]: yield { "target": item_df.to_numpy(), "start": item_df.index[0], - "freq": pd.infer_freq(item_df.index), + "freq": ( + pd.infer_freq(df.index) + if pd.infer_freq(df.index) is not None + else freq + ), "item_id": item_id, } @@ -70,6 +87,7 @@ def _from_wide_dataframe( df: pd.DataFrame, offset: Optional[int] = None, date_offset: Optional[pd.Timestamp] = None, + freq: str = "H", ) -> tuple[GenFunc, Features]: if offset is not None: df = df.iloc[:offset] @@ -78,12 +96,28 @@ def _from_wide_dataframe( print(df) + # Infer the freq and generate the prompt + inferred_freq = pd.infer_freq(df.index) + + if inferred_freq is not None: + print( + f"Inferred frequency: {inferred_freq}. Using this value for the 'freq' parameter." + ) + else: + print( + f"Inferred frequency is None. Using predefined {freq} for the 'freq' parameter." + ) + def example_gen_func() -> Generator[dict[str, Any], None, None]: for i in range(len(df.columns)): yield { "target": df.iloc[:, i].to_numpy(), "start": df.index[0], - "freq": pd.infer_freq(df.index), + "freq": ( + pd.infer_freq(df.index) + if pd.infer_freq(df.index) is not None + else freq + ), "item_id": f"item_{i}", } @@ -103,17 +137,32 @@ def _from_wide_dataframe_multivariate( df: pd.DataFrame, offset: Optional[int] = None, date_offset: Optional[pd.Timestamp] = None, + freq: str = "H", ) -> tuple[GenFunc, Features]: if offset is not None: df = df.iloc[:offset] elif date_offset is not None: df = df[df.index <= date_offset] + # Infer the freq and generate the prompt + inferred_freq = pd.infer_freq(df.index) + + if inferred_freq is not None: + print( + f"Inferred frequency: {inferred_freq}. Using this value for the 'freq' parameter." + ) + else: + print( + f"Inferred frequency is None. Using predefined {freq} for the 'freq' parameter." + ) + def example_gen_func() -> Generator[dict[str, Any], None, None]: yield { "target": df.to_numpy().T, "start": df.index[0], - "freq": pd.infer_freq(df.index), + "freq": ( + pd.infer_freq(df.index) if pd.infer_freq(df.index) is not None else freq + ), "item_id": "item_0", } @@ -145,6 +194,7 @@ def build_dataset( dataset_type: str, offset: Optional[int] = None, date_offset: Optional[pd.Timestamp] = None, + freq: str = "H", ): assert offset is None or date_offset is None, ( "One or neither offset and date_offset must be specified, but not both. " @@ -166,7 +216,7 @@ def build_dataset( ) example_gen_func, features = _from_dataframe( - df, offset=offset, date_offset=date_offset + df, freq=freq, offset=offset, date_offset=date_offset ) hf_dataset = datasets.Dataset.from_generator( example_gen_func, features=features @@ -203,7 +253,7 @@ class SimpleEvalDatasetBuilder(DatasetBuilder): def __post_init__(self): self.storage_path = Path(self.storage_path) - def build_dataset(self, file: Path, dataset_type: str): + def build_dataset(self, file: Path, dataset_type: str, freq: str = "H"): df = pd.read_csv(file, index_col=0, parse_dates=True) if dataset_type == "long": @@ -218,7 +268,7 @@ def build_dataset(self, file: Path, dataset_type: str): " Valid options are 'long', 'wide', and 'wide_multivariate'." ) - example_gen_func, features = _from_dataframe(df) + example_gen_func, features = _from_dataframe(df, freq=freq) hf_dataset = datasets.Dataset.from_generator( example_gen_func, features=features ) @@ -289,6 +339,13 @@ def generate_eval_builders( type=str, default=None, ) + # Define the `freq` argument with a default value. Use this value as 'freq' if 'freq' is None. + parser.add_argument( + "--freq", + default="H", # Set the default value + help="The user specified frequency", + ) + args = parser.parse_args() SimpleDatasetBuilder(dataset=args.dataset_name).build_dataset( @@ -296,6 +353,7 @@ def generate_eval_builders( dataset_type=args.dataset_type, offset=args.offset, date_offset=pd.Timestamp(args.date_offset) if args.date_offset else None, + freq=args.freq, ) if args.offset is not None or args.date_offset is not None: @@ -307,4 +365,6 @@ def generate_eval_builders( prediction_length=None, context_length=None, patch_size=None, - ).build_dataset(file=Path(args.file_path), dataset_type=args.dataset_type) + ).build_dataset( + file=Path(args.file_path), dataset_type=args.dataset_type, freq=args.freq + )