From acfbdce016a9ac7452dec43dee7a9d5bbc17c024 Mon Sep 17 00:00:00 2001 From: Qiao Zhongzheng Date: Mon, 19 Aug 2024 15:48:09 +0800 Subject: [PATCH 1/2] Manually set 'freq' for specific datasets --- src/uni2ts/data/builder/simple.py | 36 +++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/src/uni2ts/data/builder/simple.py b/src/uni2ts/data/builder/simple.py index 1a35007..c283c54 100644 --- a/src/uni2ts/data/builder/simple.py +++ b/src/uni2ts/data/builder/simple.py @@ -14,6 +14,7 @@ # limitations under the License. import argparse +from collections import defaultdict from dataclasses import dataclass from itertools import product from pathlib import Path @@ -26,15 +27,24 @@ from uni2ts.common.env import env from uni2ts.common.typing import GenFunc +from uni2ts.data.builder._base import DatasetBuilder from uni2ts.data.dataset import EvalDataset, SampleTimeSeriesType, TimeSeriesDataset from uni2ts.data.indexer import HuggingFaceDatasetIndexer from uni2ts.transform import Transformation -from ._base import DatasetBuilder +# Manually set the freq of the datasets whose freq can be inferred automatically. Default freq is H. +freq_dict = defaultdict( + lambda: "H", + { + "weather": "10T", + "weather_eval": "10T", + }, +) def _from_long_dataframe( df: pd.DataFrame, + dataset: str, offset: Optional[int] = None, date_offset: Optional[pd.Timestamp] = None, ) -> tuple[GenFunc, Features]: @@ -50,7 +60,11 @@ def example_gen_func() -> Generator[dict[str, Any], None, None]: yield { "target": item_df.to_numpy(), "start": item_df.index[0], - "freq": pd.infer_freq(item_df.index), + "freq": ( + pd.infer_freq(df.index) + if pd.infer_freq(df.index) is not None + else freq_dict[dataset] + ), "item_id": item_id, } @@ -68,6 +82,7 @@ def example_gen_func() -> Generator[dict[str, Any], None, None]: def _from_wide_dataframe( df: pd.DataFrame, + dataset: str, offset: Optional[int] = None, date_offset: Optional[pd.Timestamp] = None, ) -> tuple[GenFunc, Features]: @@ -83,7 +98,11 @@ def example_gen_func() -> Generator[dict[str, Any], None, None]: yield { "target": df.iloc[:, i].to_numpy(), "start": df.index[0], - "freq": pd.infer_freq(df.index), + "freq": ( + pd.infer_freq(df.index) + if pd.infer_freq(df.index) is not None + else freq_dict[dataset] + ), "item_id": f"item_{i}", } @@ -101,6 +120,7 @@ def example_gen_func() -> Generator[dict[str, Any], None, None]: def _from_wide_dataframe_multivariate( df: pd.DataFrame, + dataset: str, offset: Optional[int] = None, date_offset: Optional[pd.Timestamp] = None, ) -> tuple[GenFunc, Features]: @@ -113,7 +133,11 @@ def example_gen_func() -> Generator[dict[str, Any], None, None]: yield { "target": df.to_numpy().T, "start": df.index[0], - "freq": pd.infer_freq(df.index), + "freq": ( + pd.infer_freq(df.index) + if pd.infer_freq(df.index) is not None + else freq_dict[dataset] + ), "item_id": "item_0", } @@ -166,7 +190,7 @@ def build_dataset( ) example_gen_func, features = _from_dataframe( - df, offset=offset, date_offset=date_offset + df, dataset=self.dataset, offset=offset, date_offset=date_offset ) hf_dataset = datasets.Dataset.from_generator( example_gen_func, features=features @@ -218,7 +242,7 @@ def build_dataset(self, file: Path, dataset_type: str): " Valid options are 'long', 'wide', and 'wide_multivariate'." ) - example_gen_func, features = _from_dataframe(df) + example_gen_func, features = _from_dataframe(df, dataset=self.dataset) hf_dataset = datasets.Dataset.from_generator( example_gen_func, features=features ) From 1c5af9a2f5963664298fd7cbe2833305966280a8 Mon Sep 17 00:00:00 2001 From: Qiao Zhongzheng Date: Tue, 20 Aug 2024 11:19:29 +0800 Subject: [PATCH 2/2] Use arg to set 'freq' & Add user prompt. --- src/uni2ts/data/builder/simple.py | 80 ++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 22 deletions(-) diff --git a/src/uni2ts/data/builder/simple.py b/src/uni2ts/data/builder/simple.py index c283c54..b33178c 100644 --- a/src/uni2ts/data/builder/simple.py +++ b/src/uni2ts/data/builder/simple.py @@ -14,7 +14,6 @@ # limitations under the License. import argparse -from collections import defaultdict from dataclasses import dataclass from itertools import product from pathlib import Path @@ -27,29 +26,33 @@ from uni2ts.common.env import env from uni2ts.common.typing import GenFunc -from uni2ts.data.builder._base import DatasetBuilder from uni2ts.data.dataset import EvalDataset, SampleTimeSeriesType, TimeSeriesDataset from uni2ts.data.indexer import HuggingFaceDatasetIndexer from uni2ts.transform import Transformation -# Manually set the freq of the datasets whose freq can be inferred automatically. Default freq is H. -freq_dict = defaultdict( - lambda: "H", - { - "weather": "10T", - "weather_eval": "10T", - }, -) +from ._base import DatasetBuilder def _from_long_dataframe( df: pd.DataFrame, - dataset: str, offset: Optional[int] = None, date_offset: Optional[pd.Timestamp] = None, + freq: str = "H", ) -> tuple[GenFunc, Features]: items = df.item_id.unique() + # Infer the freq and generate the prompt + inferred_freq = pd.infer_freq(df.index) + + if inferred_freq is not None: + print( + f"Inferred frequency: {inferred_freq}. Using this value for the 'freq' parameter." + ) + else: + print( + f"Inferred frequency is None. Using predefined {freq} for the 'freq' parameter." + ) + def example_gen_func() -> Generator[dict[str, Any], None, None]: for item_id in items: item_df = df.query(f'item_id == "{item_id}"').drop("item_id", axis=1) @@ -63,7 +66,7 @@ def example_gen_func() -> Generator[dict[str, Any], None, None]: "freq": ( pd.infer_freq(df.index) if pd.infer_freq(df.index) is not None - else freq_dict[dataset] + else freq ), "item_id": item_id, } @@ -82,9 +85,9 @@ def example_gen_func() -> Generator[dict[str, Any], None, None]: def _from_wide_dataframe( df: pd.DataFrame, - dataset: str, offset: Optional[int] = None, date_offset: Optional[pd.Timestamp] = None, + freq: str = "H", ) -> tuple[GenFunc, Features]: if offset is not None: df = df.iloc[:offset] @@ -93,6 +96,18 @@ def _from_wide_dataframe( print(df) + # Infer the freq and generate the prompt + inferred_freq = pd.infer_freq(df.index) + + if inferred_freq is not None: + print( + f"Inferred frequency: {inferred_freq}. Using this value for the 'freq' parameter." + ) + else: + print( + f"Inferred frequency is None. Using predefined {freq} for the 'freq' parameter." + ) + def example_gen_func() -> Generator[dict[str, Any], None, None]: for i in range(len(df.columns)): yield { @@ -101,7 +116,7 @@ def example_gen_func() -> Generator[dict[str, Any], None, None]: "freq": ( pd.infer_freq(df.index) if pd.infer_freq(df.index) is not None - else freq_dict[dataset] + else freq ), "item_id": f"item_{i}", } @@ -120,23 +135,33 @@ def example_gen_func() -> Generator[dict[str, Any], None, None]: def _from_wide_dataframe_multivariate( df: pd.DataFrame, - dataset: str, offset: Optional[int] = None, date_offset: Optional[pd.Timestamp] = None, + freq: str = "H", ) -> tuple[GenFunc, Features]: if offset is not None: df = df.iloc[:offset] elif date_offset is not None: df = df[df.index <= date_offset] + # Infer the freq and generate the prompt + inferred_freq = pd.infer_freq(df.index) + + if inferred_freq is not None: + print( + f"Inferred frequency: {inferred_freq}. Using this value for the 'freq' parameter." + ) + else: + print( + f"Inferred frequency is None. Using predefined {freq} for the 'freq' parameter." + ) + def example_gen_func() -> Generator[dict[str, Any], None, None]: yield { "target": df.to_numpy().T, "start": df.index[0], "freq": ( - pd.infer_freq(df.index) - if pd.infer_freq(df.index) is not None - else freq_dict[dataset] + pd.infer_freq(df.index) if pd.infer_freq(df.index) is not None else freq ), "item_id": "item_0", } @@ -169,6 +194,7 @@ def build_dataset( dataset_type: str, offset: Optional[int] = None, date_offset: Optional[pd.Timestamp] = None, + freq: str = "H", ): assert offset is None or date_offset is None, ( "One or neither offset and date_offset must be specified, but not both. " @@ -190,7 +216,7 @@ def build_dataset( ) example_gen_func, features = _from_dataframe( - df, dataset=self.dataset, offset=offset, date_offset=date_offset + df, freq=freq, offset=offset, date_offset=date_offset ) hf_dataset = datasets.Dataset.from_generator( example_gen_func, features=features @@ -227,7 +253,7 @@ class SimpleEvalDatasetBuilder(DatasetBuilder): def __post_init__(self): self.storage_path = Path(self.storage_path) - def build_dataset(self, file: Path, dataset_type: str): + def build_dataset(self, file: Path, dataset_type: str, freq: str = "H"): df = pd.read_csv(file, index_col=0, parse_dates=True) if dataset_type == "long": @@ -242,7 +268,7 @@ def build_dataset(self, file: Path, dataset_type: str): " Valid options are 'long', 'wide', and 'wide_multivariate'." ) - example_gen_func, features = _from_dataframe(df, dataset=self.dataset) + example_gen_func, features = _from_dataframe(df, freq=freq) hf_dataset = datasets.Dataset.from_generator( example_gen_func, features=features ) @@ -313,6 +339,13 @@ def generate_eval_builders( type=str, default=None, ) + # Define the `freq` argument with a default value. Use this value as 'freq' if 'freq' is None. + parser.add_argument( + "--freq", + default="H", # Set the default value + help="The user specified frequency", + ) + args = parser.parse_args() SimpleDatasetBuilder(dataset=args.dataset_name).build_dataset( @@ -320,6 +353,7 @@ def generate_eval_builders( dataset_type=args.dataset_type, offset=args.offset, date_offset=pd.Timestamp(args.date_offset) if args.date_offset else None, + freq=args.freq, ) if args.offset is not None or args.date_offset is not None: @@ -331,4 +365,6 @@ def generate_eval_builders( prediction_length=None, context_length=None, patch_size=None, - ).build_dataset(file=Path(args.file_path), dataset_type=args.dataset_type) + ).build_dataset( + file=Path(args.file_path), dataset_type=args.dataset_type, freq=args.freq + )