Skip to content

Commit

Permalink
Allow manually setting 'freq' for specific datasets with simple data …
Browse files Browse the repository at this point in the history
…builder (#110)

Fix the bug in issue #107. Enable to manually set `freq` for the
datasets in which `freq=None` using `pd.infer_freq`.
  • Loading branch information
zqiao11 authored Aug 22, 2024
1 parent 27616d9 commit 901a987
Showing 1 changed file with 67 additions and 7 deletions.
74 changes: 67 additions & 7 deletions src/uni2ts/data/builder/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,22 @@ def _from_long_dataframe(
df: pd.DataFrame,
offset: Optional[int] = None,
date_offset: Optional[pd.Timestamp] = None,
freq: str = "H",
) -> tuple[GenFunc, Features]:
items = df.item_id.unique()

# Infer the freq and generate the prompt
inferred_freq = pd.infer_freq(df.index)

if inferred_freq is not None:
print(
f"Inferred frequency: {inferred_freq}. Using this value for the 'freq' parameter."
)
else:
print(
f"Inferred frequency is None. Using predefined {freq} for the 'freq' parameter."
)

def example_gen_func() -> Generator[dict[str, Any], None, None]:
for item_id in items:
item_df = df.query(f'item_id == "{item_id}"').drop("item_id", axis=1)
Expand All @@ -50,7 +63,11 @@ def example_gen_func() -> Generator[dict[str, Any], None, None]:
yield {
"target": item_df.to_numpy(),
"start": item_df.index[0],
"freq": pd.infer_freq(item_df.index),
"freq": (
pd.infer_freq(df.index)
if pd.infer_freq(df.index) is not None
else freq
),
"item_id": item_id,
}

Expand All @@ -70,6 +87,7 @@ def _from_wide_dataframe(
df: pd.DataFrame,
offset: Optional[int] = None,
date_offset: Optional[pd.Timestamp] = None,
freq: str = "H",
) -> tuple[GenFunc, Features]:
if offset is not None:
df = df.iloc[:offset]
Expand All @@ -78,12 +96,28 @@ def _from_wide_dataframe(

print(df)

# Infer the freq and generate the prompt
inferred_freq = pd.infer_freq(df.index)

if inferred_freq is not None:
print(
f"Inferred frequency: {inferred_freq}. Using this value for the 'freq' parameter."
)
else:
print(
f"Inferred frequency is None. Using predefined {freq} for the 'freq' parameter."
)

def example_gen_func() -> Generator[dict[str, Any], None, None]:
for i in range(len(df.columns)):
yield {
"target": df.iloc[:, i].to_numpy(),
"start": df.index[0],
"freq": pd.infer_freq(df.index),
"freq": (
pd.infer_freq(df.index)
if pd.infer_freq(df.index) is not None
else freq
),
"item_id": f"item_{i}",
}

Expand All @@ -103,17 +137,32 @@ def _from_wide_dataframe_multivariate(
df: pd.DataFrame,
offset: Optional[int] = None,
date_offset: Optional[pd.Timestamp] = None,
freq: str = "H",
) -> tuple[GenFunc, Features]:
if offset is not None:
df = df.iloc[:offset]
elif date_offset is not None:
df = df[df.index <= date_offset]

# Infer the freq and generate the prompt
inferred_freq = pd.infer_freq(df.index)

if inferred_freq is not None:
print(
f"Inferred frequency: {inferred_freq}. Using this value for the 'freq' parameter."
)
else:
print(
f"Inferred frequency is None. Using predefined {freq} for the 'freq' parameter."
)

def example_gen_func() -> Generator[dict[str, Any], None, None]:
yield {
"target": df.to_numpy().T,
"start": df.index[0],
"freq": pd.infer_freq(df.index),
"freq": (
pd.infer_freq(df.index) if pd.infer_freq(df.index) is not None else freq
),
"item_id": "item_0",
}

Expand Down Expand Up @@ -145,6 +194,7 @@ def build_dataset(
dataset_type: str,
offset: Optional[int] = None,
date_offset: Optional[pd.Timestamp] = None,
freq: str = "H",
):
assert offset is None or date_offset is None, (
"One or neither offset and date_offset must be specified, but not both. "
Expand All @@ -166,7 +216,7 @@ def build_dataset(
)

example_gen_func, features = _from_dataframe(
df, offset=offset, date_offset=date_offset
df, freq=freq, offset=offset, date_offset=date_offset
)
hf_dataset = datasets.Dataset.from_generator(
example_gen_func, features=features
Expand Down Expand Up @@ -203,7 +253,7 @@ class SimpleEvalDatasetBuilder(DatasetBuilder):
def __post_init__(self):
self.storage_path = Path(self.storage_path)

def build_dataset(self, file: Path, dataset_type: str):
def build_dataset(self, file: Path, dataset_type: str, freq: str = "H"):
df = pd.read_csv(file, index_col=0, parse_dates=True)

if dataset_type == "long":
Expand All @@ -218,7 +268,7 @@ def build_dataset(self, file: Path, dataset_type: str):
" Valid options are 'long', 'wide', and 'wide_multivariate'."
)

example_gen_func, features = _from_dataframe(df)
example_gen_func, features = _from_dataframe(df, freq=freq)
hf_dataset = datasets.Dataset.from_generator(
example_gen_func, features=features
)
Expand Down Expand Up @@ -289,13 +339,21 @@ def generate_eval_builders(
type=str,
default=None,
)
# Define the `freq` argument with a default value. Use this value as 'freq' if 'freq' is None.
parser.add_argument(
"--freq",
default="H", # Set the default value
help="The user specified frequency",
)

args = parser.parse_args()

SimpleDatasetBuilder(dataset=args.dataset_name).build_dataset(
file=Path(args.file_path),
dataset_type=args.dataset_type,
offset=args.offset,
date_offset=pd.Timestamp(args.date_offset) if args.date_offset else None,
freq=args.freq,
)

if args.offset is not None or args.date_offset is not None:
Expand All @@ -307,4 +365,6 @@ def generate_eval_builders(
prediction_length=None,
context_length=None,
patch_size=None,
).build_dataset(file=Path(args.file_path), dataset_type=args.dataset_type)
).build_dataset(
file=Path(args.file_path), dataset_type=args.dataset_type, freq=args.freq
)

0 comments on commit 901a987

Please sign in to comment.