Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Manually set 'freq' for specific datasets #110

Merged
merged 2 commits into from
Aug 22, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 67 additions & 7 deletions src/uni2ts/data/builder/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,22 @@ def _from_long_dataframe(
df: pd.DataFrame,
offset: Optional[int] = None,
date_offset: Optional[pd.Timestamp] = None,
freq: str = "H",
) -> tuple[GenFunc, Features]:
items = df.item_id.unique()

# Infer the freq and generate the prompt
inferred_freq = pd.infer_freq(df.index)

if inferred_freq is not None:
print(
f"Inferred frequency: {inferred_freq}. Using this value for the 'freq' parameter."
)
else:
print(
f"Inferred frequency is None. Using predefined {freq} for the 'freq' parameter."
)

def example_gen_func() -> Generator[dict[str, Any], None, None]:
for item_id in items:
item_df = df.query(f'item_id == "{item_id}"').drop("item_id", axis=1)
Expand All @@ -50,7 +63,11 @@ def example_gen_func() -> Generator[dict[str, Any], None, None]:
yield {
"target": item_df.to_numpy(),
"start": item_df.index[0],
"freq": pd.infer_freq(item_df.index),
"freq": (
pd.infer_freq(df.index)
if pd.infer_freq(df.index) is not None
else freq
),
"item_id": item_id,
}

Expand All @@ -70,6 +87,7 @@ def _from_wide_dataframe(
df: pd.DataFrame,
offset: Optional[int] = None,
date_offset: Optional[pd.Timestamp] = None,
freq: str = "H",
) -> tuple[GenFunc, Features]:
if offset is not None:
df = df.iloc[:offset]
Expand All @@ -78,12 +96,28 @@ def _from_wide_dataframe(

print(df)

# Infer the freq and generate the prompt
inferred_freq = pd.infer_freq(df.index)

if inferred_freq is not None:
print(
f"Inferred frequency: {inferred_freq}. Using this value for the 'freq' parameter."
)
else:
print(
f"Inferred frequency is None. Using predefined {freq} for the 'freq' parameter."
)

def example_gen_func() -> Generator[dict[str, Any], None, None]:
for i in range(len(df.columns)):
yield {
"target": df.iloc[:, i].to_numpy(),
"start": df.index[0],
"freq": pd.infer_freq(df.index),
"freq": (
pd.infer_freq(df.index)
if pd.infer_freq(df.index) is not None
else freq
),
"item_id": f"item_{i}",
}

Expand All @@ -103,17 +137,32 @@ def _from_wide_dataframe_multivariate(
df: pd.DataFrame,
offset: Optional[int] = None,
date_offset: Optional[pd.Timestamp] = None,
freq: str = "H",
) -> tuple[GenFunc, Features]:
if offset is not None:
df = df.iloc[:offset]
elif date_offset is not None:
df = df[df.index <= date_offset]

# Infer the freq and generate the prompt
inferred_freq = pd.infer_freq(df.index)

if inferred_freq is not None:
print(
f"Inferred frequency: {inferred_freq}. Using this value for the 'freq' parameter."
)
else:
print(
f"Inferred frequency is None. Using predefined {freq} for the 'freq' parameter."
)

def example_gen_func() -> Generator[dict[str, Any], None, None]:
yield {
"target": df.to_numpy().T,
"start": df.index[0],
"freq": pd.infer_freq(df.index),
"freq": (
pd.infer_freq(df.index) if pd.infer_freq(df.index) is not None else freq
),
"item_id": "item_0",
}

Expand Down Expand Up @@ -145,6 +194,7 @@ def build_dataset(
dataset_type: str,
offset: Optional[int] = None,
date_offset: Optional[pd.Timestamp] = None,
freq: str = "H",
):
assert offset is None or date_offset is None, (
"One or neither offset and date_offset must be specified, but not both. "
Expand All @@ -166,7 +216,7 @@ def build_dataset(
)

example_gen_func, features = _from_dataframe(
df, offset=offset, date_offset=date_offset
df, freq=freq, offset=offset, date_offset=date_offset
)
hf_dataset = datasets.Dataset.from_generator(
example_gen_func, features=features
Expand Down Expand Up @@ -203,7 +253,7 @@ class SimpleEvalDatasetBuilder(DatasetBuilder):
def __post_init__(self):
self.storage_path = Path(self.storage_path)

def build_dataset(self, file: Path, dataset_type: str):
def build_dataset(self, file: Path, dataset_type: str, freq: str = "H"):
df = pd.read_csv(file, index_col=0, parse_dates=True)

if dataset_type == "long":
Expand All @@ -218,7 +268,7 @@ def build_dataset(self, file: Path, dataset_type: str):
" Valid options are 'long', 'wide', and 'wide_multivariate'."
)

example_gen_func, features = _from_dataframe(df)
example_gen_func, features = _from_dataframe(df, freq=freq)
hf_dataset = datasets.Dataset.from_generator(
example_gen_func, features=features
)
Expand Down Expand Up @@ -289,13 +339,21 @@ def generate_eval_builders(
type=str,
default=None,
)
# Define the `freq` argument with a default value. Use this value as 'freq' if 'freq' is None.
parser.add_argument(
"--freq",
default="H", # Set the default value
help="The user specified frequency",
)

args = parser.parse_args()

SimpleDatasetBuilder(dataset=args.dataset_name).build_dataset(
file=Path(args.file_path),
dataset_type=args.dataset_type,
offset=args.offset,
date_offset=pd.Timestamp(args.date_offset) if args.date_offset else None,
freq=args.freq,
)

if args.offset is not None or args.date_offset is not None:
Expand All @@ -307,4 +365,6 @@ def generate_eval_builders(
prediction_length=None,
context_length=None,
patch_size=None,
).build_dataset(file=Path(args.file_path), dataset_type=args.dataset_type)
).build_dataset(
file=Path(args.file_path), dataset_type=args.dataset_type, freq=args.freq
)
Loading