Skip to content

Commit

Permalink
update testset
Browse files Browse the repository at this point in the history
  • Loading branch information
peterdudfield committed Dec 18, 2023
1 parent 5689bd3 commit a5fa6ab
Show file tree
Hide file tree
Showing 4 changed files with 2,567 additions and 2,515 deletions.
47 changes: 42 additions & 5 deletions quartz_solar_forecast/dataset/make_test_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,19 @@
Make a random test set
This takes a random subset of times and for various pv ids and makes a test set
There is an option to odmit timestamps that don't exsits in the ICON dataset:
https://huggingface.co/datasets/openclimatefix/dwd-icon-eu/tree/main/data
"""
import os
from typing import Optional

import numpy as np
import pandas as pd

from quartz_solar_forecast.eval.utils import make_hf_filename
from huggingface_hub import HfFileSystem

test_start_date = pd.Timestamp("2021-01-01")
test_end_date = pd.Timestamp("2022-01-01")

Expand Down Expand Up @@ -69,7 +75,7 @@
np.random.seed(42)


def make_test_set(output_file_name: Optional[str] = None, number_of_samples_per_system: int = 50):
def make_test_set(output_file_name: Optional[str] = None, number_of_samples_per_system: int = 50, check_hf_files: bool = False):
"""
Make a test set of random times and pv ids
Expand All @@ -81,16 +87,47 @@ def make_test_set(output_file_name: Optional[str] = None, number_of_samples_per_
# get the folder where this file is
output_file_name = os.path.dirname(os.path.abspath(__file__)) + "/testset.csv"

ts = pd.date_range(start=test_start_date, end=test_end_date, freq="15min")

# check that the files are in HF for ICON
if check_hf_files:
ts = filter_timestamps_if_hf_files_exists(ts)

test_set = []
for pv_id in pv_ids:
ts = pd.date_range(start=test_start_date, end=test_end_date, freq="15min")
ts = ts[np.random.choice(len(ts), size=number_of_samples_per_system, replace=False)]
test_set.append(pd.DataFrame({"pv_id": pv_id, "timestamp": ts}))
ts_choice = ts[np.random.choice(len(ts), size=number_of_samples_per_system, replace=False)]
test_set.append(pd.DataFrame({"pv_id": pv_id, "timestamp": ts_choice}))
test_set = pd.concat(test_set)
test_set.to_csv(output_file_name, index=False)

return test_set


def filter_timestamps_if_hf_files_exists(timestamps_full: pd.DatetimeIndex):
"""
Filter the timestamps if the huggingface files exist
We are checking if the teimstamps, rounded down to the nearest 6 hours,
exist in
https://huggingface.co/datasets/openclimatefix/dwd-icon-eu/tree/main/data
"""
timestamps = []
fs = HfFileSystem()
# print(fs.ls("datasets/openclimatefix/dwd-icon-eu/data/2022/4/11/", detail=False))
for timestamp in timestamps_full:
timestamp_floor = timestamp.floor("6H")
_, huggingface_file = make_hf_filename(timestamp_floor)
huggingface_file = huggingface_file[14:]

if fs.exists(huggingface_file):
timestamps.append(timestamp)
else:
print(f"Skipping {timestamp} because {huggingface_file} does not exist")

timestamps = pd.DatetimeIndex(timestamps)
return timestamps


# To run the script, un comment the following line and run this file
# make_test_set()
# make_test_set(check_hf_files=True)
Loading

0 comments on commit a5fa6ab

Please sign in to comment.