From 3c8fb7e8c9ef030db0a0c5c0f3d4def75214bb9b Mon Sep 17 00:00:00 2001 From: nankolena <145366880+nankolena@users.noreply.github.com> Date: Tue, 7 Jan 2025 15:41:57 -0500 Subject: [PATCH] chore: add back special datatype for timestamp (#733) * Revert "Revert "feat: add special datatype for timestamp (#730)" (#732)" This reverts commit 5e3863cb2a9e586a31b8726aee4cbfa73d7154a9. * support iso format by default, refactor tz localization --- docs/reference/experimental/index.md | 3 + .../multiclass/upload_results.py | 2 +- kolena/_experimental/special_data_type.py | 83 ++++++++++++++++ kolena/_utils/datatypes.py | 3 + kolena/annotation.py | 1 + tests/integration/dataset/test_dataset.py | 11 ++- .../_experimental/test_special_data_type.py | 99 +++++++++++++++++++ 7 files changed, 198 insertions(+), 4 deletions(-) create mode 100644 kolena/_experimental/special_data_type.py create mode 100644 tests/unit/_experimental/test_special_data_type.py diff --git a/docs/reference/experimental/index.md b/docs/reference/experimental/index.md index cd9ea994c..9a5c16933 100644 --- a/docs/reference/experimental/index.md +++ b/docs/reference/experimental/index.md @@ -24,3 +24,6 @@ options: members: ["download_results_by_tag"] show_root_heading: true +::: kolena._experimental.special_data_type + options: + show_root_heading: true diff --git a/examples/dataset/classification/classification/multiclass/upload_results.py b/examples/dataset/classification/classification/multiclass/upload_results.py index 459ccc61b..f7f08ff26 100644 --- a/examples/dataset/classification/classification/multiclass/upload_results.py +++ b/examples/dataset/classification/classification/multiclass/upload_results.py @@ -22,9 +22,9 @@ from classification.multiclass.constants import DATASET from classification.multiclass.constants import ID_FIELDS +from kolena.annotation import ScoredClassificationLabel from kolena.dataset import download_dataset from kolena.dataset import upload_results -from kolena.workflow.annotation import ScoredClassificationLabel MODELS = ["resnet50v2", "inceptionv3"] diff --git a/kolena/_experimental/special_data_type.py b/kolena/_experimental/special_data_type.py new file mode 100644 index 000000000..ab44ffc33 --- /dev/null +++ b/kolena/_experimental/special_data_type.py @@ -0,0 +1,83 @@ +# Copyright 2021-2024 Kolena Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Special data types supported on the Kolena platform. + +""" # noqa: E501 +from abc import ABCMeta +from datetime import datetime +from typing import Optional + +import pytz + +from kolena._utils.datatypes import DataCategory +from kolena._utils.datatypes import DataType +from kolena._utils.datatypes import TypedDataObject +from kolena._utils.pydantic_v1.dataclasses import dataclass +from kolena._utils.validators import ValidatorConfig + + +class _SpecialDataType(DataType): + TIMESTAMP = "TIMESTAMP" + + @staticmethod + def _data_category() -> DataCategory: + return DataCategory.SPECIAL + + +@dataclass(frozen=True, config=ValidatorConfig) +class SpecialDataType(TypedDataObject[_SpecialDataType], metaclass=ABCMeta): + """The base class for all special data types.""" + + +@dataclass(frozen=True, config=ValidatorConfig) +class Timestamp(SpecialDataType): + """ + !!! note "Experimental" + This class is considered **experimental** + + Timestamp data type. + """ + + epoch_time: Optional[float] = None + """The epoch time of the timestamp. If `value` and `format` are specified, the `epoch_time` will be calculated.""" + + value: Optional[str] = None + """ + The timestamp in a string representation. Note that GMT timezone is assumed unless the offset is specified in the + string. + """ + + format: Optional[str] = None + """ + The format of the `value` string following the + [python format codes](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes). If not + provided, the `value` will be parsed using + [python's `fromisoformat()`](https://docs.python.org/3/library/datetime.html#datetime.datetime.fromisoformat). + """ + + @staticmethod + def _data_type() -> _SpecialDataType: + return _SpecialDataType.TIMESTAMP + + def __post_init__(self) -> None: + if self.value: + if not self.format: + time_obj = datetime.fromisoformat(self.value) + else: + time_obj = datetime.strptime(self.value, self.format) + # assume GMT if timezone is not provided + if not time_obj.tzinfo: + time_obj = pytz.utc.localize(time_obj) + object.__setattr__(self, "epoch_time", time_obj.timestamp()) diff --git a/kolena/_utils/datatypes.py b/kolena/_utils/datatypes.py index 1eec79b06..2850d9041 100644 --- a/kolena/_utils/datatypes.py +++ b/kolena/_utils/datatypes.py @@ -85,6 +85,7 @@ class DataCategory(str, Enum): METRICS = "METRICS" ASSET = "ASSET" ANNOTATION = "ANNOTATION" + SPECIAL = "SPECIAL" def data_category_to_module_name(self) -> str: if self == DataCategory.TEST_SAMPLE: @@ -97,6 +98,8 @@ def data_category_to_module_name(self) -> str: return "kolena.asset" if self == DataCategory.ANNOTATION: return "kolena.annotation" + if self == DataCategory.SPECIAL: + return "kolena._experimental.data_type.special" raise ValueError(f"Must specify module name for data category: {self}") diff --git a/kolena/annotation.py b/kolena/annotation.py index 02f8fceed..424d3fdf0 100644 --- a/kolena/annotation.py +++ b/kolena/annotation.py @@ -62,6 +62,7 @@ class _AnnotationType(DataType): TIME_SEGMENT = "TIME_SEGMENT" TEXT_SEGMENT = "TEXT_SEGMENT" CUSTOM = "CUSTOM" + TIMESTAMP = "TIMESTAMP" @staticmethod def _data_category() -> DataCategory: diff --git a/tests/integration/dataset/test_dataset.py b/tests/integration/dataset/test_dataset.py index d053b1dc2..dfc0b2a12 100644 --- a/tests/integration/dataset/test_dataset.py +++ b/tests/integration/dataset/test_dataset.py @@ -21,6 +21,9 @@ import pytest from kolena._api.v2.dataset import CommitData +from kolena._experimental.special_data_type import Timestamp +from kolena.annotation import BoundingBox +from kolena.annotation import LabeledBoundingBox from kolena.dataset import download_dataset from kolena.dataset import list_datasets from kolena.dataset import upload_dataset @@ -28,8 +31,6 @@ from kolena.dataset.dataset import _load_dataset_metadata from kolena.errors import InputValidationError from kolena.errors import NotFoundError -from kolena.workflow.annotation import BoundingBox -from kolena.workflow.annotation import LabeledBoundingBox from tests.integration.helper import assert_frame_equal from tests.integration.helper import fake_locator from tests.integration.helper import upload_extracted_properties @@ -83,6 +84,8 @@ def test__upload_dataset() -> None: LabeledBoundingBox(label="cat", top_left=[i, i], bottom_right=[i + 10, i + 10]), LabeledBoundingBox(label="dog", top_left=[i + 5, i + 5], bottom_right=[i + 20, i + 20]), ], + time_str=Timestamp(value=f"12/31/2024, 00:00:{'{:02d}'.format(i)}", format="%m/%d/%Y, %H:%M:%S"), + time_num=Timestamp(epoch_time=1735689600 + i), ) for i in range(20) ] @@ -96,10 +99,12 @@ def test__upload_dataset() -> None: BoundingBox(label=bbox.label, top_left=bbox.top_left, bottom_right=bbox.bottom_right) for bbox in dp["bboxes"] ], + time_str=dp["time_str"], + time_num=dp["time_num"], ) for dp in datapoints ] - columns = ["locator", "width", "height", "city", "bboxes"] + columns = ["locator", "width", "height", "city", "bboxes", "time_str", "time_num"] upload_dataset(name, pd.DataFrame(datapoints[:10], columns=columns), id_fields=["locator"]) diff --git a/tests/unit/_experimental/test_special_data_type.py b/tests/unit/_experimental/test_special_data_type.py new file mode 100644 index 000000000..804942270 --- /dev/null +++ b/tests/unit/_experimental/test_special_data_type.py @@ -0,0 +1,99 @@ +# Copyright 2021-2024 Kolena Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any +from typing import Dict +from typing import Optional + +import pytest + +from kolena._experimental.special_data_type import _SpecialDataType +from kolena._experimental.special_data_type import Timestamp +from kolena._utils.datatypes import DATA_TYPE_FIELD + + +@pytest.mark.parametrize( + "object, json_data", + [ + ( + Timestamp(epoch_time=1700000000), + { + "epoch_time": 1700000000, + "value": None, + "format": None, + }, + ), + ( + Timestamp(value="12/31/2024, 00:00:00", format="%m/%d/%Y, %H:%M:%S"), + { + "epoch_time": 1735603200, + "value": "12/31/2024, 00:00:00", + "format": "%m/%d/%Y, %H:%M:%S", + }, + ), + ], +) +def test__serde__timestamp(object: Timestamp, json_data: Dict[str, Any]) -> None: + object_dict = object._to_dict() + assert object_dict == { + **json_data, + DATA_TYPE_FIELD: f"{_SpecialDataType._data_category().value}/{_SpecialDataType.TIMESTAMP.value}", + } + assert Timestamp._from_dict(object_dict) == object + + +@pytest.mark.parametrize( + "value, format, epoch_time", + [ + ("12/31/2024, 00:00:00", "%m/%d/%Y, %H:%M:%S", 1735603200), + ("25/05/99 02:35:5.523", "%d/%m/%y %H:%M:%S.%f", 927599705.523), + ("2021/05/25", "%Y/%m/%d", 1621900800), + ("2021-05-25 02:35:15", "%Y-%m-%d %H:%M:%S", 1621910115), + ("Tuesday, December 31, 2024 5:00:00 AM", "%A, %B %d, %Y %H:%M:%S %p", 1735621200), + ("Tuesday, December 31, 2024 00:00:00 AM GMT-05:00", "%A, %B %d, %Y %H:%M:%S %p %Z%z", 1735621200), + ("Tuesday, December 31, 2024 00:00:00 AM UTC-05:00", "%A, %B %d, %Y %H:%M:%S %p %Z%z", 1735621200), + ], +) +def test__timestamp_epoch_conversion_with_format(value: str, format: str, epoch_time: float) -> None: + timestamp_object = Timestamp(value=value, format=format) + assert epoch_time == timestamp_object.epoch_time + + +@pytest.mark.parametrize( + "value, epoch_time", + [ + ("2024-12-31", 1735603200), + ("2024-12-31 00:00:00", 1735603200), + ("2024-12-31 12:00:00+00:00", 1735646400), + ("2024-12-31 12:00:00-00:00", 1735646400), + ("2024-12-31 12:00:00+05:00", 1735628400), + ("2024-12-31 12:00:00-05:00", 1735664400), + ], +) +def test__timestamp_epoch_conversion_iso(value: str, epoch_time: float) -> None: + timestamp_object = Timestamp(value=value) + assert epoch_time == timestamp_object.epoch_time + + +@pytest.mark.parametrize( + "value, format", + [ + # value without format and not following ISO 8601 format + ("12/31/2024, 00:00:00", None), + # format inconsistent with value + ("12/31/2024, 00:00:00", "%m/%d/%Y, %s"), + ], +) +def test__timestamp_validation(value: str, format: Optional[str]) -> None: + with pytest.raises(ValueError): + Timestamp(value=value, format=format)