From 75ffdb5664f242b1bfe2ba8d69de280a664684af Mon Sep 17 00:00:00 2001 From: Florian Jetter Date: Tue, 16 Mar 2021 15:50:49 +0100 Subject: [PATCH] Set default for dates_as_object to True (#436) --- CHANGES.rst | 1 + kartothek/io/dask/bag.py | 14 +++++++------- kartothek/io/dask/dataframe.py | 2 +- kartothek/io/dask/delayed.py | 4 ++-- kartothek/io/eager.py | 6 +++--- kartothek/io/iter.py | 4 ++-- kartothek/io/testing/read.py | 14 ++------------ kartothek/io_components/metapartition.py | 8 +++++++- tests/io/eager/test_commit.py | 12 +++++------- tests/io/eager/test_read.py | 4 +--- tests/io_components/test_metapartition.py | 14 ++------------ 11 files changed, 33 insertions(+), 50 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 8dc011ca..39de0ad8 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -27,6 +27,7 @@ This is a major release of kartothek with breaking API changes. * Remove `output_dataset_uuid` and `df_serializer` from :func:`kartothek.io.eager.commit_dataset` since these arguments didn't have any effect * Remove `metadata`, `df_serializer`, `overwrite`, `metadata_merger` from :func:`kartothek.io.eager.write_single_partition` * :func:`~kartothek.io.eager.store_dataframes_as_dataset` now requires a list as an input +* Default value for argument `date_as_object` is now universally set to ``True``. The behaviour for `False` will be deprecated and removed in the next major release Version 3.20.0 (2021-03-15) =========================== diff --git a/kartothek/io/dask/bag.py b/kartothek/io/dask/bag.py index 1d0641fc..be5eea1b 100644 --- a/kartothek/io/dask/bag.py +++ b/kartothek/io/dask/bag.py @@ -54,7 +54,7 @@ def read_dataset_as_metapartitions_bag( columns=None, predicate_pushdown_to_io=True, categoricals=None, - dates_as_object=False, + dates_as_object: bool = True, predicates=None, factory=None, dispatch_by=None, @@ -79,10 +79,10 @@ def read_dataset_as_metapartitions_bag( mps = dispatch_metapartitions_from_factory( dataset_factory=ds_factory, predicates=predicates, dispatch_by=dispatch_by, ) - mps = db.from_sequence(mps, partition_size=partition_size) + mp_bag = db.from_sequence(mps, partition_size=partition_size) if dispatch_by is not None: - mps = mps.map( + mp_bag = mp_bag.map( _load_and_concat_metapartitions_inner, store=store, columns=columns, @@ -92,7 +92,7 @@ def read_dataset_as_metapartitions_bag( predicates=predicates, ) else: - mps = mps.map( + mp_bag = mp_bag.map( MetaPartition.load_dataframes, store=store, columns=columns, @@ -108,14 +108,14 @@ def read_dataset_as_metapartitions_bag( if categoricals_from_index: - mps = mps.map( + mp_bag = mp_bag.map( MetaPartition.apply, func=partial( _cast_categorical_to_index_cat, categories=categoricals_from_index ), type_safe=True, ) - return mps + return mp_bag @default_docs @@ -125,7 +125,7 @@ def read_dataset_as_dataframe_bag( columns=None, predicate_pushdown_to_io=True, categoricals=None, - dates_as_object=False, + dates_as_object: bool = True, predicates=None, factory=None, dispatch_by=None, diff --git a/kartothek/io/dask/dataframe.py b/kartothek/io/dask/dataframe.py index 38787ff4..610bea30 100644 --- a/kartothek/io/dask/dataframe.py +++ b/kartothek/io/dask/dataframe.py @@ -65,7 +65,7 @@ def read_dataset_as_ddf( columns=None, predicate_pushdown_to_io=True, categoricals: Optional[Sequence[str]] = None, - dates_as_object=False, + dates_as_object: bool = True, predicates=None, factory=None, dask_index_on=None, diff --git a/kartothek/io/dask/delayed.py b/kartothek/io/dask/delayed.py index b0aad8fe..bbf8c002 100644 --- a/kartothek/io/dask/delayed.py +++ b/kartothek/io/dask/delayed.py @@ -149,7 +149,7 @@ def read_dataset_as_delayed_metapartitions( columns=None, predicate_pushdown_to_io=True, categoricals: Optional[Sequence[str]] = None, - dates_as_object=False, + dates_as_object: bool = True, predicates=None, factory=None, dispatch_by=None, @@ -224,7 +224,7 @@ def read_dataset_as_delayed( columns=None, predicate_pushdown_to_io=True, categoricals=None, - dates_as_object=False, + dates_as_object: bool = True, predicates=None, factory=None, dispatch_by=None, diff --git a/kartothek/io/eager.py b/kartothek/io/eager.py index 0ee21abe..6ac0f531 100644 --- a/kartothek/io/eager.py +++ b/kartothek/io/eager.py @@ -97,7 +97,7 @@ def read_dataset_as_dataframes( columns: Dict[str, List[str]] = None, predicate_pushdown_to_io: bool = True, categoricals: List[str] = None, - dates_as_object: bool = False, + dates_as_object: bool = True, predicates: Optional[List[List[Tuple[str, str, Any]]]] = None, factory: Optional[DatasetFactory] = None, dispatch_by: Optional[List[str]] = None, @@ -153,7 +153,7 @@ def read_dataset_as_metapartitions( columns=None, predicate_pushdown_to_io=True, categoricals=None, - dates_as_object=False, + dates_as_object: bool = True, predicates=None, factory=None, dispatch_by=None, @@ -211,7 +211,7 @@ def read_table( columns: Dict[str, List[str]] = None, predicate_pushdown_to_io: bool = True, categoricals: List[str] = None, - dates_as_object: bool = False, + dates_as_object: bool = True, predicates: Optional[List[List[Tuple[str, str, Any]]]] = None, factory: Optional[DatasetFactory] = None, ) -> pd.DataFrame: diff --git a/kartothek/io/iter.py b/kartothek/io/iter.py index 72592463..df0ebce1 100644 --- a/kartothek/io/iter.py +++ b/kartothek/io/iter.py @@ -42,7 +42,7 @@ def read_dataset_as_metapartitions__iterator( columns=None, predicate_pushdown_to_io=True, categoricals=None, - dates_as_object=False, + dates_as_object: bool = True, predicates=None, factory=None, dispatch_by=None, @@ -105,7 +105,7 @@ def read_dataset_as_dataframes__iterator( columns=None, predicate_pushdown_to_io=True, categoricals=None, - dates_as_object=False, + dates_as_object: bool = True, predicates=None, factory=None, dispatch_by=None, diff --git a/kartothek/io/testing/read.py b/kartothek/io/testing/read.py index dcb048bf..266d93cd 100644 --- a/kartothek/io/testing/read.py +++ b/kartothek/io/testing/read.py @@ -234,12 +234,7 @@ def test_read_dataset_as_dataframes_predicate( core_result = pd.concat(result) expected_core = pd.DataFrame( - { - "P": [2], - "L": [2], - "TARGET": [2], - "DATE": pd.to_datetime([datetime.date(2009, 12, 31)]), - } + {"P": [2], "L": [2], "TARGET": [2], "DATE": [datetime.date(2009, 12, 31)]} ) pdt.assert_frame_equal( core_result, expected_core, check_dtype=False, check_like=True @@ -275,12 +270,7 @@ def test_read_dataset_as_dataframes_predicate_with_partition_keys( core_result = pd.concat(result) expected_core = pd.DataFrame( - { - "P": [2], - "L": [2], - "TARGET": [2], - "DATE": pd.to_datetime([datetime.date(2009, 12, 31)]), - } + {"P": [2], "L": [2], "TARGET": [2], "DATE": [datetime.date(2009, 12, 31)]} ) pdt.assert_frame_equal( core_result, expected_core, check_dtype=False, check_like=True diff --git a/kartothek/io_components/metapartition.py b/kartothek/io_components/metapartition.py index 7d4109e9..9701de6a 100644 --- a/kartothek/io_components/metapartition.py +++ b/kartothek/io_components/metapartition.py @@ -3,6 +3,7 @@ import logging import os import time +import warnings from collections import namedtuple from functools import wraps from typing import ( @@ -595,7 +596,7 @@ def load_dataframes( columns: Optional[Sequence[str]] = None, predicate_pushdown_to_io: bool = True, categoricals: Optional[Sequence[str]] = None, - dates_as_object: bool = False, + dates_as_object: bool = True, predicates: PredicatesType = None, ) -> "MetaPartition": """ @@ -630,6 +631,11 @@ def load_dataframes( if categoricals is None: categoricals = [] + if not dates_as_object: + warnings.warn( + "The argument `date_as_object` is set to False. This argument will be deprecated and the future behaviour will be as if the paramere was set to `True`. Please migrate your code accordingly ahead of time.", + DeprecationWarning, + ) LOGGER.debug("Loading internal dataframes of %s", self.label) if not self.file: diff --git a/tests/io/eager/test_commit.py b/tests/io/eager/test_commit.py index 52e91683..ee749dce 100644 --- a/tests/io/eager/test_commit.py +++ b/tests/io/eager/test_commit.py @@ -65,13 +65,11 @@ def test_commit_dataset_from_metapartition(dataset_function, store): [ ( "DATE", - pd.to_datetime( - [ - datetime.date(2016, 3, 23), - datetime.date(2010, 1, 1), - datetime.date(2009, 12, 31), - ] - ), + [ + datetime.date(2016, 3, 23), + datetime.date(2010, 1, 1), + datetime.date(2009, 12, 31), + ], ), ("L", [5, 1, 2]), ("P", [5, 1, 2]), diff --git a/tests/io/eager/test_read.py b/tests/io/eager/test_read.py index fb04670f..01278221 100644 --- a/tests/io/eager/test_read.py +++ b/tests/io/eager/test_read.py @@ -67,9 +67,7 @@ def test_read_table_eager(dataset, store_session, use_categoricals): "P": [1, 2], "L": [1, 2], "TARGET": [1, 2], - "DATE": pd.to_datetime( - [datetime.date(2010, 1, 1), datetime.date(2009, 12, 31)] - ), + "DATE": [datetime.date(2010, 1, 1), datetime.date(2009, 12, 31)], } ) if categories: diff --git a/tests/io_components/test_metapartition.py b/tests/io_components/test_metapartition.py index a19d9c5b..9e824584 100644 --- a/tests/io_components/test_metapartition.py +++ b/tests/io_components/test_metapartition.py @@ -86,12 +86,7 @@ def test_load_dataframes( ): expected_df = pd.DataFrame( OrderedDict( - [ - ("P", [1]), - ("L", [1]), - ("TARGET", [1]), - ("DATE", pd.to_datetime([date(2010, 1, 1)])), - ] + [("P", [1]), ("L", [1]), ("TARGET", [1]), ("DATE", [date(2010, 1, 1)])] ) ) mp = meta_partitions_files_only[0] @@ -122,12 +117,7 @@ def test_remove_dataframes(meta_partitions_files_only, store_session): def test_load_dataframes_selective(meta_partitions_files_only, store_session): expected_df = pd.DataFrame( OrderedDict( - [ - ("P", [1]), - ("L", [1]), - ("TARGET", [1]), - ("DATE", pd.to_datetime([date(2010, 1, 1)])), - ] + [("P", [1]), ("L", [1]), ("TARGET", [1]), ("DATE", [date(2010, 1, 1)])] ) ) mp = meta_partitions_files_only[0]