Skip to content

Commit

Permalink
Set default for dates_as_object to True (#436)
Browse files Browse the repository at this point in the history
  • Loading branch information
fjetter authored Mar 16, 2021
1 parent b1e2535 commit 75ffdb5
Show file tree
Hide file tree
Showing 11 changed files with 33 additions and 50 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ This is a major release of kartothek with breaking API changes.
* Remove `output_dataset_uuid` and `df_serializer` from :func:`kartothek.io.eager.commit_dataset` since these arguments didn't have any effect
* Remove `metadata`, `df_serializer`, `overwrite`, `metadata_merger` from :func:`kartothek.io.eager.write_single_partition`
* :func:`~kartothek.io.eager.store_dataframes_as_dataset` now requires a list as an input
* Default value for argument `date_as_object` is now universally set to ``True``. The behaviour for `False` will be deprecated and removed in the next major release

Version 3.20.0 (2021-03-15)
===========================
Expand Down
14 changes: 7 additions & 7 deletions kartothek/io/dask/bag.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def read_dataset_as_metapartitions_bag(
columns=None,
predicate_pushdown_to_io=True,
categoricals=None,
dates_as_object=False,
dates_as_object: bool = True,
predicates=None,
factory=None,
dispatch_by=None,
Expand All @@ -79,10 +79,10 @@ def read_dataset_as_metapartitions_bag(
mps = dispatch_metapartitions_from_factory(
dataset_factory=ds_factory, predicates=predicates, dispatch_by=dispatch_by,
)
mps = db.from_sequence(mps, partition_size=partition_size)
mp_bag = db.from_sequence(mps, partition_size=partition_size)

if dispatch_by is not None:
mps = mps.map(
mp_bag = mp_bag.map(
_load_and_concat_metapartitions_inner,
store=store,
columns=columns,
Expand All @@ -92,7 +92,7 @@ def read_dataset_as_metapartitions_bag(
predicates=predicates,
)
else:
mps = mps.map(
mp_bag = mp_bag.map(
MetaPartition.load_dataframes,
store=store,
columns=columns,
Expand All @@ -108,14 +108,14 @@ def read_dataset_as_metapartitions_bag(

if categoricals_from_index:

mps = mps.map(
mp_bag = mp_bag.map(
MetaPartition.apply,
func=partial(
_cast_categorical_to_index_cat, categories=categoricals_from_index
),
type_safe=True,
)
return mps
return mp_bag


@default_docs
Expand All @@ -125,7 +125,7 @@ def read_dataset_as_dataframe_bag(
columns=None,
predicate_pushdown_to_io=True,
categoricals=None,
dates_as_object=False,
dates_as_object: bool = True,
predicates=None,
factory=None,
dispatch_by=None,
Expand Down
2 changes: 1 addition & 1 deletion kartothek/io/dask/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def read_dataset_as_ddf(
columns=None,
predicate_pushdown_to_io=True,
categoricals: Optional[Sequence[str]] = None,
dates_as_object=False,
dates_as_object: bool = True,
predicates=None,
factory=None,
dask_index_on=None,
Expand Down
4 changes: 2 additions & 2 deletions kartothek/io/dask/delayed.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def read_dataset_as_delayed_metapartitions(
columns=None,
predicate_pushdown_to_io=True,
categoricals: Optional[Sequence[str]] = None,
dates_as_object=False,
dates_as_object: bool = True,
predicates=None,
factory=None,
dispatch_by=None,
Expand Down Expand Up @@ -224,7 +224,7 @@ def read_dataset_as_delayed(
columns=None,
predicate_pushdown_to_io=True,
categoricals=None,
dates_as_object=False,
dates_as_object: bool = True,
predicates=None,
factory=None,
dispatch_by=None,
Expand Down
6 changes: 3 additions & 3 deletions kartothek/io/eager.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def read_dataset_as_dataframes(
columns: Dict[str, List[str]] = None,
predicate_pushdown_to_io: bool = True,
categoricals: List[str] = None,
dates_as_object: bool = False,
dates_as_object: bool = True,
predicates: Optional[List[List[Tuple[str, str, Any]]]] = None,
factory: Optional[DatasetFactory] = None,
dispatch_by: Optional[List[str]] = None,
Expand Down Expand Up @@ -153,7 +153,7 @@ def read_dataset_as_metapartitions(
columns=None,
predicate_pushdown_to_io=True,
categoricals=None,
dates_as_object=False,
dates_as_object: bool = True,
predicates=None,
factory=None,
dispatch_by=None,
Expand Down Expand Up @@ -211,7 +211,7 @@ def read_table(
columns: Dict[str, List[str]] = None,
predicate_pushdown_to_io: bool = True,
categoricals: List[str] = None,
dates_as_object: bool = False,
dates_as_object: bool = True,
predicates: Optional[List[List[Tuple[str, str, Any]]]] = None,
factory: Optional[DatasetFactory] = None,
) -> pd.DataFrame:
Expand Down
4 changes: 2 additions & 2 deletions kartothek/io/iter.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def read_dataset_as_metapartitions__iterator(
columns=None,
predicate_pushdown_to_io=True,
categoricals=None,
dates_as_object=False,
dates_as_object: bool = True,
predicates=None,
factory=None,
dispatch_by=None,
Expand Down Expand Up @@ -105,7 +105,7 @@ def read_dataset_as_dataframes__iterator(
columns=None,
predicate_pushdown_to_io=True,
categoricals=None,
dates_as_object=False,
dates_as_object: bool = True,
predicates=None,
factory=None,
dispatch_by=None,
Expand Down
14 changes: 2 additions & 12 deletions kartothek/io/testing/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,12 +234,7 @@ def test_read_dataset_as_dataframes_predicate(
core_result = pd.concat(result)

expected_core = pd.DataFrame(
{
"P": [2],
"L": [2],
"TARGET": [2],
"DATE": pd.to_datetime([datetime.date(2009, 12, 31)]),
}
{"P": [2], "L": [2], "TARGET": [2], "DATE": [datetime.date(2009, 12, 31)]}
)
pdt.assert_frame_equal(
core_result, expected_core, check_dtype=False, check_like=True
Expand Down Expand Up @@ -275,12 +270,7 @@ def test_read_dataset_as_dataframes_predicate_with_partition_keys(
core_result = pd.concat(result)

expected_core = pd.DataFrame(
{
"P": [2],
"L": [2],
"TARGET": [2],
"DATE": pd.to_datetime([datetime.date(2009, 12, 31)]),
}
{"P": [2], "L": [2], "TARGET": [2], "DATE": [datetime.date(2009, 12, 31)]}
)
pdt.assert_frame_equal(
core_result, expected_core, check_dtype=False, check_like=True
Expand Down
8 changes: 7 additions & 1 deletion kartothek/io_components/metapartition.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import os
import time
import warnings
from collections import namedtuple
from functools import wraps
from typing import (
Expand Down Expand Up @@ -595,7 +596,7 @@ def load_dataframes(
columns: Optional[Sequence[str]] = None,
predicate_pushdown_to_io: bool = True,
categoricals: Optional[Sequence[str]] = None,
dates_as_object: bool = False,
dates_as_object: bool = True,
predicates: PredicatesType = None,
) -> "MetaPartition":
"""
Expand Down Expand Up @@ -630,6 +631,11 @@ def load_dataframes(

if categoricals is None:
categoricals = []
if not dates_as_object:
warnings.warn(
"The argument `date_as_object` is set to False. This argument will be deprecated and the future behaviour will be as if the paramere was set to `True`. Please migrate your code accordingly ahead of time.",
DeprecationWarning,
)

LOGGER.debug("Loading internal dataframes of %s", self.label)
if not self.file:
Expand Down
12 changes: 5 additions & 7 deletions tests/io/eager/test_commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,11 @@ def test_commit_dataset_from_metapartition(dataset_function, store):
[
(
"DATE",
pd.to_datetime(
[
datetime.date(2016, 3, 23),
datetime.date(2010, 1, 1),
datetime.date(2009, 12, 31),
]
),
[
datetime.date(2016, 3, 23),
datetime.date(2010, 1, 1),
datetime.date(2009, 12, 31),
],
),
("L", [5, 1, 2]),
("P", [5, 1, 2]),
Expand Down
4 changes: 1 addition & 3 deletions tests/io/eager/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,7 @@ def test_read_table_eager(dataset, store_session, use_categoricals):
"P": [1, 2],
"L": [1, 2],
"TARGET": [1, 2],
"DATE": pd.to_datetime(
[datetime.date(2010, 1, 1), datetime.date(2009, 12, 31)]
),
"DATE": [datetime.date(2010, 1, 1), datetime.date(2009, 12, 31)],
}
)
if categories:
Expand Down
14 changes: 2 additions & 12 deletions tests/io_components/test_metapartition.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,7 @@ def test_load_dataframes(
):
expected_df = pd.DataFrame(
OrderedDict(
[
("P", [1]),
("L", [1]),
("TARGET", [1]),
("DATE", pd.to_datetime([date(2010, 1, 1)])),
]
[("P", [1]), ("L", [1]), ("TARGET", [1]), ("DATE", [date(2010, 1, 1)])]
)
)
mp = meta_partitions_files_only[0]
Expand Down Expand Up @@ -122,12 +117,7 @@ def test_remove_dataframes(meta_partitions_files_only, store_session):
def test_load_dataframes_selective(meta_partitions_files_only, store_session):
expected_df = pd.DataFrame(
OrderedDict(
[
("P", [1]),
("L", [1]),
("TARGET", [1]),
("DATE", pd.to_datetime([date(2010, 1, 1)])),
]
[("P", [1]), ("L", [1]), ("TARGET", [1]), ("DATE", [date(2010, 1, 1)])]
)
)
mp = meta_partitions_files_only[0]
Expand Down

0 comments on commit 75ffdb5

Please sign in to comment.