JDASoftwareGroup · fjetter · Feb 8, 2021 · Feb 2, 2021 · Feb 3, 2021 · Feb 3, 2021
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -2,6 +2,12 @@
 Changelog
 =========
 
+Version 3.18.1 (2021-02-XY)
+===========================
+
+* Fix an issue where updates on cubes or updates on datatsets using
+  dask.dataframe might not update all secondary indices, resulting in a corrupt
+  state after the update
 
 Version 3.18.0 (2021-01-25)
 ===========================

diff --git a/docs/guide/mutating_datasets.rst b/docs/guide/mutating_datasets.rst
@@ -1,4 +1,4 @@
-
+.. _mutating_datasets:
 
 Mutating Datasets
 =================
@@ -286,6 +286,8 @@ consists of two rows corresponding to ``B=2013-01-02`` (from ``df``) and four ro
 Thus, the original partition with the two rows corresponding to ``B=2013-01-03`` from ``df``
 has been completely replaced.
 
+
+
 Garbage collection
 ------------------
 
@@ -324,3 +326,45 @@ When garbage collection is called, the files are removed.
     files_before.difference(store.keys())  # Show files removed
 
 .. _storefact: https://github.com/blue-yonder/storefact
+
+
+Mutating indexed datasets
+-------------------------
+
+The mutating operation will update all indices that currently exist for the dataset. This even holds true in case the update function does not specify any or only partially the indices. Consider the following example
+
+.. ipython:: python
+
+    df = pd.DataFrame({"payload": range(10), "i1": 0, "i2": ["a"] * 5 + ["b"] * 5})
+    dm = store_dataframes_as_dataset(
+        store_url, "indexed_dataset", [df], secondary_indices=["i1", "i2"]
+    )
+    dm = dm.load_all_indices(store_url)
+    dm.indices["i1"].observed_values()
+    dm.indices["i2"].observed_values()
+
+    new_df = pd.DataFrame({"payload": range(10), "i1": 1, "i2": "c"})
+
+If we do not specify anything, kartothek will infer the indices and update them correctly
+
+.. ipython:: python
+
+    dm = update_dataset_from_dataframes([new_df], store=store_url, dataset_uuid=dm.uuid)
+
+    dm = dm.load_all_indices(store_url)
+    dm.indices["i1"].observed_values()
+    dm.indices["i2"].observed_values()
+
+
+This is even true if only a subset is given
+
+.. ipython:: python
+
+    new_df = pd.DataFrame({"payload": range(10), "i1": 2, "i2": "d"})
+    dm = update_dataset_from_dataframes(
+        [new_df], store=store_url, dataset_uuid=dm.uuid, secondary_indices="i1"
+    )
+
+    dm = dm.load_all_indices(store_url)
+    dm.indices["i1"].observed_values()
+    dm.indices["i2"].observed_values()
diff --git a/kartothek/api/consistency.py b/kartothek/api/consistency.py
@@ -200,7 +200,7 @@ def _check_indices(datasets: Dict[str, DatasetMetadata], cube: Cube) -> None:
     For all datasets the primary indices must be equal to ``ds.partition_keys``. For the seed dataset, secondary
     indices for all dimension columns except ``cube.suppress_index_on`` are expected.
 
-    Additional indices are accepted and will not bew reported as error.
+    Additional indices are accepted and will not be reported as error.
 
     Parameters
     ----------

diff --git a/kartothek/io/dask/_shuffle.py b/kartothek/io/dask/_shuffle.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import List, Optional, Sequence, Union
+from typing import List, Optional, Sequence, cast
 
 import dask.array as da
 import dask.dataframe as dd
@@ -9,6 +9,7 @@
 from kartothek.core.typing import StoreFactory
 from kartothek.io.dask.compression import pack_payload, unpack_payload_pandas
 from kartothek.io_components.metapartition import MetaPartition
+from kartothek.io_components.utils import InferredIndices
 from kartothek.io_components.write import write_partition
 from kartothek.serialization import DataFrameSerializer
 
@@ -35,7 +36,7 @@ def _hash_bucket(df: pd.DataFrame, subset: Optional[Sequence[str]], num_buckets:
 def shuffle_store_dask_partitions(
     ddf: dd.DataFrame,
     table: str,
-    secondary_indices: Optional[Union[str, Sequence[str]]],
+    secondary_indices: Optional[InferredIndices],
     metadata_version: int,
     partition_on: List[str],
     store_factory: StoreFactory,
@@ -109,28 +110,29 @@ def shuffle_store_dask_partitions(
     unpacked_meta = ddf._meta
 
     ddf = pack_payload(ddf, group_key=group_cols)
-    ddf = ddf.groupby(by=group_cols)
-    ddf = ddf.apply(
-        partial(
-            _unpack_store_partition,
-            secondary_indices=secondary_indices,
-            sort_partitions_by=sort_partitions_by,
-            table=table,
-            dataset_uuid=dataset_uuid,
-            partition_on=partition_on,
-            store_factory=store_factory,
-            df_serializer=df_serializer,
-            metadata_version=metadata_version,
-            unpacked_meta=unpacked_meta,
-        ),
-        meta=("MetaPartition", "object"),
+    ddf_grouped = ddf.groupby(by=group_cols)
+
+    unpack = partial(
+        _unpack_store_partition,
+        secondary_indices=secondary_indices,
+        sort_partitions_by=sort_partitions_by,
+        table=table,
+        dataset_uuid=dataset_uuid,
+        partition_on=partition_on,
+        store_factory=store_factory,
+        df_serializer=df_serializer,
+        metadata_version=metadata_version,
+        unpacked_meta=unpacked_meta,
+    )
+    return cast(
+        da.Array,  # Output type depends on meta but mypy cannot infer this easily.
+        ddf_grouped.apply(unpack, meta=("MetaPartition", "object")),
     )
-    return ddf
 
 
 def _unpack_store_partition(
     df: pd.DataFrame,
-    secondary_indices: List[str],
+    secondary_indices: Optional[InferredIndices],
     sort_partitions_by: List[str],
     table: str,
     dataset_uuid: str,

diff --git a/kartothek/io/dask/bag_cube.py b/kartothek/io/dask/bag_cube.py
@@ -442,6 +442,10 @@ def update_cube_from_bag(
     metadata_dict: dask.bag.Bag
         A dask bag object containing the compute graph to append to the cube returning the dict of dataset metadata
         objects. The bag has a single partition with a single element.
+
+    See Also
+    --------
+    :ref:`mutating_datasets`
     """
     return append_to_cube_from_bag_internal(
         data=data,

diff --git a/kartothek/io/dask/common_cube.py b/kartothek/io/dask/common_cube.py
@@ -3,6 +3,7 @@
 """
 from collections import defaultdict
 from functools import partial
+from typing import Dict
 
 import dask.bag as db
 
@@ -13,6 +14,8 @@
     KTK_CUBE_METADATA_STORAGE_FORMAT,
     KTK_CUBE_METADATA_VERSION,
 )
+from kartothek.core.cube.cube import Cube
+from kartothek.core.dataset import DatasetMetadataBase
 from kartothek.io_components.cube.append import check_existing_datasets
 from kartothek.io_components.cube.common import check_blocksize, check_store_factory
 from kartothek.io_components.cube.query import load_group, plan_query, quick_concat
@@ -34,6 +37,7 @@
     parse_input_to_metapartition,
 )
 from kartothek.io_components.update import update_dataset_from_partitions
+from kartothek.io_components.utils import _ensure_compatible_indices
 from kartothek.io_components.write import (
     raise_if_dataset_exists,
     store_dataset_from_partitions,
@@ -48,6 +52,37 @@
 )
 
 
+def ensure_valid_cube_indices(
+    existing_datasets: Dict[str, DatasetMetadataBase], cube: Cube
+) -> Cube:
+    """
+    Parse all existing datasets and infer the required set of indices. We do not
+    allow indices to be removed or added in update steps at the momenent and
+    need to make sure that existing ones are updated properly.
+    The returned `Cube` instance will be a copy of the input with
+    `index_columns` and `suppress_index_on` fields adjusted to reflect the
+    existing datasets.
+    """
+    required_indices = set(cube.index_columns)
+    suppress_index_on = set(cube.suppress_index_on)
+    for ds in existing_datasets.values():
+        for internal_table in ds.table_meta:
+            dataset_columns = set(ds.table_meta[internal_table].names)
+            table_indices = required_indices & dataset_columns
+            compatible_indices = _ensure_compatible_indices(ds, table_indices)
+            if compatible_indices:
+                dataset_indices = set(compatible_indices)
+                suppress_index_on -= dataset_indices
+                required_indices |= dataset_indices
+    # Need to remove dimension columns since they *are* technically indices but
+    # the cube interface class declares them as not indexed just to add them
+    # later on, assuming it is not blacklisted
+    return cube.copy(
+        index_columns=required_indices - set(cube.dimension_columns),
+        suppress_index_on=suppress_index_on,
+    )
+
+
 def build_cube_from_bag_internal(
     data, cube, store, ktk_cube_dataset_ids, metadata, overwrite, partition_on
 ):
@@ -90,6 +125,7 @@ def build_cube_from_bag_internal(
     existing_datasets = discover_datasets_unchecked(cube.uuid_prefix, store)
     check_datasets_prebuild(ktk_cube_dataset_ids, cube, existing_datasets)
     partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on)
+    cube = ensure_valid_cube_indices(existing_datasets, cube)
 
     data = (
         data.map(multiplex_user_input, cube=cube)
@@ -167,6 +203,7 @@ def extend_cube_from_bag_internal(
     partition_on = prepare_ktk_partition_on(cube, ktk_cube_dataset_ids, partition_on)
 
     existing_datasets = discover_datasets(cube, store)
+    cube = ensure_valid_cube_indices(existing_datasets, cube)
     if overwrite:
         existing_datasets_cut = {
             ktk_cube_dataset_id: ds
@@ -337,6 +374,7 @@ def append_to_cube_from_bag_internal(
     metadata = check_provided_metadata_dict(metadata, ktk_cube_dataset_ids)
 
     existing_datasets = discover_datasets(cube, store)
+    cube = ensure_valid_cube_indices(existing_datasets, cube)
     # existing_payload is set to empty because we're not checking against any existing payload. ktk will account for the
     # compat check within 1 dataset
     existing_payload = set()

diff --git a/kartothek/io/dask/dataframe.py b/kartothek/io/dask/dataframe.py
@@ -32,6 +32,7 @@
 from kartothek.io_components.read import dispatch_metapartitions_from_factory
 from kartothek.io_components.update import update_dataset_from_partitions
 from kartothek.io_components.utils import (
+    InferredIndices,
     _ensure_compatible_indices,
     check_single_table_dataset,
     normalize_arg,
@@ -321,7 +322,7 @@ def _write_dataframe_partitions(
     store: StoreFactory,
     dataset_uuid: str,
     table: str,
-    secondary_indices: List[str],
+    secondary_indices: Optional[InferredIndices],
     shuffle: bool,
     repartition_ratio: Optional[SupportsFloat],
     num_buckets: int,
@@ -397,6 +398,10 @@ def update_dataset_from_ddf(
 ):
     """
     Update a dataset from a dask.dataframe.
+
+    See Also
+    --------
+    :ref:`mutating_datasets`
     """
     partition_on = normalize_arg("partition_on", partition_on)
     secondary_indices = normalize_arg("secondary_indices", secondary_indices)
@@ -415,7 +420,8 @@ def update_dataset_from_ddf(
         ds_factory=factory,
     )
 
-    _ensure_compatible_indices(ds_factory, secondary_indices)
+    inferred_indices = _ensure_compatible_indices(ds_factory, secondary_indices)
+    del secondary_indices
 
     if ds_factory is not None:
         check_single_table_dataset(ds_factory, table)
@@ -425,7 +431,7 @@ def update_dataset_from_ddf(
         store=store,
         dataset_uuid=dataset_uuid or ds_factory.dataset_uuid,
         table=table,
-        secondary_indices=secondary_indices,
+        secondary_indices=inferred_indices,
         shuffle=shuffle,
         repartition_ratio=repartition_ratio,
         num_buckets=num_buckets,

diff --git a/kartothek/io/dask/delayed.py b/kartothek/io/dask/delayed.py
@@ -464,6 +464,10 @@ def update_dataset_from_delayed(
 
     Parameters
     ----------
+
+    See Also
+    --------
+    :ref:`mutating_datasets`
     """
     partition_on = normalize_arg("partition_on", partition_on)
     store = normalize_arg("store", store)

diff --git a/kartothek/io/eager.py b/kartothek/io/eager.py
@@ -728,6 +728,10 @@ def update_dataset_from_dataframes(
     Returns
     -------
     The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).
+
+    See Also
+    --------
+    :ref:`mutating_datasets`
     """
     if load_dynamic_metadata is not True:
         warnings.warn(
@@ -749,12 +753,13 @@ def update_dataset_from_dataframes(
         partition_on=partition_on,
     )
 
-    secondary_indices = _ensure_compatible_indices(ds_factory, secondary_indices)
+    inferred_indices = _ensure_compatible_indices(ds_factory, secondary_indices)
+    del secondary_indices
 
     mp = parse_input_to_metapartition(
         df_list,
         metadata_version=metadata_version,
-        expected_secondary_indices=secondary_indices,
+        expected_secondary_indices=inferred_indices,
     )
 
     if sort_partitions_by:
@@ -763,8 +768,8 @@ def update_dataset_from_dataframes(
     if partition_on:
         mp = mp.partition_on(partition_on)
 
-    if secondary_indices:
-        mp = mp.build_indices(secondary_indices)
+    if inferred_indices:
+        mp = mp.build_indices(inferred_indices)
 
     mp = mp.store_dataframes(
         store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer

diff --git a/kartothek/io/iter.py b/kartothek/io/iter.py
@@ -220,6 +220,10 @@ def update_dataset_from_dataframes__iter(
     Returns
     -------
     The dataset metadata object (:class:`~kartothek.core.dataset.DatasetMetadata`).
+
+    See Also
+    --------
+    :ref:`mutating_datasets`
     """
     if load_dynamic_metadata is not True:
         warnings.warn(

diff --git a/kartothek/io/testing/build_cube.py b/kartothek/io/testing/build_cube.py
@@ -1119,6 +1119,7 @@ def test_overwrite_rollback_ktk(driver, function_store):
         store=function_store,
         dataset_uuid=cube.ktk_dataset_uuid(cube.seed_dataset),
         metadata_version=KTK_CUBE_METADATA_VERSION,
+        secondary_indices=["i1", "i2"],
     )
 
     df_source2 = pd.DataFrame(