From 678656bf339f6a9cfddb15fb13e79d2e1a18ab80 Mon Sep 17 00:00:00 2001 From: Stefan Krawczyk Date: Thu, 15 Sep 2022 17:43:33 -0700 Subject: [PATCH 1/5] Refactors base typing imports Importing `typing` and using that as a prefix was getting unsightly. So moved to importing the types explicitly. --- hamilton/base.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/hamilton/base.py b/hamilton/base.py index 469643a7..a578ecbe 100644 --- a/hamilton/base.py +++ b/hamilton/base.py @@ -5,7 +5,7 @@ import abc import collections import inspect -import typing +from typing import Any, Dict, List, Tuple, Type import numpy as np import pandas as pd @@ -23,7 +23,7 @@ class ResultMixin(object): @staticmethod @abc.abstractmethod - def build_result(**outputs: typing.Dict[str, typing.Any]) -> typing.Any: + def build_result(**outputs: Dict[str, Any]) -> Any: """This function builds the result given the computed values.""" pass @@ -32,7 +32,7 @@ class DictResult(ResultMixin): """Simple function that returns the dict of column -> value results.""" @staticmethod - def build_result(**outputs: typing.Dict[str, typing.Any]) -> typing.Dict: + def build_result(**outputs: Dict[str, Any]) -> Dict: """This function builds a simple dict of output -> computed values.""" return outputs @@ -41,7 +41,7 @@ class PandasDataFrameResult(ResultMixin): """Mixin for building a pandas dataframe from the result""" @staticmethod - def build_result(**outputs: typing.Dict[str, typing.Any]) -> pd.DataFrame: + def build_result(**outputs: Dict[str, Any]) -> pd.DataFrame: # TODO check inputs are pd.Series, arrays, or scalars -- else error # TODO do a basic index check across pd.Series and flag where mismatches occur? if len(outputs) == 1: @@ -61,7 +61,7 @@ class NumpyMatrixResult(ResultMixin): """ @staticmethod - def build_result(**outputs: typing.Dict[str, typing.Any]) -> np.matrix: + def build_result(**outputs: Dict[str, Any]) -> np.matrix: """Builds a numpy matrix from the passed in, inputs. :param outputs: function_name -> np.array. @@ -108,7 +108,7 @@ class HamiltonGraphAdapter(ResultMixin): @staticmethod @abc.abstractmethod - def check_input_type(node_type: typing.Type, input_value: typing.Any) -> bool: + def check_input_type(node_type: Type, input_value: Any) -> bool: """Used to check whether the user inputs match what the execution strategy & functions can handle. :param node_type: The type of the node. @@ -119,7 +119,7 @@ def check_input_type(node_type: typing.Type, input_value: typing.Any) -> bool: @staticmethod @abc.abstractmethod - def check_node_type_equivalence(node_type: typing.Type, input_type: typing.Type) -> bool: + def check_node_type_equivalence(node_type: Type, input_type: Type) -> bool: """Used to check whether two types are equivalent. This is used when the function graph is being created and we're statically type checking the annotations @@ -132,7 +132,7 @@ def check_node_type_equivalence(node_type: typing.Type, input_type: typing.Type) pass @abc.abstractmethod - def execute_node(self, node: node.Node, kwargs: typing.Dict[str, typing.Any]) -> typing.Any: + def execute_node(self, node: node.Node, kwargs: Dict[str, Any]) -> Any: """Given a node that represents a hamilton function, execute it. Note, in some adapters this might just return some type of "future". @@ -147,8 +147,8 @@ class SimplePythonDataFrameGraphAdapter(HamiltonGraphAdapter, PandasDataFrameRes """This is the default (original Hamilton) graph adapter. It uses plain python and builds a dataframe result.""" @staticmethod - def check_input_type(node_type: typing.Type, input_value: typing.Any) -> bool: - if node_type == typing.Any: + def check_input_type(node_type: Type, input_value: Any) -> bool: + if node_type == Any: return True elif inspect.isclass(node_type) and isinstance(input_value, node_type): return True @@ -171,10 +171,10 @@ def check_input_type(node_type: typing.Type, input_value: typing.Any) -> bool: return False @staticmethod - def check_node_type_equivalence(node_type: typing.Type, input_type: typing.Type) -> bool: + def check_node_type_equivalence(node_type: Type, input_type: Type) -> bool: return node_type == input_type - def execute_node(self, node: node.Node, kwargs: typing.Dict[str, typing.Any]) -> typing.Any: + def execute_node(self, node: node.Node, kwargs: Dict[str, Any]) -> Any: return node.callable(**kwargs) @@ -186,6 +186,6 @@ def __init__(self, result_builder: ResultMixin): if self.result_builder is None: raise ValueError("You must provide a ResultMixin object for `result_builder`.") - def build_result(self, **outputs: typing.Dict[str, typing.Any]) -> typing.Any: + def build_result(self, **outputs: Dict[str, Any]) -> Any: """Delegates to the result builder function supplied.""" return self.result_builder.build_result(**outputs) From d4bf48e37e48e5d7f268076da8463d8a20fd88d9 Mon Sep 17 00:00:00 2001 From: Stefan Krawczyk Date: Thu, 22 Sep 2022 12:10:48 -0700 Subject: [PATCH 2/5] Adds some basic index type checking to df creation Related to issue #191, this commit is to help surface index type issues. Specifically: 1. Warn if there are index type mismatches. 2. Require you to set your logger to debug if you want to see more details. 3. Provide a "ResultBuilder" class that uses strict index type matching so if you want to error on index type mismatches, this is the results builder to use. I don't think we should build anything more custom unless there's a clear common use case - user contributed result builders sound like an interesting idea. --- hamilton/base.py | 123 +++++++++++++++++++++++++++- tests/test_base.py | 194 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 315 insertions(+), 2 deletions(-) diff --git a/hamilton/base.py b/hamilton/base.py index a578ecbe..2c43df77 100644 --- a/hamilton/base.py +++ b/hamilton/base.py @@ -5,14 +5,18 @@ import abc import collections import inspect +import logging from typing import Any, Dict, List, Tuple, Type import numpy as np import pandas as pd import typing_inspect +from pandas.core.indexes import extension as pd_extension from . import node +logger = logging.getLogger(__name__) + class ResultMixin(object): """Base class housing the static function. @@ -40,10 +44,96 @@ def build_result(**outputs: Dict[str, Any]) -> Dict: class PandasDataFrameResult(ResultMixin): """Mixin for building a pandas dataframe from the result""" + @staticmethod + def pandas_index_types( + outputs: Dict[str, Any] + ) -> Tuple[Dict[str, List[str]], Dict[str, List[str]], Dict[str, List[str]]]: + """This function creates three dictionaries according to whether there is an index type or not. + + The three dicts we create are: + 1. Dict of index type to list of outputs that match it. + 2. Dict of time series / categorical index types to list of outputs that match it. + 3. Dict of `no-index` key to list of outputs with no index type. + + :param outputs: the dict we're trying to create a result from. + :return: dict of all index types, dict of time series/categorical index types, dict if there is no index + """ + all_index_types = collections.defaultdict(list) + time_indexes = collections.defaultdict(list) + no_indexes = collections.defaultdict(list) + for output_name, output_value in outputs.items(): + if isinstance(output_value, (pd.DataFrame, pd.Series)): + dict_key = f"{output_value.index.__class__.__name__}:::{output_value.index.dtype}" + if isinstance(output_value.index, pd_extension.NDArrayBackedExtensionIndex): + # it's a time index -- these will produce garbage if not aligned properly. + time_indexes[dict_key].append(output_name) + else: + dict_key = "no-index" + no_indexes[dict_key].append(output_name) + all_index_types[dict_key].append(output_name) + return all_index_types, time_indexes, no_indexes + + @staticmethod + def check_pandas_index_types_match( + all_index_types: Dict[str, List[str]], + time_indexes: Dict[str, List[str]], + no_indexes: Dict[str, List[str]], + ) -> bool: + """Checks that pandas index types match. + + This only logs warning errors, and if debug is enabled, a debug statement to list index types. + """ + no_index_length = len(no_indexes) + time_indexes_length = len(time_indexes) + all_indexes_length = len(all_index_types) + number_with_indexes = all_indexes_length - no_index_length + types_match = True # default to True + # if there is more than one time index + if time_indexes_length > 1: + logger.warning( + f"WARNING: Time/Categorical index type mismatches detected - check output to ensure Pandas " + f"is doing what you intend to do. Else change the index types to match. Set logger to debug " + f"to see index types." + ) + types_match = False + # if there is more than one index type and it's not explained by the time indexes then + if number_with_indexes > 1 and all_indexes_length > time_indexes_length: + logger.warning( + f"WARNING: Multiple index types detected - check output to ensure Pandas is " + f"doing what you intend to do. Else change the index types to match. Set logger to debug to " + f"see index types." + ) + types_match = False + elif number_with_indexes == 1 and no_index_length > 0: + logger.warning( + f"WARNING: a single pandas index was found, but there are also {no_index_length} outputs without " + f"an index. Those values will be made constants throughout the values of the index." + ) + # Strictly speaking the index types match -- there is only one -- so setting to True. + types_match = True + # if all indexes matches no indexes + elif no_index_length == all_indexes_length: + logger.warning( + "It appears no Pandas index type was detected. This will likely break when trying to " + "create a DataFrame. E.g. are you requesting all scalar values? Use a different result " + "builder or return at least one Pandas object with an index." + ) + types_match = False + if logger.isEnabledFor(logging.DEBUG): + import pprint + + pretty_string = pprint.pformat(dict(all_index_types)) + logger.debug(f"Index types encountered:\n{pretty_string}.") + return types_match + @staticmethod def build_result(**outputs: Dict[str, Any]) -> pd.DataFrame: # TODO check inputs are pd.Series, arrays, or scalars -- else error - # TODO do a basic index check across pd.Series and flag where mismatches occur? + output_index_type_tuple = PandasDataFrameResult.pandas_index_types(outputs) + # this next line just log warnings + # we don't actually care about the result since this is the current default behavior. + PandasDataFrameResult.check_pandas_index_types_match(*output_index_type_tuple) + if len(outputs) == 1: (value,) = outputs.values() # this works because it's length 1. if isinstance(value, pd.DataFrame): @@ -54,6 +144,37 @@ def build_result(**outputs: Dict[str, Any]) -> pd.DataFrame: return pd.DataFrame(outputs) +class StrictIndexTypePandasDataFrameResult(PandasDataFrameResult): + """A ResultBuilder that produces a dataframe only if the index types match exactly. + + Note: If there is no index type on some outputs, e.g. the value is a scalar, as long as there exists a single pandas + index type, no error will be thrown, because a dataframe can be easily created. + + To use: + from hamilton import base, driver + strict_builder = base.StrictIndexTypePandasDataFrameResult() + adapter = base.SimplePythonGraphAdapter(strict_builder) + ... + dr = driver.Driver(config, *modules, adapter=adapter) + df = dr.execute(...) # this will now error if index types mismatch. + """ + + @staticmethod + def build_result(**outputs: Dict[str, Any]) -> pd.DataFrame: + # TODO check inputs are pd.Series, arrays, or scalars -- else error + output_index_type_tuple = PandasDataFrameResult.pandas_index_types(outputs) + indexes_match = PandasDataFrameResult.check_pandas_index_types_match( + *output_index_type_tuple + ) + if not indexes_match: + raise ValueError( + "Error: pandas index types did not match exactly. " + f"Found the following indexes:\n{dict(output_index_type_tuple[0])}" + ) + + return PandasDataFrameResult.build_result(**outputs) + + class NumpyMatrixResult(ResultMixin): """Mixin for building a Numpy Matrix from the result of walking the graph. diff --git a/tests/test_base.py b/tests/test_base.py index 7f239ff6..d769a60e 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -177,7 +177,199 @@ def test_PandasDataFrameResult_build_result(outputs, expected_result): ], ) def test_PandasDataFrameResult_build_result_errors(outputs): - """Tests the happy case of PandasDataFrameResult.build_result()""" + """Tests the error case of PandasDataFrameResult.build_result()""" pdfr = base.PandasDataFrameResult() with pytest.raises(ValueError): pdfr.build_result(**outputs) + + +@pytest.mark.parametrize( + "outputs,expected_result", + [ + ({"a": pd.Series([1, 2, 3])}, ({"RangeIndex:::int64": ["a"]}, {}, {})), + ( + {"a": pd.Series([1, 2, 3]), "b": pd.Series([3, 4, 5])}, + ({"RangeIndex:::int64": ["a", "b"]}, {}, {}), + ), + ( + { + "b": pd.Series( + [3, 4, 5], index=pd.DatetimeIndex(["2022-01", "2022-02", "2022-03"], freq="MS") + ) + }, + ( + {"DatetimeIndex:::datetime64[ns]": ["b"]}, + {"DatetimeIndex:::datetime64[ns]": ["b"]}, + {}, + ), + ), + ({"c": 1}, ({"no-index": ["c"]}, {}, {"no-index": ["c"]})), + ( + { + "a": pd.Series([1, 2, 3]), + "b": 1, + "c": pd.Series( + [3, 4, 5], index=pd.DatetimeIndex(["2022-01", "2022-02", "2022-03"], freq="MS") + ), + }, + ( + { + "DatetimeIndex:::datetime64[ns]": ["c"], + "RangeIndex:::int64": ["a"], + "no-index": ["b"], + }, + {"DatetimeIndex:::datetime64[ns]": ["c"]}, + {"no-index": ["b"]}, + ), + ), + ({"a": pd.DataFrame({"a": [1, 2, 3]})}, ({"RangeIndex:::int64": ["a"]}, {}, {})), + ], + ids=[ + "int-index", + "int-index-double", + "ts-index", + "no-index", + "multiple-different-indexes", + "df-index", + ], +) +def test_PandasDataFrameResult_pandas_index_types(outputs, expected_result): + """Tests exercising the function to return pandas index types from outputs""" + pdfr = base.PandasDataFrameResult() + actual = pdfr.pandas_index_types(outputs) + assert dict(actual[0]) == expected_result[0] + assert dict(actual[1]) == expected_result[1] + assert dict(actual[2]) == expected_result[2] + + +@pytest.mark.parametrize( + "all_index_types,time_indexes,no_indexes,expected_result", + [ + ({"foo": ["a", "b", "c"]}, {}, {}, True), + ({"int-index": ["a"], "no-index": ["b"]}, {}, {"no-index": ["b"]}, True), + ({"ts-1": ["a"], "ts-2": ["b"]}, {"ts-1": ["a"], "ts-2": ["b"]}, {}, False), + ({"float-index": ["a"], "int-index": ["b"]}, {}, {}, False), + ({"no-index": ["a", "b"]}, {}, {"no-index": ["a", "b"]}, False), + ], + ids=[ + "all-the-same", # True + "single-index-with-no-index", # True + "multiple-ts", # False + "multiple-indexes-not-ts", # False + "no-indexes-at-all", # False4 + ], +) +def test_PandasDataFrameResult_check_pandas_index_types_match( + all_index_types, time_indexes, no_indexes, expected_result +): + """Tests exercising the function to determine whether pandas index types match""" + # setup to test conditional if statement on logger level + import logging + + logger = logging.getLogger("hamilton.base") # get logger of base module. + logger.setLevel(logging.DEBUG) + pdfr = base.PandasDataFrameResult() + actual = pdfr.check_pandas_index_types_match(all_index_types, time_indexes, no_indexes) + assert actual == expected_result + + +@pytest.mark.parametrize( + "outputs,expected_result", + [ + ({"a": pd.Series([1, 2, 3])}, pd.DataFrame({"a": pd.Series([1, 2, 3])})), + ( + { + "a": pd.Series( + [1, 2, 3], index=pd.DatetimeIndex(["2022-01", "2022-02", "2022-03"], freq="MS") + ), + "b": pd.Series( + [3, 4, 5], index=pd.DatetimeIndex(["2022-01", "2022-02", "2022-03"], freq="MS") + ), + }, + pd.DataFrame( + { + "a": pd.Series( + [1, 2, 3], + index=pd.DatetimeIndex(["2022-01", "2022-02", "2022-03"], freq="MS"), + ), + "b": pd.Series( + [3, 4, 5], + index=pd.DatetimeIndex(["2022-01", "2022-02", "2022-03"], freq="MS"), + ), + } + ), + ), + ( + { + "a": pd.Series( + [1, 2, 3], index=pd.DatetimeIndex(["2022-01", "2022-02", "2022-03"], freq="MS") + ), + "b": 4, + }, + pd.DataFrame( + { + "a": pd.Series( + [1, 2, 3], + index=pd.DatetimeIndex(["2022-01", "2022-02", "2022-03"], freq="MS"), + ), + "b": 4, + } + ), + ), + ], + ids=[ + "test-same-index-simple", + "test-same-index-ts", + "test-index-with-scalar", + ], +) +def test_StrictIndexTypePandasDataFrameResult_build_result(outputs, expected_result): + """Tests the happy case of StrictIndexTypePandasDataFrameResult.build_result()""" + sitpdfr = base.StrictIndexTypePandasDataFrameResult() + actual = sitpdfr.build_result(**outputs) + pd.testing.assert_frame_equal(actual, expected_result) + + +@pytest.mark.parametrize( + "outputs", + [ + ( + { + "a": pd.Series([1, 2, 3], index=[0, 1, 2]), + "b": pd.Series([1, 2, 3], index=[0.0, 1.0, 2.0]), + } + ), + ( + { + "series1": pd.Series( + [1, 2, 3], index=pd.DatetimeIndex(["2022-01", "2022-02", "2022-03"], freq="MS") + ), + "series2": pd.Series( + [4, 5, 6], + index=pd.PeriodIndex(year=[2022, 2022, 2022], month=[1, 2, 3], freq="M"), + ), + "series3": pd.Series( + [4, 5, 6], + index=pd.PeriodIndex( + year=[2022, 2022, 2022], month=[1, 1, 1], day=[3, 4, 5], freq="B" + ), + ), + "series4": pd.Series( + [4, 5, 6], + index=pd.PeriodIndex( + year=[2022, 2022, 2022], month=[1, 1, 1], day=[4, 11, 18], freq="W" + ), + ), + } + ), + ], + ids=[ + "test-int-float", + "test-different-ts-indexes", + ], +) +def test_StrictIndexTypePandasDataFrameResult_build_result_errors(outputs): + """Tests the error case of StrictIndexTypePandasDataFrameResult.build_result()""" + sitpdfr = base.StrictIndexTypePandasDataFrameResult() + with pytest.raises(ValueError): + sitpdfr.build_result(**outputs) From 23eb68587291d3d47fead34749dde1fc884b90f7 Mon Sep 17 00:00:00 2001 From: Stefan Krawczyk Date: Thu, 22 Sep 2022 12:22:55 -0700 Subject: [PATCH 3/5] Adds time index type check compatible with py3.6 Pandas dropped support for python 3.6 in something like 1.2. So pandas 1.1.5 is what we're using in our CI system, and that does not have the `NDArrayBackedExtensionIndex` type. So I'm guessing here, but looking at the 1.1.5 source, we instead want `ExtensionIndex`. --- hamilton/base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hamilton/base.py b/hamilton/base.py index 2c43df77..1b30a428 100644 --- a/hamilton/base.py +++ b/hamilton/base.py @@ -64,7 +64,11 @@ def pandas_index_types( for output_name, output_value in outputs.items(): if isinstance(output_value, (pd.DataFrame, pd.Series)): dict_key = f"{output_value.index.__class__.__name__}:::{output_value.index.dtype}" - if isinstance(output_value.index, pd_extension.NDArrayBackedExtensionIndex): + try: + index_type = getattr(pd_extension, "NDArrayBackedExtensionIndex") + except AttributeError: # for python 3.6 & pandas 1.1.5 + index_type = getattr(pd_extension, "ExtensionIndex") + if isinstance(output_value.index, index_type): # it's a time index -- these will produce garbage if not aligned properly. time_indexes[dict_key].append(output_name) else: From 22b49df727620ddb34d2bc70c4286adea0712251 Mon Sep 17 00:00:00 2001 From: Stefan Krawczyk Date: Fri, 23 Sep 2022 14:19:48 -0700 Subject: [PATCH 4/5] Adds unit test for pandas index with DF creation TIL: you can create a dataframe and pass in an index object and it'll happily use it as a column. So this test should exist for dataframe creation since it's a valid case. But for the index type checking, I'm adding it here even though it does not have an explicit index. Therefore, one could make the argument it doesn't qualify here. But, I'd rather push people to be explicit in their code, e.g. if they want to be strict on indexes, then they should make the index a series, rather than passing an Index object. --- hamilton/base.py | 4 ++++ tests/test_base.py | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/hamilton/base.py b/hamilton/base.py index 1b30a428..0e8bee67 100644 --- a/hamilton/base.py +++ b/hamilton/base.py @@ -71,6 +71,10 @@ def pandas_index_types( if isinstance(output_value.index, index_type): # it's a time index -- these will produce garbage if not aligned properly. time_indexes[dict_key].append(output_name) + elif isinstance(output_value, pd.Index): + # there is no index on this - so it's just an integer one. + int_index = pd.Series([1, 2, 3], index=[0, 1, 2]) + dict_key = f"{int_index.index.__class__.__name__}:::{int_index.dtype}" else: dict_key = "no-index" no_indexes[dict_key].append(output_name) diff --git a/tests/test_base.py b/tests/test_base.py index d769a60e..2c6206e9 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -137,12 +137,23 @@ def test_SimplePythonDataFrameGraphAdapter_check_input_type_mismatch(node_type, {"a": pd.Series([1, 2, 3]), "b": pd.Series([11, 12, 13]), "c": pd.Series([1, 1, 1])} ), ), + ( + { + "a": pd.Series([1, 2, 3]), + "b": pd.Series([11, 12, 13]), + "c": pd.Series([11, 12, 13]).index, + }, + pd.DataFrame( + {"a": pd.Series([1, 2, 3]), "b": pd.Series([11, 12, 13]), "c": pd.Series([0, 1, 2])} + ), + ), ], ids=[ "test-single-series", "test-single-dataframe", "test-multiple-series", "test-multiple-series-with-scalar", + "test-multiple-series-with-index", ], ) def test_PandasDataFrameResult_build_result(outputs, expected_result): @@ -223,6 +234,7 @@ def test_PandasDataFrameResult_build_result_errors(outputs): ), ), ({"a": pd.DataFrame({"a": [1, 2, 3]})}, ({"RangeIndex:::int64": ["a"]}, {}, {})), + ({"a": pd.Series([1, 2, 3]).index}, ({"Int64Index:::int64": ["a"]}, {}, {})), ], ids=[ "int-index", @@ -231,6 +243,7 @@ def test_PandasDataFrameResult_build_result_errors(outputs): "no-index", "multiple-different-indexes", "df-index", + "index-object", ], ) def test_PandasDataFrameResult_pandas_index_types(outputs, expected_result): From d7bc3f50a050b069ca86c4f3792a9a05d854e044 Mon Sep 17 00:00:00 2001 From: Stefan Krawczyk Date: Fri, 23 Sep 2022 14:35:04 -0700 Subject: [PATCH 5/5] Refactors pandas_index_types to be more legible So that way it's clearer what's going on and why. I decided to use private functions to the static ones because I don't really want them used outside of that function. --- hamilton/base.py | 48 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/hamilton/base.py b/hamilton/base.py index 0e8bee67..ddbcdd49 100644 --- a/hamilton/base.py +++ b/hamilton/base.py @@ -6,7 +6,7 @@ import collections import inspect import logging -from typing import Any, Dict, List, Tuple, Type +from typing import Any, Dict, List, Tuple, Type, Union import numpy as np import pandas as pd @@ -61,20 +61,39 @@ def pandas_index_types( all_index_types = collections.defaultdict(list) time_indexes = collections.defaultdict(list) no_indexes = collections.defaultdict(list) + + def index_key_name(pd_object: Union[pd.DataFrame, pd.Series]) -> str: + """Creates a string helping identify the index and it's type. + Useful for disambiguating time related indexes.""" + return f"{pd_object.index.__class__.__name__}:::{pd_object.index.dtype}" + + def get_parent_time_index_type(): + """Helper to pull the right time index parent class.""" + if hasattr( + pd_extension, "NDArrayBackedExtensionIndex" + ): # for python 3.7+ & pandas >= 1.2 + index_type = pd_extension.NDArrayBackedExtensionIndex + elif hasattr(pd_extension, "ExtensionIndex"): # for python 3.6 & pandas <= 1.2 + index_type = pd_extension.ExtensionIndex + else: + index_type = None # weird case, but not worth breaking for. + return index_type + for output_name, output_value in outputs.items(): - if isinstance(output_value, (pd.DataFrame, pd.Series)): - dict_key = f"{output_value.index.__class__.__name__}:::{output_value.index.dtype}" - try: - index_type = getattr(pd_extension, "NDArrayBackedExtensionIndex") - except AttributeError: # for python 3.6 & pandas 1.1.5 - index_type = getattr(pd_extension, "ExtensionIndex") - if isinstance(output_value.index, index_type): + if isinstance( + output_value, (pd.DataFrame, pd.Series) + ): # if it has an index -- let's grab it's type + dict_key = index_key_name(output_value) + if isinstance(output_value.index, get_parent_time_index_type()): # it's a time index -- these will produce garbage if not aligned properly. time_indexes[dict_key].append(output_name) - elif isinstance(output_value, pd.Index): - # there is no index on this - so it's just an integer one. - int_index = pd.Series([1, 2, 3], index=[0, 1, 2]) - dict_key = f"{int_index.index.__class__.__name__}:::{int_index.dtype}" + elif isinstance( + output_value, pd.Index + ): # there is no index on this - so it's just an integer one. + int_index = pd.Series( + [1, 2, 3], index=[0, 1, 2] + ) # dummy to get right values for string. + dict_key = index_key_name(int_index) else: dict_key = "no-index" no_indexes[dict_key].append(output_name) @@ -175,9 +194,12 @@ def build_result(**outputs: Dict[str, Any]) -> pd.DataFrame: *output_index_type_tuple ) if not indexes_match: + import pprint + + pretty_string = pprint.pformat(dict(output_index_type_tuple[0])) raise ValueError( "Error: pandas index types did not match exactly. " - f"Found the following indexes:\n{dict(output_index_type_tuple[0])}" + f"Found the following indexes:\n{pretty_string}" ) return PandasDataFrameResult.build_result(**outputs)