Skip to content
This repository was archived by the owner on Jul 3, 2023. It is now read-only.

Commit

Permalink
PandasDataFrameResult: Convert non-list values to single row frame (#243
Browse files Browse the repository at this point in the history
)

This fixes behavior around scalars and returning single items.

We will now:

1. handle single scalar/object values when only they are present when creating a dataframe.
2. correctly handle a dictionary value as the only result in a dataframe (this fixes a regression).
3. Otherwise no other behavior should be impacted.
4. More unit test coverage added here.

Pandas `is_list_like` is key here, and covers objects that aren't pandas types, like dictionaries, lists, which provide some "indexing" with which to build a dataframe from.

Thanks @ianhoffman for finding and fixing this!

Contents of squashed commits that make this commit up:
* Handle scalar values in PandasDataFrameResult
* handle requests for all scalar values
* PR feedback:
* Use `is_list_like` instead of explicitly checking for dataframes and series
* Add a bunch of tests for building a result from different types, such as scalars, lists, numpy arrays, mixtures of arrays and dicts, etc.
* omit test for nested arrays since they break in Python 3.6. Would be nice to have this at some point though.
* PR Feedback:
* Tests for objects and scalars + dicts
* Improve comment to mention objects as well as scalars
* PR feedback: use yield from
  • Loading branch information
ianhoffman authored Dec 14, 2022
1 parent 0bd1d05 commit bcdbe13
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 5 deletions.
9 changes: 6 additions & 3 deletions hamilton/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,12 @@ def build_result(**outputs: Dict[str, Any]) -> pd.DataFrame:
(value,) = outputs.values() # this works because it's length 1.
if isinstance(value, pd.DataFrame):
return value
elif isinstance(value, pd.Series):
return pd.DataFrame(outputs)
raise ValueError(f"Cannot build result. Cannot handle type {value}.")

if not any(pd.api.types.is_list_like(value) for value in outputs.values()):
# If we're dealing with all values that don't have any "index" that could be created
# (i.e. scalars, objects) coerce the output to a single-row, multi-column dataframe.
return pd.DataFrame([outputs])

return pd.DataFrame(outputs)


Expand Down
81 changes: 79 additions & 2 deletions tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,36 @@ def test_SimplePythonDataFrameGraphAdapter_check_input_type_mismatch(node_type,
assert actual is False


def _gen_ints(n: int) -> typing.Iterator[int]:
"""Simple function to test that we can build results including generators."""
yield from range(n)


class _Foo:
"""Dummy object used for testing."""

def __init__(self, name: str):
self.name = name

def __eq__(self, other: typing.Any) -> bool:
return isinstance(other, _Foo) and other.name == self.name


@pytest.mark.parametrize(
"outputs,expected_result",
[
({"a": 1}, pd.DataFrame([{"a": 1}])),
({"a": pd.Series([1, 2, 3])}, pd.DataFrame({"a": pd.Series([1, 2, 3])})),
(
{"a": pd.DataFrame({"a": [1, 2, 3], "b": [11, 12, 13]})},
pd.DataFrame({"a": pd.Series([1, 2, 3]), "b": pd.Series([11, 12, 13])}),
),
({"a": [1, 2, 3]}, pd.DataFrame({"a": [1, 2, 3]})),
({"a": np.array([1, 2, 3])}, pd.DataFrame({"a": pd.Series([1, 2, 3])})),
({"a": {"b": 1, "c": "foo"}}, pd.DataFrame({"a": {"b": 1, "c": "foo"}})),
({"a": _gen_ints(3)}, pd.DataFrame({"a": pd.Series([0, 1, 2])})),
({"a": _Foo("bar")}, pd.DataFrame([{"a": _Foo("bar")}])),
({"a": 1, "bar": 2}, pd.DataFrame([{"a": 1, "bar": 2}])),
(
{"a": pd.Series([1, 2, 3]), "b": pd.Series([11, 12, 13])},
pd.DataFrame({"a": pd.Series([1, 2, 3]), "b": pd.Series([11, 12, 13])}),
Expand All @@ -147,13 +169,62 @@ def test_SimplePythonDataFrameGraphAdapter_check_input_type_mismatch(node_type,
{"a": pd.Series([1, 2, 3]), "b": pd.Series([11, 12, 13]), "c": pd.Series([0, 1, 2])}
),
),
(
{"a": [1, 2, 3], "b": [4, 5, 6]},
pd.DataFrame({"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 5, 6])}),
),
(
{"a": np.array([1, 2, 3]), "b": np.array([4, 5, 6])},
pd.DataFrame({"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 5, 6])}),
),
(
{"a": {"b": 1, "c": "foo"}, "d": {"b": 2}},
pd.DataFrame({"a": pd.Series({"b": 1, "c": "foo"}), "d": pd.Series({"b": 2})}),
),
(
{"a": _gen_ints(3), "b": _gen_ints(3)},
pd.DataFrame({"a": pd.Series([0, 1, 2]), "b": pd.Series([0, 1, 2])}),
),
(
{"a": pd.Series([1, 2, 3]), "b": 4},
pd.DataFrame({"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 4, 4])}),
),
(
{"a": [1, 2, 3], "b": 4},
pd.DataFrame({"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 4, 4])}),
),
(
{"a": {"foo": 1, "bar": 2}, "b": 4},
pd.DataFrame({"a": pd.Series([2, 1]), "b": pd.Series([4, 4])}).rename(
index=lambda i: ["bar", "foo"][i]
),
),
(
{"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 5, 6])},
pd.DataFrame({"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 5, 6])}),
),
],
ids=[
"test-single-scalar",
"test-single-series",
"test-single-dataframe",
"test-single-list",
"test-single-array",
"test-single-dict",
"test-single-generator",
"test-single-object",
"test-multiple-scalars",
"test-multiple-series",
"test-multiple-series-with-scalar",
"test-multiple-series-with-index",
"test-multiple-lists",
"test-multiple-arrays",
"test-multiple-dicts",
"test-multiple-generators",
"test-scalar-and-series",
"test-scalar-and-list",
"test-scalar-and-dict",
"test-series-and-list",
],
)
def test_PandasDataFrameResult_build_result(outputs, expected_result):
Expand All @@ -166,7 +237,6 @@ def test_PandasDataFrameResult_build_result(outputs, expected_result):
@pytest.mark.parametrize(
"outputs",
[
({"a": 1}),
(
{
"a": pd.DataFrame({"a": [1, 2, 3], "b": [11, 12, 13]}),
Expand All @@ -180,11 +250,18 @@ def test_PandasDataFrameResult_build_result(outputs, expected_result):
"c": pd.DataFrame({"d": [0, 0, 0]}),
}
),
({"a": [1, 2], "b": {"foo": "bar"}}),
({"a": [1, 2], "b": [3, 4, 5]}),
({"a": np.array([1, 2]), "b": np.array([3, 4, 5])}),
({"a": _gen_ints(3), "b": _gen_ints(4)}),
],
ids=[
"test-single-value",
"test-multiple-dataframes",
"test-multiple-series-with-dataframe",
"test-lists-and-dicts",
"test-mismatched-lists",
"test-mismatched-arrays",
"test-mismatched-generators",
],
)
def test_PandasDataFrameResult_build_result_errors(outputs):
Expand Down

0 comments on commit bcdbe13

Please sign in to comment.