PandasDataFrameResult: Convert non-list values to single row frame (#243

) This fixes behavior around scalars and returning single items. We will now: 1. handle single scalar/object values when only they are present when creating a dataframe. 2. correctly handle a dictionary value as the only result in a dataframe (this fixes a regression). 3. Otherwise no other behavior should be impacted. 4. More unit test coverage added here. Pandas `is_list_like` is key here, and covers objects that aren't pandas types, like dictionaries, lists, which provide some "indexing" with which to build a dataframe from. Thanks @ianhoffman for finding and fixing this! Contents of squashed commits that make this commit up: * Handle scalar values in PandasDataFrameResult * handle requests for all scalar values * PR feedback: * Use `is_list_like` instead of explicitly checking for dataframes and series * Add a bunch of tests for building a result from different types, such as scalars, lists, numpy arrays, mixtures of arrays and dicts, etc. * omit test for nested arrays since they break in Python 3.6. Would be nice to have this at some point though. * PR Feedback: * Tests for objects and scalars + dicts * Improve comment to mention objects as well as scalars * PR feedback: use yield from
stitchfix · Dec 14, 2022 · bcdbe13 · bcdbe13
1 parent 0bd1d05
commit bcdbe13
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 5 deletions.
diff --git a/hamilton/base.py b/hamilton/base.py
@@ -166,9 +166,12 @@ def build_result(**outputs: Dict[str, Any]) -> pd.DataFrame:
             (value,) = outputs.values()  # this works because it's length 1.
             if isinstance(value, pd.DataFrame):
                 return value
-            elif isinstance(value, pd.Series):
-                return pd.DataFrame(outputs)
-            raise ValueError(f"Cannot build result. Cannot handle type {value}.")
+
+        if not any(pd.api.types.is_list_like(value) for value in outputs.values()):
+            # If we're dealing with all values that don't have any "index" that could be created
+            # (i.e. scalars, objects) coerce the output to a single-row, multi-column dataframe.
+            return pd.DataFrame([outputs])
+
         return pd.DataFrame(outputs)
 
 

diff --git a/tests/test_base.py b/tests/test_base.py
@@ -119,14 +119,36 @@ def test_SimplePythonDataFrameGraphAdapter_check_input_type_mismatch(node_type,
     assert actual is False
 
 
+def _gen_ints(n: int) -> typing.Iterator[int]:
+    """Simple function to test that we can build results including generators."""
+    yield from range(n)
+
+
+class _Foo:
+    """Dummy object used for testing."""
+
+    def __init__(self, name: str):
+        self.name = name
+
+    def __eq__(self, other: typing.Any) -> bool:
+        return isinstance(other, _Foo) and other.name == self.name
+
+
 @pytest.mark.parametrize(
     "outputs,expected_result",
     [
+        ({"a": 1}, pd.DataFrame([{"a": 1}])),
         ({"a": pd.Series([1, 2, 3])}, pd.DataFrame({"a": pd.Series([1, 2, 3])})),
         (
             {"a": pd.DataFrame({"a": [1, 2, 3], "b": [11, 12, 13]})},
             pd.DataFrame({"a": pd.Series([1, 2, 3]), "b": pd.Series([11, 12, 13])}),
         ),
+        ({"a": [1, 2, 3]}, pd.DataFrame({"a": [1, 2, 3]})),
+        ({"a": np.array([1, 2, 3])}, pd.DataFrame({"a": pd.Series([1, 2, 3])})),
+        ({"a": {"b": 1, "c": "foo"}}, pd.DataFrame({"a": {"b": 1, "c": "foo"}})),
+        ({"a": _gen_ints(3)}, pd.DataFrame({"a": pd.Series([0, 1, 2])})),
+        ({"a": _Foo("bar")}, pd.DataFrame([{"a": _Foo("bar")}])),
+        ({"a": 1, "bar": 2}, pd.DataFrame([{"a": 1, "bar": 2}])),
         (
             {"a": pd.Series([1, 2, 3]), "b": pd.Series([11, 12, 13])},
             pd.DataFrame({"a": pd.Series([1, 2, 3]), "b": pd.Series([11, 12, 13])}),
@@ -147,13 +169,62 @@ def test_SimplePythonDataFrameGraphAdapter_check_input_type_mismatch(node_type,
                 {"a": pd.Series([1, 2, 3]), "b": pd.Series([11, 12, 13]), "c": pd.Series([0, 1, 2])}
             ),
         ),
+        (
+            {"a": [1, 2, 3], "b": [4, 5, 6]},
+            pd.DataFrame({"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 5, 6])}),
+        ),
+        (
+            {"a": np.array([1, 2, 3]), "b": np.array([4, 5, 6])},
+            pd.DataFrame({"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 5, 6])}),
+        ),
+        (
+            {"a": {"b": 1, "c": "foo"}, "d": {"b": 2}},
+            pd.DataFrame({"a": pd.Series({"b": 1, "c": "foo"}), "d": pd.Series({"b": 2})}),
+        ),
+        (
+            {"a": _gen_ints(3), "b": _gen_ints(3)},
+            pd.DataFrame({"a": pd.Series([0, 1, 2]), "b": pd.Series([0, 1, 2])}),
+        ),
+        (
+            {"a": pd.Series([1, 2, 3]), "b": 4},
+            pd.DataFrame({"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 4, 4])}),
+        ),
+        (
+            {"a": [1, 2, 3], "b": 4},
+            pd.DataFrame({"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 4, 4])}),
+        ),
+        (
+            {"a": {"foo": 1, "bar": 2}, "b": 4},
+            pd.DataFrame({"a": pd.Series([2, 1]), "b": pd.Series([4, 4])}).rename(
+                index=lambda i: ["bar", "foo"][i]
+            ),
+        ),
+        (
+            {"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 5, 6])},
+            pd.DataFrame({"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 5, 6])}),
+        ),
     ],
     ids=[
+        "test-single-scalar",
         "test-single-series",
         "test-single-dataframe",
+        "test-single-list",
+        "test-single-array",
+        "test-single-dict",
+        "test-single-generator",
+        "test-single-object",
+        "test-multiple-scalars",
         "test-multiple-series",
         "test-multiple-series-with-scalar",
         "test-multiple-series-with-index",
+        "test-multiple-lists",
+        "test-multiple-arrays",
+        "test-multiple-dicts",
+        "test-multiple-generators",
+        "test-scalar-and-series",
+        "test-scalar-and-list",
+        "test-scalar-and-dict",
+        "test-series-and-list",
     ],
 )
 def test_PandasDataFrameResult_build_result(outputs, expected_result):
@@ -166,7 +237,6 @@ def test_PandasDataFrameResult_build_result(outputs, expected_result):
 @pytest.mark.parametrize(
     "outputs",
     [
-        ({"a": 1}),
         (
             {
                 "a": pd.DataFrame({"a": [1, 2, 3], "b": [11, 12, 13]}),
@@ -180,11 +250,18 @@ def test_PandasDataFrameResult_build_result(outputs, expected_result):
                 "c": pd.DataFrame({"d": [0, 0, 0]}),
             }
         ),
+        ({"a": [1, 2], "b": {"foo": "bar"}}),
+        ({"a": [1, 2], "b": [3, 4, 5]}),
+        ({"a": np.array([1, 2]), "b": np.array([3, 4, 5])}),
+        ({"a": _gen_ints(3), "b": _gen_ints(4)}),
     ],
     ids=[
-        "test-single-value",
         "test-multiple-dataframes",
         "test-multiple-series-with-dataframe",
+        "test-lists-and-dicts",
+        "test-mismatched-lists",
+        "test-mismatched-arrays",
+        "test-mismatched-generators",
     ],
 )
 def test_PandasDataFrameResult_build_result_errors(outputs):