From 89374d93ef1a0a6fdd5efe34c8e3c02991c619b8 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 17 Jun 2024 00:15:34 +0200 Subject: [PATCH 1/2] Remove inference from data types --- .../polars/_utils/construction/dataframe.py | 38 ++++--------------- .../unit/constructors/test_constructors.py | 11 ------ 2 files changed, 8 insertions(+), 41 deletions(-) diff --git a/py-polars/polars/_utils/construction/dataframe.py b/py-polars/polars/_utils/construction/dataframe.py index 91af9f0ca685..90a9d2727d4b 100644 --- a/py-polars/polars/_utils/construction/dataframe.py +++ b/py-polars/polars/_utils/construction/dataframe.py @@ -537,11 +537,14 @@ def _sequence_of_sequence_to_pydf( infer_schema_length: int | None, ) -> PyDataFrame: if orient is None: - orient = _infer_data_orientation( - first_element, - len_data=len(data), - len_schema=len(schema) if schema is not None else None, - ) + if schema is None: + orient = "col" + else: + # Try to infer orientation from schema length and data dimensions + is_row_oriented = (len(schema) == len(first_element)) and ( + len(schema) != len(data) + ) + orient = "row" if is_row_oriented else "col" if orient == "row": column_names, schema_overrides = _unpack_schema( @@ -599,31 +602,6 @@ def _sequence_of_sequence_to_pydf( raise ValueError(msg) -def _infer_data_orientation( - first_element: Sequence[Any] | np.ndarray[Any, Any], - len_data: int, - len_schema: int | None = None, -) -> Orientation: - # Check if element types in the first 'row' resolve to a single dtype. - # Note: limit type-checking to smaller data; larger values are much more - # likely to indicate col orientation anyway, so minimize extra checks. - if len(first_element) <= 1000 and (len_schema is None or len_schema == len_data): - row_types = {type(value) for value in first_element if value is not None} - if int in row_types and float in row_types: - row_types.discard(int) - return "row" if len(row_types) > 1 else "col" - - elif ( - len_schema is not None - and len_schema == len(first_element) - and len_schema != len_data - ): - return "row" - - else: - return "col" - - def _sequence_of_series_to_pydf( first_element: Series, data: Sequence[Any], diff --git a/py-polars/tests/unit/constructors/test_constructors.py b/py-polars/tests/unit/constructors/test_constructors.py index a0df56e0299d..1144ca419df0 100644 --- a/py-polars/tests/unit/constructors/test_constructors.py +++ b/py-polars/tests/unit/constructors/test_constructors.py @@ -13,7 +13,6 @@ from pydantic import BaseModel, Field, TypeAdapter import polars as pl -from polars._utils.construction.dataframe import _infer_data_orientation from polars._utils.construction.utils import try_get_type_hints from polars.datatypes import PolarsDataType, numpy_char_code_to_dtype from polars.dependencies import dataclasses, pydantic @@ -1625,13 +1624,3 @@ def test_array_construction() -> None: df = pl.from_dicts(rows, schema=schema) assert df.schema == schema assert df.rows() == [("a", [1, 2, 3]), ("b", [2, 3, 4])] - - -def test_infer_data_orientation() -> None: - assert _infer_data_orientation([1], 1) == "col" - assert _infer_data_orientation([1, 2], 2) == "col" - assert _infer_data_orientation([1, 2], 2, 2) == "col" - assert _infer_data_orientation([1, 2, 3], 2) == "col" - assert _infer_data_orientation([1, 2, 3], 2, 2) == "col" - assert _infer_data_orientation([1, 2, 3], 2, 3) == "row" - assert _infer_data_orientation([1, "x"], 2) == "row" From 49e1b6c557df04f6163d61fef5b783642a6a57b8 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 17 Jun 2024 00:53:18 +0200 Subject: [PATCH 2/2] Add warning to row inference --- .../docs/source/reference/exceptions.rst | 1 + py-polars/polars/__init__.py | 2 + .../polars/_utils/construction/dataframe.py | 10 ++++- py-polars/polars/exceptions.py | 41 +++++++++++++++++++ .../tests/unit/constructors/test_dataframe.py | 6 +++ 5 files changed, 59 insertions(+), 1 deletion(-) diff --git a/py-polars/docs/source/reference/exceptions.rst b/py-polars/docs/source/reference/exceptions.rst index fd1f95dfab8d..e5bcf6d7bce3 100644 --- a/py-polars/docs/source/reference/exceptions.rst +++ b/py-polars/docs/source/reference/exceptions.rst @@ -45,6 +45,7 @@ Warnings CategoricalRemappingWarning ChronoFormatWarning CustomUFuncWarning + DataOrientationWarning MapWithoutReturnDtypeWarning PerformanceWarning PolarsInefficientMapWarning diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index ba6728b1ed71..06bee4feb90b 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -83,6 +83,7 @@ ColumnNotFoundError, ComputeError, CustomUFuncWarning, + DataOrientationWarning, DuplicateError, InvalidAssert, InvalidOperationError, @@ -273,6 +274,7 @@ "CategoricalRemappingWarning", "ChronoFormatWarning", "CustomUFuncWarning", + "DataOrientationWarning", "MapWithoutReturnDtypeWarning", "PerformanceWarning", "PolarsInefficientMapWarning", diff --git a/py-polars/polars/_utils/construction/dataframe.py b/py-polars/polars/_utils/construction/dataframe.py index 90a9d2727d4b..7f6a41d2fe97 100644 --- a/py-polars/polars/_utils/construction/dataframe.py +++ b/py-polars/polars/_utils/construction/dataframe.py @@ -30,6 +30,7 @@ from polars._utils.various import ( _is_generator, arrlen, + issue_warning, parse_version, ) from polars._utils.wrap import wrap_df, wrap_s @@ -53,7 +54,7 @@ from polars.dependencies import numpy as np from polars.dependencies import pandas as pd from polars.dependencies import pyarrow as pa -from polars.exceptions import ShapeError +from polars.exceptions import DataOrientationWarning, ShapeError from polars.meta import thread_pool_size with contextlib.suppress(ImportError): # Module not available when building docs @@ -546,6 +547,13 @@ def _sequence_of_sequence_to_pydf( ) orient = "row" if is_row_oriented else "col" + if is_row_oriented: + issue_warning( + "Row orientation inferred during DataFrame construction." + ' Explicitly specify the orientation by passing `orient="row"` to silence this warning.', + DataOrientationWarning, + ) + if orient == "row": column_names, schema_overrides = _unpack_schema( schema, schema_overrides=schema_overrides, n_expected=len(first_element) diff --git a/py-polars/polars/exceptions.py b/py-polars/polars/exceptions.py index 356ea03cad4e..118a690444c5 100644 --- a/py-polars/polars/exceptions.py +++ b/py-polars/polars/exceptions.py @@ -149,6 +149,46 @@ class CustomUFuncWarning(PolarsWarning): # type: ignore[misc] """Warning issued when a custom ufunc is handled differently than numpy ufunc would.""" # noqa: W505 +class DataOrientationWarning(PolarsWarning): # type: ignore[misc] + """ + Warning issued to indicate row orientation was inferred from the inputs. + + Occurs when constructing a DataFrame from a list of rows without explicitly + specifying row orientation. Polars is usually able to infer the data orientation + from the data and schema, but there are cases where this is not possible. This is a + common source of confusion. Use the `orient` parameter to be explicit about the + data orientation. + + Examples + -------- + >>> pl.DataFrame([(1, 2, 3), (4, 5, 6)], schema=["a", "b", "c"]) # doctest: +SKIP + DataOrientationWarning: Row orientation inferred during DataFrame construction. + Explicitly specify the orientation by passing `orient="row"` to silence this warning. + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + Pass `orient="row"` to silence the warning. + + >>> pl.DataFrame([[1, 2, 3], [4, 5, 6]], schema=["a", "b", "c"], orient="row") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + """ # noqa: W505 + + class PolarsInefficientMapWarning(PerformanceWarning): # type: ignore[misc] """Warning issued when a potentially slow `map_*` operation is performed.""" @@ -187,6 +227,7 @@ class UnstableWarning(PolarsWarning): # type: ignore[misc] "CategoricalRemappingWarning", "ChronoFormatWarning", "CustomUFuncWarning", + "DataOrientationWarning", "MapWithoutReturnDtypeWarning", "PerformanceWarning", "PolarsInefficientMapWarning", diff --git a/py-polars/tests/unit/constructors/test_dataframe.py b/py-polars/tests/unit/constructors/test_dataframe.py index 3248b7c3ebe1..b2c59a147218 100644 --- a/py-polars/tests/unit/constructors/test_dataframe.py +++ b/py-polars/tests/unit/constructors/test_dataframe.py @@ -7,6 +7,7 @@ import pytest import polars as pl +from polars.exceptions import DataOrientationWarning def test_df_mixed_dtypes_string() -> None: @@ -196,3 +197,8 @@ def test_df_init_schema_object() -> None: assert df.columns == schema.names() assert df.dtypes == schema.dtypes() + + +def test_df_init_data_orientation_inference_warning() -> None: + with pytest.warns(DataOrientationWarning): + pl.from_records([[1, 2, 3], [4, 5, 6]], schema=["a", "b", "c"])