Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python)!: Change data orientation inference logic for DataFrame construction and warn when row orientation is inferred #16976

Merged
merged 2 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/exceptions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Warnings
CategoricalRemappingWarning
ChronoFormatWarning
CustomUFuncWarning
DataOrientationWarning
MapWithoutReturnDtypeWarning
PerformanceWarning
PolarsInefficientMapWarning
Expand Down
2 changes: 2 additions & 0 deletions py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
ColumnNotFoundError,
ComputeError,
CustomUFuncWarning,
DataOrientationWarning,
DuplicateError,
InvalidAssert,
InvalidOperationError,
Expand Down Expand Up @@ -273,6 +274,7 @@
"CategoricalRemappingWarning",
"ChronoFormatWarning",
"CustomUFuncWarning",
"DataOrientationWarning",
"MapWithoutReturnDtypeWarning",
"PerformanceWarning",
"PolarsInefficientMapWarning",
Expand Down
48 changes: 17 additions & 31 deletions py-polars/polars/_utils/construction/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from polars._utils.various import (
_is_generator,
arrlen,
issue_warning,
parse_version,
)
from polars._utils.wrap import wrap_df, wrap_s
Expand All @@ -53,7 +54,7 @@
from polars.dependencies import numpy as np
from polars.dependencies import pandas as pd
from polars.dependencies import pyarrow as pa
from polars.exceptions import ShapeError
from polars.exceptions import DataOrientationWarning, ShapeError
from polars.meta import thread_pool_size

with contextlib.suppress(ImportError): # Module not available when building docs
Expand Down Expand Up @@ -537,11 +538,21 @@ def _sequence_of_sequence_to_pydf(
infer_schema_length: int | None,
) -> PyDataFrame:
if orient is None:
orient = _infer_data_orientation(
first_element,
len_data=len(data),
len_schema=len(schema) if schema is not None else None,
)
if schema is None:
orient = "col"
else:
# Try to infer orientation from schema length and data dimensions
is_row_oriented = (len(schema) == len(first_element)) and (
len(schema) != len(data)
)
orient = "row" if is_row_oriented else "col"

if is_row_oriented:
issue_warning(
"Row orientation inferred during DataFrame construction."
' Explicitly specify the orientation by passing `orient="row"` to silence this warning.',
DataOrientationWarning,
)

if orient == "row":
column_names, schema_overrides = _unpack_schema(
Expand Down Expand Up @@ -599,31 +610,6 @@ def _sequence_of_sequence_to_pydf(
raise ValueError(msg)


def _infer_data_orientation(
first_element: Sequence[Any] | np.ndarray[Any, Any],
len_data: int,
len_schema: int | None = None,
) -> Orientation:
# Check if element types in the first 'row' resolve to a single dtype.
# Note: limit type-checking to smaller data; larger values are much more
# likely to indicate col orientation anyway, so minimize extra checks.
if len(first_element) <= 1000 and (len_schema is None or len_schema == len_data):
row_types = {type(value) for value in first_element if value is not None}
if int in row_types and float in row_types:
row_types.discard(int)
return "row" if len(row_types) > 1 else "col"

elif (
len_schema is not None
and len_schema == len(first_element)
and len_schema != len_data
):
return "row"

else:
return "col"


def _sequence_of_series_to_pydf(
first_element: Series,
data: Sequence[Any],
Expand Down
41 changes: 41 additions & 0 deletions py-polars/polars/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,46 @@ class CustomUFuncWarning(PolarsWarning): # type: ignore[misc]
"""Warning issued when a custom ufunc is handled differently than numpy ufunc would.""" # noqa: W505


class DataOrientationWarning(PolarsWarning): # type: ignore[misc]
"""
Warning issued to indicate row orientation was inferred from the inputs.

Occurs when constructing a DataFrame from a list of rows without explicitly
specifying row orientation. Polars is usually able to infer the data orientation
from the data and schema, but there are cases where this is not possible. This is a
common source of confusion. Use the `orient` parameter to be explicit about the
data orientation.

Examples
--------
>>> pl.DataFrame([(1, 2, 3), (4, 5, 6)], schema=["a", "b", "c"]) # doctest: +SKIP
DataOrientationWarning: Row orientation inferred during DataFrame construction.
Explicitly specify the orientation by passing `orient="row"` to silence this warning.
shape: (2, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1 ┆ 2 ┆ 3 │
│ 4 ┆ 5 ┆ 6 │
└─────┴─────┴─────┘

Pass `orient="row"` to silence the warning.

>>> pl.DataFrame([[1, 2, 3], [4, 5, 6]], schema=["a", "b", "c"], orient="row")
shape: (2, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1 ┆ 2 ┆ 3 │
│ 4 ┆ 5 ┆ 6 │
└─────┴─────┴─────┘
""" # noqa: W505


class PolarsInefficientMapWarning(PerformanceWarning): # type: ignore[misc]
"""Warning issued when a potentially slow `map_*` operation is performed."""

Expand Down Expand Up @@ -187,6 +227,7 @@ class UnstableWarning(PolarsWarning): # type: ignore[misc]
"CategoricalRemappingWarning",
"ChronoFormatWarning",
"CustomUFuncWarning",
"DataOrientationWarning",
"MapWithoutReturnDtypeWarning",
"PerformanceWarning",
"PolarsInefficientMapWarning",
Expand Down
11 changes: 0 additions & 11 deletions py-polars/tests/unit/constructors/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from pydantic import BaseModel, Field, TypeAdapter

import polars as pl
from polars._utils.construction.dataframe import _infer_data_orientation
from polars._utils.construction.utils import try_get_type_hints
from polars.datatypes import PolarsDataType, numpy_char_code_to_dtype
from polars.dependencies import dataclasses, pydantic
Expand Down Expand Up @@ -1625,13 +1624,3 @@ def test_array_construction() -> None:
df = pl.from_dicts(rows, schema=schema)
assert df.schema == schema
assert df.rows() == [("a", [1, 2, 3]), ("b", [2, 3, 4])]


def test_infer_data_orientation() -> None:
assert _infer_data_orientation([1], 1) == "col"
assert _infer_data_orientation([1, 2], 2) == "col"
assert _infer_data_orientation([1, 2], 2, 2) == "col"
assert _infer_data_orientation([1, 2, 3], 2) == "col"
assert _infer_data_orientation([1, 2, 3], 2, 2) == "col"
assert _infer_data_orientation([1, 2, 3], 2, 3) == "row"
assert _infer_data_orientation([1, "x"], 2) == "row"
6 changes: 6 additions & 0 deletions py-polars/tests/unit/constructors/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pytest

import polars as pl
from polars.exceptions import DataOrientationWarning


def test_df_mixed_dtypes_string() -> None:
Expand Down Expand Up @@ -196,3 +197,8 @@ def test_df_init_schema_object() -> None:

assert df.columns == schema.names()
assert df.dtypes == schema.dtypes()


def test_df_init_data_orientation_inference_warning() -> None:
with pytest.warns(DataOrientationWarning):
pl.from_records([[1, 2, 3], [4, 5, 6]], schema=["a", "b", "c"])