Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: skew #1173

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api-reference/expr.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
- sample
- shift
- sort
- skew
- std
- sum
- tail
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
- shape
- shift
- sort
- skew
- std
- sum
- tail
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,9 @@ def n_unique(self) -> Self:
def std(self, ddof: int = 1) -> Self:
return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True)

def skew(self) -> Self:
return reuse_series_implementation(self, "skew", returns_scalar=True)

def cast(self, dtype: DType) -> Self:
return reuse_series_implementation(self, "cast", dtype)

Expand Down
5 changes: 5 additions & 0 deletions narwhals/_arrow/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,11 @@ def mean(self, *column_names: str) -> ArrowExpr:
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
).mean()

def skew(self, *column_names: str) -> ArrowExpr:
return ArrowExpr.from_column_names(
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
).skew()

CarloLepelaars marked this conversation as resolved.
Show resolved Hide resolved
def max(self, *column_names: str) -> ArrowExpr:
return ArrowExpr.from_column_names(
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
Expand Down
12 changes: 12 additions & 0 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from narwhals._arrow.utils import narwhals_to_native_dtype
from narwhals._arrow.utils import native_to_narwhals_dtype
from narwhals._arrow.utils import validate_column_comparand
from narwhals.dependencies import get_numpy
from narwhals.utils import Implementation
from narwhals.utils import generate_unique_token

Expand Down Expand Up @@ -298,6 +299,17 @@ def std(self, ddof: int = 1) -> int:

return pc.stddev(self._native_series, ddof=ddof) # type: ignore[no-any-return]

def skew(self) -> float:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although it would end up returning a pyarrow scalar, I think we should keep the implementation with native methods, or you can reuse methods implemented, such as all elementary operations

values = self._native_series.to_numpy()
np = get_numpy()
CarloLepelaars marked this conversation as resolved.
Show resolved Hide resolved
m = np.mean(values)
s = np.std(values)
n = len(values)
if n < 3:
return float("nan")
g1 = np.sum((values - m) ** 3) / (n * s**3)
return float(g1) # Population skewness

def count(self) -> int:
import pyarrow.compute as pc # ignore-banned-import()

Expand Down
3 changes: 3 additions & 0 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,9 @@ def mean(self) -> Self:
def std(self, *, ddof: int = 1) -> Self:
return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True)

def skew(self) -> Self:
return reuse_series_implementation(self, "skew", returns_scalar=True)

def any(self) -> Self:
return reuse_series_implementation(self, "any", returns_scalar=True)

Expand Down
8 changes: 8 additions & 0 deletions narwhals/_pandas_like/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,14 @@ def mean(self, *column_names: str) -> PandasLikeExpr:
dtypes=self._dtypes,
).mean()

def skew(self, *column_names: str) -> PandasLikeExpr:
return PandasLikeExpr.from_column_names(
*column_names,
implementation=self._implementation,
backend_version=self._backend_version,
dtypes=self._dtypes,
).skew()
CarloLepelaars marked this conversation as resolved.
Show resolved Hide resolved

def max(self, *column_names: str) -> PandasLikeExpr:
return PandasLikeExpr.from_column_names(
*column_names,
Expand Down
19 changes: 19 additions & 0 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from narwhals._pandas_like.utils import set_axis
from narwhals._pandas_like.utils import to_datetime
from narwhals._pandas_like.utils import validate_column_comparand
from narwhals.dependencies import get_numpy
from narwhals.dependencies import get_pandas
from narwhals.utils import Implementation

if TYPE_CHECKING:
Expand Down Expand Up @@ -424,6 +426,23 @@ def std(
ser = self._native_series
return ser.std(ddof=ddof)

def skew(self) -> Any:
FBruzzesi marked this conversation as resolved.
Show resolved Hide resolved
np = get_numpy()
pd = get_pandas()
ser = self._native_series
if hasattr(ser, "skew") and not isinstance(ser.dtype, pd.ArrowDtype):
return float(ser.skew())
else:
values = ser.to_numpy()
n = len(values)
if n < 3:
return float("nan")
CarloLepelaars marked this conversation as resolved.
Show resolved Hide resolved
m = np.mean(values)
m2 = np.mean((values - m) ** 2)
m3 = np.mean((values - m) ** 3)
g1 = m3 / (m2**1.5)
return float(g1) # Population skewness

def len(self) -> Any:
return len(self._native_series)

Expand Down
9 changes: 8 additions & 1 deletion narwhals/_polars/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
from typing import Sequence

from narwhals._expression_parsing import parse_into_exprs
from narwhals._polars.expr import PolarsExpr
from narwhals._polars.utils import extract_args_kwargs
from narwhals._polars.utils import narwhals_to_native_dtype
from narwhals.utils import Implementation

if TYPE_CHECKING:
from narwhals._polars.dataframe import PolarsDataFrame
from narwhals._polars.dataframe import PolarsLazyFrame
from narwhals._polars.expr import PolarsExpr
from narwhals._polars.typing import IntoPolarsExpr
from narwhals.dtypes import DType
from narwhals.typing import DTypes
Expand Down Expand Up @@ -98,6 +98,13 @@ def mean(self, *column_names: str) -> PolarsExpr:
return PolarsExpr(pl.mean([*column_names]), dtypes=self._dtypes) # type: ignore[arg-type]
return PolarsExpr(pl.mean(*column_names), dtypes=self._dtypes)

def skew(self, *column_names: str) -> PolarsExpr:
import polars as pl # ignore-banned-import()

if self._backend_version < (0, 20, 4): # pragma: no cover
return PolarsExpr(pl.skew([*column_names]), dtypes=self._dtypes)
return PolarsExpr(pl.skew(*column_names), dtypes=self._dtypes)
CarloLepelaars marked this conversation as resolved.
Show resolved Hide resolved

def mean_horizontal(self, *exprs: IntoPolarsExpr) -> PolarsExpr:
import polars as pl # ignore-banned-import()

Expand Down
39 changes: 38 additions & 1 deletion narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ def std(self, *, ddof: int = 1) -> Self:
Get standard deviation.

Arguments:
ddof: Delta Degrees of Freedom: the divisor used in the calculation is N - ddof,
ddof: "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
where N represents the number of elements. By default ddof is 1.

Examples:
Expand Down Expand Up @@ -433,6 +433,43 @@ def std(self, *, ddof: int = 1) -> Self:
"""
return self.__class__(lambda plx: self._call(plx).std(ddof=ddof))

def skew(self) -> Self:
FBruzzesi marked this conversation as resolved.
Show resolved Hide resolved
"""
Calculate the sample skewness of a column.

Returns:
An expression representing the sample skewness of the column.

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> df_pd = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]})
>>> df_pl = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]})

Let's define a dataframe-agnostic function:

>>> @nw.narwhalify
... def func(df):
... return df.select(nw.col("a", "b").skew())

We can then pass either pandas or Polars to `func`:
FBruzzesi marked this conversation as resolved.
Show resolved Hide resolved

>>> func(df_pd)
a b
0 0.0 2.194964
>>> func(df_pl)
shape: (1, 2)
┌─────┬──────────┐
│ a ┆ b │
│ --- ┆ --- │
│ f64 ┆ f64 │
╞═════╪══════════╡
│ 0.0 ┆ 1.472427 │
└─────┴──────────┘
CarloLepelaars marked this conversation as resolved.
Show resolved Hide resolved
"""
return self.__class__(lambda plx: self._call(plx).skew())

def sum(self) -> Expr:
"""
Return the sum value.
Expand Down
34 changes: 34 additions & 0 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,40 @@ def mean(self) -> Any:
"""
return self._compliant_series.mean()

def skew(self) -> Any:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as Expr.skew, polars exposes a bias parameter

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See conversation in narwhals/expr.py

"""
Calculate the sample skewness of the Series.

Returns:
The sample skewness of the Series.

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> s = [1, 2, 3, 4, 5]
CarloLepelaars marked this conversation as resolved.
Show resolved Hide resolved
>>> s_pd = pd.Series(s)
>>> s_pl = pl.Series(s)

We define a library agnostic function:

>>> @nw.narwhalify
... def func(s):
... return s.skew()

We can then pass either pandas or Polars to `func`:

>>> func(s_pd)
0.0
>>> func(s_pl)
0.0

Notes:
The skewness is a measure of the asymmetry of the probability distribution.
A perfectly symmetric distribution has a skewness of 0.
"""
return self._compliant_series.skew()

def count(self) -> Any:
"""
Returns the number of non-null elements in the Series.
Expand Down
14 changes: 10 additions & 4 deletions tests/expr_and_series/unary_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,19 @@ def test_unary(constructor: Constructor, request: pytest.FixtureRequest) -> None
.with_columns(
a_mean=nw.col("a").mean(),
a_sum=nw.col("a").sum(),
a_skew=nw.col("a").skew(),
b_nunique=nw.col("b").n_unique(),
z_min=nw.col("z").min(),
z_max=nw.col("z").max(),
)
.select(nw.col("a_mean", "a_sum", "b_nunique", "z_min", "z_max").unique())
.select(
nw.col("a_mean", "a_sum", "a_skew", "b_nunique", "z_min", "z_max").unique()
)
)
expected = {
"a_mean": [2],
"a_sum": [6],
"a_skew": [0.0],
"b_nunique": [2],
"z_min": [7],
"z_max": [9],
Expand All @@ -38,15 +42,17 @@ def test_unary_series(constructor_eager: Any) -> None:
result = {
"a_mean": [df["a"].mean()],
"a_sum": [df["a"].sum()],
"a_skew": [df["a"].skew()],
"b_nunique": [df["b"].n_unique()],
"z_min": [df["z"].min()],
"z_max": [df["z"].max()],
}
expected = {
"a_mean": [2],
"a_mean": [2.0],
"a_sum": [6],
"a_skew": [0.0],
"b_nunique": [2],
"z_min": [7],
"z_max": [9],
"z_min": [7.0],
"z_max": [9.0],
}
compare_dicts(result, expected)
Loading