From 90d9742f38f1feeba80ed27e4f0326477df7491b Mon Sep 17 00:00:00 2001 From: Carlo Lepelaars Date: Mon, 14 Oct 2024 17:48:36 +0200 Subject: [PATCH 01/12] Implement skew for Arrow, Pandas-like and Polars --- docs/api-reference/expr.md | 1 + docs/api-reference/series.md | 1 + narwhals/_arrow/expr.py | 3 +++ narwhals/_arrow/namespace.py | 5 ++++ narwhals/_arrow/series.py | 12 ++++++++++ narwhals/_pandas_like/expr.py | 3 +++ narwhals/_pandas_like/namespace.py | 8 +++++++ narwhals/_pandas_like/series.py | 19 +++++++++++++++ narwhals/_polars/namespace.py | 9 ++++++- narwhals/expr.py | 37 +++++++++++++++++++++++++++++ narwhals/series.py | 34 ++++++++++++++++++++++++++ tests/expr_and_series/unary_test.py | 14 +++++++---- 12 files changed, 141 insertions(+), 5 deletions(-) diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md index 7188b2c36..4c884147a 100644 --- a/docs/api-reference/expr.md +++ b/docs/api-reference/expr.md @@ -40,6 +40,7 @@ - sample - shift - sort + - skew - std - sum - tail diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index e8572dda8..a4e3c4786 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -50,6 +50,7 @@ - shape - shift - sort + - skew - std - sum - tail diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 55c529d30..fa3237c67 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -213,6 +213,9 @@ def n_unique(self) -> Self: def std(self, ddof: int = 1) -> Self: return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True) + def skew(self) -> Self: + return reuse_series_implementation(self, "skew", returns_scalar=True) + def cast(self, dtype: DType) -> Self: return reuse_series_implementation(self, "cast", dtype) diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index 5eae258fc..f823b2e31 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -325,6 +325,11 @@ def mean(self, *column_names: str) -> ArrowExpr: *column_names, backend_version=self._backend_version, dtypes=self._dtypes ).mean() + def skew(self, *column_names: str) -> ArrowExpr: + return ArrowExpr.from_column_names( + *column_names, backend_version=self._backend_version, dtypes=self._dtypes + ).skew() + def max(self, *column_names: str) -> ArrowExpr: return ArrowExpr.from_column_names( *column_names, backend_version=self._backend_version, dtypes=self._dtypes diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 65a393ca9..e4e2c2f5a 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -13,6 +13,7 @@ from narwhals._arrow.utils import narwhals_to_native_dtype from narwhals._arrow.utils import native_to_narwhals_dtype from narwhals._arrow.utils import validate_column_comparand +from narwhals.dependencies import get_numpy from narwhals.utils import Implementation from narwhals.utils import generate_unique_token @@ -298,6 +299,17 @@ def std(self, ddof: int = 1) -> int: return pc.stddev(self._native_series, ddof=ddof) # type: ignore[no-any-return] + def skew(self) -> float: + values = self._native_series.to_numpy() + np = get_numpy() + m = np.mean(values) + s = np.std(values) + n = len(values) + if n < 3: + return float("nan") + g1 = np.sum((values - m) ** 3) / (n * s**3) + return float(g1) # Population skewness + def count(self) -> int: import pyarrow.compute as pc # ignore-banned-import() diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 07ba3e56d..e47e7c677 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -221,6 +221,9 @@ def mean(self) -> Self: def std(self, *, ddof: int = 1) -> Self: return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True) + def skew(self) -> Self: + return reuse_series_implementation(self, "skew", returns_scalar=True) + def any(self) -> Self: return reuse_series_implementation(self, "any", returns_scalar=True) diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 63d3454a5..99622dc9c 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -177,6 +177,14 @@ def mean(self, *column_names: str) -> PandasLikeExpr: dtypes=self._dtypes, ).mean() + def skew(self, *column_names: str) -> PandasLikeExpr: + return PandasLikeExpr.from_column_names( + *column_names, + implementation=self._implementation, + backend_version=self._backend_version, + dtypes=self._dtypes, + ).skew() + def max(self, *column_names: str) -> PandasLikeExpr: return PandasLikeExpr.from_column_names( *column_names, diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 2fe53b22a..a76b0ecb1 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -15,6 +15,8 @@ from narwhals._pandas_like.utils import set_axis from narwhals._pandas_like.utils import to_datetime from narwhals._pandas_like.utils import validate_column_comparand +from narwhals.dependencies import get_numpy +from narwhals.dependencies import get_pandas from narwhals.utils import Implementation if TYPE_CHECKING: @@ -424,6 +426,23 @@ def std( ser = self._native_series return ser.std(ddof=ddof) + def skew(self) -> Any: + np = get_numpy() + pd = get_pandas() + ser = self._native_series + if hasattr(ser, "skew") and not isinstance(ser.dtype, pd.ArrowDtype): + return float(ser.skew()) + else: + values = ser.to_numpy() + n = len(values) + if n < 3: + return float("nan") + m = np.mean(values) + m2 = np.mean((values - m) ** 2) + m3 = np.mean((values - m) ** 3) + g1 = m3 / (m2**1.5) + return float(g1) # Population skewness + def len(self) -> Any: return len(self._native_series) diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py index 4eb8451b7..1f1e18150 100644 --- a/narwhals/_polars/namespace.py +++ b/narwhals/_polars/namespace.py @@ -7,6 +7,7 @@ from typing import Sequence from narwhals._expression_parsing import parse_into_exprs +from narwhals._polars.expr import PolarsExpr from narwhals._polars.utils import extract_args_kwargs from narwhals._polars.utils import narwhals_to_native_dtype from narwhals.utils import Implementation @@ -14,7 +15,6 @@ if TYPE_CHECKING: from narwhals._polars.dataframe import PolarsDataFrame from narwhals._polars.dataframe import PolarsLazyFrame - from narwhals._polars.expr import PolarsExpr from narwhals._polars.typing import IntoPolarsExpr from narwhals.dtypes import DType from narwhals.typing import DTypes @@ -98,6 +98,13 @@ def mean(self, *column_names: str) -> PolarsExpr: return PolarsExpr(pl.mean([*column_names]), dtypes=self._dtypes) # type: ignore[arg-type] return PolarsExpr(pl.mean(*column_names), dtypes=self._dtypes) + def skew(self, *column_names: str) -> PolarsExpr: + import polars as pl # ignore-banned-import() + + if self._backend_version < (0, 20, 4): # pragma: no cover + return PolarsExpr(pl.skew([*column_names]), dtypes=self._dtypes) + return PolarsExpr(pl.skew(*column_names), dtypes=self._dtypes) + def mean_horizontal(self, *exprs: IntoPolarsExpr) -> PolarsExpr: import polars as pl # ignore-banned-import() diff --git a/narwhals/expr.py b/narwhals/expr.py index edd52b305..b3425f714 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -433,6 +433,43 @@ def std(self, *, ddof: int = 1) -> Self: """ return self.__class__(lambda plx: self._call(plx).std(ddof=ddof)) + def skew(self) -> Self: + """ + Calculate the sample skewness of a column. + + Returns: + An expression representing the sample skewness of the column. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> df_pd = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]}) + >>> df_pl = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]}) + + Let's define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(df): + ... return df.select(nw.col("a", "b").skew()) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) + a b + 0 0.000000 2.194964 + >>> func(df_pl) + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 0.000000 ┆ 2.194964 │ + └──────────┴──────────┘ + """ + return self.__class__(lambda plx: self._call(plx).skew()) + def sum(self) -> Expr: """ Return the sum value. diff --git a/narwhals/series.py b/narwhals/series.py index 0115ac34f..c8bfa0447 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -519,6 +519,40 @@ def mean(self) -> Any: """ return self._compliant_series.mean() + def skew(self) -> Any: + """ + Calculate the sample skewness of the Series. + + Returns: + The sample skewness of the Series. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> s = [1, 2, 3, 4, 5] + >>> s_pd = pd.Series(s) + >>> s_pl = pl.Series(s) + + We define a library agnostic function: + + >>> @nw.narwhalify + ... def func(s): + ... return s.skew() + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) + 0.0 + >>> func(s_pl) + 0.0 + + Notes: + The skewness is a measure of the asymmetry of the probability distribution. + A perfectly symmetric distribution has a skewness of 0. + """ + return self._compliant_series.skew() + def count(self) -> Any: """ Returns the number of non-null elements in the Series. diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index dabab0c03..76579690b 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -16,15 +16,19 @@ def test_unary(constructor: Constructor, request: pytest.FixtureRequest) -> None .with_columns( a_mean=nw.col("a").mean(), a_sum=nw.col("a").sum(), + a_skew=nw.col("a").skew(), b_nunique=nw.col("b").n_unique(), z_min=nw.col("z").min(), z_max=nw.col("z").max(), ) - .select(nw.col("a_mean", "a_sum", "b_nunique", "z_min", "z_max").unique()) + .select( + nw.col("a_mean", "a_sum", "a_skew", "b_nunique", "z_min", "z_max").unique() + ) ) expected = { "a_mean": [2], "a_sum": [6], + "a_skew": [0.0], "b_nunique": [2], "z_min": [7], "z_max": [9], @@ -38,15 +42,17 @@ def test_unary_series(constructor_eager: Any) -> None: result = { "a_mean": [df["a"].mean()], "a_sum": [df["a"].sum()], + "a_skew": [df["a"].skew()], "b_nunique": [df["b"].n_unique()], "z_min": [df["z"].min()], "z_max": [df["z"].max()], } expected = { - "a_mean": [2], + "a_mean": [2.0], "a_sum": [6], + "a_skew": [0.0], "b_nunique": [2], - "z_min": [7], - "z_max": [9], + "z_min": [7.0], + "z_max": [9.0], } compare_dicts(result, expected) From c82fec1cd2267c81db28160ab49fbd0a5c1e6c09 Mon Sep 17 00:00:00 2001 From: Carlo Lepelaars Date: Mon, 14 Oct 2024 18:28:03 +0200 Subject: [PATCH 02/12] Fix doctests --- narwhals/expr.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index b3425f714..3ec25518f 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -399,7 +399,7 @@ def std(self, *, ddof: int = 1) -> Self: Get standard deviation. Arguments: - ddof: “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + ddof: "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof, where N represents the number of elements. By default ddof is 1. Examples: @@ -456,17 +456,17 @@ def skew(self) -> Self: We can then pass either pandas or Polars to `func`: >>> func(df_pd) - a b - 0 0.000000 2.194964 + a b + 0 0.0 2.194964 >>> func(df_pl) shape: (1, 2) - ┌──────────┬──────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪══════════╡ - │ 0.000000 ┆ 2.194964 │ - └──────────┴──────────┘ + ┌─────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════╡ + │ 0.0 ┆ 1.472427 │ + └─────┴──────────┘ """ return self.__class__(lambda plx: self._call(plx).skew()) From e118e4d210efb084d5a371e022afc9122caeec31 Mon Sep 17 00:00:00 2001 From: Carlo Lepelaars Date: Mon, 14 Oct 2024 21:43:29 +0200 Subject: [PATCH 03/12] Remove skew in namespace. Remove n > 3 requirement. Fix expr doc --- narwhals/_arrow/namespace.py | 5 ----- narwhals/_arrow/series.py | 6 ++---- narwhals/_pandas_like/namespace.py | 8 -------- narwhals/_pandas_like/series.py | 3 --- narwhals/_polars/namespace.py | 9 +-------- narwhals/expr.py | 2 +- 6 files changed, 4 insertions(+), 29 deletions(-) diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index f823b2e31..5eae258fc 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -325,11 +325,6 @@ def mean(self, *column_names: str) -> ArrowExpr: *column_names, backend_version=self._backend_version, dtypes=self._dtypes ).mean() - def skew(self, *column_names: str) -> ArrowExpr: - return ArrowExpr.from_column_names( - *column_names, backend_version=self._backend_version, dtypes=self._dtypes - ).skew() - def max(self, *column_names: str) -> ArrowExpr: return ArrowExpr.from_column_names( *column_names, backend_version=self._backend_version, dtypes=self._dtypes diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index e4e2c2f5a..671248192 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -13,7 +13,6 @@ from narwhals._arrow.utils import narwhals_to_native_dtype from narwhals._arrow.utils import native_to_narwhals_dtype from narwhals._arrow.utils import validate_column_comparand -from narwhals.dependencies import get_numpy from narwhals.utils import Implementation from narwhals.utils import generate_unique_token @@ -300,13 +299,12 @@ def std(self, ddof: int = 1) -> int: return pc.stddev(self._native_series, ddof=ddof) # type: ignore[no-any-return] def skew(self) -> float: + import numpy as np # ignore-banned-import + values = self._native_series.to_numpy() - np = get_numpy() m = np.mean(values) s = np.std(values) n = len(values) - if n < 3: - return float("nan") g1 = np.sum((values - m) ** 3) / (n * s**3) return float(g1) # Population skewness diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 99622dc9c..63d3454a5 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -177,14 +177,6 @@ def mean(self, *column_names: str) -> PandasLikeExpr: dtypes=self._dtypes, ).mean() - def skew(self, *column_names: str) -> PandasLikeExpr: - return PandasLikeExpr.from_column_names( - *column_names, - implementation=self._implementation, - backend_version=self._backend_version, - dtypes=self._dtypes, - ).skew() - def max(self, *column_names: str) -> PandasLikeExpr: return PandasLikeExpr.from_column_names( *column_names, diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index a76b0ecb1..6fc19359f 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -434,9 +434,6 @@ def skew(self) -> Any: return float(ser.skew()) else: values = ser.to_numpy() - n = len(values) - if n < 3: - return float("nan") m = np.mean(values) m2 = np.mean((values - m) ** 2) m3 = np.mean((values - m) ** 3) diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py index 1f1e18150..4eb8451b7 100644 --- a/narwhals/_polars/namespace.py +++ b/narwhals/_polars/namespace.py @@ -7,7 +7,6 @@ from typing import Sequence from narwhals._expression_parsing import parse_into_exprs -from narwhals._polars.expr import PolarsExpr from narwhals._polars.utils import extract_args_kwargs from narwhals._polars.utils import narwhals_to_native_dtype from narwhals.utils import Implementation @@ -15,6 +14,7 @@ if TYPE_CHECKING: from narwhals._polars.dataframe import PolarsDataFrame from narwhals._polars.dataframe import PolarsLazyFrame + from narwhals._polars.expr import PolarsExpr from narwhals._polars.typing import IntoPolarsExpr from narwhals.dtypes import DType from narwhals.typing import DTypes @@ -98,13 +98,6 @@ def mean(self, *column_names: str) -> PolarsExpr: return PolarsExpr(pl.mean([*column_names]), dtypes=self._dtypes) # type: ignore[arg-type] return PolarsExpr(pl.mean(*column_names), dtypes=self._dtypes) - def skew(self, *column_names: str) -> PolarsExpr: - import polars as pl # ignore-banned-import() - - if self._backend_version < (0, 20, 4): # pragma: no cover - return PolarsExpr(pl.skew([*column_names]), dtypes=self._dtypes) - return PolarsExpr(pl.skew(*column_names), dtypes=self._dtypes) - def mean_horizontal(self, *exprs: IntoPolarsExpr) -> PolarsExpr: import polars as pl # ignore-banned-import() diff --git a/narwhals/expr.py b/narwhals/expr.py index 3ec25518f..67e281b76 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -457,7 +457,7 @@ def skew(self) -> Self: >>> func(df_pd) a b - 0 0.0 2.194964 + 0 0.0 1.472427 >>> func(df_pl) shape: (1, 2) ┌─────┬──────────┐ From 2530f81d2d0aeebcddaebe6ccdcd77159ac1c90f Mon Sep 17 00:00:00 2001 From: Carlo Lepelaars Date: Mon, 14 Oct 2024 21:57:01 +0200 Subject: [PATCH 04/12] Use biases population skewness --- narwhals/_arrow/series.py | 8 ++++---- narwhals/_pandas_like/series.py | 17 ++++++----------- tests/expr_and_series/unary_test.py | 8 +++++++- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 671248192..1570d964d 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -303,10 +303,10 @@ def skew(self) -> float: values = self._native_series.to_numpy() m = np.mean(values) - s = np.std(values) - n = len(values) - g1 = np.sum((values - m) ** 3) / (n * s**3) - return float(g1) # Population skewness + m2 = np.mean((values - m) ** 2) + m3 = np.mean((values - m) ** 3) + g1 = m3 / (m2**1.5) + return float(g1) # Biased population skewness def count(self) -> int: import pyarrow.compute as pc # ignore-banned-import() diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 6fc19359f..63f3224e3 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -16,7 +16,6 @@ from narwhals._pandas_like.utils import to_datetime from narwhals._pandas_like.utils import validate_column_comparand from narwhals.dependencies import get_numpy -from narwhals.dependencies import get_pandas from narwhals.utils import Implementation if TYPE_CHECKING: @@ -428,17 +427,13 @@ def std( def skew(self) -> Any: np = get_numpy() - pd = get_pandas() ser = self._native_series - if hasattr(ser, "skew") and not isinstance(ser.dtype, pd.ArrowDtype): - return float(ser.skew()) - else: - values = ser.to_numpy() - m = np.mean(values) - m2 = np.mean((values - m) ** 2) - m3 = np.mean((values - m) ** 3) - g1 = m3 / (m2**1.5) - return float(g1) # Population skewness + values = ser.to_numpy() + m = np.mean(values) + m2 = np.mean((values - m) ** 2) + m3 = np.mean((values - m) ** 3) + g1 = m3 / (m2**1.5) + return float(g1) # Biased population skewness def len(self) -> Any: return len(self._native_series) diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index 76579690b..cd8909e15 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -18,11 +18,14 @@ def test_unary(constructor: Constructor, request: pytest.FixtureRequest) -> None a_sum=nw.col("a").sum(), a_skew=nw.col("a").skew(), b_nunique=nw.col("b").n_unique(), + b_skew=nw.col("b").skew(), z_min=nw.col("z").min(), z_max=nw.col("z").max(), ) .select( - nw.col("a_mean", "a_sum", "a_skew", "b_nunique", "z_min", "z_max").unique() + nw.col( + "a_mean", "a_sum", "a_skew", "b_nunique", "b_skew", "z_min", "z_max" + ).unique() ) ) expected = { @@ -30,6 +33,7 @@ def test_unary(constructor: Constructor, request: pytest.FixtureRequest) -> None "a_sum": [6], "a_skew": [0.0], "b_nunique": [2], + "b_skew": [0.7071067811865465], "z_min": [7], "z_max": [9], } @@ -44,6 +48,7 @@ def test_unary_series(constructor_eager: Any) -> None: "a_sum": [df["a"].sum()], "a_skew": [df["a"].skew()], "b_nunique": [df["b"].n_unique()], + "b_skew": [df["b"].skew()], "z_min": [df["z"].min()], "z_max": [df["z"].max()], } @@ -52,6 +57,7 @@ def test_unary_series(constructor_eager: Any) -> None: "a_sum": [6], "a_skew": [0.0], "b_nunique": [2], + "b_skew": [0.7071067811865465], "z_min": [7.0], "z_max": [9.0], } From fc375291a2be1e0d9e938a3d550543ca60ba6522 Mon Sep 17 00:00:00 2001 From: Carlo Lepelaars Date: Tue, 15 Oct 2024 18:09:12 +0200 Subject: [PATCH 05/12] Add pyarrow example for skew Expr --- narwhals/expr.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index 67e281b76..3371afaf6 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -443,9 +443,11 @@ def skew(self) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> df_pd = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]}) >>> df_pl = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]}) + >>> df_pa = pa.Table.from_pandas(df_pd) Let's define a dataframe-agnostic function: @@ -453,7 +455,7 @@ def skew(self) -> Self: ... def func(df): ... return df.select(nw.col("a", "b").skew()) - We can then pass either pandas or Polars to `func`: + We can then pass pandas, Polars, or PyArrow to `func`: >>> func(df_pd) a b @@ -467,6 +469,13 @@ def skew(self) -> Self: ╞═════╪══════════╡ │ 0.0 ┆ 1.472427 │ └─────┴──────────┘ + >>> func(df_pa) + pyarrow.Table + a: double + b: double + ---- + a: [[0]] + b: [[1.4724267269058975]] """ return self.__class__(lambda plx: self._call(plx).skew()) From 02fdb4c40d882eebee6a569b7de4ff7faf03c05f Mon Sep 17 00:00:00 2001 From: Carlo Lepelaars Date: Tue, 15 Oct 2024 20:23:23 +0200 Subject: [PATCH 06/12] Fix: Add a_skew to schema --- tests/expr_and_series/unary_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index f89b75509..7d31f1a4c 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -18,8 +18,8 @@ def test_unary(constructor: Constructor) -> None: z_min=nw.col("z").min(), z_max=nw.col("z").max(), ) - .unique(["a_mean", "a_sum", "b_nunique", "b_skew", "z_min", "z_max"]) - .select(["a_mean", "a_sum", "b_nunique", "b_skew", "z_min", "z_max"]) + .unique(["a_mean", "a_sum", "a_skew", "b_nunique", "b_skew", "z_min", "z_max"]) + .select(["a_mean", "a_sum", "a_skew", "b_nunique", "b_skew", "z_min", "z_max"]) ) expected = { "a_mean": [2], From 895be9cca347ea2b78737ec85c9c80331b0c45d9 Mon Sep 17 00:00:00 2001 From: Carlo Lepelaars Date: Thu, 17 Oct 2024 13:20:03 +0200 Subject: [PATCH 07/12] Use native operation for PandasLikeSeries skew. Dask skew expr --- narwhals/_dask/expr.py | 7 +++++++ narwhals/_pandas_like/series.py | 11 ++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 693fcad5e..1137ab0c8 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -401,6 +401,13 @@ def std(self, ddof: int = 1) -> Self: returns_scalar=True, ) + def skew(self) -> Any: + return self._from_call( + lambda _input: _input.skew(), + "skew", + returns_scalar=True, + ) + def shift(self, n: int) -> Self: return self._from_call( lambda _input, n: _input.shift(n), diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 63f3224e3..16d61f5ce 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -15,7 +15,6 @@ from narwhals._pandas_like.utils import set_axis from narwhals._pandas_like.utils import to_datetime from narwhals._pandas_like.utils import validate_column_comparand -from narwhals.dependencies import get_numpy from narwhals.utils import Implementation if TYPE_CHECKING: @@ -426,13 +425,11 @@ def std( return ser.std(ddof=ddof) def skew(self) -> Any: - np = get_numpy() ser = self._native_series - values = ser.to_numpy() - m = np.mean(values) - m2 = np.mean((values - m) ** 2) - m3 = np.mean((values - m) ** 3) - g1 = m3 / (m2**1.5) + m = ser.mean() + m2 = ((ser - m) ** 2).mean() + m3 = ((ser - m) ** 3).mean() + g1 = m3 / (m2**1.5) if m2 != 0 else 0 return float(g1) # Biased population skewness def len(self) -> Any: From a3b71bc7238b8eba8da2deadd84b1882d02a3dc3 Mon Sep 17 00:00:00 2001 From: Carlo Lepelaars Date: Thu, 17 Oct 2024 13:36:21 +0200 Subject: [PATCH 08/12] Use native pyarrow operations for skew --- narwhals/_arrow/series.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 1570d964d..444dc39d8 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -298,14 +298,20 @@ def std(self, ddof: int = 1) -> int: return pc.stddev(self._native_series, ddof=ddof) # type: ignore[no-any-return] - def skew(self) -> float: - import numpy as np # ignore-banned-import + def skew(self) -> Any: + import pyarrow.compute as pc # ignore-banned-import() - values = self._native_series.to_numpy() - m = np.mean(values) - m2 = np.mean((values - m) ** 2) - m3 = np.mean((values - m) ** 3) - g1 = m3 / (m2**1.5) + ser = self._native_series + m = pc.mean(ser) + m2 = pc.mean(pc.multiply(pc.subtract(ser, m), pc.subtract(ser, m))) + m3 = pc.mean( + pc.multiply( + pc.multiply(pc.subtract(ser, m), pc.subtract(ser, m)), pc.subtract(ser, m) + ) + ) + m2_py = m2.as_py() + m3_py = m3.as_py() + g1 = float(m3_py) / (float(m2_py) ** 1.5) if float(m2_py) != 0 else 0 return float(g1) # Biased population skewness def count(self) -> int: From 4ff077df1fd1091ecb528764788b2716d3622a5a Mon Sep 17 00:00:00 2001 From: Carlo Lepelaars Date: Fri, 18 Oct 2024 12:30:20 +0200 Subject: [PATCH 09/12] Simplify arrow skew. non-trivial example for series.skew. --- narwhals/_arrow/series.py | 10 +++------- narwhals/series.py | 12 ++++++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 444dc39d8..a0ddfa308 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -302,13 +302,9 @@ def skew(self) -> Any: import pyarrow.compute as pc # ignore-banned-import() ser = self._native_series - m = pc.mean(ser) - m2 = pc.mean(pc.multiply(pc.subtract(ser, m), pc.subtract(ser, m))) - m3 = pc.mean( - pc.multiply( - pc.multiply(pc.subtract(ser, m), pc.subtract(ser, m)), pc.subtract(ser, m) - ) - ) + m = pc.subtract(ser, pc.mean(ser)) + m2 = pc.mean(pc.power(m, 2)) + m3 = pc.mean(pc.power(m, 3)) m2_py = m2.as_py() m3_py = m3.as_py() g1 = float(m3_py) / (float(m2_py) ** 1.5) if float(m2_py) != 0 else 0 diff --git a/narwhals/series.py b/narwhals/series.py index 1af0bf1fa..56ddfe428 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -529,10 +529,12 @@ def skew(self) -> Any: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> s = [1, 2, 3, 4, 5] + >>> s = [1, 1, 2, 10, 100] >>> s_pd = pd.Series(s) >>> s_pl = pl.Series(s) + >>> s_pa = pa.array(s) We define a library agnostic function: @@ -540,12 +542,14 @@ def skew(self) -> Any: ... def func(s): ... return s.skew() - We can then pass either pandas or Polars to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: >>> func(s_pd) - 0.0 + 1.472427 >>> func(s_pl) - 0.0 + 1.472427 + >>> func(s_pa) + 1.472427 Notes: The skewness is a measure of the asymmetry of the probability distribution. From 11efd49c8b36cda4a9e857b02119c67f295a3364 Mon Sep 17 00:00:00 2001 From: Carlo Lepelaars Date: Fri, 18 Oct 2024 17:25:05 +0200 Subject: [PATCH 10/12] unary_test with nan data. 2 element and 1 element unary tests --- narwhals/_arrow/series.py | 2 + narwhals/_pandas_like/series.py | 2 + tests/expr_and_series/unary_test.py | 113 +++++++++++++++++++++++++++- 3 files changed, 115 insertions(+), 2 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index a0ddfa308..72ff25b4e 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -302,6 +302,8 @@ def skew(self) -> Any: import pyarrow.compute as pc # ignore-banned-import() ser = self._native_series + if len(ser) <= 1: + return float("nan") m = pc.subtract(ser, pc.mean(ser)) m2 = pc.mean(pc.power(m, 2)) m3 = pc.mean(pc.power(m, 3)) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 1358772ce..90301a2a3 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -426,6 +426,8 @@ def std( def skew(self) -> Any: ser = self._native_series + if len(ser) <= 1 or ser.isna().any(): + return float("nan") m = ser.mean() m2 = ((ser - m) ** 2).mean() m3 = ((ser - m) ** 3).mean() diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index 6f6d95562..d68247101 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import warnings + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -7,13 +9,20 @@ def test_unary(constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + data = { + "a": [1, 3, 2], + "b": [4, 4, 6], + "c": [7.0, 8.0, float("nan")], + "z": [7.0, 8, 9], + } result = nw.from_native(constructor(data)).select( a_mean=nw.col("a").mean(), a_sum=nw.col("a").sum(), a_skew=nw.col("a").skew(), b_nunique=nw.col("b").n_unique(), b_skew=nw.col("b").skew(), + c_nunique=nw.col("c").n_unique(), + c_skew=nw.col("c").skew(), z_min=nw.col("z").min(), z_max=nw.col("z").max(), ) @@ -23,6 +32,8 @@ def test_unary(constructor: Constructor) -> None: "a_skew": [0.0], "b_nunique": [2], "b_skew": [0.7071067811865465], + "c_nunique": [3], + "c_skew": [float("nan")], "z_min": [7], "z_max": [9], } @@ -30,7 +41,12 @@ def test_unary(constructor: Constructor) -> None: def test_unary_series(constructor_eager: ConstructorEager) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + data = { + "a": [1, 3, 2], + "b": [4, 4, 6], + "c": [7.0, 8.0, float("nan")], + "z": [7.0, 8, 9], + } df = nw.from_native(constructor_eager(data), eager_only=True) result = { "a_mean": [df["a"].mean()], @@ -38,6 +54,8 @@ def test_unary_series(constructor_eager: ConstructorEager) -> None: "a_skew": [df["a"].skew()], "b_nunique": [df["b"].n_unique()], "b_skew": [df["b"].skew()], + "c_nunique": [df["c"].n_unique()], + "c_skew": [df["c"].skew()], "z_min": [df["z"].min()], "z_max": [df["z"].max()], } @@ -47,7 +65,98 @@ def test_unary_series(constructor_eager: ConstructorEager) -> None: "a_skew": [0.0], "b_nunique": [2], "b_skew": [0.7071067811865465], + "c_nunique": [3], + "c_skew": [float("nan")], "z_min": [7.0], "z_max": [9.0], } compare_dicts(result, expected) + + +def test_unary_two_elements(constructor: Constructor) -> None: + data = {"a": [1, 2], "b": [2, 10], "c": [2.0, float("nan")]} + result = nw.from_native(constructor(data)).select( + a_nunique=nw.col("a").n_unique(), + a_skew=nw.col("a").skew(), + b_nunique=nw.col("b").n_unique(), + b_skew=nw.col("b").skew(), + c_nunique=nw.col("c").n_unique(), + c_skew=nw.col("c").skew(), + ) + expected = { + "a_nunique": [2], + "a_skew": [0.0], + "b_nunique": [2], + "b_skew": [0.0], + "c_nunique": [2], + "c_skew": [float("nan")], + } + compare_dicts(result, expected) + + +def test_unary_two_elements_series(constructor_eager: ConstructorEager) -> None: + data = {"a": [1, 2], "b": [2, 10], "c": [2.0, float("nan")]} + df = nw.from_native(constructor_eager(data), eager_only=True) + result = { + "a_nunique": [df["a"].n_unique()], + "a_skew": [df["a"].skew()], + "b_nunique": [df["b"].n_unique()], + "b_skew": [df["b"].skew()], + "c_nunique": [df["c"].n_unique()], + "c_skew": [df["c"].skew()], + } + expected = { + "a_nunique": [2], + "a_skew": [0.0], + "b_nunique": [2], + "b_skew": [0.0], + "c_nunique": [2], + "c_skew": [float("nan")], + } + compare_dicts(result, expected) + + +def test_unary_one_element(constructor: Constructor) -> None: + data = {"a": [1], "b": [2], "c": [float("nan")]} + # Dask runs into a divide by zero RuntimeWarning for 1 element skew. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + result = nw.from_native(constructor(data)).select( + a_nunique=nw.col("a").n_unique(), + a_skew=nw.col("a").skew(), + b_nunique=nw.col("b").n_unique(), + b_skew=nw.col("b").skew(), + c_nunique=nw.col("c").n_unique(), + c_skew=nw.col("c").skew(), + ) + expected = { + "a_nunique": [1], + "a_skew": [float("nan")], + "b_nunique": [1], + "b_skew": [float("nan")], + "c_nunique": [1], + "c_skew": [float("nan")], + } + compare_dicts(result, expected) + + +def test_unary_one_element_series(constructor_eager: ConstructorEager) -> None: + data = {"a": [1], "b": [2], "c": [float("nan")]} + df = nw.from_native(constructor_eager(data)) + result = { + "a_nunique": [df["a"].n_unique()], + "a_skew": [df["a"].skew()], + "b_nunique": [df["b"].n_unique()], + "b_skew": [df["b"].skew()], + "c_nunique": [df["c"].n_unique()], + "c_skew": [df["c"].skew()], + } + expected = { + "a_nunique": [1], + "a_skew": [float("nan")], + "b_nunique": [1], + "b_skew": [float("nan")], + "c_nunique": [1], + "c_skew": [float("nan")], + } + compare_dicts(result, expected) From 26a64f8971c46865237359914d5c66d5b39816c3 Mon Sep 17 00:00:00 2001 From: Carlo Lepelaars Date: Fri, 18 Oct 2024 17:39:43 +0200 Subject: [PATCH 11/12] Fix doctest for Series skew --- narwhals/series.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/narwhals/series.py b/narwhals/series.py index 56ddfe428..a101b1fde 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -545,11 +545,9 @@ def skew(self) -> Any: We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: >>> func(s_pd) - 1.472427 + 1.4724267269058975 >>> func(s_pl) - 1.472427 - >>> func(s_pa) - 1.472427 + 1.4724267269058975 Notes: The skewness is a measure of the asymmetry of the probability distribution. From 2014036acd99be345a4763e6c9276089c7132a97 Mon Sep 17 00:00:00 2001 From: Carlo Lepelaars Date: Wed, 23 Oct 2024 15:16:12 +0200 Subject: [PATCH 12/12] Make skew nan policy consistent with Polars --- narwhals/_arrow/series.py | 22 ++++++++++++++-------- narwhals/_pandas_like/series.py | 18 ++++++++++++------ tests/expr_and_series/unary_test.py | 12 ++++++------ 3 files changed, 32 insertions(+), 20 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 72ff25b4e..7b153e95f 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -302,15 +302,21 @@ def skew(self) -> Any: import pyarrow.compute as pc # ignore-banned-import() ser = self._native_series - if len(ser) <= 1: + ser_not_null = pc.drop_null(ser) + if len(ser_not_null) == 0: + return None + elif len(ser_not_null) == 1: return float("nan") - m = pc.subtract(ser, pc.mean(ser)) - m2 = pc.mean(pc.power(m, 2)) - m3 = pc.mean(pc.power(m, 3)) - m2_py = m2.as_py() - m3_py = m3.as_py() - g1 = float(m3_py) / (float(m2_py) ** 1.5) if float(m2_py) != 0 else 0 - return float(g1) # Biased population skewness + elif len(ser_not_null) == 2: + return 0.0 + else: + m = pc.subtract(ser_not_null, pc.mean(ser_not_null)) + m2 = pc.mean(pc.power(m, 2)) + m3 = pc.mean(pc.power(m, 3)) + m2_py = m2.as_py() + m3_py = m3.as_py() + g1 = float(m3_py) / (float(m2_py) ** 1.5) if float(m2_py) != 0 else 0 + return float(g1) # Biased population skewness def count(self) -> int: import pyarrow.compute as pc # ignore-banned-import() diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 90301a2a3..8eff7198a 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -426,13 +426,19 @@ def std( def skew(self) -> Any: ser = self._native_series - if len(ser) <= 1 or ser.isna().any(): + ser_not_null = ser.dropna() + if len(ser_not_null) == 0: + return None + elif len(ser_not_null) == 1: return float("nan") - m = ser.mean() - m2 = ((ser - m) ** 2).mean() - m3 = ((ser - m) ** 3).mean() - g1 = m3 / (m2**1.5) if m2 != 0 else 0 - return float(g1) # Biased population skewness + elif len(ser_not_null) == 2: + return 0.0 + else: + m = ser_not_null.mean() + m2 = ((ser_not_null - m) ** 2).mean() + m3 = ((ser_not_null - m) ** 3).mean() + g1 = m3 / (m2**1.5) if m2 != 0 else 0 + return float(g1) # Biased population skewness def len(self) -> Any: return len(self._native_series) diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index d68247101..a4235a6fd 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -12,7 +12,7 @@ def test_unary(constructor: Constructor) -> None: data = { "a": [1, 3, 2], "b": [4, 4, 6], - "c": [7.0, 8.0, float("nan")], + "c": [7.0, 8.0, None], "z": [7.0, 8, 9], } result = nw.from_native(constructor(data)).select( @@ -33,7 +33,7 @@ def test_unary(constructor: Constructor) -> None: "b_nunique": [2], "b_skew": [0.7071067811865465], "c_nunique": [3], - "c_skew": [float("nan")], + "c_skew": [0.0], "z_min": [7], "z_max": [9], } @@ -44,7 +44,7 @@ def test_unary_series(constructor_eager: ConstructorEager) -> None: data = { "a": [1, 3, 2], "b": [4, 4, 6], - "c": [7.0, 8.0, float("nan")], + "c": [7.0, 8.0, None], "z": [7.0, 8, 9], } df = nw.from_native(constructor_eager(data), eager_only=True) @@ -66,7 +66,7 @@ def test_unary_series(constructor_eager: ConstructorEager) -> None: "b_nunique": [2], "b_skew": [0.7071067811865465], "c_nunique": [3], - "c_skew": [float("nan")], + "c_skew": [0.0], "z_min": [7.0], "z_max": [9.0], } @@ -74,7 +74,7 @@ def test_unary_series(constructor_eager: ConstructorEager) -> None: def test_unary_two_elements(constructor: Constructor) -> None: - data = {"a": [1, 2], "b": [2, 10], "c": [2.0, float("nan")]} + data = {"a": [1, 2], "b": [2, 10], "c": [2.0, None]} result = nw.from_native(constructor(data)).select( a_nunique=nw.col("a").n_unique(), a_skew=nw.col("a").skew(), @@ -95,7 +95,7 @@ def test_unary_two_elements(constructor: Constructor) -> None: def test_unary_two_elements_series(constructor_eager: ConstructorEager) -> None: - data = {"a": [1, 2], "b": [2, 10], "c": [2.0, float("nan")]} + data = {"a": [1, 2], "b": [2, 10], "c": [2.0, None]} df = nw.from_native(constructor_eager(data), eager_only=True) result = { "a_nunique": [df["a"].n_unique()],