narwhals-dev · CarloLepelaars · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024
diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md
@@ -40,6 +40,7 @@
         - sample
         - shift
         - sort
+        - skew
         - std
         - sum
         - tail

diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md
@@ -50,6 +50,7 @@
         - shape
         - shift
         - sort
+        - skew
         - std
         - sum
         - tail

diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py
@@ -213,6 +213,9 @@ def n_unique(self) -> Self:
     def std(self, ddof: int = 1) -> Self:
         return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True)
 
+    def skew(self) -> Self:
+        return reuse_series_implementation(self, "skew", returns_scalar=True)
+
     def cast(self, dtype: DType) -> Self:
         return reuse_series_implementation(self, "cast", dtype)
 

diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py
@@ -325,6 +325,11 @@ def mean(self, *column_names: str) -> ArrowExpr:
             *column_names, backend_version=self._backend_version, dtypes=self._dtypes
         ).mean()
 
+    def skew(self, *column_names: str) -> ArrowExpr:
+        return ArrowExpr.from_column_names(
+            *column_names, backend_version=self._backend_version, dtypes=self._dtypes
+        ).skew()
+
     def max(self, *column_names: str) -> ArrowExpr:
         return ArrowExpr.from_column_names(
             *column_names, backend_version=self._backend_version, dtypes=self._dtypes

diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py
@@ -13,6 +13,7 @@
 from narwhals._arrow.utils import narwhals_to_native_dtype
 from narwhals._arrow.utils import native_to_narwhals_dtype
 from narwhals._arrow.utils import validate_column_comparand
+from narwhals.dependencies import get_numpy
 from narwhals.utils import Implementation
 from narwhals.utils import generate_unique_token
 
@@ -298,6 +299,17 @@ def std(self, ddof: int = 1) -> int:
 
         return pc.stddev(self._native_series, ddof=ddof)  # type: ignore[no-any-return]
 
+    def skew(self) -> float:
+        values = self._native_series.to_numpy()
+        np = get_numpy()
+        m = np.mean(values)
+        s = np.std(values)
+        n = len(values)
+        if n < 3:
+            return float("nan")
+        g1 = np.sum((values - m) ** 3) / (n * s**3)
+        return float(g1)  # Population skewness
+
     def count(self) -> int:
         import pyarrow.compute as pc  # ignore-banned-import()
 

diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py
@@ -221,6 +221,9 @@ def mean(self) -> Self:
     def std(self, *, ddof: int = 1) -> Self:
         return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True)
 
+    def skew(self) -> Self:
+        return reuse_series_implementation(self, "skew", returns_scalar=True)
+
     def any(self) -> Self:
         return reuse_series_implementation(self, "any", returns_scalar=True)
 

diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py
@@ -177,6 +177,14 @@ def mean(self, *column_names: str) -> PandasLikeExpr:
             dtypes=self._dtypes,
         ).mean()
 
+    def skew(self, *column_names: str) -> PandasLikeExpr:
+        return PandasLikeExpr.from_column_names(
+            *column_names,
+            implementation=self._implementation,
+            backend_version=self._backend_version,
+            dtypes=self._dtypes,
+        ).skew()
+
     def max(self, *column_names: str) -> PandasLikeExpr:
         return PandasLikeExpr.from_column_names(
             *column_names,

diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py
@@ -15,6 +15,8 @@
 from narwhals._pandas_like.utils import set_axis
 from narwhals._pandas_like.utils import to_datetime
 from narwhals._pandas_like.utils import validate_column_comparand
+from narwhals.dependencies import get_numpy
+from narwhals.dependencies import get_pandas
 from narwhals.utils import Implementation
 
 if TYPE_CHECKING:
@@ -424,6 +426,23 @@ def std(
         ser = self._native_series
         return ser.std(ddof=ddof)
 
+    def skew(self) -> Any:
+        np = get_numpy()
+        pd = get_pandas()
+        ser = self._native_series
+        if hasattr(ser, "skew") and not isinstance(ser.dtype, pd.ArrowDtype):
+            return float(ser.skew())
+        else:
+            values = ser.to_numpy()
+            n = len(values)
+            if n < 3:
+                return float("nan")
+            m = np.mean(values)
+            m2 = np.mean((values - m) ** 2)
+            m3 = np.mean((values - m) ** 3)
+            g1 = m3 / (m2**1.5)
+            return float(g1)  # Population skewness
+
     def len(self) -> Any:
         return len(self._native_series)
 

diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py
@@ -7,14 +7,14 @@
 from typing import Sequence
 
 from narwhals._expression_parsing import parse_into_exprs
+from narwhals._polars.expr import PolarsExpr
 from narwhals._polars.utils import extract_args_kwargs
 from narwhals._polars.utils import narwhals_to_native_dtype
 from narwhals.utils import Implementation
 
 if TYPE_CHECKING:
     from narwhals._polars.dataframe import PolarsDataFrame
     from narwhals._polars.dataframe import PolarsLazyFrame
-    from narwhals._polars.expr import PolarsExpr
     from narwhals._polars.typing import IntoPolarsExpr
     from narwhals.dtypes import DType
     from narwhals.typing import DTypes
@@ -98,6 +98,13 @@ def mean(self, *column_names: str) -> PolarsExpr:
             return PolarsExpr(pl.mean([*column_names]), dtypes=self._dtypes)  # type: ignore[arg-type]
         return PolarsExpr(pl.mean(*column_names), dtypes=self._dtypes)
 
+    def skew(self, *column_names: str) -> PolarsExpr:
+        import polars as pl  # ignore-banned-import()
+
+        if self._backend_version < (0, 20, 4):  # pragma: no cover
+            return PolarsExpr(pl.skew([*column_names]), dtypes=self._dtypes)
+        return PolarsExpr(pl.skew(*column_names), dtypes=self._dtypes)
+
     def mean_horizontal(self, *exprs: IntoPolarsExpr) -> PolarsExpr:
         import polars as pl  # ignore-banned-import()
 

diff --git a/narwhals/expr.py b/narwhals/expr.py
@@ -399,7 +399,7 @@ def std(self, *, ddof: int = 1) -> Self:
         Get standard deviation.
 
         Arguments:
-            ddof: “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
+            ddof: "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
                      where N represents the number of elements. By default ddof is 1.
 
         Examples:
@@ -433,6 +433,43 @@ def std(self, *, ddof: int = 1) -> Self:
         """
         return self.__class__(lambda plx: self._call(plx).std(ddof=ddof))
 
+    def skew(self) -> Self:
+        """
+        Calculate the sample skewness of a column.
+
+        Returns:
+            An expression representing the sample skewness of the column.
+
+        Examples:
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> import narwhals as nw
+            >>> df_pd = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]})
+            >>> df_pl = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]})
+
+            Let's define a dataframe-agnostic function:
+
+            >>> @nw.narwhalify
+            ... def func(df):
+            ...     return df.select(nw.col("a", "b").skew())
+
+            We can then pass either pandas or Polars to `func`:
+
+            >>> func(df_pd)
+                 a         b
+            0  0.0  2.194964
+            >>> func(df_pl)
+            shape: (1, 2)
+            ┌─────┬──────────┐
+            │ a   ┆ b        │
+            │ --- ┆ ---      │
+            │ f64 ┆ f64      │
+            ╞═════╪══════════╡
+            │ 0.0 ┆ 1.472427 │
+            └─────┴──────────┘
+        """
+        return self.__class__(lambda plx: self._call(plx).skew())
+
     def sum(self) -> Expr:
         """
         Return the sum value.

diff --git a/narwhals/series.py b/narwhals/series.py
@@ -519,6 +519,40 @@ def mean(self) -> Any:
         """
         return self._compliant_series.mean()
 
+    def skew(self) -> Any:
+        """
+        Calculate the sample skewness of the Series.
+
+        Returns:
+            The sample skewness of the Series.
+
+        Examples:
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> import narwhals as nw
+            >>> s = [1, 2, 3, 4, 5]
+            >>> s_pd = pd.Series(s)
+            >>> s_pl = pl.Series(s)
+
+            We define a library agnostic function:
+
+            >>> @nw.narwhalify
+            ... def func(s):
+            ...     return s.skew()
+
+            We can then pass either pandas or Polars to `func`:
+
+            >>> func(s_pd)
+            0.0
+            >>> func(s_pl)
+            0.0
+
+        Notes:
+            The skewness is a measure of the asymmetry of the probability distribution.
+            A perfectly symmetric distribution has a skewness of 0.
+        """
+        return self._compliant_series.skew()
+
     def count(self) -> Any:
         """
         Returns the number of non-null elements in the Series.

diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py
@@ -16,15 +16,19 @@ def test_unary(constructor: Constructor, request: pytest.FixtureRequest) -> None
         .with_columns(
             a_mean=nw.col("a").mean(),
             a_sum=nw.col("a").sum(),
+            a_skew=nw.col("a").skew(),
             b_nunique=nw.col("b").n_unique(),
             z_min=nw.col("z").min(),
             z_max=nw.col("z").max(),
         )
-        .select(nw.col("a_mean", "a_sum", "b_nunique", "z_min", "z_max").unique())
+        .select(
+            nw.col("a_mean", "a_sum", "a_skew", "b_nunique", "z_min", "z_max").unique()
+        )
     )
     expected = {
         "a_mean": [2],
         "a_sum": [6],
+        "a_skew": [0.0],
         "b_nunique": [2],
         "z_min": [7],
         "z_max": [9],
@@ -38,15 +42,17 @@ def test_unary_series(constructor_eager: Any) -> None:
     result = {
         "a_mean": [df["a"].mean()],
         "a_sum": [df["a"].sum()],
+        "a_skew": [df["a"].skew()],
         "b_nunique": [df["b"].n_unique()],
         "z_min": [df["z"].min()],
         "z_max": [df["z"].max()],
     }
     expected = {
-        "a_mean": [2],
+        "a_mean": [2.0],
         "a_sum": [6],
+        "a_skew": [0.0],
         "b_nunique": [2],
-        "z_min": [7],
-        "z_max": [9],
+        "z_min": [7.0],
+        "z_max": [9.0],
     }
     compare_dicts(result, expected)