Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support for median #1212

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api-reference/expr.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
- len
- max
- mean
- median
- min
- mode
- null_count
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/narwhals.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Here are the top-level functions available in Narwhals.
- maybe_set_index
- mean
- mean_horizontal
- median
- min
- min_horizontal
- narwhalify
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
- len
- max
- mean
- median
- min
- mode
- name
Expand Down
2 changes: 2 additions & 0 deletions narwhals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from narwhals.expr import max_horizontal
from narwhals.expr import mean
from narwhals.expr import mean_horizontal
from narwhals.expr import median
from narwhals.expr import min
from narwhals.expr import min_horizontal
from narwhals.expr import nth
Expand Down Expand Up @@ -97,6 +98,7 @@
"max_horizontal",
"mean",
"mean_horizontal",
"median",
"min",
"min_horizontal",
"nth",
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,9 @@ def filter(self, *predicates: IntoArrowExpr) -> Self:
def mean(self) -> Self:
return reuse_series_implementation(self, "mean", returns_scalar=True)

def median(self) -> Self:
return reuse_series_implementation(self, "median", returns_scalar=True)

def count(self) -> Self:
return reuse_series_implementation(self, "count", returns_scalar=True)

Expand Down
5 changes: 5 additions & 0 deletions narwhals/_arrow/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,11 @@ def mean(self, *column_names: str) -> ArrowExpr:
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
).mean()

def median(self, *column_names: str) -> ArrowExpr:
return ArrowExpr.from_column_names(
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
).median()

def max(self, *column_names: str) -> ArrowExpr:
return ArrowExpr.from_column_names(
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
Expand Down
5 changes: 5 additions & 0 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,11 @@ def mean(self) -> int:

return pc.mean(self._native_series) # type: ignore[no-any-return]

def median(self) -> int:
import pyarrow.compute as pc # ignore-banned-import()

return pc.approximate_median(self._native_series) # type: ignore[no-any-return]

def min(self) -> int:
import pyarrow.compute as pc # ignore-banned-import()

Expand Down
7 changes: 7 additions & 0 deletions narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,13 @@ def mean(self) -> Self:
returns_scalar=True,
)

def median(self) -> Self:
return self._from_call(
lambda _input: _input.median_approximate(),
"median",
returns_scalar=True,
)

def min(self) -> Self:
return self._from_call(
lambda _input: _input.min(),
Expand Down
5 changes: 5 additions & 0 deletions narwhals/_dask/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,11 @@ def mean(self, *column_names: str) -> DaskExpr:
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
).mean()

def median(self, *column_names: str) -> DaskExpr:
return DaskExpr.from_column_names(
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
).median()

def sum(self, *column_names: str) -> DaskExpr:
return DaskExpr.from_column_names(
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,9 @@ def count(self) -> Self:
def mean(self) -> Self:
return reuse_series_implementation(self, "mean", returns_scalar=True)

def median(self) -> Self:
return reuse_series_implementation(self, "median", returns_scalar=True)

def std(self, *, ddof: int = 1) -> Self:
return reuse_series_implementation(self, "std", ddof=ddof, returns_scalar=True)

Expand Down
8 changes: 8 additions & 0 deletions narwhals/_pandas_like/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,14 @@ def mean(self, *column_names: str) -> PandasLikeExpr:
dtypes=self._dtypes,
).mean()

def median(self, *column_names: str) -> PandasLikeExpr:
return PandasLikeExpr.from_column_names(
*column_names,
implementation=self._implementation,
backend_version=self._backend_version,
dtypes=self._dtypes,
).median()

def max(self, *column_names: str) -> PandasLikeExpr:
return PandasLikeExpr.from_column_names(
*column_names,
Expand Down
4 changes: 4 additions & 0 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,10 @@ def mean(self) -> Any:
ser = self._native_series
return ser.mean()

def median(self) -> Any:
ser = self._native_series
return ser.median()

def std(
self,
*,
Expand Down
7 changes: 7 additions & 0 deletions narwhals/_polars/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,13 @@ def mean_horizontal(self, *exprs: IntoPolarsExpr) -> PolarsExpr:
dtypes=self._dtypes,
)

def median(self, *column_names: str) -> PolarsExpr:
import polars as pl # ignore-banned-import()

from narwhals._polars.expr import PolarsExpr

return PolarsExpr(pl.median([*column_names]), dtypes=self._dtypes) # type: ignore[arg-type]

def concat_str(
self,
exprs: Iterable[IntoPolarsExpr],
Expand Down
96 changes: 96 additions & 0 deletions narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,52 @@ def mean(self) -> Self:
"""
return self.__class__(lambda plx: self._call(plx).mean())

def median(self) -> Self:
"""
Get median value.

Notes:
Results might slightly differ across backends due to differences in the underlying algorithms used to compute the median.

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> df_pd = pd.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]})
>>> df_pl = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]})
>>> df_pa = pa.table({"a": [1, 8, 3], "b": [4, 5, 2]})

Let's define a dataframe-agnostic function:

>>> @nw.narwhalify
... def func(df):
... return df.select(nw.col("a", "b").median())

We can then pass any supported library such as pandas, Polars, or PyArrow to `func`:

>>> func(df_pd)
a b
0 3.0 4.0
>>> func(df_pl)
shape: (1, 2)
β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”
β”‚ a ┆ b β”‚
β”‚ --- ┆ --- β”‚
β”‚ f64 ┆ f64 β”‚
β•žβ•β•β•β•β•β•ͺ═════║
β”‚ 3.0 ┆ 4.0 β”‚
β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜
>>> func(df_pa)
pyarrow.Table
a: double
b: double
----
a: [[3]]
b: [[4]]
"""
return self.__class__(lambda plx: self._call(plx).median())

def std(self, *, ddof: int = 1) -> Self:
"""
Get standard deviation.
Expand Down Expand Up @@ -4548,6 +4594,56 @@ def mean(*columns: str) -> Expr:
return Expr(lambda plx: plx.mean(*columns))


def median(*columns: str) -> Expr:
"""
Get the median value.

Notes:
- Syntactic sugar for ``nw.col(columns).median()``
- Results might slightly differ across backends due to differences in the underlying algorithms used to compute the median.

Arguments:
columns: Name(s) of the columns to use in the aggregation function

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> df_pd = pd.DataFrame({"a": [4, 5, 2]})
>>> df_pl = pl.DataFrame({"a": [4, 5, 2]})
>>> df_pa = pa.table({"a": [4, 5, 2]})

Let's define a dataframe agnostic function:

>>> @nw.narwhalify
... def func(df):
... return df.select(nw.median("a"))

We can then pass any supported library such as pandas, Polars, or PyArrow to `func`:

>>> func(df_pd)
a
0 4.0
>>> func(df_pl)
shape: (1, 1)
β”Œβ”€β”€β”€β”€β”€β”
β”‚ a β”‚
β”‚ --- β”‚
β”‚ f64 β”‚
β•žβ•β•β•β•β•β•‘
β”‚ 4.0 β”‚
β””β”€β”€β”€β”€β”€β”˜
>>> func(df_pa)
pyarrow.Table
a: double
----
a: [[4]]
"""

return Expr(lambda plx: plx.median(*columns))


def min(*columns: str) -> Expr:
"""
Return the minimum value.
Expand Down
34 changes: 34 additions & 0 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,40 @@ def mean(self) -> Any:
"""
return self._compliant_series.mean()

def median(self) -> Any:
"""
Reduce this Series to the median value.

Notes:
Results might slightly differ across backends due to differences in the underlying algorithms used to compute the median.

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> s = [5, 3, 8]
>>> s_pd = pd.Series(s)
>>> s_pl = pl.Series(s)
>>> s_pa = pa.chunked_array([s])

Let's define a library agnostic function:

>>> @nw.narwhalify
... def func(s):
... return s.median()

We can then pass any supported library such as pandas, Polars, or PyArrow to `func`:

>>> func(s_pd)
np.float64(5.0)
>>> func(s_pl)
5.0
>>> func(s_pa)
<pyarrow.DoubleScalar: 5.0>
"""
return self._compliant_series.median()

def count(self) -> Any:
"""
Returns the number of non-null elements in the Series.
Expand Down
50 changes: 50 additions & 0 deletions narwhals/stable/v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1357,6 +1357,55 @@ def mean(*columns: str) -> Expr:
return _stableify(nw.mean(*columns))


def median(*columns: str) -> Expr:
"""
Get the median value.

Notes:
- Syntactic sugar for ``nw.col(columns).median()``
- Results might slightly differ across backends due to differences in the underlying algorithms used to compute the median.

Arguments:
columns: Name(s) of the columns to use in the aggregation function

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals.stable.v1 as nw
>>> df_pd = pd.DataFrame({"a": [4, 5, 2]})
>>> df_pl = pl.DataFrame({"a": [4, 5, 2]})
>>> df_pa = pa.table({"a": [4, 5, 2]})

Let's define a dataframe agnostic function:

>>> @nw.narwhalify
... def func(df):
... return df.select(nw.median("a"))

We can then pass any supported library such as pandas, Polars, or PyArrow to `func`:

>>> func(df_pd)
a
0 4.0
>>> func(df_pl)
shape: (1, 1)
β”Œβ”€β”€β”€β”€β”€β”
β”‚ a β”‚
β”‚ --- β”‚
β”‚ f64 β”‚
β•žβ•β•β•β•β•β•‘
β”‚ 4.0 β”‚
β””β”€β”€β”€β”€β”€β”˜
>>> func(df_pa)
pyarrow.Table
a: double
----
a: [[4]]
"""
return _stableify(nw.median(*columns))


def sum(*columns: str) -> Expr:
"""
Sum all values.
Expand Down Expand Up @@ -2461,6 +2510,7 @@ def from_dict(
"max_horizontal",
"mean",
"mean_horizontal",
"median",
"min",
"min_horizontal",
"sum",
Expand Down
Loading
Loading