Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enh: add dt.timestamp #1220

Merged
merged 29 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
487a525
add timestamp method
raisadz Oct 17, 2024
b1e568c
add docstring example for series
raisadz Oct 17, 2024
5ecbed9
add docstring example for expr
raisadz Oct 17, 2024
7ed770d
update example to use time unit
raisadz Oct 17, 2024
d3f76cb
add tests
raisadz Oct 17, 2024
c9a8ec0
Merge remote-tracking branch 'upstream/main' into add-dt-timestamp
raisadz Oct 17, 2024
839891b
preserve pyarrow types, add tests
raisadz Oct 17, 2024
485c1e1
fix dtype comparisons, add test for dates
raisadz Oct 18, 2024
bf909ad
add parametrization
raisadz Oct 18, 2024
5301aa4
resolve conflicts after merge
raisadz Oct 18, 2024
a37e5cd
parametrize for other time units
raisadz Oct 18, 2024
fee2422
move common functions to utils, add tests for invalid inputs, add das…
raisadz Oct 18, 2024
ee760e6
Merge remote-tracking branch 'upstream/main' into add-dt-timestamp
MarcoGorelli Oct 20, 2024
dc86689
use more elif/else statements
MarcoGorelli Oct 20, 2024
70d6462
add timestamp_test
MarcoGorelli Oct 20, 2024
b1e97c9
version compat
MarcoGorelli Oct 20, 2024
124d588
pandas versions compat
MarcoGorelli Oct 20, 2024
6277094
coverage
MarcoGorelli Oct 20, 2024
d158f22
improve type hints
MarcoGorelli Oct 20, 2024
49d235c
insert a time zone for good measure
MarcoGorelli Oct 20, 2024
674b912
set time zone to utc first
MarcoGorelli Oct 20, 2024
263f093
split time zone into separate test
MarcoGorelli Oct 20, 2024
532b07f
more version-dependent xfails
MarcoGorelli Oct 20, 2024
186f926
xfail strict=False for these
MarcoGorelli Oct 20, 2024
a5ed5f6
coverage
MarcoGorelli Oct 20, 2024
a022148
dask xfail
MarcoGorelli Oct 20, 2024
3a572e6
modin xfail
MarcoGorelli Oct 20, 2024
a01d9ad
Merge remote-tracking branch 'upstream/main' into add-dt-timestamp
MarcoGorelli Oct 21, 2024
32bb502
typing
MarcoGorelli Oct 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions docs/api-reference/expr_dt.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,23 @@
members:
- convert_time_zone
- date
- year
- month
- day
- ordinal_day
- hour
- minute
- second
- millisecond
- microsecond
- millisecond
- minute
- month
- nanosecond
- ordinal_day
- replace_time_zone
- total_minutes
- total_seconds
- total_milliseconds
- second
- timestamp
- total_microseconds
- total_milliseconds
- total_minutes
- total_nanoseconds
- total_seconds
- to_string
- year
show_source: false
show_bases: false
19 changes: 10 additions & 9 deletions docs/api-reference/series_dt.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,23 @@
members:
- convert_time_zone
- date
- year
- month
- day
- ordinal_day
- hour
- minute
- second
- millisecond
- microsecond
- millisecond
- minute
- month
- nanosecond
- ordinal_day
- replace_time_zone
- total_minutes
- total_seconds
- total_milliseconds
- second
- timestamp
- total_microseconds
- total_milliseconds
- total_minutes
- total_nanoseconds
- total_seconds
- to_string
- year
show_source: false
show_bases: false
5 changes: 5 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,11 @@ def convert_time_zone(self: Self, time_zone: str) -> ArrowExpr:
self._expr, "dt", "convert_time_zone", time_zone
)

def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowExpr:
return reuse_series_namespace_implementation(
self._expr, "dt", "timestamp", time_unit
)

def date(self: Self) -> ArrowExpr:
return reuse_series_namespace_implementation(self._expr, "dt", "date")

Expand Down
53 changes: 53 additions & 0 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,59 @@ def convert_time_zone(self: Self, time_zone: str) -> ArrowSeries:

return self._arrow_series._from_native_series(result)

def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowSeries:
import pyarrow as pa # ignore-banned-import
import pyarrow.compute as pc # ignore-banned-import

s = self._arrow_series._native_series
dtype = self._arrow_series.dtype
if dtype == self._arrow_series._dtypes.Datetime:
unit = dtype.time_unit # type: ignore[attr-defined]
s_cast = s.cast(pa.int64())
if unit == "ns":
if time_unit == "ns":
result = s_cast
elif time_unit == "us":
result = floordiv_compat(s_cast, 1_000)
else:
result = floordiv_compat(s_cast, 1_000_000)
elif unit == "us":
if time_unit == "ns":
result = pc.multiply(s_cast, 1_000)
elif time_unit == "us":
result = s_cast
else:
result = floordiv_compat(s_cast, 1_000)
elif unit == "ms":
if time_unit == "ns":
result = pc.multiply(s_cast, 1_000_000)
elif time_unit == "us":
result = pc.multiply(s_cast, 1_000)
else:
result = s_cast
elif unit == "s":
if time_unit == "ns":
result = pc.multiply(s_cast, 1_000_000_000)
elif time_unit == "us":
result = pc.multiply(s_cast, 1_000_000)
else:
result = pc.multiply(s_cast, 1_000)
else:
msg = f"unexpected time unit {unit}, please report an issue at https://github.com/narwhals-dev/narwhals"
raise AssertionError(msg)
elif dtype == self._arrow_series._dtypes.Date:
time_s = pc.multiply(s.cast(pa.int32()), 86400)
if time_unit == "ns":
result = pc.multiply(time_s, 1_000_000_000)
elif time_unit == "us":
result = pc.multiply(time_s, 1_000_000)
else:
result = pc.multiply(time_s, 1_000)
else:
msg = "Input should be either of Date or Datetime type"
raise TypeError(msg)
return self._arrow_series._from_native_series(result)

def date(self: Self) -> ArrowSeries:
import pyarrow as pa # ignore-banned-import()

Expand Down
32 changes: 32 additions & 0 deletions narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from narwhals._dask.utils import add_row_index
from narwhals._dask.utils import maybe_evaluate
from narwhals._dask.utils import narwhals_to_native_dtype
from narwhals._pandas_like.utils import calculate_timestamp_date
from narwhals._pandas_like.utils import calculate_timestamp_datetime
from narwhals._pandas_like.utils import native_to_narwhals_dtype
from narwhals.utils import generate_unique_token

Expand Down Expand Up @@ -953,6 +955,36 @@ def func(s: dask_expr.Series, time_zone: str) -> dask_expr.Series:
returns_scalar=False,
)

def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> DaskExpr:
def func(
s: dask_expr.Series, time_unit: Literal["ns", "us", "ms"] = "us"
) -> dask_expr.Series:
dtype = native_to_narwhals_dtype(s, self._expr._dtypes)
is_pyarrow_dtype = "pyarrow" in str(dtype)
mask_na = s.isna()
if dtype == self._expr._dtypes.Date:
s_cast = s.astype("Int32[pyarrow]") * 86_400
result = calculate_timestamp_date(s_cast, time_unit)
elif dtype == self._expr._dtypes.Datetime:
original_time_unit = dtype.time_unit # type: ignore[attr-defined]
s_cast = (
s.astype("Int64[pyarrow]") if is_pyarrow_dtype else s.astype("int64")
)
result = calculate_timestamp_datetime(
s_cast, original_time_unit, time_unit
)
else:
msg = "Input should be either of Date or Datetime type"
raise TypeError(msg)
return result.where(~mask_na)

return self._expr._from_call(
func,
"datetime",
time_unit,
returns_scalar=False,
)

def total_minutes(self) -> DaskExpr:
return self._expr._from_call(
lambda _input: _input.dt.total_seconds() // 60,
Expand Down
5 changes: 5 additions & 0 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,11 @@ def convert_time_zone(self, time_zone: str) -> PandasLikeExpr:
self._expr, "dt", "convert_time_zone", time_zone
)

def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeExpr:
return reuse_series_namespace_implementation(
self._expr, "dt", "timestamp", time_unit
)


class PandasLikeExprNameNamespace:
def __init__(self: Self, expr: PandasLikeExpr) -> None:
Expand Down
20 changes: 20 additions & 0 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from typing import Sequence
from typing import overload

from narwhals._pandas_like.utils import calculate_timestamp_date
from narwhals._pandas_like.utils import calculate_timestamp_datetime
from narwhals._pandas_like.utils import int_dtype_mapper
from narwhals._pandas_like.utils import narwhals_to_native_dtype
from narwhals._pandas_like.utils import native_series_from_iterable
Expand Down Expand Up @@ -944,3 +946,21 @@ def convert_time_zone(self, time_zone: str) -> PandasLikeSeries:
else:
result = self._pandas_series._native_series.dt.tz_convert(time_zone)
return self._pandas_series._from_native_series(result)

def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeSeries:
s = self._pandas_series._native_series
dtype = self._pandas_series.dtype
is_pyarrow_dtype = "pyarrow" in str(self._pandas_series._native_series.dtype)
mask_na = s.isna()
if dtype == self._pandas_series._dtypes.Date:
s_cast = s.astype("Int32[pyarrow]") * 86_400
result = calculate_timestamp_date(s_cast, time_unit)
elif dtype == self._pandas_series._dtypes.Datetime:
original_time_unit = dtype.time_unit # type: ignore[attr-defined]
s_cast = s.astype("Int64[pyarrow]") if is_pyarrow_dtype else s.astype("int64")
result = calculate_timestamp_datetime(s_cast, original_time_unit, time_unit)
else:
msg = "Input should be either of Date or Datetime type"
raise TypeError(msg)
result[mask_na] = None
return self._pandas_series._from_native_series(result)
47 changes: 47 additions & 0 deletions narwhals/_pandas_like/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,3 +542,50 @@ def convert_str_slice_to_int_slice(
stop = columns.get_loc(str_slice.stop) + 1 if str_slice.stop is not None else None
step = str_slice.step
return (start, stop, step)


def calculate_timestamp_datetime(
s_cast: int, original_time_unit: str, time_unit: str
) -> Any:
if original_time_unit == "ns":
if time_unit == "ns":
result = s_cast
elif time_unit == "us":
result = s_cast // 1_000
else:
result = s_cast // 1_000_000
elif original_time_unit == "us":
if time_unit == "ns":
result = s_cast * 1_000
elif time_unit == "us":
result = s_cast
else:
result = s_cast // 1_000
elif original_time_unit == "ms":
if time_unit == "ns":
result = s_cast * 1_000_000
elif time_unit == "us":
result = s_cast * 1_000
else:
result = s_cast
elif original_time_unit == "s":
if time_unit == "ns":
result = s_cast * 1_000_000_000
elif time_unit == "us":
result = s_cast * 1_000_000
else:
result = s_cast * 1_000
else:
msg = f"unexpected time unit {original_time_unit}, please report a bug at https://github.com/narwhals-dev/narwhals"
raise AssertionError(msg)
return result


def calculate_timestamp_date(s_cast: int, time_unit: str) -> Any:
if time_unit == "ns":
result = s_cast * 1_000_000_000
elif time_unit == "us":
result = s_cast * 1_000_000
else:
result = s_cast * 1_000
return result
66 changes: 66 additions & 0 deletions narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3621,6 +3621,72 @@ def convert_time_zone(self, time_zone: str) -> Expr:
lambda plx: self._expr._call(plx).dt.convert_time_zone(time_zone)
)

def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> Expr:
"""
Return a timestamp in the given time unit.

Arguments:
time_unit: {'ns', 'us', 'ms'}
Time unit.

Examples:
>>> from datetime import date
>>> import narwhals as nw
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> data = {"date": [date(2001, 1, 1), None, date(2001, 1, 3)]}
>>> df_pd = pd.DataFrame(data, dtype="datetime64[ns]")
>>> df_pl = pl.DataFrame(data)
>>> df_pa = pa.table(data)

Let's define a dataframe-agnostic function:

>>> @nw.narwhalify
... def func(df):
... return df.with_columns(
... nw.col("date").dt.timestamp().alias("timestamp_us"),
... nw.col("date").dt.timestamp("ms").alias("timestamp_ms"),
... )

We can then pass pandas / PyArrow / Polars / any other supported library:

>>> func(df_pd)
date timestamp_us timestamp_ms
0 2001-01-01 9.783072e+14 9.783072e+11
1 NaT NaN NaN
2 2001-01-03 9.784800e+14 9.784800e+11
>>> func(df_pl)
shape: (3, 3)
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ date ┆ timestamp_us ┆ timestamp_ms β”‚
β”‚ --- ┆ --- ┆ --- β”‚
β”‚ date ┆ i64 ┆ i64 β”‚
β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════════β•ͺ══════════════║
β”‚ 2001-01-01 ┆ 978307200000000 ┆ 978307200000 β”‚
β”‚ null ┆ null ┆ null β”‚
β”‚ 2001-01-03 ┆ 978480000000000 ┆ 978480000000 β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
>>> func(df_pa)
pyarrow.Table
date: date32[day]
timestamp_us: int64
timestamp_ms: int64
----
date: [[2001-01-01,null,2001-01-03]]
timestamp_us: [[978307200000000,null,978480000000000]]
timestamp_ms: [[978307200000,null,978480000000]]
"""
if time_unit not in {"ns", "us", "ms"}:
msg = (
"invalid `time_unit`"
f"\n\nExpected one of {{'ns', 'us', 'ms'}}, got {time_unit!r}."
)
raise ValueError(msg)
return self._expr.__class__(
lambda plx: self._expr._call(plx).dt.timestamp(time_unit)
)


class ExprNameNamespace:
def __init__(self: Self, expr: Expr) -> None:
Expand Down
Loading
Loading