From 487a52563c63b0deb92194c9576edcaf7facf53b Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Thu, 17 Oct 2024 10:02:48 +0100 Subject: [PATCH 01/25] add timestamp method --- docs/api-reference/expr_dt.md | 19 ++++++++------- docs/api-reference/series_dt.md | 19 ++++++++------- narwhals/_arrow/expr.py | 5 ++++ narwhals/_arrow/series.py | 43 +++++++++++++++++++++++++++++++++ narwhals/_dask/expr.py | 24 ++++++++++++++++++ narwhals/_pandas_like/expr.py | 5 ++++ narwhals/_pandas_like/series.py | 15 ++++++++++++ narwhals/expr.py | 20 +++++++++++++++ narwhals/series.py | 20 +++++++++++++++ 9 files changed, 152 insertions(+), 18 deletions(-) diff --git a/docs/api-reference/expr_dt.md b/docs/api-reference/expr_dt.md index 5c9ab41f3..604ac4abf 100644 --- a/docs/api-reference/expr_dt.md +++ b/docs/api-reference/expr_dt.md @@ -6,22 +6,23 @@ members: - convert_time_zone - date - - year - - month - day - - ordinal_day - hour - - minute - - second - - millisecond - microsecond + - millisecond + - minute + - month - nanosecond + - ordinal_day - replace_time_zone - - total_minutes - - total_seconds - - total_milliseconds + - second + - timestamp - total_microseconds + - total_milliseconds + - total_minutes - total_nanoseconds + - total_seconds - to_string + - year show_source: false show_bases: false diff --git a/docs/api-reference/series_dt.md b/docs/api-reference/series_dt.md index c92592411..23d4817cb 100644 --- a/docs/api-reference/series_dt.md +++ b/docs/api-reference/series_dt.md @@ -6,22 +6,23 @@ members: - convert_time_zone - date - - year - - month - day - - ordinal_day - hour - - minute - - second - - millisecond - microsecond + - millisecond + - minute + - month - nanosecond + - ordinal_day - replace_time_zone - - total_minutes - - total_seconds - - total_milliseconds + - second + - timestamp - total_microseconds + - total_milliseconds + - total_minutes - total_nanoseconds + - total_seconds - to_string + - year show_source: false show_bases: false diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 55c529d30..c9ee160e4 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -420,6 +420,11 @@ def convert_time_zone(self: Self, time_zone: str) -> ArrowExpr: self._expr, "dt", "convert_time_zone", time_zone ) + def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowExpr: + return reuse_series_namespace_implementation( + self._expr, "dt", "timestamp", time_unit + ) + def date(self: Self) -> ArrowExpr: return reuse_series_namespace_implementation(self._expr, "dt", "date") diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 65a393ca9..a21acac1c 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -780,6 +780,49 @@ def convert_time_zone(self: Self, time_zone: str) -> ArrowSeries: return self._arrow_series._from_native_series(result) + def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowSeries: + import pyarrow as pa # ignore-banned-import + import pyarrow.compute as pc # ignore-banned-import + + s = self._arrow_series._native_series + dtype = s.type + if isinstance(dtype, pa.TimestampType): + unit = dtype.unit + s_cast = s.cast(pa.int64()) + if unit == "ns": + if time_unit == "ns": + result = s_cast + if time_unit == "us": + result = pc.divide(s_cast, 1_000) + if time_unit == "ms": + result = pc.divide(s_cast, 1_000_000) + if unit == "us": + if time_unit == "ns": + result = pc.multiply(s_cast, 1_000) + if time_unit == "us": + result = s_cast + if time_unit == "ms": + result = pc.divide(s_cast, 1_000) + if unit == "ms": + if time_unit == "ns": + result = pc.multiply(s_cast, 1_000_000) + if time_unit == "us": + result = pc.multiply(s_cast, 1_000) + if time_unit == "ms": + result = s_cast + elif dtype == pa.date32(): + time_s = pc.multiply(s.cast(pa.int32()), 86400) + if time_unit == "ns": + result = pc.multiply(time_s, 1_000_000_000) + if time_unit == "us": + result = pc.multiply(time_s, 1_000_000) + if time_unit == "ms": + result = pc.multiply(time_s, 1_000) + else: + msg = "Input should be either of TimeStamp or Date type" + raise TypeError(msg) + return self._arrow_series._from_native_series(result) + def date(self: Self) -> ArrowSeries: import pyarrow as pa # ignore-banned-import() diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 693fcad5e..0326657b3 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -953,6 +953,30 @@ def func(s: dask_expr.Series, time_zone: str) -> dask_expr.Series: returns_scalar=False, ) + def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> DaskExpr: + def func( + s: dask_expr.Series, time_unit: Literal["ns", "us", "ms"] = "us" + ) -> dask_expr.Series: + import numpy as np # ignore-banned-import + + mask_na = s.isna() + time_ns = s.astype(np.int64) + time_ns[mask_na] = None + if time_unit == "ns": + result = time_ns + if time_unit == "us": + result = time_ns / 1_000 + if time_unit == "ms": + result = time_ns / 1_000_000 + return result + + return self._expr._from_call( + func, + "datetime", + time_unit, + returns_scalar=False, + ) + def total_minutes(self) -> DaskExpr: return self._expr._from_call( lambda _input: _input.dt.total_seconds() // 60, diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 07ba3e56d..a4082235f 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -582,6 +582,11 @@ def convert_time_zone(self, time_zone: str) -> PandasLikeExpr: self._expr, "dt", "convert_time_zone", time_zone ) + def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._expr, "dt", "timestamp", time_unit + ) + class PandasLikeExprNameNamespace: def __init__(self: Self, expr: PandasLikeExpr) -> None: diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 2fe53b22a..5cc3639b1 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -946,3 +946,18 @@ def convert_time_zone(self, time_zone: str) -> PandasLikeSeries: else: result = self._pandas_series._native_series.dt.tz_convert(time_zone) return self._pandas_series._from_native_series(result) + + def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeSeries: + import numpy as np # ignore-banned-import + + s = self._pandas_series._native_series + mask_na = s.isna() + time_ns = s.astype(np.int64) + time_ns[mask_na] = None + if time_unit == "ns": + result = time_ns + if time_unit == "us": + result = time_ns / 1_000 + if time_unit == "ms": + result = time_ns / 1_000_000 + return self._pandas_series._from_native_series(result) diff --git a/narwhals/expr.py b/narwhals/expr.py index 6eedbafa4..b8ebb40b7 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -3612,6 +3612,26 @@ def convert_time_zone(self, time_zone: str) -> Expr: lambda plx: self._expr._call(plx).dt.convert_time_zone(time_zone) ) + def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> Expr: + """ + Return a timestamp in the given time unit. + + Arguments: + time_unit: {'ns', 'us', 'ms'} + Time unit. + + Examples: + """ + if time_unit not in {"ns", "us", "ms"}: + msg = ( + "invalid `time_unit`" + f"\n\nExpected one of {{'ns', 'us', 'ms'}}, got {time_unit!r}." + ) + raise ValueError(msg) + return self._expr.__class__( + lambda plx: self._expr._call(plx).dt.timestamp(time_unit) + ) + class ExprNameNamespace: def __init__(self: Self, expr: Expr) -> None: diff --git a/narwhals/series.py b/narwhals/series.py index 36ecf50ff..d9141cf3a 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -4001,3 +4001,23 @@ def convert_time_zone(self, time_zone: str) -> Series: return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.convert_time_zone(time_zone) ) + + def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> Series: + """ + Return a timestamp in the given time unit. + + Arguments: + time_unit: {'ns', 'us', 'ms'} + Time unit. + + Examples: + """ + if time_unit not in {"ns", "us", "ms"}: + msg = ( + "invalid `time_unit`" + f"\n\nExpected one of {{'ns', 'us', 'ms'}}, got {time_unit!r}." + ) + raise ValueError(msg) + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.timestamp(time_unit) + ) From b1e568c8cb8b68b4ce01bb018c6e2507fc9c336e Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Thu, 17 Oct 2024 10:21:35 +0100 Subject: [PATCH 02/25] add docstring example for series --- narwhals/series.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/narwhals/series.py b/narwhals/series.py index d9141cf3a..6e6381a85 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -4011,6 +4011,46 @@ def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> Series: Time unit. Examples: + >>> from datetime import date + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = [date(2001, 1, 1), None, date(2001, 1, 3)] + >>> s_pd = pd.Series(data, dtype="datetime64[ns]") + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + Let's define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(s): + ... return s.dt.timestamp() + + We can then pass pandas / PyArrow / Polars / any other supported library: + + >>> func(s_pd) + 0 9.783072e+14 + 1 NaN + 2 9.784800e+14 + dtype: float64 + >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: '' [i64] + [ + 978307200000000 + null + 978480000000000 + ] + >>> func(s_pa) + + [ + [ + 978307200000000, + null, + 978480000000000 + ] + ] """ if time_unit not in {"ns", "us", "ms"}: msg = ( From 5ecbed91652d4843a91599c68ea2ddfed64fe9b5 Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Thu, 17 Oct 2024 11:13:35 +0100 Subject: [PATCH 03/25] add docstring example for expr --- narwhals/expr.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/narwhals/expr.py b/narwhals/expr.py index b8ebb40b7..b5c589f08 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -3621,6 +3621,45 @@ def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> Expr: Time unit. Examples: + >>> from datetime import date + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = {"a": [date(2001, 1, 1), None, date(2001, 1, 3)]} + >>> df_pd = pd.DataFrame(data, dtype="datetime64[ns]") + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Let's define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(df): + ... return df.select(nw.col("a").dt.timestamp()) + + We can then pass pandas / PyArrow / Polars / any other supported library: + + >>> func(df_pd) + a + 0 9.783072e+14 + 1 NaN + 2 9.784800e+14 + >>> func(df_pl) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════╡ + │ 978307200000000 │ + │ null │ + │ 978480000000000 │ + └─────────────────┘ + >>> func(df_pa) + pyarrow.Table + a: int64 + ---- + a: [[978307200000000,null,978480000000000]] """ if time_unit not in {"ns", "us", "ms"}: msg = ( From 7ed770d3806843455744cd39389a6e4262dd87e5 Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Thu, 17 Oct 2024 11:23:49 +0100 Subject: [PATCH 04/25] update example to use time unit --- narwhals/expr.py | 43 +++++++++++++++++++++++++------------------ narwhals/series.py | 14 +++++++------- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index b5c589f08..bf8a1b198 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -3626,7 +3626,7 @@ def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> Expr: >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> data = {"a": [date(2001, 1, 1), None, date(2001, 1, 3)]} + >>> data = {"date": [date(2001, 1, 1), None, date(2001, 1, 3)]} >>> df_pd = pd.DataFrame(data, dtype="datetime64[ns]") >>> df_pl = pl.DataFrame(data) >>> df_pa = pa.table(data) @@ -3635,31 +3635,38 @@ def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> Expr: >>> @nw.narwhalify ... def func(df): - ... return df.select(nw.col("a").dt.timestamp()) + ... return df.with_columns( + ... nw.col("date").dt.timestamp().alias("timestamp_us"), + ... nw.col("date").dt.timestamp("ms").alias("timestamp_ms"), + ... ) We can then pass pandas / PyArrow / Polars / any other supported library: >>> func(df_pd) - a - 0 9.783072e+14 - 1 NaN - 2 9.784800e+14 + date timestamp_us timestamp_ms + 0 2001-01-01 9.783072e+14 9.783072e+11 + 1 NaT NaN NaN + 2 2001-01-03 9.784800e+14 9.784800e+11 >>> func(df_pl) - shape: (3, 1) - ┌─────────────────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════════════════╡ - │ 978307200000000 │ - │ null │ - │ 978480000000000 │ - └─────────────────┘ + shape: (3, 3) + ┌────────────┬─────────────────┬──────────────┐ + │ date ┆ timestamp_us ┆ timestamp_ms │ + │ --- ┆ --- ┆ --- │ + │ date ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════╪══════════════╡ + │ 2001-01-01 ┆ 978307200000000 ┆ 978307200000 │ + │ null ┆ null ┆ null │ + │ 2001-01-03 ┆ 978480000000000 ┆ 978480000000 │ + └────────────┴─────────────────┴──────────────┘ >>> func(df_pa) pyarrow.Table - a: int64 + date: date32[day] + timestamp_us: int64 + timestamp_ms: int64 ---- - a: [[978307200000000,null,978480000000000]] + date: [[2001-01-01,null,2001-01-03]] + timestamp_us: [[978307200000000,null,978480000000000]] + timestamp_ms: [[978307200000,null,978480000000]] """ if time_unit not in {"ns", "us", "ms"}: msg = ( diff --git a/narwhals/series.py b/narwhals/series.py index 6e6381a85..73bb61eb9 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -4025,30 +4025,30 @@ def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> Series: >>> @nw.narwhalify ... def func(s): - ... return s.dt.timestamp() + ... return s.dt.timestamp("ms") We can then pass pandas / PyArrow / Polars / any other supported library: >>> func(s_pd) - 0 9.783072e+14 + 0 9.783072e+11 1 NaN - 2 9.784800e+14 + 2 9.784800e+11 dtype: float64 >>> func(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ - 978307200000000 + 978307200000 null - 978480000000000 + 978480000000 ] >>> func(s_pa) [ [ - 978307200000000, + 978307200000, null, - 978480000000000 + 978480000000 ] ] """ From d3f76cb3a407c56df2660e08b496d8138b1e8cce Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Thu, 17 Oct 2024 13:44:18 +0100 Subject: [PATCH 05/25] add tests --- tests/expr_and_series/dt/datetime_attributes_test.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index 5b9519f57..faa5ac3d0 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -104,3 +104,13 @@ def test_datetime_chained_attributes( result = df.select(nw.col("a").dt.date().dt.year()) compare_dicts(result, {"a": [2021, 2020]}) + + +def test_timestamp(constructor_eager: Any) -> None: + dates = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} + df = nw.from_native(constructor_eager(dates)) + result = df.select(nw.col("a").dt.timestamp()) + expected = {"a": [978307200000000, None, 978480000000000]} + compare_dicts(result, expected) + result = df.select(nw.col("a").cast(nw.Datetime("ms")).dt.timestamp()) + expected = {"a": [978307200, 9784800]} From 839891b3ed41f5890417c9b529f14cf7471fea43 Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Thu, 17 Oct 2024 17:44:04 +0100 Subject: [PATCH 06/25] preserve pyarrow types, add tests --- narwhals/_dask/expr.py | 50 +++++++++++++++---- narwhals/_pandas_like/series.py | 46 +++++++++++++---- .../dt/datetime_attributes_test.py | 23 +++++++-- 3 files changed, 94 insertions(+), 25 deletions(-) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 0326657b3..0a92f8c7f 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -957,18 +957,46 @@ def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> DaskExpr: def func( s: dask_expr.Series, time_unit: Literal["ns", "us", "ms"] = "us" ) -> dask_expr.Series: - import numpy as np # ignore-banned-import - + dtype = native_to_narwhals_dtype(s, self._expr._dtypes) + is_pyarrow_dtype = "pyarrow" in str(dtype) mask_na = s.isna() - time_ns = s.astype(np.int64) - time_ns[mask_na] = None - if time_unit == "ns": - result = time_ns - if time_unit == "us": - result = time_ns / 1_000 - if time_unit == "ms": - result = time_ns / 1_000_000 - return result + if dtype == self._expr._dtypes.Date: + result = s.astype("Int32[pyarrow]") + if dtype == self._expr._dtypes.Datetime: + original_time_unit = dtype.time_unit # type: ignore[attr-defined] + if is_pyarrow_dtype: + s_cast = s.astype("Int64[pyarrow]") + else: + s_cast = s.astype("int64") + if original_time_unit == "ns": + if time_unit == "ns": + result = s_cast + if time_unit == "us": + result = s_cast / 1_000 + if time_unit == "ms": + result = s_cast / 1_000_000 + if original_time_unit == "us": + if time_unit == "ns": + result = s_cast * 1_000 + if time_unit == "us": + result = s_cast + if time_unit == "ms": + result = s_cast / 1_000 + if original_time_unit == "ms": + if time_unit == "ns": + result = s_cast * 1_000_000 + if time_unit == "us": + result = s_cast * 1_000 + if time_unit == "ms": + result = s_cast + if original_time_unit == "s": + if time_unit == "ns": + result = s_cast * 1_000_000_000 + if time_unit == "us": + result = s_cast * 1_000_000 + if time_unit == "ms": + result = s_cast * 1_000 + return result.where(~mask_na) return self._expr._from_call( func, diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 4f4c6f241..95ea73b98 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -946,16 +946,42 @@ def convert_time_zone(self, time_zone: str) -> PandasLikeSeries: return self._pandas_series._from_native_series(result) def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeSeries: - import numpy as np # ignore-banned-import - s = self._pandas_series._native_series + dtype = self._pandas_series.dtype + is_pyarrow_dtype = "pyarrow" in str(self._pandas_series._native_series.dtype) mask_na = s.isna() - time_ns = s.astype(np.int64) - time_ns[mask_na] = None - if time_unit == "ns": - result = time_ns - if time_unit == "us": - result = time_ns / 1_000 - if time_unit == "ms": - result = time_ns / 1_000_000 + if dtype == self._pandas_series._dtypes.Date: + result = s.astype("Int32[pyarrow]") + if dtype == self._pandas_series._dtypes.Datetime: + original_time_unit = dtype.time_unit # type: ignore[attr-defined] + s_cast = s.astype("Int64[pyarrow]") if is_pyarrow_dtype else s.astype("int64") + if original_time_unit == "ns": + if time_unit == "ns": + result = s_cast + if time_unit == "us": + result = s_cast / 1_000 + if time_unit == "ms": + result = s_cast / 1_000_000 + if original_time_unit == "us": + if time_unit == "ns": + result = s_cast * 1_000 + if time_unit == "us": + result = s_cast + if time_unit == "ms": + result = s_cast / 1_000 + if original_time_unit == "ms": + if time_unit == "ns": + result = s_cast * 1_000_000 + if time_unit == "us": + result = s_cast * 1_000 + if time_unit == "ms": + result = s_cast + if original_time_unit == "s": + if time_unit == "ns": + result = s_cast * 1_000_000_000 + if time_unit == "us": + result = s_cast * 1_000_000 + if time_unit == "ms": + result = s_cast * 1_000 + result[mask_na] = None return self._pandas_series._from_native_series(result) diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index 8f8dc2db7..a5432cae6 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -106,11 +106,26 @@ def test_datetime_chained_attributes( compare_dicts(result, {"a": [2021, 2020]}) -def test_timestamp(constructor_eager: Any) -> None: - dates = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} - df = nw.from_native(constructor_eager(dates)) +def test_timestamp_datetimes(constructor: Constructor) -> None: + datetimes = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} + df = nw.from_native(constructor(datetimes)) result = df.select(nw.col("a").dt.timestamp()) expected = {"a": [978307200000000, None, 978480000000000]} compare_dicts(result, expected) result = df.select(nw.col("a").cast(nw.Datetime("ms")).dt.timestamp()) - expected = {"a": [978307200, 9784800]} + compare_dicts(result, expected) + + +def test_timestamp_dates( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if any( + x in str(constructor) + for x in ("pandas_constructor", "pandas_nullable_constructor", "cudf") + ): + request.applymarker(pytest.mark.xfail) + dates = {"a": [date(2001, 1, 1), None, date(2001, 1, 3)]} + df = nw.from_native(constructor(dates)) + result = df.select(nw.col("a").dt.timestamp()) + expected = {"a": [978307200000000, None, 978480000000000]} + compare_dicts(result, expected) From 485c1e1f8953cf4ffd5c1efb90d844aa2fc42f53 Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Fri, 18 Oct 2024 08:54:40 +0100 Subject: [PATCH 07/25] fix dtype comparisons, add test for dates --- narwhals/_arrow/series.py | 8 ++++---- narwhals/_dask/expr.py | 13 +++++++++++-- narwhals/_pandas_like/series.py | 15 ++++++++++++--- .../dt/datetime_attributes_test.py | 6 +++--- 4 files changed, 30 insertions(+), 12 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index a21acac1c..b62ec3dbc 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -785,9 +785,9 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowS import pyarrow.compute as pc # ignore-banned-import s = self._arrow_series._native_series - dtype = s.type - if isinstance(dtype, pa.TimestampType): - unit = dtype.unit + dtype = self._arrow_series.dtype + if dtype == self._arrow_series._dtypes.Datetime: + unit = dtype.time_unit # type: ignore[attr-defined] s_cast = s.cast(pa.int64()) if unit == "ns": if time_unit == "ns": @@ -810,7 +810,7 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowS result = pc.multiply(s_cast, 1_000) if time_unit == "ms": result = s_cast - elif dtype == pa.date32(): + elif dtype == self._arrow_series._dtypes.Date: time_s = pc.multiply(s.cast(pa.int32()), 86400) if time_unit == "ns": result = pc.multiply(time_s, 1_000_000_000) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 0a92f8c7f..8e01fb045 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -961,8 +961,14 @@ def func( is_pyarrow_dtype = "pyarrow" in str(dtype) mask_na = s.isna() if dtype == self._expr._dtypes.Date: - result = s.astype("Int32[pyarrow]") - if dtype == self._expr._dtypes.Datetime: + s_cast = s.astype("Int32[pyarrow]") * 86_400 + if time_unit == "ns": + result = s_cast * 1_000_000_000 + if time_unit == "us": + result = s_cast * 1_000_000 + if time_unit == "ms": + result = s_cast * 1_000 + elif dtype == self._expr._dtypes.Datetime: original_time_unit = dtype.time_unit # type: ignore[attr-defined] if is_pyarrow_dtype: s_cast = s.astype("Int64[pyarrow]") @@ -996,6 +1002,9 @@ def func( result = s_cast * 1_000_000 if time_unit == "ms": result = s_cast * 1_000 + else: + msg = "Input should be either of Date or Datetime type" + raise TypeError(msg) return result.where(~mask_na) return self._expr._from_call( diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 95ea73b98..654dd3862 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -951,8 +951,14 @@ def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeSe is_pyarrow_dtype = "pyarrow" in str(self._pandas_series._native_series.dtype) mask_na = s.isna() if dtype == self._pandas_series._dtypes.Date: - result = s.astype("Int32[pyarrow]") - if dtype == self._pandas_series._dtypes.Datetime: + s_cast = s.astype("Int32[pyarrow]") * 86_400 + if time_unit == "ns": + result = s_cast * 1_000_000_000 + if time_unit == "us": + result = s_cast * 1_000_000 + if time_unit == "ms": + result = s_cast * 1_000 + elif dtype == self._pandas_series._dtypes.Datetime: original_time_unit = dtype.time_unit # type: ignore[attr-defined] s_cast = s.astype("Int64[pyarrow]") if is_pyarrow_dtype else s.astype("int64") if original_time_unit == "ns": @@ -983,5 +989,8 @@ def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeSe result = s_cast * 1_000_000 if time_unit == "ms": result = s_cast * 1_000 - result[mask_na] = None + else: + msg = "Input should be either of Date or Datetime type" + raise TypeError(msg) + result[mask_na] = None return self._pandas_series._from_native_series(result) diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index a5432cae6..3fc870643 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -121,11 +121,11 @@ def test_timestamp_dates( ) -> None: if any( x in str(constructor) - for x in ("pandas_constructor", "pandas_nullable_constructor", "cudf") + for x in ("pandas_constructor", "pandas_nullable_constructor", "cudf", "dask") ): request.applymarker(pytest.mark.xfail) - dates = {"a": [date(2001, 1, 1), None, date(2001, 1, 3)]} + dates = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} df = nw.from_native(constructor(dates)) - result = df.select(nw.col("a").dt.timestamp()) + result = df.select(nw.col("a").dt.date().dt.timestamp()) expected = {"a": [978307200000000, None, 978480000000000]} compare_dicts(result, expected) From bf909ad3e34e653e58dd7c9e44d43fed3a5adc3b Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Fri, 18 Oct 2024 09:30:54 +0100 Subject: [PATCH 08/25] add parametrization --- .../dt/datetime_attributes_test.py | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index 3fc870643..81c0fa18b 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -2,6 +2,7 @@ from datetime import date from datetime import datetime +from typing import Literal import pytest @@ -106,14 +107,25 @@ def test_datetime_chained_attributes( compare_dicts(result, {"a": [2021, 2020]}) -def test_timestamp_datetimes(constructor: Constructor) -> None: +@pytest.mark.parametrize( + ("original_time_unit", "time_unit", "expected"), + [ + ("ns", "ns", [978307200000000000, None, 978480000000000000]), + ("ns", "us", [978307200000000, None, 978480000000000]), + ], +) +def test_timestamp_datetimes( + constructor: Constructor, + original_time_unit: Literal["us", "ns", "ms", "s"], + time_unit: Literal["ns", "us", "ms"], + expected: list[int | None], +) -> None: datetimes = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} df = nw.from_native(constructor(datetimes)) - result = df.select(nw.col("a").dt.timestamp()) - expected = {"a": [978307200000000, None, 978480000000000]} - compare_dicts(result, expected) - result = df.select(nw.col("a").cast(nw.Datetime("ms")).dt.timestamp()) - compare_dicts(result, expected) + result = df.select( + nw.col("a").cast(nw.Datetime(original_time_unit)).dt.timestamp(time_unit) + ) + compare_dicts(result, {"a": expected}) def test_timestamp_dates( From a37e5cd7c76043df81bae18b069898402da8112e Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Fri, 18 Oct 2024 09:58:50 +0100 Subject: [PATCH 09/25] parametrize for other time units --- narwhals/_arrow/series.py | 7 +++++++ .../expr_and_series/dt/datetime_attributes_test.py | 13 +++++++++++++ 2 files changed, 20 insertions(+) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 8ff8ae22c..1cf7c7b82 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -810,6 +810,13 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowS result = pc.multiply(s_cast, 1_000) if time_unit == "ms": result = s_cast + if unit == "s": + if time_unit == "ns": + result = pc.multiply(s_cast, 1_000_000_000) + if time_unit == "us": + result = pc.multiply(s_cast, 1_000_000) + if time_unit == "ms": + result = pc.multiply(s_cast, 1_000) elif dtype == self._arrow_series._dtypes.Date: time_s = pc.multiply(s.cast(pa.int32()), 86400) if time_unit == "ns": diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index 95e65e1f2..ead47a5e5 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -112,14 +112,27 @@ def test_datetime_chained_attributes( [ ("ns", "ns", [978307200000000000, None, 978480000000000000]), ("ns", "us", [978307200000000, None, 978480000000000]), + ("ns", "ms", [978307200000, None, 978480000000]), + ("us", "ns", [978307200000000000, None, 978480000000000000]), + ("us", "us", [978307200000000, None, 978480000000000]), + ("us", "ms", [978307200000, None, 978480000000]), + ("ms", "ns", [978307200000000000, None, 978480000000000000]), + ("ms", "us", [978307200000000, None, 978480000000000]), + ("ms", "ms", [978307200000, None, 978480000000]), + ("s", "ns", [978307200000000000, None, 978480000000000000]), + ("s", "us", [978307200000000, None, 978480000000000]), + ("s", "ms", [978307200000, None, 978480000000]), ], ) def test_timestamp_datetimes( + request: pytest.FixtureRequest, constructor: Constructor, original_time_unit: Literal["us", "ns", "ms", "s"], time_unit: Literal["ns", "us", "ms"], expected: list[int | None], ) -> None: + if original_time_unit == "s" and "polars" in str(constructor): + request.applymarker(pytest.mark.xfail) datetimes = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} df = nw.from_native(constructor(datetimes)) result = df.select( From fee242256b44996f912ca09f3c243861d7af4b7b Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Fri, 18 Oct 2024 14:27:38 +0100 Subject: [PATCH 10/25] move common functions to utils, add tests for invalid inputs, add dask pyarrow types to to_date test --- narwhals/_arrow/series.py | 2 +- narwhals/_dask/expr.py | 47 +++--------- narwhals/_pandas_like/series.py | 38 +--------- narwhals/_pandas_like/utils.py | 44 +++++++++++ .../dt/datetime_attributes_test.py | 73 +++++++++++++++++-- 5 files changed, 123 insertions(+), 81 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 1cf7c7b82..1a971b21d 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -826,7 +826,7 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowS if time_unit == "ms": result = pc.multiply(time_s, 1_000) else: - msg = "Input should be either of TimeStamp or Date type" + msg = "Input should be either of Date or Datetime type" raise TypeError(msg) return self._arrow_series._from_native_series(result) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 8e01fb045..1f753f005 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -10,6 +10,8 @@ from narwhals._dask.utils import add_row_index from narwhals._dask.utils import maybe_evaluate from narwhals._dask.utils import narwhals_to_native_dtype +from narwhals._pandas_like.utils import calculate_timestamp_date +from narwhals._pandas_like.utils import calculate_timestamp_datetime from narwhals._pandas_like.utils import native_to_narwhals_dtype from narwhals.utils import generate_unique_token @@ -962,46 +964,15 @@ def func( mask_na = s.isna() if dtype == self._expr._dtypes.Date: s_cast = s.astype("Int32[pyarrow]") * 86_400 - if time_unit == "ns": - result = s_cast * 1_000_000_000 - if time_unit == "us": - result = s_cast * 1_000_000 - if time_unit == "ms": - result = s_cast * 1_000 + result = calculate_timestamp_date(s_cast, time_unit) elif dtype == self._expr._dtypes.Datetime: original_time_unit = dtype.time_unit # type: ignore[attr-defined] - if is_pyarrow_dtype: - s_cast = s.astype("Int64[pyarrow]") - else: - s_cast = s.astype("int64") - if original_time_unit == "ns": - if time_unit == "ns": - result = s_cast - if time_unit == "us": - result = s_cast / 1_000 - if time_unit == "ms": - result = s_cast / 1_000_000 - if original_time_unit == "us": - if time_unit == "ns": - result = s_cast * 1_000 - if time_unit == "us": - result = s_cast - if time_unit == "ms": - result = s_cast / 1_000 - if original_time_unit == "ms": - if time_unit == "ns": - result = s_cast * 1_000_000 - if time_unit == "us": - result = s_cast * 1_000 - if time_unit == "ms": - result = s_cast - if original_time_unit == "s": - if time_unit == "ns": - result = s_cast * 1_000_000_000 - if time_unit == "us": - result = s_cast * 1_000_000 - if time_unit == "ms": - result = s_cast * 1_000 + s_cast = ( + s.astype("Int64[pyarrow]") if is_pyarrow_dtype else s.astype("int64") + ) + result = calculate_timestamp_datetime( + s_cast, original_time_unit, time_unit + ) else: msg = "Input should be either of Date or Datetime type" raise TypeError(msg) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 654dd3862..d7fa3afce 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -8,6 +8,8 @@ from typing import Sequence from typing import overload +from narwhals._pandas_like.utils import calculate_timestamp_date +from narwhals._pandas_like.utils import calculate_timestamp_datetime from narwhals._pandas_like.utils import int_dtype_mapper from narwhals._pandas_like.utils import narwhals_to_native_dtype from narwhals._pandas_like.utils import native_series_from_iterable @@ -952,43 +954,11 @@ def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeSe mask_na = s.isna() if dtype == self._pandas_series._dtypes.Date: s_cast = s.astype("Int32[pyarrow]") * 86_400 - if time_unit == "ns": - result = s_cast * 1_000_000_000 - if time_unit == "us": - result = s_cast * 1_000_000 - if time_unit == "ms": - result = s_cast * 1_000 + result = calculate_timestamp_date(s_cast, time_unit) elif dtype == self._pandas_series._dtypes.Datetime: original_time_unit = dtype.time_unit # type: ignore[attr-defined] s_cast = s.astype("Int64[pyarrow]") if is_pyarrow_dtype else s.astype("int64") - if original_time_unit == "ns": - if time_unit == "ns": - result = s_cast - if time_unit == "us": - result = s_cast / 1_000 - if time_unit == "ms": - result = s_cast / 1_000_000 - if original_time_unit == "us": - if time_unit == "ns": - result = s_cast * 1_000 - if time_unit == "us": - result = s_cast - if time_unit == "ms": - result = s_cast / 1_000 - if original_time_unit == "ms": - if time_unit == "ns": - result = s_cast * 1_000_000 - if time_unit == "us": - result = s_cast * 1_000 - if time_unit == "ms": - result = s_cast - if original_time_unit == "s": - if time_unit == "ns": - result = s_cast * 1_000_000_000 - if time_unit == "us": - result = s_cast * 1_000_000 - if time_unit == "ms": - result = s_cast * 1_000 + result = calculate_timestamp_datetime(s_cast, original_time_unit, time_unit) else: msg = "Input should be either of Date or Datetime type" raise TypeError(msg) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 0c5ec4711..976f750a8 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -542,3 +542,47 @@ def convert_str_slice_to_int_slice( stop = columns.get_loc(str_slice.stop) + 1 if str_slice.stop is not None else None step = str_slice.step return (start, stop, step) + + +def calculate_timestamp_datetime( + s_cast: int, original_time_unit: str, time_unit: str +) -> Any: + if original_time_unit == "ns": + if time_unit == "ns": + result = s_cast + if time_unit == "us": + result = s_cast // 1_000 + if time_unit == "ms": + result = s_cast // 1_000_000 + if original_time_unit == "us": + if time_unit == "ns": + result = s_cast * 1_000 + if time_unit == "us": + result = s_cast + if time_unit == "ms": + result = s_cast // 1_000 + if original_time_unit == "ms": + if time_unit == "ns": + result = s_cast * 1_000_000 + if time_unit == "us": + result = s_cast * 1_000 + if time_unit == "ms": + result = s_cast + if original_time_unit == "s": + if time_unit == "ns": + result = s_cast * 1_000_000_000 + if time_unit == "us": + result = s_cast * 1_000_000 + if time_unit == "ms": + result = s_cast * 1_000 + return result + + +def calculate_timestamp_date(s_cast: int, time_unit: str) -> Any: + if time_unit == "ns": + result = s_cast * 1_000_000_000 + if time_unit == "us": + result = s_cast * 1_000_000 + if time_unit == "ms": + result = s_cast * 1_000 + return result diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index ead47a5e5..484d9bfca 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -141,28 +141,85 @@ def test_timestamp_datetimes( compare_dicts(result, {"a": expected}) +@pytest.mark.parametrize( + ("time_unit", "expected"), + [ + ("ns", [978307200000000000, None, 978480000000000000]), + ("us", [978307200000000, None, 978480000000000]), + ("ms", [978307200000, None, 978480000000]), + ], +) def test_timestamp_dates( - request: pytest.FixtureRequest, constructor: Constructor + request: pytest.FixtureRequest, + constructor: Constructor, + time_unit: Literal["ns", "us", "ms"], + expected: list[int | None], ) -> None: if any( x in str(constructor) - for x in ("pandas_constructor", "pandas_nullable_constructor", "cudf", "dask") + for x in ("pandas_constructor", "pandas_nullable_constructor", "cudf") ): request.applymarker(pytest.mark.xfail) + dates = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} - df = nw.from_native(constructor(dates)) - result = df.select(nw.col("a").dt.date().dt.timestamp()) - expected = {"a": [978307200000000, None, 978480000000000]} - compare_dicts(result, expected) + if "dask" in str(constructor): + df = nw.from_native( + constructor(dates).astype({"a": "timestamp[ns][pyarrow]"}) # type: ignore[union-attr] + ) + else: + df = nw.from_native(constructor(dates)) + result = df.select(nw.col("a").dt.date().dt.timestamp(time_unit)) + compare_dicts(result, {"a": expected}) + + +def test_timestamp_invalid_date( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "polars" in str(constructor): + request.applymarker(pytest.mark.xfail) + data_str = {"a": ["x", "y", None]} + data_num = {"a": [1, 2, None]} + df_str = nw.from_native(constructor(data_str)) + df_num = nw.from_native(constructor(data_num)) + msg = "Input should be either of Date or Datetime type" + with pytest.raises(TypeError, match=msg): + df_str.select(nw.col("a").dt.timestamp()) + with pytest.raises(TypeError, match=msg): + df_num.select(nw.col("a").dt.timestamp()) + + +def test_timestamp_invalid_unit_expr(constructor: Constructor) -> None: + time_unit_invalid = "i" + msg = ( + "invalid `time_unit`" + f"\n\nExpected one of {{'ns', 'us', 'ms'}}, got {time_unit_invalid!r}." + ) + with pytest.raises(ValueError, match=msg): + nw.from_native(constructor(data)).select( + nw.col("a").dt.timestamp(time_unit_invalid) # type: ignore[arg-type] + ) + + +def test_timestamp_invalid_unit_series(constructor_eager: ConstructorEager) -> None: + time_unit_invalid = "i" + msg = ( + "invalid `time_unit`" + f"\n\nExpected one of {{'ns', 'us', 'ms'}}, got {time_unit_invalid!r}." + ) + with pytest.raises(ValueError, match=msg): + nw.from_native(constructor_eager(data))["a"].dt.timestamp(time_unit_invalid) # type: ignore[arg-type] def test_to_date(request: pytest.FixtureRequest, constructor: Constructor) -> None: if any( x in str(constructor) - for x in ("pandas_constructor", "pandas_nullable_constructor", "dask") + for x in ("pandas_constructor", "pandas_nullable_constructor") ): request.applymarker(pytest.mark.xfail) dates = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} - df = nw.from_native(constructor(dates)) + if "dask" in str(constructor): + df = nw.from_native(constructor(dates).astype({"a": "timestamp[ns][pyarrow]"})) # type: ignore[union-attr] + else: + df = nw.from_native(constructor(dates)) result = df.select(nw.col("a").dt.date()) assert result.collect_schema() == {"a": nw.Date} From dc86689fb2cb5c10d1a999941d8b38b36b8382ec Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 20 Oct 2024 08:32:13 +0100 Subject: [PATCH 11/25] use more elif/else statements --- narwhals/_arrow/series.py | 35 +++--- narwhals/_pandas_like/utils.py | 29 ++--- .../dt/datetime_attributes_test.py | 104 ------------------ 3 files changed, 35 insertions(+), 133 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 1a971b21d..e2d0ab126 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -792,38 +792,41 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowS if unit == "ns": if time_unit == "ns": result = s_cast - if time_unit == "us": - result = pc.divide(s_cast, 1_000) - if time_unit == "ms": - result = pc.divide(s_cast, 1_000_000) - if unit == "us": + elif time_unit == "us": + result = floordiv_compat(s_cast, 1_000) + else: + result = floordiv_compat(s_cast, 1_000_000) + elif unit == "us": if time_unit == "ns": result = pc.multiply(s_cast, 1_000) - if time_unit == "us": + elif time_unit == "us": result = s_cast - if time_unit == "ms": - result = pc.divide(s_cast, 1_000) - if unit == "ms": + else: + result = floordiv_compat(s_cast, 1_000) + elif unit == "ms": if time_unit == "ns": result = pc.multiply(s_cast, 1_000_000) - if time_unit == "us": + elif time_unit == "us": result = pc.multiply(s_cast, 1_000) - if time_unit == "ms": + else: result = s_cast - if unit == "s": + elif unit == "s": if time_unit == "ns": result = pc.multiply(s_cast, 1_000_000_000) - if time_unit == "us": + elif time_unit == "us": result = pc.multiply(s_cast, 1_000_000) - if time_unit == "ms": + else: result = pc.multiply(s_cast, 1_000) + else: + msg = f"unexpected time unit {unit}, please report an issue at https://github.com/narwhals-dev/narwhals" + raise AssertionError(msg) elif dtype == self._arrow_series._dtypes.Date: time_s = pc.multiply(s.cast(pa.int32()), 86400) if time_unit == "ns": result = pc.multiply(time_s, 1_000_000_000) - if time_unit == "us": + elif time_unit == "us": result = pc.multiply(time_s, 1_000_000) - if time_unit == "ms": + else: result = pc.multiply(time_s, 1_000) else: msg = "Input should be either of Date or Datetime type" diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 976f750a8..1425044dd 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -550,39 +550,42 @@ def calculate_timestamp_datetime( if original_time_unit == "ns": if time_unit == "ns": result = s_cast - if time_unit == "us": + elif time_unit == "us": result = s_cast // 1_000 - if time_unit == "ms": + else: result = s_cast // 1_000_000 - if original_time_unit == "us": + elif original_time_unit == "us": if time_unit == "ns": result = s_cast * 1_000 - if time_unit == "us": + elif time_unit == "us": result = s_cast - if time_unit == "ms": + else: result = s_cast // 1_000 - if original_time_unit == "ms": + elif original_time_unit == "ms": if time_unit == "ns": result = s_cast * 1_000_000 - if time_unit == "us": + elif time_unit == "us": result = s_cast * 1_000 - if time_unit == "ms": + else: result = s_cast - if original_time_unit == "s": + elif original_time_unit == "s": if time_unit == "ns": result = s_cast * 1_000_000_000 - if time_unit == "us": + elif time_unit == "us": result = s_cast * 1_000_000 - if time_unit == "ms": + else: result = s_cast * 1_000 + else: + msg = f"unexpected time unit {original_time_unit}, please report a bug at https://github.com/narwhals-dev/narwhals" + raise AssertionError(msg) return result def calculate_timestamp_date(s_cast: int, time_unit: str) -> Any: if time_unit == "ns": result = s_cast * 1_000_000_000 - if time_unit == "us": + elif time_unit == "us": result = s_cast * 1_000_000 - if time_unit == "ms": + else: result = s_cast * 1_000 return result diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index 484d9bfca..cf025317d 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -2,7 +2,6 @@ from datetime import date from datetime import datetime -from typing import Literal import pytest @@ -107,109 +106,6 @@ def test_datetime_chained_attributes( compare_dicts(result, {"a": [2021, 2020]}) -@pytest.mark.parametrize( - ("original_time_unit", "time_unit", "expected"), - [ - ("ns", "ns", [978307200000000000, None, 978480000000000000]), - ("ns", "us", [978307200000000, None, 978480000000000]), - ("ns", "ms", [978307200000, None, 978480000000]), - ("us", "ns", [978307200000000000, None, 978480000000000000]), - ("us", "us", [978307200000000, None, 978480000000000]), - ("us", "ms", [978307200000, None, 978480000000]), - ("ms", "ns", [978307200000000000, None, 978480000000000000]), - ("ms", "us", [978307200000000, None, 978480000000000]), - ("ms", "ms", [978307200000, None, 978480000000]), - ("s", "ns", [978307200000000000, None, 978480000000000000]), - ("s", "us", [978307200000000, None, 978480000000000]), - ("s", "ms", [978307200000, None, 978480000000]), - ], -) -def test_timestamp_datetimes( - request: pytest.FixtureRequest, - constructor: Constructor, - original_time_unit: Literal["us", "ns", "ms", "s"], - time_unit: Literal["ns", "us", "ms"], - expected: list[int | None], -) -> None: - if original_time_unit == "s" and "polars" in str(constructor): - request.applymarker(pytest.mark.xfail) - datetimes = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} - df = nw.from_native(constructor(datetimes)) - result = df.select( - nw.col("a").cast(nw.Datetime(original_time_unit)).dt.timestamp(time_unit) - ) - compare_dicts(result, {"a": expected}) - - -@pytest.mark.parametrize( - ("time_unit", "expected"), - [ - ("ns", [978307200000000000, None, 978480000000000000]), - ("us", [978307200000000, None, 978480000000000]), - ("ms", [978307200000, None, 978480000000]), - ], -) -def test_timestamp_dates( - request: pytest.FixtureRequest, - constructor: Constructor, - time_unit: Literal["ns", "us", "ms"], - expected: list[int | None], -) -> None: - if any( - x in str(constructor) - for x in ("pandas_constructor", "pandas_nullable_constructor", "cudf") - ): - request.applymarker(pytest.mark.xfail) - - dates = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} - if "dask" in str(constructor): - df = nw.from_native( - constructor(dates).astype({"a": "timestamp[ns][pyarrow]"}) # type: ignore[union-attr] - ) - else: - df = nw.from_native(constructor(dates)) - result = df.select(nw.col("a").dt.date().dt.timestamp(time_unit)) - compare_dicts(result, {"a": expected}) - - -def test_timestamp_invalid_date( - request: pytest.FixtureRequest, constructor: Constructor -) -> None: - if "polars" in str(constructor): - request.applymarker(pytest.mark.xfail) - data_str = {"a": ["x", "y", None]} - data_num = {"a": [1, 2, None]} - df_str = nw.from_native(constructor(data_str)) - df_num = nw.from_native(constructor(data_num)) - msg = "Input should be either of Date or Datetime type" - with pytest.raises(TypeError, match=msg): - df_str.select(nw.col("a").dt.timestamp()) - with pytest.raises(TypeError, match=msg): - df_num.select(nw.col("a").dt.timestamp()) - - -def test_timestamp_invalid_unit_expr(constructor: Constructor) -> None: - time_unit_invalid = "i" - msg = ( - "invalid `time_unit`" - f"\n\nExpected one of {{'ns', 'us', 'ms'}}, got {time_unit_invalid!r}." - ) - with pytest.raises(ValueError, match=msg): - nw.from_native(constructor(data)).select( - nw.col("a").dt.timestamp(time_unit_invalid) # type: ignore[arg-type] - ) - - -def test_timestamp_invalid_unit_series(constructor_eager: ConstructorEager) -> None: - time_unit_invalid = "i" - msg = ( - "invalid `time_unit`" - f"\n\nExpected one of {{'ns', 'us', 'ms'}}, got {time_unit_invalid!r}." - ) - with pytest.raises(ValueError, match=msg): - nw.from_native(constructor_eager(data))["a"].dt.timestamp(time_unit_invalid) # type: ignore[arg-type] - - def test_to_date(request: pytest.FixtureRequest, constructor: Constructor) -> None: if any( x in str(constructor) From 70d6462134c3c17abbfb3caec279a7e6444641da Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 20 Oct 2024 08:45:32 +0100 Subject: [PATCH 12/25] add timestamp_test --- tests/expr_and_series/dt/timestamp_test.py | 153 +++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 tests/expr_and_series/dt/timestamp_test.py diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py new file mode 100644 index 000000000..5900ce265 --- /dev/null +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +from datetime import datetime +from typing import Literal + +import hypothesis.strategies as st +import pytest +from hypothesis import given + +import narwhals.stable.v1 as nw +from tests.utils import Constructor +from tests.utils import ConstructorEager +from tests.utils import compare_dicts + +data = { + "a": [ + datetime(2021, 3, 1, 12, 34, 56, 49000), + datetime(2020, 1, 2, 2, 4, 14, 715000), + ], +} + + +@pytest.mark.parametrize( + ("original_time_unit", "time_unit", "expected"), + [ + ("ns", "ns", [978307200000000000, None, 978480000000000000]), + ("ns", "us", [978307200000000, None, 978480000000000]), + ("ns", "ms", [978307200000, None, 978480000000]), + ("us", "ns", [978307200000000000, None, 978480000000000000]), + ("us", "us", [978307200000000, None, 978480000000000]), + ("us", "ms", [978307200000, None, 978480000000]), + ("ms", "ns", [978307200000000000, None, 978480000000000000]), + ("ms", "us", [978307200000000, None, 978480000000000]), + ("ms", "ms", [978307200000, None, 978480000000]), + ("s", "ns", [978307200000000000, None, 978480000000000000]), + ("s", "us", [978307200000000, None, 978480000000000]), + ("s", "ms", [978307200000, None, 978480000000]), + ], +) +def test_timestamp_datetimes( + request: pytest.FixtureRequest, + constructor: Constructor, + original_time_unit: Literal["us", "ns", "ms", "s"], + time_unit: Literal["ns", "us", "ms"], + expected: list[int | None], +) -> None: + if original_time_unit == "s" and "polars" in str(constructor): + request.applymarker(pytest.mark.xfail) + datetimes = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} + df = nw.from_native(constructor(datetimes)) + result = df.select( + nw.col("a").cast(nw.Datetime(original_time_unit)).dt.timestamp(time_unit) + ) + compare_dicts(result, {"a": expected}) + + +@pytest.mark.parametrize( + ("time_unit", "expected"), + [ + ("ns", [978307200000000000, None, 978480000000000000]), + ("us", [978307200000000, None, 978480000000000]), + ("ms", [978307200000, None, 978480000000]), + ], +) +def test_timestamp_dates( + request: pytest.FixtureRequest, + constructor: Constructor, + time_unit: Literal["ns", "us", "ms"], + expected: list[int | None], +) -> None: + if any( + x in str(constructor) + for x in ("pandas_constructor", "pandas_nullable_constructor", "cudf") + ): + request.applymarker(pytest.mark.xfail) + + dates = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} + if "dask" in str(constructor): + df = nw.from_native( + constructor(dates).astype({"a": "timestamp[ns][pyarrow]"}) # type: ignore[union-attr] + ) + else: + df = nw.from_native(constructor(dates)) + result = df.select(nw.col("a").dt.date().dt.timestamp(time_unit)) + compare_dicts(result, {"a": expected}) + + +def test_timestamp_invalid_date( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "polars" in str(constructor): + request.applymarker(pytest.mark.xfail) + data_str = {"a": ["x", "y", None]} + data_num = {"a": [1, 2, None]} + df_str = nw.from_native(constructor(data_str)) + df_num = nw.from_native(constructor(data_num)) + msg = "Input should be either of Date or Datetime type" + with pytest.raises(TypeError, match=msg): + df_str.select(nw.col("a").dt.timestamp()) + with pytest.raises(TypeError, match=msg): + df_num.select(nw.col("a").dt.timestamp()) + + +def test_timestamp_invalid_unit_expr(constructor: Constructor) -> None: + time_unit_invalid = "i" + msg = ( + "invalid `time_unit`" + f"\n\nExpected one of {{'ns', 'us', 'ms'}}, got {time_unit_invalid!r}." + ) + with pytest.raises(ValueError, match=msg): + nw.from_native(constructor(data)).select( + nw.col("a").dt.timestamp(time_unit_invalid) # type: ignore[arg-type] + ) + + +def test_timestamp_invalid_unit_series(constructor_eager: ConstructorEager) -> None: + time_unit_invalid = "i" + msg = ( + "invalid `time_unit`" + f"\n\nExpected one of {{'ns', 'us', 'ms'}}, got {time_unit_invalid!r}." + ) + with pytest.raises(ValueError, match=msg): + nw.from_native(constructor_eager(data))["a"].dt.timestamp(time_unit_invalid) # type: ignore[arg-type] + + +@given( # type: ignore[misc] + inputs=st.datetimes(min_value=datetime(1960, 1, 1), max_value=datetime(1980, 1, 1)), + time_unit=st.sampled_from(["ms", "us", "ns"]), + # We keep 'ms' out for now due to an upstream bug: https://github.com/pola-rs/polars/issues/19309 + starting_time_unit=st.sampled_from(["us", "ns"]), +) +def test_timestamp_hypothesis( + inputs: datetime, + time_unit: Literal["ms", "us", "ns"], + starting_time_unit: Literal["ms", "us", "ns"], +) -> None: + import pandas as pd + import polars as pl + import pyarrow as pa + + @nw.narwhalify + def func(s: nw.Series) -> nw.Series: + return s.dt.timestamp(time_unit) # type: ignore[return-value] + + result_pl = func(pl.Series([inputs], dtype=pl.Datetime(starting_time_unit))) + result_pd = func(pd.Series([inputs], dtype=f"datetime64[{starting_time_unit}]")) + result_pdpa = func( + pd.Series([inputs], dtype=f"timestamp[{starting_time_unit}][pyarrow]") + ) + result_pa = func(pa.chunked_array([[inputs]], type=pa.timestamp(starting_time_unit))) + assert result_pl[0] == result_pd[0] + assert result_pl[0] == result_pdpa[0] + assert result_pl[0] == result_pa[0].as_py() From b1e97c90c8e353736c629684427caae8685c18b9 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 20 Oct 2024 08:57:34 +0100 Subject: [PATCH 13/25] version compat --- tests/expr_and_series/dt/timestamp_test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index 5900ce265..ca9882423 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -4,10 +4,12 @@ from typing import Literal import hypothesis.strategies as st +import pandas as pd import pytest from hypothesis import given import narwhals.stable.v1 as nw +from narwhals.utils import parse_version from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import compare_dicts @@ -129,6 +131,7 @@ def test_timestamp_invalid_unit_series(constructor_eager: ConstructorEager) -> N # We keep 'ms' out for now due to an upstream bug: https://github.com/pola-rs/polars/issues/19309 starting_time_unit=st.sampled_from(["us", "ns"]), ) +@pytest.mark.skipif(parse_version(pd.__version__) < (2, 2), reason="bug in old pandas") def test_timestamp_hypothesis( inputs: datetime, time_unit: Literal["ms", "us", "ns"], From 124d588503410e77b015db7a1c92280be7adfb5f Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 20 Oct 2024 09:05:20 +0100 Subject: [PATCH 14/25] pandas versions compat --- narwhals/_pandas_like/series.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index d7fa3afce..7312e11fe 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -957,7 +957,15 @@ def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeSe result = calculate_timestamp_date(s_cast, time_unit) elif dtype == self._pandas_series._dtypes.Datetime: original_time_unit = dtype.time_unit # type: ignore[attr-defined] - s_cast = s.astype("Int64[pyarrow]") if is_pyarrow_dtype else s.astype("int64") + if ( + self._pandas_series._implementation is Implementation.PANDAS + and self._pandas_series._backend_version < (2,) + ): # pragma: no cover + s_cast = s.view("Int64[pyarrow]") if is_pyarrow_dtype else s.view("int64") + else: + s_cast = ( + s.astype("Int64[pyarrow]") if is_pyarrow_dtype else s.astype("int64") + ) result = calculate_timestamp_datetime(s_cast, original_time_unit, time_unit) else: msg = "Input should be either of Date or Datetime type" From 62770947899fdf2fa28a6ac41fb88ac6dff751cb Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 20 Oct 2024 09:14:18 +0100 Subject: [PATCH 15/25] coverage --- narwhals/_arrow/series.py | 2 +- narwhals/_pandas_like/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index e2d0ab126..f029a4d5c 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -817,7 +817,7 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowS result = pc.multiply(s_cast, 1_000_000) else: result = pc.multiply(s_cast, 1_000) - else: + else: # pragma: no cover msg = f"unexpected time unit {unit}, please report an issue at https://github.com/narwhals-dev/narwhals" raise AssertionError(msg) elif dtype == self._arrow_series._dtypes.Date: diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 1425044dd..ef7e58e1b 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -575,7 +575,7 @@ def calculate_timestamp_datetime( result = s_cast * 1_000_000 else: result = s_cast * 1_000 - else: + else: # pragma: no cover msg = f"unexpected time unit {original_time_unit}, please report a bug at https://github.com/narwhals-dev/narwhals" raise AssertionError(msg) return result From d158f2266428e19118caef5856960cdba082ca48 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 20 Oct 2024 10:46:12 +0100 Subject: [PATCH 16/25] improve type hints --- narwhals/_pandas_like/series.py | 2 +- narwhals/_pandas_like/utils.py | 37 +++++++++++++++++---------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 7312e11fe..de63f5ff5 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -953,7 +953,7 @@ def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeSe is_pyarrow_dtype = "pyarrow" in str(self._pandas_series._native_series.dtype) mask_na = s.isna() if dtype == self._pandas_series._dtypes.Date: - s_cast = s.astype("Int32[pyarrow]") * 86_400 + s_cast = s.astype("Int32[pyarrow]") result = calculate_timestamp_date(s_cast, time_unit) elif dtype == self._pandas_series._dtypes.Datetime: original_time_unit = dtype.time_unit # type: ignore[attr-defined] diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index ef7e58e1b..2c4612eb1 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -545,47 +545,48 @@ def convert_str_slice_to_int_slice( def calculate_timestamp_datetime( - s_cast: int, original_time_unit: str, time_unit: str -) -> Any: + s: pd.Series, original_time_unit: str, time_unit: str +) -> pd.Series: if original_time_unit == "ns": if time_unit == "ns": - result = s_cast + result = s elif time_unit == "us": - result = s_cast // 1_000 + result = s // 1_000 else: - result = s_cast // 1_000_000 + result = s // 1_000_000 elif original_time_unit == "us": if time_unit == "ns": - result = s_cast * 1_000 + result = s * 1_000 elif time_unit == "us": - result = s_cast + result = s else: - result = s_cast // 1_000 + result = s // 1_000 elif original_time_unit == "ms": if time_unit == "ns": - result = s_cast * 1_000_000 + result = s * 1_000_000 elif time_unit == "us": - result = s_cast * 1_000 + result = s * 1_000 else: - result = s_cast + result = s elif original_time_unit == "s": if time_unit == "ns": - result = s_cast * 1_000_000_000 + result = s * 1_000_000_000 elif time_unit == "us": - result = s_cast * 1_000_000 + result = s * 1_000_000 else: - result = s_cast * 1_000 + result = s * 1_000 else: # pragma: no cover msg = f"unexpected time unit {original_time_unit}, please report a bug at https://github.com/narwhals-dev/narwhals" raise AssertionError(msg) return result -def calculate_timestamp_date(s_cast: int, time_unit: str) -> Any: +def calculate_timestamp_date(s: pd.Series, time_unit: str) -> pd.Series: + s = s * 86_400 # number of seconds in a day if time_unit == "ns": - result = s_cast * 1_000_000_000 + result = s * 1_000_000_000 elif time_unit == "us": - result = s_cast * 1_000_000 + result = s * 1_000_000 else: - result = s_cast * 1_000 + result = s * 1_000 return result From 49d235cae2915b3a59aa3ffb355fa2cac834b0a2 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 20 Oct 2024 10:53:19 +0100 Subject: [PATCH 17/25] insert a time zone for good measure --- narwhals/_dask/expr.py | 2 +- tests/expr_and_series/dt/timestamp_test.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 1f753f005..d34fcaeec 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -963,7 +963,7 @@ def func( is_pyarrow_dtype = "pyarrow" in str(dtype) mask_na = s.isna() if dtype == self._expr._dtypes.Date: - s_cast = s.astype("Int32[pyarrow]") * 86_400 + s_cast = s.astype("Int32[pyarrow]") result = calculate_timestamp_date(s_cast, time_unit) elif dtype == self._expr._dtypes.Datetime: original_time_unit = dtype.time_unit # type: ignore[attr-defined] diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index ca9882423..bf06e8651 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -51,7 +51,10 @@ def test_timestamp_datetimes( datetimes = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} df = nw.from_native(constructor(datetimes)) result = df.select( - nw.col("a").cast(nw.Datetime(original_time_unit)).dt.timestamp(time_unit) + nw.col("a") + .cast(nw.Datetime(original_time_unit)) + .dt.convert_time_zone("Asia/Kathmandu") + .dt.timestamp(time_unit) ) compare_dicts(result, {"a": expected}) From 674b91232cba8136b56189ec9fdfb43e27de345f Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 20 Oct 2024 10:55:11 +0100 Subject: [PATCH 18/25] set time zone to utc first --- tests/expr_and_series/dt/timestamp_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index bf06e8651..b98e0207c 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -53,6 +53,7 @@ def test_timestamp_datetimes( result = df.select( nw.col("a") .cast(nw.Datetime(original_time_unit)) + .dt.replace_time_zone("UTC") .dt.convert_time_zone("Asia/Kathmandu") .dt.timestamp(time_unit) ) From 263f093a642102bba3048c7b1b1f33c4c134ad0b Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 20 Oct 2024 10:59:14 +0100 Subject: [PATCH 19/25] split time zone into separate test --- tests/expr_and_series/dt/timestamp_test.py | 43 ++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index b98e0207c..b1ed39fd2 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -5,6 +5,7 @@ import hypothesis.strategies as st import pandas as pd +import pyarrow as pa import pytest from hypothesis import given @@ -13,6 +14,7 @@ from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import compare_dicts +from tests.utils import is_windows data = { "a": [ @@ -50,6 +52,47 @@ def test_timestamp_datetimes( request.applymarker(pytest.mark.xfail) datetimes = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} df = nw.from_native(constructor(datetimes)) + result = df.select( + nw.col("a").cast(nw.Datetime(original_time_unit)).dt.timestamp(time_unit) + ) + compare_dicts(result, {"a": expected}) + + +@pytest.mark.parametrize( + ("original_time_unit", "time_unit", "expected"), + [ + ("ns", "ns", [978307200000000000, None, 978480000000000000]), + ("ns", "us", [978307200000000, None, 978480000000000]), + ("ns", "ms", [978307200000, None, 978480000000]), + ("us", "ns", [978307200000000000, None, 978480000000000000]), + ("us", "us", [978307200000000, None, 978480000000000]), + ("us", "ms", [978307200000, None, 978480000000]), + ("ms", "ns", [978307200000000000, None, 978480000000000000]), + ("ms", "us", [978307200000000, None, 978480000000000]), + ("ms", "ms", [978307200000, None, 978480000000]), + ("s", "ns", [978307200000000000, None, 978480000000000000]), + ("s", "us", [978307200000000, None, 978480000000000]), + ("s", "ms", [978307200000, None, 978480000000]), + ], +) +def test_timestamp_datetimes_tz_aware( + request: pytest.FixtureRequest, + constructor: Constructor, + original_time_unit: Literal["us", "ns", "ms", "s"], + time_unit: Literal["ns", "us", "ms"], + expected: list[int | None], +) -> None: + if ( + (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) + or ("pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < (2,)) + or ("pyarrow_table" in str(constructor) and parse_version(pa.__version__) < (12,)) + or ("cudf" in str(constructor)) + ): + request.applymarker(pytest.mark.xfail) + if original_time_unit == "s" and "polars" in str(constructor): + request.applymarker(pytest.mark.xfail) + datetimes = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} + df = nw.from_native(constructor(datetimes)) result = df.select( nw.col("a") .cast(nw.Datetime(original_time_unit)) From 532b07f5d55bff4818ac4f1cecca540beabbf6d1 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 20 Oct 2024 11:48:47 +0100 Subject: [PATCH 20/25] more version-dependent xfails --- tests/expr_and_series/dt/timestamp_test.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index b1ed39fd2..8d8399cc2 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -50,6 +50,12 @@ def test_timestamp_datetimes( ) -> None: if original_time_unit == "s" and "polars" in str(constructor): request.applymarker(pytest.mark.xfail) + if ( + "pandas_pyarrow" in str(constructor) + and parse_version(pd.__version__) < (2, 2) + and (original_time_unit == "ns" and time_unit != "ns") + ): + request.applymarker(pytest.mark.xfail) datetimes = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} df = nw.from_native(constructor(datetimes)) result = df.select( @@ -89,6 +95,12 @@ def test_timestamp_datetimes_tz_aware( or ("cudf" in str(constructor)) ): request.applymarker(pytest.mark.xfail) + if ( + "pandas_pyarrow" in str(constructor) + and parse_version(pd.__version__) < (2, 2) + and (original_time_unit == "ns" and time_unit != "ns") + ): + request.applymarker(pytest.mark.xfail) if original_time_unit == "s" and "polars" in str(constructor): request.applymarker(pytest.mark.xfail) datetimes = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} From 186f926805e8153399acf1330a708c460073cb7a Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 20 Oct 2024 12:00:59 +0100 Subject: [PATCH 21/25] xfail strict=False for these --- tests/expr_and_series/dt/timestamp_test.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index 8d8399cc2..9dea1c4e6 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -50,12 +50,9 @@ def test_timestamp_datetimes( ) -> None: if original_time_unit == "s" and "polars" in str(constructor): request.applymarker(pytest.mark.xfail) - if ( - "pandas_pyarrow" in str(constructor) - and parse_version(pd.__version__) < (2, 2) - and (original_time_unit == "ns" and time_unit != "ns") - ): - request.applymarker(pytest.mark.xfail) + if "pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < (2, 2): + # pyarrow-backed timestamps were too inconsistent and unreliable before 2.2 + request.applymarker(pytest.mark.xfail(strict=False)) datetimes = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} df = nw.from_native(constructor(datetimes)) result = df.select( @@ -95,12 +92,9 @@ def test_timestamp_datetimes_tz_aware( or ("cudf" in str(constructor)) ): request.applymarker(pytest.mark.xfail) - if ( - "pandas_pyarrow" in str(constructor) - and parse_version(pd.__version__) < (2, 2) - and (original_time_unit == "ns" and time_unit != "ns") - ): - request.applymarker(pytest.mark.xfail) + if "pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < (2, 2): + # pyarrow-backed timestamps were too inconsistent and unreliable before 2.2 + request.applymarker(pytest.mark.xfail(strict=False)) if original_time_unit == "s" and "polars" in str(constructor): request.applymarker(pytest.mark.xfail) datetimes = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} From a5ed5f6d37e7588723a0b990cb76d36a2f7d01ed Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 20 Oct 2024 12:05:18 +0100 Subject: [PATCH 22/25] coverage --- tests/expr_and_series/dt/timestamp_test.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index 9dea1c4e6..da6092e4d 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -50,7 +50,10 @@ def test_timestamp_datetimes( ) -> None: if original_time_unit == "s" and "polars" in str(constructor): request.applymarker(pytest.mark.xfail) - if "pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < (2, 2): + if "pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < ( + 2, + 2, + ): # pragma: no cover # pyarrow-backed timestamps were too inconsistent and unreliable before 2.2 request.applymarker(pytest.mark.xfail(strict=False)) datetimes = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} @@ -92,7 +95,10 @@ def test_timestamp_datetimes_tz_aware( or ("cudf" in str(constructor)) ): request.applymarker(pytest.mark.xfail) - if "pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < (2, 2): + if "pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < ( + 2, + 2, + ): # pragma: no cover # pyarrow-backed timestamps were too inconsistent and unreliable before 2.2 request.applymarker(pytest.mark.xfail(strict=False)) if original_time_unit == "s" and "polars" in str(constructor): From a022148860749c9c0ec44055c4c19457964db0b6 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 20 Oct 2024 12:20:10 +0100 Subject: [PATCH 23/25] dask xfail --- tests/expr_and_series/dt/timestamp_test.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index da6092e4d..64538cc23 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -101,6 +101,12 @@ def test_timestamp_datetimes_tz_aware( ): # pragma: no cover # pyarrow-backed timestamps were too inconsistent and unreliable before 2.2 request.applymarker(pytest.mark.xfail(strict=False)) + if "dask" in str(constructor): + import dask + + if parse_version(dask.__version__) < (2024, 8): # pragma: no cover + request.applymarker(pytest.mark.xfail) + if original_time_unit == "s" and "polars" in str(constructor): request.applymarker(pytest.mark.xfail) datetimes = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} From 3a572e6e85324dcc2cc0922a6853c7f3d21eea56 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 20 Oct 2024 12:32:58 +0100 Subject: [PATCH 24/25] modin xfail --- tests/expr_and_series/dt/timestamp_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index 64538cc23..3f3e87654 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -89,7 +89,7 @@ def test_timestamp_datetimes_tz_aware( expected: list[int | None], ) -> None: if ( - (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) + (any(x in str(constructor) for x in ("pyarrow",)) and is_windows()) or ("pandas_pyarrow" in str(constructor) and parse_version(pd.__version__) < (2,)) or ("pyarrow_table" in str(constructor) and parse_version(pa.__version__) < (12,)) or ("cudf" in str(constructor)) From 32bb502e63f1e7555d0c554d9d6d0550afa5e79d Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 21 Oct 2024 07:54:07 +0100 Subject: [PATCH 25/25] typing --- narwhals/_dask/expr.py | 1 + narwhals/_pandas_like/series.py | 1 + narwhals/expr.py | 2 +- narwhals/series.py | 2 +- tests/expr_and_series/dt/timestamp_test.py | 2 +- 5 files changed, 5 insertions(+), 3 deletions(-) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index d34fcaeec..1347dad8c 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -963,6 +963,7 @@ def func( is_pyarrow_dtype = "pyarrow" in str(dtype) mask_na = s.isna() if dtype == self._expr._dtypes.Date: + # Date is only supported in pandas dtypes if pyarrow-backed s_cast = s.astype("Int32[pyarrow]") result = calculate_timestamp_date(s_cast, time_unit) elif dtype == self._expr._dtypes.Datetime: diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index de63f5ff5..a8e59bc12 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -953,6 +953,7 @@ def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeSe is_pyarrow_dtype = "pyarrow" in str(self._pandas_series._native_series.dtype) mask_na = s.isna() if dtype == self._pandas_series._dtypes.Date: + # Date is only supported in pandas dtypes if pyarrow-backed s_cast = s.astype("Int32[pyarrow]") result = calculate_timestamp_date(s_cast, time_unit) elif dtype == self._pandas_series._dtypes.Datetime: diff --git a/narwhals/expr.py b/narwhals/expr.py index 7d46d36be..3156c426e 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -3976,7 +3976,7 @@ def convert_time_zone(self: Self, time_zone: str) -> T: lambda plx: self._expr._call(plx).dt.convert_time_zone(time_zone) ) - def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> Expr: + def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> T: """ Return a timestamp in the given time unit. diff --git a/narwhals/series.py b/narwhals/series.py index 01ab98ef6..ea0432231 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -4007,7 +4007,7 @@ def convert_time_zone(self: Self, time_zone: str) -> T: self._narwhals_series._compliant_series.dt.convert_time_zone(time_zone) ) - def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> Series: + def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> T: """ Return a timestamp in the given time unit. diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index 3f3e87654..4e3dcdf3a 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -208,7 +208,7 @@ def test_timestamp_hypothesis( @nw.narwhalify def func(s: nw.Series) -> nw.Series: - return s.dt.timestamp(time_unit) # type: ignore[return-value] + return s.dt.timestamp(time_unit) result_pl = func(pl.Series([inputs], dtype=pl.Datetime(starting_time_unit))) result_pd = func(pd.Series([inputs], dtype=f"datetime64[{starting_time_unit}]"))