diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index a4c37b7c5d..5abf97a78d 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -89,7 +89,7 @@ def cast_ibis_value( - value: ibis_types.Value, to_type: ibis_dtypes.DataType + value: ibis_types.Value, to_type: ibis_dtypes.DataType, safe: bool = False ) -> ibis_types.Value: """Perform compatible type casts of ibis values @@ -176,7 +176,7 @@ def cast_ibis_value( value = ibis_value_to_canonical_type(value) if value.type() in good_casts: if to_type in good_casts[value.type()]: - return value.cast(to_type) + return value.try_cast(to_type) if safe else value.cast(to_type) else: # this should never happen raise TypeError( @@ -188,10 +188,16 @@ def cast_ibis_value( # BigQuery casts bools to lower case strings. Capitalize the result to match Pandas # TODO(bmil): remove this workaround after fixing Ibis if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.string: - return cast(ibis_types.StringValue, value.cast(to_type)).capitalize() + if safe: + return cast(ibis_types.StringValue, value.try_cast(to_type)).capitalize() + else: + return cast(ibis_types.StringValue, value.cast(to_type)).capitalize() if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.float64: - return value.cast(ibis_dtypes.int64).cast(ibis_dtypes.float64) + if safe: + return value.try_cast(ibis_dtypes.int64).try_cast(ibis_dtypes.float64) + else: + return value.cast(ibis_dtypes.int64).cast(ibis_dtypes.float64) if value.type() == ibis_dtypes.float64 and to_type == ibis_dtypes.bool: return value != ibis_types.literal(0) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 80e354aa8c..e7526ca48b 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -947,7 +947,9 @@ def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): return result.cast(result.type()(nullable=True)).name(name) -def numeric_to_datetime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue: +def numeric_to_datetime( + x: ibis_types.Value, unit: str, safe: bool = False +) -> ibis_types.TimestampValue: if not isinstance(x, ibis_types.IntegerValue) and not isinstance( x, ibis_types.FloatingValue ): @@ -956,7 +958,11 @@ def numeric_to_datetime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampV if unit not in UNIT_TO_US_CONVERSION_FACTORS: raise ValueError(f"Cannot convert input with unit '{unit}'.") x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit] - x_converted = x_converted.cast(ibis_dtypes.int64) + x_converted = ( + x_converted.try_cast(ibis_dtypes.int64) + if safe + else x_converted.cast(ibis_dtypes.int64) + ) # Note: Due to an issue where casting directly to a timestamp # without a timezone does not work, we first cast to UTC. This @@ -978,8 +984,11 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): # When casting DATETIME column into INT column, we need to convert the column into TIMESTAMP first. if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.timestamp: - x_converted = x.cast(ibis_dtypes.Timestamp(timezone="UTC")) - return bigframes.core.compile.ibis_types.cast_ibis_value(x_converted, to_type) + utc_time_type = ibis_dtypes.Timestamp(timezone="UTC") + x_converted = x.try_cast(utc_time_type) if op.safe else x.cast(utc_time_type) + return bigframes.core.compile.ibis_types.cast_ibis_value( + x_converted, to_type, safe=op.safe + ) if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.time: # The conversion unit is set to "us" (microseconds) for consistency @@ -991,15 +1000,20 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): # with pandas converting int64[pyarrow] to timestamp[us][pyarrow], # timestamp[us, tz=UTC][pyarrow], and time64[us][pyarrow]. unit = "us" - x_converted = numeric_to_datetime(x, unit) + x_converted = numeric_to_datetime(x, unit, safe=op.safe) if to_type == ibis_dtypes.timestamp: - return x_converted.cast(ibis_dtypes.Timestamp()) + return ( + x_converted.try_cast(ibis_dtypes.Timestamp()) + if op.safe + else x_converted.cast(ibis_dtypes.Timestamp()) + ) elif to_type == ibis_dtypes.Timestamp(timezone="UTC"): return x_converted elif to_type == ibis_dtypes.time: return x_converted.time() - return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type) + # TODO: either inline this function, or push rest of this op into the function + return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type, safe=op.safe) @scalar_op_compiler.register_unary_op(ops.IsInOp, pass_op=True) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 0ba79bebee..f2a0e9dd29 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -17,7 +17,7 @@ from __future__ import annotations import typing -from typing import Hashable, Optional, Sequence, Union +from typing import Hashable, Literal, Optional, Sequence, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index @@ -324,11 +324,17 @@ def sort_values(self, *, ascending: bool = True, na_position: str = "last"): def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], + *, + errors: Literal["raise", "null"] = "raise", ) -> Index: + if errors not in ["raise", "null"]: + raise ValueError("Arg 'error' must be one of 'raise' or 'null'") if self.nlevels > 1: raise TypeError("Multiindex does not support 'astype'") return self._apply_unary_expr( - ops.AsTypeOp(to_type=dtype).as_expr(ex.free_var("arg")) + ops.AsTypeOp(to_type=dtype, safe=(errors == "null")).as_expr( + ex.free_var("arg") + ) ) def all(self) -> bool: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d5cac76e9b..0b639a5649 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -365,8 +365,14 @@ def __iter__(self): def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], + *, + errors: Literal["raise", "null"] = "raise", ) -> DataFrame: - return self._apply_unary_op(ops.AsTypeOp(to_type=dtype)) + if errors not in ["raise", "null"]: + raise ValueError("Arg 'error' must be one of 'raise' or 'null'") + return self._apply_unary_op( + ops.AsTypeOp(to_type=dtype, safe=(errors == "null")) + ) def _to_sql_query( self, include_index: bool, enable_cache: bool = True diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 2e2e4a0552..fa38be368f 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -494,6 +494,7 @@ class AsTypeOp(UnaryOp): name: typing.ClassVar[str] = "astype" # TODO: Convert strings to dtype earlier to_type: dtypes.DtypeString | dtypes.Dtype + safe: bool = False def output_type(self, *input_types): # TODO: We should do this conversion earlier diff --git a/bigframes/series.py b/bigframes/series.py index 1d44cdd963..5d4771c073 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -352,8 +352,14 @@ def __repr__(self) -> str: def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], + *, + errors: Literal["raise", "null"] = "raise", ) -> Series: - return self._apply_unary_op(bigframes.operations.AsTypeOp(to_type=dtype)) + if errors not in ["raise", "null"]: + raise ValueError("Arg 'error' must be one of 'raise' or 'null'") + return self._apply_unary_op( + bigframes.operations.AsTypeOp(to_type=dtype, safe=(errors == "null")) + ) def to_pandas( self, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 02b771a1bd..f69eb2eb4a 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3687,6 +3687,12 @@ def test_df_add_suffix(scalars_df_index, scalars_pandas_df_index, axis): ) +def test_df_astype_error_error(session): + input = pd.DataFrame(["hello", "world", "3.11", "4000"]) + with pytest.raises(ValueError): + session.read_pandas(input).astype("Float64", errors="bad_value") + + def test_df_columns_filter_items(scalars_df_index, scalars_pandas_df_index): if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): pytest.skip("pandas filter items behavior different pre-2.1") diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index d68cf6c3f3..cdf4fa6511 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -123,6 +123,12 @@ def test_index_astype(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_index_equal(bf_result, pd_result) +def test_index_astype_error_error(session): + input = pd.Index(["hello", "world", "3.11", "4000"]) + with pytest.raises(ValueError): + session.read_pandas(input).astype("Float64", errors="bad_value") + + def test_index_any(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.set_index("int64_col").index.any() pd_result = scalars_pandas_df_index.set_index("int64_col").index.any() diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index b906f452b7..d83dfe6de0 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3087,6 +3087,7 @@ def foo(x): assert_series_equal(bf_result, pd_result, check_dtype=False) +@pytest.mark.parametrize("errors", ["raise", "null"]) @pytest.mark.parametrize( ("column", "to_type"), [ @@ -3102,6 +3103,7 @@ def foo(x): ("int64_col", "time64[us][pyarrow]"), ("bool_col", "Int64"), ("bool_col", "string[pyarrow]"), + ("bool_col", "Float64"), ("string_col", "binary[pyarrow]"), ("bytes_col", "string[pyarrow]"), # pandas actually doesn't let folks convert to/from naive timestamp and @@ -3137,12 +3139,29 @@ def foo(x): ], ) @skip_legacy_pandas -def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type): - bf_result = scalars_df_index[column].astype(to_type).to_pandas() +def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type, errors): + bf_result = scalars_df_index[column].astype(to_type, errors=errors).to_pandas() pd_result = scalars_pandas_df_index[column].astype(to_type) pd.testing.assert_series_equal(bf_result, pd_result) +def test_astype_safe(session): + input = pd.Series(["hello", "world", "3.11", "4000"]) + exepcted = pd.Series( + [None, None, 3.11, 4000], + dtype="Float64", + index=pd.Index([0, 1, 2, 3], dtype="Int64"), + ) + result = session.read_pandas(input).astype("Float64", errors="null").to_pandas() + pd.testing.assert_series_equal(result, exepcted) + + +def test_series_astype_error_error(session): + input = pd.Series(["hello", "world", "3.11", "4000"]) + with pytest.raises(ValueError): + session.read_pandas(input).astype("Float64", errors="bad_value") + + @skip_legacy_pandas def test_astype_numeric_to_int(scalars_df_index, scalars_pandas_df_index): column = "numeric_col" diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 101cdc5bd9..83a24f7a9c 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -180,6 +180,10 @@ def astype(self, dtype): ``pd.ArrowDtype(pa.time64("us"))``, ``pd.ArrowDtype(pa.timestamp("us"))``, ``pd.ArrowDtype(pa.timestamp("us", tz="UTC"))``. + errors ({'raise', 'null'}, default 'raise'): + Control raising of exceptions on invalid data for provided dtype. + If 'raise', allow exceptions to be raised if any value fails cast + If 'null', will assign null value if value fails cast Returns: bigframes.pandas.DataFrame: diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index b0e1a09392..84a4aa9664 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -106,6 +106,11 @@ def astype(self, dtype): Args: dtype (numpy dtype or pandas type): + A dtype supported by BigQuery DataFrames + errors ({'raise', 'null'}, default 'raise'): + Control raising of exceptions on invalid data for provided dtype. + If 'raise', allow exceptions to be raised if any value fails cast + If 'null', will assign null value if value fails cast Returns: Index: Index with values cast to specified dtype.