googleapis · TrevorBergeron · Nov 13, 2024 · Oct 30, 2024 · Nov 13, 2024 · Nov 13, 2024
@@ -89,7 +89,7 @@
 
 
 def cast_ibis_value(
-    value: ibis_types.Value, to_type: ibis_dtypes.DataType
+    value: ibis_types.Value, to_type: ibis_dtypes.DataType, safe: bool = False
 ) -> ibis_types.Value:
     """Perform compatible type casts of ibis values
 
@@ -176,7 +176,7 @@ def cast_ibis_value(
     value = ibis_value_to_canonical_type(value)
     if value.type() in good_casts:
         if to_type in good_casts[value.type()]:
-            return value.cast(to_type)
+            return value.try_cast(to_type) if safe else value.cast(to_type)
     else:
         # this should never happen
         raise TypeError(
@@ -188,10 +188,16 @@ def cast_ibis_value(
     # BigQuery casts bools to lower case strings. Capitalize the result to match Pandas
     # TODO(bmil): remove this workaround after fixing Ibis
     if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.string:
-        return cast(ibis_types.StringValue, value.cast(to_type)).capitalize()
+        if safe:
+            return cast(ibis_types.StringValue, value.try_cast(to_type)).capitalize()
+        else:
+            return cast(ibis_types.StringValue, value.cast(to_type)).capitalize()
 
     if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.float64:
-        return value.cast(ibis_dtypes.int64).cast(ibis_dtypes.float64)
+        if safe:
+            return value.try_cast(ibis_dtypes.int64).try_cast(ibis_dtypes.float64)
+        else:
+            return value.cast(ibis_dtypes.int64).cast(ibis_dtypes.float64)
 
     if value.type() == ibis_dtypes.float64 and to_type == ibis_dtypes.bool:
         return value != ibis_types.literal(0)

@@ -947,7 +947,9 @@ def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp):
     return result.cast(result.type()(nullable=True)).name(name)
 
 
-def numeric_to_datetime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue:
+def numeric_to_datetime(
+    x: ibis_types.Value, unit: str, safe: bool = False
+) -> ibis_types.TimestampValue:
     if not isinstance(x, ibis_types.IntegerValue) and not isinstance(
         x, ibis_types.FloatingValue
     ):
@@ -956,7 +958,11 @@ def numeric_to_datetime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampV
     if unit not in UNIT_TO_US_CONVERSION_FACTORS:
         raise ValueError(f"Cannot convert input with unit '{unit}'.")
     x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit]
-    x_converted = x_converted.cast(ibis_dtypes.int64)
+    x_converted = (
+        x_converted.try_cast(ibis_dtypes.int64)
+        if safe
+        else x_converted.cast(ibis_dtypes.int64)
+    )
 
     # Note: Due to an issue where casting directly to a timestamp
     # without a timezone does not work, we first cast to UTC. This
@@ -978,8 +984,11 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp):
 
     # When casting DATETIME column into INT column, we need to convert the column into TIMESTAMP first.
     if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.timestamp:
-        x_converted = x.cast(ibis_dtypes.Timestamp(timezone="UTC"))
-        return bigframes.core.compile.ibis_types.cast_ibis_value(x_converted, to_type)
+        utc_time_type = ibis_dtypes.Timestamp(timezone="UTC")
+        x_converted = x.try_cast(utc_time_type) if op.safe else x.cast(utc_time_type)
+        return bigframes.core.compile.ibis_types.cast_ibis_value(
+            x_converted, to_type, safe=op.safe
+        )
 
     if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.time:
         # The conversion unit is set to "us" (microseconds) for consistency
@@ -991,15 +1000,20 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp):
         # with pandas converting int64[pyarrow] to timestamp[us][pyarrow],
         # timestamp[us, tz=UTC][pyarrow], and time64[us][pyarrow].
         unit = "us"
-        x_converted = numeric_to_datetime(x, unit)
+        x_converted = numeric_to_datetime(x, unit, safe=op.safe)
         if to_type == ibis_dtypes.timestamp:
-            return x_converted.cast(ibis_dtypes.Timestamp())
+            return (
+                x_converted.try_cast(ibis_dtypes.Timestamp())
+                if op.safe
+                else x_converted.cast(ibis_dtypes.Timestamp())
+            )
         elif to_type == ibis_dtypes.Timestamp(timezone="UTC"):
             return x_converted
         elif to_type == ibis_dtypes.time:
             return x_converted.time()
 
-    return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type)
+    # TODO: either inline this function, or push rest of this op into the function
+    return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type, safe=op.safe)
 
 
 @scalar_op_compiler.register_unary_op(ops.IsInOp, pass_op=True)

@@ -17,7 +17,7 @@
 from __future__ import annotations
 
 import typing
-from typing import Hashable, Optional, Sequence, Union
+from typing import Hashable, Literal, Optional, Sequence, Union
 
 import bigframes_vendored.constants as constants
 import bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index
@@ -324,11 +324,17 @@ def sort_values(self, *, ascending: bool = True, na_position: str = "last"):
     def astype(
         self,
         dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype],
+        *,
+        errors: Literal["raise", "null"] = "raise",
     ) -> Index:
+        if errors not in ["raise", "null"]:
+            raise ValueError("Arg 'error' must be one of 'raise' or 'null'")
         if self.nlevels > 1:
             raise TypeError("Multiindex does not support 'astype'")
         return self._apply_unary_expr(
-            ops.AsTypeOp(to_type=dtype).as_expr(ex.free_var("arg"))
+            ops.AsTypeOp(to_type=dtype, safe=(errors == "null")).as_expr(
+                ex.free_var("arg")
+            )
         )
 
     def all(self) -> bool:

@@ -365,8 +365,14 @@ def __iter__(self):
     def astype(
         self,
         dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype],
+        *,
+        errors: Literal["raise", "null"] = "raise",
     ) -> DataFrame:
-        return self._apply_unary_op(ops.AsTypeOp(to_type=dtype))
+        if errors not in ["raise", "null"]:
+            raise ValueError("Arg 'error' must be one of 'raise' or 'null'")
+        return self._apply_unary_op(
+            ops.AsTypeOp(to_type=dtype, safe=(errors == "null"))
+        )
 
     def _to_sql_query(
         self, include_index: bool, enable_cache: bool = True

@@ -494,6 +494,7 @@ class AsTypeOp(UnaryOp):
     name: typing.ClassVar[str] = "astype"
     # TODO: Convert strings to dtype earlier
     to_type: dtypes.DtypeString | dtypes.Dtype
+    safe: bool = False
 
     def output_type(self, *input_types):
         # TODO: We should do this conversion earlier

@@ -352,8 +352,14 @@ def __repr__(self) -> str:
     def astype(
         self,
         dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype],
+        *,
+        errors: Literal["raise", "null"] = "raise",
     ) -> Series:
-        return self._apply_unary_op(bigframes.operations.AsTypeOp(to_type=dtype))
+        if errors not in ["raise", "null"]:
+            raise ValueError("Arg 'error' must be one of 'raise' or 'null'")
+        return self._apply_unary_op(
+            bigframes.operations.AsTypeOp(to_type=dtype, safe=(errors == "null"))
+        )
 
     def to_pandas(
         self,

@@ -3687,6 +3687,12 @@ def test_df_add_suffix(scalars_df_index, scalars_pandas_df_index, axis):
     )
 
 
+def test_df_astype_error_error(session):
+    input = pd.DataFrame(["hello", "world", "3.11", "4000"])
+    with pytest.raises(ValueError):
+        session.read_pandas(input).astype("Float64", errors="bad_value")
+
+
 def test_df_columns_filter_items(scalars_df_index, scalars_pandas_df_index):
     if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."):
         pytest.skip("pandas filter items behavior different pre-2.1")

@@ -123,6 +123,12 @@ def test_index_astype(scalars_df_index, scalars_pandas_df_index):
     pd.testing.assert_index_equal(bf_result, pd_result)
 
 
+def test_index_astype_error_error(session):
+    input = pd.Index(["hello", "world", "3.11", "4000"])
+    with pytest.raises(ValueError):
+        session.read_pandas(input).astype("Float64", errors="bad_value")
+
+
 def test_index_any(scalars_df_index, scalars_pandas_df_index):
     bf_result = scalars_df_index.set_index("int64_col").index.any()
     pd_result = scalars_pandas_df_index.set_index("int64_col").index.any()

@@ -3087,6 +3087,7 @@ def foo(x):
     assert_series_equal(bf_result, pd_result, check_dtype=False)
 
 
+@pytest.mark.parametrize("errors", ["raise", "null"])
 @pytest.mark.parametrize(
     ("column", "to_type"),
     [
@@ -3102,6 +3103,7 @@ def foo(x):
         ("int64_col", "time64[us][pyarrow]"),
         ("bool_col", "Int64"),
         ("bool_col", "string[pyarrow]"),
+        ("bool_col", "Float64"),
         ("string_col", "binary[pyarrow]"),
         ("bytes_col", "string[pyarrow]"),
         # pandas actually doesn't let folks convert to/from naive timestamp and
@@ -3137,12 +3139,29 @@ def foo(x):
     ],
 )
 @skip_legacy_pandas
-def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type):
-    bf_result = scalars_df_index[column].astype(to_type).to_pandas()
+def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type, errors):
+    bf_result = scalars_df_index[column].astype(to_type, errors=errors).to_pandas()
     pd_result = scalars_pandas_df_index[column].astype(to_type)
     pd.testing.assert_series_equal(bf_result, pd_result)
 
 
+def test_astype_safe(session):
+    input = pd.Series(["hello", "world", "3.11", "4000"])
+    exepcted = pd.Series(
+        [None, None, 3.11, 4000],
+        dtype="Float64",
+        index=pd.Index([0, 1, 2, 3], dtype="Int64"),
+    )
+    result = session.read_pandas(input).astype("Float64", errors="null").to_pandas()
+    pd.testing.assert_series_equal(result, exepcted)
+
+
+def test_series_astype_error_error(session):
+    input = pd.Series(["hello", "world", "3.11", "4000"])
+    with pytest.raises(ValueError):
+        session.read_pandas(input).astype("Float64", errors="bad_value")
+
+
 @skip_legacy_pandas
 def test_astype_numeric_to_int(scalars_df_index, scalars_pandas_df_index):
     column = "numeric_col"

@@ -180,6 +180,10 @@ def astype(self, dtype):
                 ``pd.ArrowDtype(pa.time64("us"))``,
                 ``pd.ArrowDtype(pa.timestamp("us"))``,
                 ``pd.ArrowDtype(pa.timestamp("us", tz="UTC"))``.
+            errors ({'raise', 'null'}, default 'raise'):
+                Control raising of exceptions on invalid data for provided dtype.
+                If 'raise', allow exceptions to be raised if any value fails cast
+                If 'null', will assign null value if value fails cast
 
         Returns:
             bigframes.pandas.DataFrame:

@@ -106,6 +106,11 @@ def astype(self, dtype):
 
         Args:
             dtype (numpy dtype or pandas type):
+                A dtype supported by BigQuery DataFrames
+            errors ({'raise', 'null'}, default 'raise'):
+                Control raising of exceptions on invalid data for provided dtype.
+                If 'raise', allow exceptions to be raised if any value fails cast
+                If 'null', will assign null value if value fails cast
 
         Returns:
             Index: Index with values cast to specified dtype.