From 64e913d73832f6363466cbea5ace2337c86fa58b Mon Sep 17 00:00:00 2001 From: chelsea-lin <124939984+chelsea-lin@users.noreply.github.com> Date: Tue, 18 Apr 2023 15:14:47 -0700 Subject: [PATCH] feat: add date, datetime, time, timestamp dtype to to_dataframe (#1547) --- google/cloud/bigquery/_pandas_helpers.py | 27 ++- google/cloud/bigquery/enums.py | 6 + google/cloud/bigquery/job/query.py | 64 ++++++- google/cloud/bigquery/table.py | 128 +++++++++++--- tests/system/test_pandas.py | 98 +++++++++++ tests/unit/test_table.py | 209 ++++++++++++++++++++++- 6 files changed, 494 insertions(+), 38 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 601aa13df..a14dbec9b 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -290,6 +290,10 @@ def default_types_mapper( int_dtype: Union[Any, None] = None, float_dtype: Union[Any, None] = None, string_dtype: Union[Any, None] = None, + date_dtype: Union[Any, None] = None, + datetime_dtype: Union[Any, None] = None, + time_dtype: Union[Any, None] = None, + timestamp_dtype: Union[Any, None] = None, ): """Create a mapping from pyarrow types to pandas types. @@ -321,13 +325,28 @@ def types_mapper(arrow_data_type): elif ( # If date_as_object is True, we know some DATE columns are # out-of-bounds of what is supported by pandas. - not date_as_object + date_dtype is not None + and not date_as_object and pyarrow.types.is_date(arrow_data_type) ): - return db_dtypes.DateDtype() + return date_dtype - elif pyarrow.types.is_time(arrow_data_type): - return db_dtypes.TimeDtype() + elif ( + datetime_dtype is not None + and pyarrow.types.is_timestamp(arrow_data_type) + and arrow_data_type.tz is None + ): + return datetime_dtype + + elif ( + timestamp_dtype is not None + and pyarrow.types.is_timestamp(arrow_data_type) + and arrow_data_type.tz is not None + ): + return timestamp_dtype + + elif time_dtype is not None and pyarrow.types.is_time(arrow_data_type): + return time_dtype return types_mapper diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py index e4e3d22fc..553853630 100644 --- a/google/cloud/bigquery/enums.py +++ b/google/cloud/bigquery/enums.py @@ -90,6 +90,12 @@ class DefaultPandasDTypes(enum.Enum): INT_DTYPE = object() """Specifies default integer dtype""" + DATE_DTYPE = object() + """Specifies default date dtype""" + + TIME_DTYPE = object() + """Specifies default time dtype""" + class DestinationFormat(object): """The exported file format. The default value is :attr:`CSV`. diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 62668c601..315d8201c 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -58,6 +58,11 @@ except ImportError: # pragma: NO COVER pandas = None +try: + import db_dtypes # type: ignore +except ImportError: # pragma: NO COVER + db_dtypes = None + if typing.TYPE_CHECKING: # pragma: NO COVER # Assumption: type checks are only used by library developers and CI environments # that have all optional dependencies installed, thus no conditional imports. @@ -1637,6 +1642,10 @@ def to_dataframe( int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, float_dtype: Union[Any, None] = None, string_dtype: Union[Any, None] = None, + date_dtype: Union[Any, None] = DefaultPandasDTypes.DATE_DTYPE, + datetime_dtype: Union[Any, None] = None, + time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE, + timestamp_dtype: Union[Any, None] = None, ) -> "pandas.DataFrame": """Return a pandas DataFrame from a QueryJob @@ -1697,7 +1706,7 @@ def to_dataframe( type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 int_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) @@ -1707,7 +1716,7 @@ def to_dataframe( Integer types can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 float_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) @@ -1717,7 +1726,7 @@ def to_dataframe( type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 string_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to @@ -1727,7 +1736,50 @@ def to_dataframe( type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 + + date_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.date32())``) to convert BigQuery Date + type, instead of relying on the default ``db_dtypes.DateDtype()``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery + Date type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#date_type + + .. versionadded:: 3.10.0 + + datetime_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.timestamp("us"))``) to convert BigQuery Datetime + type, instead of relying on the default ``numpy.dtype("datetime64[ns]``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery + Datetime type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#datetime_type + + .. versionadded:: 3.10.0 + + time_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.time64("us"))``) to convert BigQuery Time + type, instead of relying on the default ``db_dtypes.TimeDtype()``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("object")``. BigQuery Time type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#time_type + + .. versionadded:: 3.10.0 + + timestamp_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC"))``) to convert BigQuery Timestamp + type, instead of relying on the default ``numpy.dtype("datetime64[ns, UTC]")``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("datetime64[ns, UTC]")`` or ``object`` if out of bound. BigQuery + Datetime type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp_type + + .. versionadded:: 3.10.0 Returns: pandas.DataFrame: @@ -1755,6 +1807,10 @@ def to_dataframe( int_dtype=int_dtype, float_dtype=float_dtype, string_dtype=string_dtype, + date_dtype=date_dtype, + datetime_dtype=datetime_dtype, + time_dtype=time_dtype, + timestamp_dtype=timestamp_dtype, ) # If changing the signature of this method, make sure to apply the same diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 93b0da67f..a34e5dc25 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1935,6 +1935,10 @@ def to_dataframe( int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, float_dtype: Union[Any, None] = None, string_dtype: Union[Any, None] = None, + date_dtype: Union[Any, None] = DefaultPandasDTypes.DATE_DTYPE, + datetime_dtype: Union[Any, None] = None, + time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE, + timestamp_dtype: Union[Any, None] = None, ) -> "pandas.DataFrame": """Create a pandas DataFrame by loading all pages of a query. @@ -1999,7 +2003,7 @@ def to_dataframe( type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 int_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) @@ -2009,7 +2013,7 @@ def to_dataframe( Integer types can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 float_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) @@ -2019,7 +2023,7 @@ def to_dataframe( type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 string_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to @@ -2029,7 +2033,50 @@ def to_dataframe( type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 + + date_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.date32())``) to convert BigQuery Date + type, instead of relying on the default ``db_dtypes.DateDtype()``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery + Date type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#date_type + + .. versionadded:: 3.10.0 + + datetime_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.timestamp("us"))``) to convert BigQuery Datetime + type, instead of relying on the default ``numpy.dtype("datetime64[ns]``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery + Datetime type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#datetime_type + + .. versionadded:: 3.10.0 + + time_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.time64("us"))``) to convert BigQuery Time + type, instead of relying on the default ``db_dtypes.TimeDtype()``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("object")``. BigQuery Time type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#time_type + + .. versionadded:: 3.10.0 + + timestamp_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC"))``) to convert BigQuery Timestamp + type, instead of relying on the default ``numpy.dtype("datetime64[ns, UTC]")``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("datetime64[ns, UTC]")`` or ``object`` if out of bound. BigQuery + Datetime type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp_type + + .. versionadded:: 3.10.0 Returns: pandas.DataFrame: @@ -2059,6 +2106,9 @@ def to_dataframe( if int_dtype is DefaultPandasDTypes.INT_DTYPE: int_dtype = pandas.Int64Dtype() + if time_dtype is DefaultPandasDTypes.TIME_DTYPE: + time_dtype = db_dtypes.TimeDtype() + if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"): raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE) @@ -2071,6 +2121,24 @@ def to_dataframe( if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"): raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE) + if ( + date_dtype is not None + and date_dtype is not DefaultPandasDTypes.DATE_DTYPE + and not hasattr(date_dtype, "__from_arrow__") + ): + raise ValueError("date_dtype", _NO_SUPPORTED_DTYPE) + + if datetime_dtype is not None and not hasattr(datetime_dtype, "__from_arrow__"): + raise ValueError("datetime_dtype", _NO_SUPPORTED_DTYPE) + + if time_dtype is not None and not hasattr(time_dtype, "__from_arrow__"): + raise ValueError("time_dtype", _NO_SUPPORTED_DTYPE) + + if timestamp_dtype is not None and not hasattr( + timestamp_dtype, "__from_arrow__" + ): + raise ValueError("timestamp_dtype", _NO_SUPPORTED_DTYPE) + if dtypes is None: dtypes = {} @@ -2086,25 +2154,29 @@ def to_dataframe( create_bqstorage_client=create_bqstorage_client, ) - # When converting date or timestamp values to nanosecond precision, the result - # can be out of pyarrow bounds. To avoid the error when converting to - # Pandas, we set the date_as_object or timestamp_as_object parameter to True, - # if necessary. - date_as_object = not all( - self.__can_cast_timestamp_ns(col) - for col in record_batch - # Type can be date32 or date64 (plus units). - # See: https://arrow.apache.org/docs/python/api/datatypes.html - if pyarrow.types.is_date(col.type) - ) + # Default date dtype is `db_dtypes.DateDtype()` that could cause out of bounds error, + # when pyarrow converts date values to nanosecond precision. To avoid the error, we + # set the date_as_object parameter to True, if necessary. + date_as_object = False + if date_dtype is DefaultPandasDTypes.DATE_DTYPE: + date_dtype = db_dtypes.DateDtype() + date_as_object = not all( + self.__can_cast_timestamp_ns(col) + for col in record_batch + # Type can be date32 or date64 (plus units). + # See: https://arrow.apache.org/docs/python/api/datatypes.html + if pyarrow.types.is_date(col.type) + ) - timestamp_as_object = not all( - self.__can_cast_timestamp_ns(col) - for col in record_batch - # Type can be datetime and timestamp (plus units and time zone). - # See: https://arrow.apache.org/docs/python/api/datatypes.html - if pyarrow.types.is_timestamp(col.type) - ) + timestamp_as_object = False + if datetime_dtype is None and timestamp_dtype is None: + timestamp_as_object = not all( + self.__can_cast_timestamp_ns(col) + for col in record_batch + # Type can be datetime and timestamp (plus units and time zone). + # See: https://arrow.apache.org/docs/python/api/datatypes.html + if pyarrow.types.is_timestamp(col.type) + ) if len(record_batch) > 0: df = record_batch.to_pandas( @@ -2117,6 +2189,10 @@ def to_dataframe( int_dtype=int_dtype, float_dtype=float_dtype, string_dtype=string_dtype, + date_dtype=date_dtype, + datetime_dtype=datetime_dtype, + time_dtype=time_dtype, + timestamp_dtype=timestamp_dtype, ), ) else: @@ -2317,6 +2393,10 @@ def to_dataframe( int_dtype=None, float_dtype=None, string_dtype=None, + date_dtype=None, + datetime_dtype=None, + time_dtype=None, + timestamp_dtype=None, ) -> "pandas.DataFrame": """Create an empty dataframe. @@ -2330,6 +2410,10 @@ def to_dataframe( int_dtype (Any): Ignored. Added for compatibility with RowIterator. float_dtype (Any): Ignored. Added for compatibility with RowIterator. string_dtype (Any): Ignored. Added for compatibility with RowIterator. + date_dtype (Any): Ignored. Added for compatibility with RowIterator. + datetime_dtype (Any): Ignored. Added for compatibility with RowIterator. + time_dtype (Any): Ignored. Added for compatibility with RowIterator. + timestamp_dtype (Any): Ignored. Added for compatibility with RowIterator. Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 91305b450..ea8cc6d63 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -34,6 +34,7 @@ pandas = pytest.importorskip("pandas", minversion="0.23.0") +pyarrow = pytest.importorskip("pyarrow") numpy = pytest.importorskip("numpy") bigquery_storage = pytest.importorskip( @@ -1109,6 +1110,103 @@ def test_list_rows_nullable_scalars_extreme_dtypes( assert df.dtypes["string_col"].name == "object" +@pytest.mark.parametrize( + ("max_results",), + ( + (None,), + (10,), + ), # Use BQ Storage API. # Use REST API. +) +def test_list_rows_nullable_scalars_extreme_dtypes_w_custom_dtype( + bigquery_client, scalars_extreme_table, max_results +): + # TODO(GH#836): Avoid INTERVAL columns until they are supported by the + # BigQuery Storage API and pyarrow. + schema = [ + bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), + bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), + bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), + bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), + bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), + bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), + bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), + bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), + bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), + bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), + bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), + bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), + ] + + df = bigquery_client.list_rows( + scalars_extreme_table, + max_results=max_results, + selected_fields=schema, + ).to_dataframe( + bool_dtype=pandas.BooleanDtype(), + int_dtype=pandas.Int64Dtype(), + float_dtype=( + pandas.Float64Dtype() + if hasattr(pandas, "Float64Dtype") + else pandas.StringDtype() + ), + string_dtype=pandas.StringDtype(), + date_dtype=( + pandas.ArrowDtype(pyarrow.date32()) + if hasattr(pandas, "ArrowDtype") + else None + ), + datetime_dtype=( + pandas.ArrowDtype(pyarrow.timestamp("us")) + if hasattr(pandas, "ArrowDtype") + else None + ), + time_dtype=( + pandas.ArrowDtype(pyarrow.time64("us")) + if hasattr(pandas, "ArrowDtype") + else None + ), + timestamp_dtype=( + pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC")) + if hasattr(pandas, "ArrowDtype") + else None + ), + ) + + # These pandas dtypes are handled by the custom dtypes. + assert df.dtypes["bool_col"].name == "boolean" + assert df.dtypes["float64_col"].name == "Float64" + assert df.dtypes["int64_col"].name == "Int64" + assert df.dtypes["string_col"].name == "string" + + assert ( + df.dtypes["date_col"].name == "date32[day][pyarrow]" + if hasattr(pandas, "ArrowDtype") + else "datetime64[ns]" + ) + assert ( + df.dtypes["datetime_col"].name == "timestamp[us][pyarrow]" + if hasattr(pandas, "ArrowDtype") + else "object" + ) + assert ( + df.dtypes["timestamp_col"].name == "timestamp[us, tz=UTC][pyarrow]" + if hasattr(pandas, "ArrowDtype") + else "object" + ) + assert ( + df.dtypes["time_col"].name == "time64[us][pyarrow]" + if hasattr(pandas, "ArrowDtype") + else "object" + ) + + # decimal.Decimal is used to avoid loss of precision. + assert df.dtypes["numeric_col"].name == "object" + assert df.dtypes["bignumeric_col"].name == "object" + + # pandas uses Python bytes objects. + assert df.dtypes["bytes_col"].name == "object" + + def test_upload_time_and_datetime_56(bigquery_client, dataset_id): df = pandas.DataFrame( dict( diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 9bdd7b596..53db635fa 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -46,6 +46,7 @@ PYARROW_VERSION = pkg_resources.parse_version("0.0.1") if pyarrow: + import pyarrow import pyarrow.types PYARROW_VERSION = pkg_resources.parse_version(pyarrow.__version__) @@ -3471,11 +3472,45 @@ def test_to_dataframe_w_dtypes_mapper(self): SchemaField("age", "INTEGER"), SchemaField("seconds", "INT64"), SchemaField("miles", "FLOAT64"), + SchemaField("date", "DATE"), + SchemaField("datetime", "DATETIME"), + SchemaField("time", "TIME"), + SchemaField("timestamp", "TIMESTAMP"), ] row_data = [ - ["Phred Phlyntstone", "true", "32", "23000", "1.77"], - ["Bharney Rhubble", "false", "33", "454000", "6.66"], - ["Wylma Phlyntstone", "true", "29", "341000", "2.0"], + [ + "Phred Phlyntstone", + "true", + "32", + "23000", + "1.77", + "1999-12-01", + "1999-12-31T00:00:00.000000", + "00:00:00.000000", + "1433836800000000", + ], + [ + "Bharney Rhubble", + "false", + "33", + "454000", + "6.66", + "4567-06-14", + "4567-12-31T00:00:00.000000", + "12:00:00.232413", + "81953424000000000", + ], + [ + "Wylma Phlyntstone", + "true", + "29", + "341000", + "2.0", + "9999-12-31", + "9999-12-31T23:59:59.999999", + "23:59:59.999999", + "253402261199999999", + ], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] path = "/foo" @@ -3492,18 +3527,136 @@ def test_to_dataframe_w_dtypes_mapper(self): else pandas.StringDtype() ), string_dtype=pandas.StringDtype(), + date_dtype=( + pandas.ArrowDtype(pyarrow.date32()) + if hasattr(pandas, "ArrowDtype") + else None + ), + datetime_dtype=( + pandas.ArrowDtype(pyarrow.timestamp("us")) + if hasattr(pandas, "ArrowDtype") + else None + ), + time_dtype=( + pandas.ArrowDtype(pyarrow.time64("us")) + if hasattr(pandas, "ArrowDtype") + else None + ), + timestamp_dtype=( + pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC")) + if hasattr(pandas, "ArrowDtype") + else None + ), ) self.assertIsInstance(df, pandas.DataFrame) + + self.assertEqual(list(df.complete), [True, False, True]) self.assertEqual(df.complete.dtype.name, "boolean") + + self.assertEqual(list(df.age), [32, 33, 29]) self.assertEqual(df.age.dtype.name, "Int32") + + self.assertEqual(list(df.seconds), [23000, 454000, 341000]) self.assertEqual(df.seconds.dtype.name, "Int32") + self.assertEqual( - df.miles.dtype.name, - "Float64" if hasattr(pandas, "Float64Dtype") else "string", + list(df.name), ["Phred Phlyntstone", "Bharney Rhubble", "Wylma Phlyntstone"] ) self.assertEqual(df.name.dtype.name, "string") + if hasattr(pandas, "Float64Dtype"): + self.assertEqual(list(df.miles), [1.77, 6.66, 2.0]) + self.assertEqual(df.miles.dtype.name, "Float64") + else: + self.assertEqual(list(df.miles), ["1.77", "6.66", "2.0"]) + self.assertEqual(df.miles.dtype.name, "string") + + if hasattr(pandas, "ArrowDtype"): + self.assertEqual( + list(df.date), + [ + datetime.date(1999, 12, 1), + datetime.date(4567, 6, 14), + datetime.date(9999, 12, 31), + ], + ) + self.assertEqual(df.date.dtype.name, "date32[day][pyarrow]") + + self.assertEqual( + list(df.datetime), + [ + datetime.datetime(1999, 12, 31, 0, 0), + datetime.datetime(4567, 12, 31, 0, 0), + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + ], + ) + self.assertEqual(df.datetime.dtype.name, "timestamp[us][pyarrow]") + + self.assertEqual( + list(df.time), + [ + datetime.time(0, 0), + datetime.time(12, 0, 0, 232413), + datetime.time(23, 59, 59, 999999), + ], + ) + self.assertEqual(df.time.dtype.name, "time64[us][pyarrow]") + + self.assertEqual( + list(df.timestamp), + [ + datetime.datetime(2015, 6, 9, 8, 0, tzinfo=datetime.timezone.utc), + datetime.datetime(4567, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), + datetime.datetime( + 9999, 12, 31, 12, 59, 59, 999999, tzinfo=datetime.timezone.utc + ), + ], + ) + self.assertEqual(df.timestamp.dtype.name, "timestamp[us, tz=UTC][pyarrow]") + else: + self.assertEqual( + list(df.date), + [ + pandas.Timestamp("1999-12-01 00:00:00"), + pandas.Timestamp("2229-03-27 01:41:45.161793536"), + pandas.Timestamp("1816-03-29 05:56:08.066277376"), + ], + ) + self.assertEqual(df.date.dtype.name, "datetime64[ns]") + + self.assertEqual( + list(df.datetime), + [ + datetime.datetime(1999, 12, 31, 0, 0), + datetime.datetime(4567, 12, 31, 0, 0), + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + ], + ) + self.assertEqual(df.datetime.dtype.name, "object") + + self.assertEqual( + list(df.time), + [ + datetime.time(0, 0), + datetime.time(12, 0, 0, 232413), + datetime.time(23, 59, 59, 999999), + ], + ) + self.assertEqual(df.time.dtype.name, "object") + + self.assertEqual( + list(df.timestamp), + [ + datetime.datetime(2015, 6, 9, 8, 0, tzinfo=datetime.timezone.utc), + datetime.datetime(4567, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), + datetime.datetime( + 9999, 12, 31, 12, 59, 59, 999999, tzinfo=datetime.timezone.utc + ), + ], + ) + self.assertEqual(df.timestamp.dtype.name, "object") + @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_w_none_dtypes_mapper(self): from google.cloud.bigquery.schema import SchemaField @@ -3514,11 +3667,23 @@ def test_to_dataframe_w_none_dtypes_mapper(self): SchemaField("age", "INTEGER"), SchemaField("seconds", "INT64"), SchemaField("miles", "FLOAT64"), + SchemaField("date", "DATE"), + SchemaField("datetime", "DATETIME"), + SchemaField("time", "TIME"), + SchemaField("timestamp", "TIMESTAMP"), ] row_data = [ - ["Phred Phlyntstone", "true", "32", "23000", "1.77"], - ["Bharney Rhubble", "false", "33", "454000", "6.66"], - ["Wylma Phlyntstone", "true", "29", "341000", "2.0"], + [ + "Phred Phlyntstone", + "true", + "32", + "23000", + "1.77", + "1999-12-01", + "1999-12-31T00:00:00.000000", + "23:59:59.999999", + "1433836800000000", + ], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] path = "/foo" @@ -3531,6 +3696,10 @@ def test_to_dataframe_w_none_dtypes_mapper(self): int_dtype=None, float_dtype=None, string_dtype=None, + date_dtype=None, + datetime_dtype=None, + time_dtype=None, + timestamp_dtype=None, ) self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(df.complete.dtype.name, "bool") @@ -3538,6 +3707,10 @@ def test_to_dataframe_w_none_dtypes_mapper(self): self.assertEqual(df.seconds.dtype.name, "int64") self.assertEqual(df.miles.dtype.name, "float64") self.assertEqual(df.name.dtype.name, "object") + self.assertEqual(df.date.dtype.name, "datetime64[ns]") + self.assertEqual(df.datetime.dtype.name, "datetime64[ns]") + self.assertEqual(df.time.dtype.name, "object") + self.assertEqual(df.timestamp.dtype.name, "datetime64[ns, UTC]") @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_w_unsupported_dtypes_mapper(self): @@ -3575,6 +3748,26 @@ def test_to_dataframe_w_unsupported_dtypes_mapper(self): create_bqstorage_client=False, string_dtype=numpy.dtype("object"), ) + with self.assertRaises(ValueError): + row_iterator.to_dataframe( + create_bqstorage_client=False, + date_dtype=numpy.dtype("object"), + ) + with self.assertRaises(ValueError): + row_iterator.to_dataframe( + create_bqstorage_client=False, + datetime_dtype=numpy.dtype("datetime64[us]"), + ) + with self.assertRaises(ValueError): + row_iterator.to_dataframe( + create_bqstorage_client=False, + time_dtype=numpy.dtype("datetime64[us]"), + ) + with self.assertRaises(ValueError): + row_iterator.to_dataframe( + create_bqstorage_client=False, + timestamp_dtype=numpy.dtype("datetime64[us]"), + ) @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_column_dtypes(self):