feat: add info and memory_usage methods to dataframe (#219)

googleapis · Nov 28, 2023 · 9d6613d · 9d6613d
1 parent ae03756
commit 9d6613d
Show file tree

Hide file tree

Showing 10 changed files with 235 additions and 2 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -38,4 +38,4 @@ repos:
     rev: v1.1.1
     hooks:
     -   id: mypy
-        additional_dependencies: [types-requests]
+        additional_dependencies: [types-requests, types-tabulate]
diff --git a/bigframes/_config/display_options.py b/bigframes/_config/display_options.py
@@ -32,6 +32,10 @@ class DisplayOptions:
     progress_bar: Optional[str] = "auto"
     repr_mode: Literal["head", "deferred"] = "head"
 
+    max_info_columns: int = 100
+    max_info_rows: Optional[int] = 200000
+    memory_usage: bool = True
+
 
 @contextlib.contextmanager
 def pandas_repr(display_options: DisplayOptions):

diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py
@@ -155,6 +155,14 @@ def _block(self) -> blocks.Block:
     def T(self) -> Index:
         return self.transpose()
 
+    def _memory_usage(self) -> int:
+        (n_rows,) = self.shape
+        return sum(
+            self.dtypes.map(
+                lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows
+            )
+        )
+
     def transpose(self) -> Index:
         return self
 
@@ -326,7 +334,10 @@ def _apply_aggregation(self, op: agg_ops.AggregateOp) -> typing.Any:
 
     def __getitem__(self, key: int) -> typing.Any:
         if isinstance(key, int):
-            result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas()
+            if key != -1:
+                result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas()
+            else:  # special case, want [-1:] instead of [-1:0]
+                result_pd_df, _ = self._block.slice(key).to_pandas()
             if result_pd_df.empty:
                 raise IndexError("single positional indexer is out-of-bounds")
             return result_pd_df.index[0]

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -18,6 +18,7 @@
 
 import datetime
 import re
+import sys
 import textwrap
 import typing
 from typing import (
@@ -36,6 +37,7 @@
 import google.cloud.bigquery as bigquery
 import numpy
 import pandas
+import tabulate
 
 import bigframes
 import bigframes._config.display_options as display_options
@@ -350,6 +352,88 @@ def query_job(self) -> Optional[bigquery.QueryJob]:
             self._set_internal_query_job(self._compute_dry_run())
         return self._query_job
 
+    def memory_usage(self, index: bool = True):
+        n_rows, _ = self.shape
+        # like pandas, treat all variable-size objects as just 8-byte pointers, ignoring actual object
+        column_sizes = self.dtypes.map(
+            lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows
+        )
+        if index:
+            index_size = pandas.Series([self.index._memory_usage()], index=["Index"])
+            column_sizes = pandas.concat([index_size, column_sizes])
+        return column_sizes
+
+    def info(
+        self,
+        verbose: Optional[bool] = None,
+        buf=None,
+        max_cols: Optional[int] = None,
+        memory_usage: Optional[bool] = None,
+        show_counts: Optional[bool] = None,
+    ):
+        obuf = buf or sys.stdout
+
+        n_rows, n_columns = self.shape
+
+        max_cols = (
+            max_cols
+            if max_cols is not None
+            else bigframes.options.display.max_info_columns
+        )
+
+        show_all_columns = verbose if verbose is not None else (n_columns < max_cols)
+
+        obuf.write(f"{type(self)}\n")
+
+        index_type = "MultiIndex" if self.index.nlevels > 1 else "Index"
+
+        # These accessses are kind of expensive, maybe should try to skip?
+        first_indice = self.index[0]
+        last_indice = self.index[-1]
+        obuf.write(f"{index_type}: {n_rows} entries, {first_indice} to {last_indice}\n")
+
+        dtype_strings = self.dtypes.astype("string")
+        if show_all_columns:
+            obuf.write(f"Data columns (total {n_columns} columns):\n")
+            column_info = self.columns.to_frame(name="Column")
+
+            max_rows = bigframes.options.display.max_info_rows
+            too_many_rows = n_rows > max_rows if max_rows is not None else False
+
+            if show_counts if show_counts is not None else (not too_many_rows):
+                non_null_counts = self.count().to_pandas()
+                column_info["Non-Null Count"] = non_null_counts.map(
+                    lambda x: f"{int(x)} non-null"
+                )
+
+            column_info["Dtype"] = dtype_strings
+
+            column_info = column_info.reset_index(drop=True)
+            column_info.index.name = "#"
+
+            column_info_formatted = tabulate.tabulate(column_info, headers="keys")  # type: ignore
+            obuf.write(column_info_formatted)
+            obuf.write("\n")
+
+        else:  # Just number of columns and first, last
+            obuf.write(
+                f"Columns: {n_columns} entries, {self.columns[0]} to {self.columns[-1]}\n"
+            )
+        dtype_counts = dtype_strings.value_counts().sort_index(ascending=True).items()
+        dtype_counts_formatted = ", ".join(
+            f"{dtype}({count})" for dtype, count in dtype_counts
+        )
+        obuf.write(f"dtypes: {dtype_counts_formatted}\n")
+
+        show_memory = (
+            memory_usage
+            if memory_usage is not None
+            else bigframes.options.display.memory_usage
+        )
+        if show_memory:
+            # TODO: Convert to different units (kb, mb, etc.)
+            obuf.write(f"memory usage: {self.memory_usage().sum()} bytes\n")
+
     def _set_internal_query_job(self, query_job: bigquery.QueryJob):
         self._query_job = query_job
 

diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
@@ -143,6 +143,19 @@
 # "string" and "string[pyarrow] are accepted"
 BIGFRAMES_STRING_TO_BIGFRAMES["string[pyarrow]"] = pd.StringDtype(storage="pyarrow")
 
+# For the purposes of dataframe.memory_usage
+# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data_type_sizes
+DTYPE_BYTE_SIZES = {
+    pd.BooleanDtype(): 1,
+    pd.Int64Dtype(): 8,
+    pd.Float32Dtype(): 8,
+    pd.StringDtype(): 8,
+    pd.ArrowDtype(pa.time64("us")): 8,
+    pd.ArrowDtype(pa.timestamp("us")): 8,
+    pd.ArrowDtype(pa.timestamp("us", tz="UTC")): 8,
+    pd.ArrowDtype(pa.date32()): 8,
+}
+
 
 def ibis_dtype_to_bigframes_dtype(
     ibis_dtype: ibis_dtypes.DataType,

diff --git a/noxfile.py b/noxfile.py
@@ -228,6 +228,7 @@ def mypy(session):
                 "types-python-dateutil",
                 "types-requests",
                 "types-setuptools",
+                "types-tabulate",
             ]
         )
         | set(SYSTEM_TEST_STANDARD_DEPENDENCIES)

diff --git a/setup.py b/setup.py
@@ -50,6 +50,7 @@
     "requests >=2.27.1",
     "scikit-learn >=1.2.2",
     "sqlalchemy >=1.4,<3.0dev",
+    "tabulate >= 0.9",
     "ipywidgets >=7.7.1",
     "humanize >= 4.6.0",
 ]

diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import io
 import operator
 import tempfile
 import typing
@@ -255,6 +256,47 @@ def test_drop_with_custom_column_labels(scalars_dfs):
     assert_pandas_df_equal(bf_result, pd_result)
 
 
+def test_df_memory_usage(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    pd_result = scalars_pandas_df.memory_usage()
+    bf_result = scalars_df.memory_usage()
+
+    pd.testing.assert_series_equal(pd_result, bf_result, rtol=1.5)
+
+
+def test_df_info(scalars_dfs):
+    expected = (
+        "<class 'bigframes.dataframe.DataFrame'>\n"
+        "Index: 9 entries, 0 to 8\n"
+        "Data columns (total 13 columns):\n"
+        "  #  Column         Non-Null Count    Dtype\n"
+        "---  -------------  ----------------  ------------------------------\n"
+        "  0  bool_col       8 non-null        boolean\n"
+        "  1  bytes_col      6 non-null        object\n"
+        "  2  date_col       7 non-null        date32[day][pyarrow]\n"
+        "  3  datetime_col   6 non-null        timestamp[us][pyarrow]\n"
+        "  4  geography_col  4 non-null        geometry\n"
+        "  5  int64_col      8 non-null        Int64\n"
+        "  6  int64_too      9 non-null        Int64\n"
+        "  7  numeric_col    6 non-null        object\n"
+        "  8  float64_col    7 non-null        Float64\n"
+        "  9  rowindex_2     9 non-null        Int64\n"
+        " 10  string_col     8 non-null        string\n"
+        " 11  time_col       6 non-null        time64[us][pyarrow]\n"
+        " 12  timestamp_col  6 non-null        timestamp[us, tz=UTC][pyarrow]\n"
+        "dtypes: Float64(1), Int64(3), boolean(1), date32[day][pyarrow](1), geometry(1), object(2), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n"
+        "memory usage: 945 bytes\n"
+    )
+
+    scalars_df, _ = scalars_dfs
+    bf_result = io.StringIO()
+
+    scalars_df.info(buf=bf_result)
+
+    assert expected == bf_result.getvalue()
+
+
 def test_drop_index(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
 

diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py
@@ -33,6 +33,17 @@
             Instead estimated bytes processed will be shown. Dataframe and Series
             objects can still be computed with methods that explicitly execute and
             download results.
+    max_info_columns (int):
+        max_info_columns is used in DataFrame.info method to decide if
+        per column information will be printed.
+    max_info_rows (int or None):
+        df.info() will usually show null-counts for each column.
+        For large frames this can be quite slow. max_info_rows and max_info_cols
+        limit this null check only to frames with smaller dimensions than
+        specified.
+    memory_usage (bool):
+        This specifies if the memory usage of a DataFrame should be displayed when
+        df.info() is called. Valid values True,False,
 """
 
 sampling_options_doc = """

diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -92,6 +92,72 @@ def values(self) -> np.ndarray:
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def info(
+        self,
+        verbose: bool | None = None,
+        buf=None,
+        max_cols: int | None = None,
+        memory_usage: bool | None = None,
+        show_counts: bool | None = None,
+    ) -> None:
+        """
+        Print a concise summary of a DataFrame.
+
+        This method prints information about a DataFrame including
+        the index dtypeand columns, non-null values and memory usage.
+
+        Args:
+            verbose (bool, optional):
+                Whether to print the full summary. By default, the setting in
+                ``pandas.options.display.max_info_columns`` is followed.
+            buf (writable buffer, defaults to sys.stdout):
+                Where to send the output. By default, the output is printed to
+                sys.stdout. Pass a writable buffer if you need to further process
+                the output.
+            max_cols (int, optional):
+                When to switch from the verbose to the truncated output. If the
+                DataFrame has more than `max_cols` columns, the truncated output
+                is used. By default, the setting in
+                ``pandas.options.display.max_info_columns`` is used.
+            memory_usage (bool, optional):
+                Specifies whether total memory usage of the DataFrame
+                elements (including the index) should be displayed. By default,
+                this follows the ``pandas.options.display.memory_usage`` setting.
+                True always show memory usage. False never shows memory usage.
+                Memory estimation is made based in column dtype and number of rows
+                assuming values consume the same memory amount for corresponding dtypes.
+            show_counts (bool, optional):
+                Whether to show the non-null counts. By default, this is shown
+                only if the DataFrame is smaller than
+                ``pandas.options.display.max_info_rows`` and
+                ``pandas.options.display.max_info_columns``. A value of True always
+                shows the counts, and False never shows the counts.
+
+        Returns:
+            None: This method prints a summary of a DataFrame and returns None."""
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+    def memory_usage(self, index: bool = True):
+        """
+        Return the memory usage of each column in bytes.
+
+        The memory usage can optionally include the contribution of
+        the index and elements of `object` dtype.
+
+        This value is displayed in `DataFrame.info` by default. This can be
+        suppressed by setting ``pandas.options.display.memory_usage`` to False.
+
+        Args:
+            index (bool, default True):
+                Specifies whether to include the memory usage of the DataFrame's
+                index in returned Series. If ``index=True``, the memory usage of
+                the index is the first item in the output.
+
+        Returns:
+            Series: A Series whose index is the original column names and whose values is the memory usage of each column in bytes.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     # ----------------------------------------------------------------------
     # IO methods (to / from other formats)
     def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray: