Skip to content

Commit

Permalink
feat: add info and memory_usage methods to dataframe (#219)
Browse files Browse the repository at this point in the history
  • Loading branch information
TrevorBergeron authored Nov 28, 2023
1 parent ae03756 commit 9d6613d
Show file tree
Hide file tree
Showing 10 changed files with 235 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,4 @@ repos:
rev: v1.1.1
hooks:
- id: mypy
additional_dependencies: [types-requests]
additional_dependencies: [types-requests, types-tabulate]
4 changes: 4 additions & 0 deletions bigframes/_config/display_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ class DisplayOptions:
progress_bar: Optional[str] = "auto"
repr_mode: Literal["head", "deferred"] = "head"

max_info_columns: int = 100
max_info_rows: Optional[int] = 200000
memory_usage: bool = True


@contextlib.contextmanager
def pandas_repr(display_options: DisplayOptions):
Expand Down
13 changes: 12 additions & 1 deletion bigframes/core/indexes/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,14 @@ def _block(self) -> blocks.Block:
def T(self) -> Index:
return self.transpose()

def _memory_usage(self) -> int:
(n_rows,) = self.shape
return sum(
self.dtypes.map(
lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows
)
)

def transpose(self) -> Index:
return self

Expand Down Expand Up @@ -326,7 +334,10 @@ def _apply_aggregation(self, op: agg_ops.AggregateOp) -> typing.Any:

def __getitem__(self, key: int) -> typing.Any:
if isinstance(key, int):
result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas()
if key != -1:
result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas()
else: # special case, want [-1:] instead of [-1:0]
result_pd_df, _ = self._block.slice(key).to_pandas()
if result_pd_df.empty:
raise IndexError("single positional indexer is out-of-bounds")
return result_pd_df.index[0]
Expand Down
84 changes: 84 additions & 0 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import datetime
import re
import sys
import textwrap
import typing
from typing import (
Expand All @@ -36,6 +37,7 @@
import google.cloud.bigquery as bigquery
import numpy
import pandas
import tabulate

import bigframes
import bigframes._config.display_options as display_options
Expand Down Expand Up @@ -350,6 +352,88 @@ def query_job(self) -> Optional[bigquery.QueryJob]:
self._set_internal_query_job(self._compute_dry_run())
return self._query_job

def memory_usage(self, index: bool = True):
n_rows, _ = self.shape
# like pandas, treat all variable-size objects as just 8-byte pointers, ignoring actual object
column_sizes = self.dtypes.map(
lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows
)
if index:
index_size = pandas.Series([self.index._memory_usage()], index=["Index"])
column_sizes = pandas.concat([index_size, column_sizes])
return column_sizes

def info(
self,
verbose: Optional[bool] = None,
buf=None,
max_cols: Optional[int] = None,
memory_usage: Optional[bool] = None,
show_counts: Optional[bool] = None,
):
obuf = buf or sys.stdout

n_rows, n_columns = self.shape

max_cols = (
max_cols
if max_cols is not None
else bigframes.options.display.max_info_columns
)

show_all_columns = verbose if verbose is not None else (n_columns < max_cols)

obuf.write(f"{type(self)}\n")

index_type = "MultiIndex" if self.index.nlevels > 1 else "Index"

# These accessses are kind of expensive, maybe should try to skip?
first_indice = self.index[0]
last_indice = self.index[-1]
obuf.write(f"{index_type}: {n_rows} entries, {first_indice} to {last_indice}\n")

dtype_strings = self.dtypes.astype("string")
if show_all_columns:
obuf.write(f"Data columns (total {n_columns} columns):\n")
column_info = self.columns.to_frame(name="Column")

max_rows = bigframes.options.display.max_info_rows
too_many_rows = n_rows > max_rows if max_rows is not None else False

if show_counts if show_counts is not None else (not too_many_rows):
non_null_counts = self.count().to_pandas()
column_info["Non-Null Count"] = non_null_counts.map(
lambda x: f"{int(x)} non-null"
)

column_info["Dtype"] = dtype_strings

column_info = column_info.reset_index(drop=True)
column_info.index.name = "#"

column_info_formatted = tabulate.tabulate(column_info, headers="keys") # type: ignore
obuf.write(column_info_formatted)
obuf.write("\n")

else: # Just number of columns and first, last
obuf.write(
f"Columns: {n_columns} entries, {self.columns[0]} to {self.columns[-1]}\n"
)
dtype_counts = dtype_strings.value_counts().sort_index(ascending=True).items()
dtype_counts_formatted = ", ".join(
f"{dtype}({count})" for dtype, count in dtype_counts
)
obuf.write(f"dtypes: {dtype_counts_formatted}\n")

show_memory = (
memory_usage
if memory_usage is not None
else bigframes.options.display.memory_usage
)
if show_memory:
# TODO: Convert to different units (kb, mb, etc.)
obuf.write(f"memory usage: {self.memory_usage().sum()} bytes\n")

def _set_internal_query_job(self, query_job: bigquery.QueryJob):
self._query_job = query_job

Expand Down
13 changes: 13 additions & 0 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,19 @@
# "string" and "string[pyarrow] are accepted"
BIGFRAMES_STRING_TO_BIGFRAMES["string[pyarrow]"] = pd.StringDtype(storage="pyarrow")

# For the purposes of dataframe.memory_usage
# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data_type_sizes
DTYPE_BYTE_SIZES = {
pd.BooleanDtype(): 1,
pd.Int64Dtype(): 8,
pd.Float32Dtype(): 8,
pd.StringDtype(): 8,
pd.ArrowDtype(pa.time64("us")): 8,
pd.ArrowDtype(pa.timestamp("us")): 8,
pd.ArrowDtype(pa.timestamp("us", tz="UTC")): 8,
pd.ArrowDtype(pa.date32()): 8,
}


def ibis_dtype_to_bigframes_dtype(
ibis_dtype: ibis_dtypes.DataType,
Expand Down
1 change: 1 addition & 0 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ def mypy(session):
"types-python-dateutil",
"types-requests",
"types-setuptools",
"types-tabulate",
]
)
| set(SYSTEM_TEST_STANDARD_DEPENDENCIES)
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
"requests >=2.27.1",
"scikit-learn >=1.2.2",
"sqlalchemy >=1.4,<3.0dev",
"tabulate >= 0.9",
"ipywidgets >=7.7.1",
"humanize >= 4.6.0",
]
Expand Down
42 changes: 42 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import operator
import tempfile
import typing
Expand Down Expand Up @@ -255,6 +256,47 @@ def test_drop_with_custom_column_labels(scalars_dfs):
assert_pandas_df_equal(bf_result, pd_result)


def test_df_memory_usage(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

pd_result = scalars_pandas_df.memory_usage()
bf_result = scalars_df.memory_usage()

pd.testing.assert_series_equal(pd_result, bf_result, rtol=1.5)


def test_df_info(scalars_dfs):
expected = (
"<class 'bigframes.dataframe.DataFrame'>\n"
"Index: 9 entries, 0 to 8\n"
"Data columns (total 13 columns):\n"
" # Column Non-Null Count Dtype\n"
"--- ------------- ---------------- ------------------------------\n"
" 0 bool_col 8 non-null boolean\n"
" 1 bytes_col 6 non-null object\n"
" 2 date_col 7 non-null date32[day][pyarrow]\n"
" 3 datetime_col 6 non-null timestamp[us][pyarrow]\n"
" 4 geography_col 4 non-null geometry\n"
" 5 int64_col 8 non-null Int64\n"
" 6 int64_too 9 non-null Int64\n"
" 7 numeric_col 6 non-null object\n"
" 8 float64_col 7 non-null Float64\n"
" 9 rowindex_2 9 non-null Int64\n"
" 10 string_col 8 non-null string\n"
" 11 time_col 6 non-null time64[us][pyarrow]\n"
" 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n"
"dtypes: Float64(1), Int64(3), boolean(1), date32[day][pyarrow](1), geometry(1), object(2), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n"
"memory usage: 945 bytes\n"
)

scalars_df, _ = scalars_dfs
bf_result = io.StringIO()

scalars_df.info(buf=bf_result)

assert expected == bf_result.getvalue()


def test_drop_index(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

Expand Down
11 changes: 11 additions & 0 deletions third_party/bigframes_vendored/pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@
Instead estimated bytes processed will be shown. Dataframe and Series
objects can still be computed with methods that explicitly execute and
download results.
max_info_columns (int):
max_info_columns is used in DataFrame.info method to decide if
per column information will be printed.
max_info_rows (int or None):
df.info() will usually show null-counts for each column.
For large frames this can be quite slow. max_info_rows and max_info_cols
limit this null check only to frames with smaller dimensions than
specified.
memory_usage (bool):
This specifies if the memory usage of a DataFrame should be displayed when
df.info() is called. Valid values True,False,
"""

sampling_options_doc = """
Expand Down
66 changes: 66 additions & 0 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,72 @@ def values(self) -> np.ndarray:
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def info(
self,
verbose: bool | None = None,
buf=None,
max_cols: int | None = None,
memory_usage: bool | None = None,
show_counts: bool | None = None,
) -> None:
"""
Print a concise summary of a DataFrame.
This method prints information about a DataFrame including
the index dtypeand columns, non-null values and memory usage.
Args:
verbose (bool, optional):
Whether to print the full summary. By default, the setting in
``pandas.options.display.max_info_columns`` is followed.
buf (writable buffer, defaults to sys.stdout):
Where to send the output. By default, the output is printed to
sys.stdout. Pass a writable buffer if you need to further process
the output.
max_cols (int, optional):
When to switch from the verbose to the truncated output. If the
DataFrame has more than `max_cols` columns, the truncated output
is used. By default, the setting in
``pandas.options.display.max_info_columns`` is used.
memory_usage (bool, optional):
Specifies whether total memory usage of the DataFrame
elements (including the index) should be displayed. By default,
this follows the ``pandas.options.display.memory_usage`` setting.
True always show memory usage. False never shows memory usage.
Memory estimation is made based in column dtype and number of rows
assuming values consume the same memory amount for corresponding dtypes.
show_counts (bool, optional):
Whether to show the non-null counts. By default, this is shown
only if the DataFrame is smaller than
``pandas.options.display.max_info_rows`` and
``pandas.options.display.max_info_columns``. A value of True always
shows the counts, and False never shows the counts.
Returns:
None: This method prints a summary of a DataFrame and returns None."""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def memory_usage(self, index: bool = True):
"""
Return the memory usage of each column in bytes.
The memory usage can optionally include the contribution of
the index and elements of `object` dtype.
This value is displayed in `DataFrame.info` by default. This can be
suppressed by setting ``pandas.options.display.memory_usage`` to False.
Args:
index (bool, default True):
Specifies whether to include the memory usage of the DataFrame's
index in returned Series. If ``index=True``, the memory usage of
the index is the first item in the output.
Returns:
Series: A Series whose index is the original column names and whose values is the memory usage of each column in bytes.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

# ----------------------------------------------------------------------
# IO methods (to / from other formats)
def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray:
Expand Down

0 comments on commit 9d6613d

Please sign in to comment.