Skip to content

Commit

Permalink
feat: define list accessor for bigframes Series
Browse files Browse the repository at this point in the history
  • Loading branch information
sycai committed Sep 4, 2024
1 parent 2b0f0fa commit a7f2e70
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 21 deletions.
37 changes: 37 additions & 0 deletions bigframes/operations/_op_converters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import bigframes.operations as ops


def convert_index(key: int) -> ops.ArrayIndexOp:
if key < 0:
raise NotImplementedError("Negative indexing is not supported.")
return ops.ArrayIndexOp(index=key)


def convert_slice(key: slice) -> ops.ArraySliceOp:
if key.step is not None and key.step != 1:
raise NotImplementedError(f"Only a step of 1 is allowed, got {key.step}")

if (key.start is not None and key.start < 0) or (
key.stop is not None and key.stop < 0
):
raise NotImplementedError("Slicing with negative numbers is not allowed.")

return ops.ArraySliceOp(
start=key.start if key.start is not None else 0,
stop=key.stop,
step=key.step,
)
43 changes: 43 additions & 0 deletions bigframes/operations/lists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

from typing import Union

import bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors

from bigframes.core import log_adapter
import bigframes.operations as ops
from bigframes.operations._op_converters import convert_index, convert_slice
import bigframes.operations.base
import bigframes.series as series


@log_adapter.class_logger
class ListAccessor(
bigframes.operations.base.SeriesMethods, vendoracessors.ListAccessor
):
__doc__ = vendoracessors.ListAccessor.__doc__

def len(self):
return self._apply_unary_op(ops.len_op)

def __getitem__(self, key: Union[int, slice]) -> series.Series:
if isinstance(key, int):
return self._apply_unary_op(convert_index(key))
elif isinstance(key, slice):
return self._apply_unary_op(convert_slice(key))
else:
raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
24 changes: 3 additions & 21 deletions bigframes/operations/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from bigframes.core import log_adapter
import bigframes.dataframe as df
import bigframes.operations as ops
from bigframes.operations._op_converters import convert_index, convert_slice
import bigframes.operations.base
import bigframes.series as series

Expand All @@ -40,28 +41,9 @@ class StringMethods(bigframes.operations.base.SeriesMethods, vendorstr.StringMet

def __getitem__(self, key: Union[int, slice]) -> series.Series:
if isinstance(key, int):
if key < 0:
raise NotImplementedError("Negative indexing is not supported.")
return self._apply_unary_op(ops.ArrayIndexOp(index=key))
return self._apply_unary_op(convert_index(key))
elif isinstance(key, slice):
if key.step is not None and key.step != 1:
raise NotImplementedError(
f"Only a step of 1 is allowed, got {key.step}"
)
if (key.start is not None and key.start < 0) or (
key.stop is not None and key.stop < 0
):
raise NotImplementedError(
"Slicing with negative numbers is not allowed."
)

return self._apply_unary_op(
ops.ArraySliceOp(
start=key.start if key.start is not None else 0,
stop=key.stop,
step=key.step,
)
)
return self._apply_unary_op(convert_slice(key))
else:
raise ValueError(f"key must be an int or slice, got {type(key).__name__}")

Expand Down
5 changes: 5 additions & 0 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
import bigframes.operations.aggregations as agg_ops
import bigframes.operations.base
import bigframes.operations.datetimes as dt
import bigframes.operations.lists as lists
import bigframes.operations.plotting as plotting
import bigframes.operations.strings as strings
import bigframes.operations.structs as structs
Expand Down Expand Up @@ -161,6 +162,10 @@ def query_job(self) -> Optional[bigquery.QueryJob]:
def struct(self) -> structs.StructAccessor:
return structs.StructAccessor(self._block)

@property
def list(self) -> lists.ListAccessor:
return lists.ListAccessor(self._block)

@property
@validations.requires_ordering()
def T(self) -> Series:
Expand Down
74 changes: 74 additions & 0 deletions tests/system/small/operations/test_lists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import pandas as pd
import pyarrow as pa
import pytest

import bigframes.pandas as bpd

from ...utils import assert_series_equal


@pytest.mark.parametrize(
("key"),
[
pytest.param(0, id="int"),
pytest.param(slice(None, None, None), id="default_start_slice"),
pytest.param(slice(0, None, 1), id="default_stop_slice"),
pytest.param(slice(0, 2, None), id="default_step_slice"),
],
)
def test_getitem(key):
data = [[1], [2, 3], [4, 5, 6]]
s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))

bf_result = s.list[key].to_pandas()
pd_result = pd_s.list[key]

assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)


@pytest.mark.parametrize(
("key", "expectation"),
[
# Negative index
(-1, pytest.raises(NotImplementedError)),
# Slice with negative start
(slice(-1, None, None), pytest.raises(NotImplementedError)),
# Slice with negatiev end
(slice(0, -1, None), pytest.raises(NotImplementedError)),
# Slice with step not equal to 1
(slice(0, 2, 2), pytest.raises(NotImplementedError)),
],
)
def test_getitem_notsupported(key, expectation):
data = [[1], [2, 3], [4, 5, 6]]
s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))

with expectation as e:
assert s.list[key] == e


def test_len():
data = [[], [1], [1, 2], [1, 2, 3]]
s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))

bf_result = s.list.len().to_pandas()
pd_result = pd_s.list.len()

assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,79 @@
from bigframes import constants


class ListAccessor:
"""
Accessor object for list data properties of the Series values.
"""

def len(self):
"""
Return the length of each list in the Series.
Returns
-------
Series
The length of each list.
See Also
--------
str.len : Python built-in function returning the length of an object.
Series.size : Returns the length of the Series.
StringMethods.len : Compute the length of each element in the Series/Index.
Examples
--------
>>> import bigframes.pandas as bpd
>>> import pyarrow as pa
>>> bpd.options.display.progress_bar = None
>>> s = bpd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(pa.int64())),
... )
>>> s.list.len()
0 3
1 1
dtype: int32[pyarrow]
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def __getitem__(self, key: int | slice):
"""
Index or slice lists in the Series.
Parameters
----------
key : int | slice
Index or slice of indices to access from each list.
Returns
-------
Series
The list at requested index.
Examples
--------
>>> import bigframes.pandas as bpd
>>> import pyarrow as pa
>>> bpd.options.display.progress_bar = None
>>> s = bpd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(pa.int64())),
... )
>>> s.list[0]
0 1
1 3
dtype: int64[pyarrow]
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)


class StructAccessor:
"""
Accessor object for structured data properties of the Series values.
Expand Down

0 comments on commit a7f2e70

Please sign in to comment.