fix: Fix caching from generating row numbers in partial ordering mode (…

…#872)
googleapis · Aug 5, 2024 · 52b7786 · 52b7786
1 parent 5317327
commit 52b7786
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 3 deletions.
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -641,7 +641,7 @@ def head(self, n: int = 5) -> Series:
     def tail(self, n: int = 5) -> Series:
         return typing.cast(Series, self.iloc[-n:])
 
-    def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame:
+    def peek(self, n: int = 5, *, force: bool = True) -> pandas.Series:
         """
         Preview n arbitrary elements from the series without guarantees about row selection or ordering.
 

diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -1997,8 +1997,10 @@ def _cache_with_session_awareness(self, array_value: core.ArrayValue) -> None:
         )
         if len(cluster_cols) > 0:
             self._cache_with_cluster_cols(core.ArrayValue(target), cluster_cols)
-        else:
+        elif self._strictly_ordered:
             self._cache_with_offsets(core.ArrayValue(target))
+        else:
+            self._cache_with_cluster_cols(core.ArrayValue(target), [])
 
     def _simplify_with_caching(self, array_value: core.ArrayValue):
         """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces."""

diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py
@@ -19,7 +19,11 @@
 
 import bigframes.exceptions
 import bigframes.pandas as bpd
-from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas
+from tests.system.utils import (
+    assert_pandas_df_equal,
+    assert_series_equal,
+    skip_legacy_pandas,
+)
 
 
 def test_unordered_mode_sql_no_hash(unordered_session):
@@ -51,6 +55,15 @@ def test_unordered_mode_cache_aggregate(unordered_session):
     assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)
 
 
+def test_unordered_mode_series_peek(unordered_session):
+    pd_series = pd.Series([1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype())
+    bf_series = bpd.Series(pd_series, session=unordered_session)
+    pd_result = pd_series.groupby(pd_series % 4).sum()
+    bf_peek = bf_series.groupby(bf_series % 4).sum().peek(2)
+
+    assert_series_equal(bf_peek, pd_result.reindex(bf_peek.index))
+
+
 def test_unordered_mode_single_aggregate(unordered_session):
     pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype())
     bf_df = bpd.DataFrame(pd_df, session=unordered_session)