From adc4d9748218d8f86e5361decb445382bb5cb1d4 Mon Sep 17 00:00:00 2001
From: TrevorBergeron <tbergeron@google.com>
Date: Thu, 4 Apr 2024 13:03:01 -0700
Subject: [PATCH] perf: Add multi-query execution capability for complex
 dataframes (#427)

---
 bigframes/_config/compute_options.py |   6 +-
 bigframes/core/blocks.py             |   4 +
 bigframes/core/expression.py         |   9 ++
 bigframes/core/nodes.py              | 207 ++++++++++++++++++++++++++-
 bigframes/core/tree_properties.py    |  51 ++++++-
 bigframes/dataframe.py               |  12 +-
 bigframes/series.py                  |  10 ++
 bigframes/session/__init__.py        |  52 +++++++
 tests/system/conftest.py             |   8 ++
 tests/system/small/test_dataframe.py |  50 +++++++
 10 files changed, 403 insertions(+), 6 deletions(-)

diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py
index fb708b844c..2b849c558a 100644
--- a/bigframes/_config/compute_options.py
+++ b/bigframes/_config/compute_options.py
@@ -40,7 +40,11 @@ class ComputeOptions:
             bytes billed beyond this limit will fail (without incurring a
             charge). If unspecified, this will be set to your project default.
             See `maximum_bytes_billed <https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJobConfig#google_cloud_bigquery_job_QueryJobConfig_maximum_bytes_billed>`_.
-
+        enable_multi_query_execution (bool, Options):
+            If enabled, large queries may be factored into multiple smaller queries
+            in order to avoid generating queries that are too complex for the query
+            engine to handle. However this comes at the cost of increase cost and latency.
     """
 
     maximum_bytes_billed: Optional[int] = None
+    enable_multi_query_execution: bool = False
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 0b6e50cfa3..c7b41e93eb 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -1873,6 +1873,10 @@ def cached(self, *, optimize_offsets=False, force: bool = False) -> Block:
             expr = self.session._cache_with_cluster_cols(
                 self.expr, cluster_cols=self.index_columns
             )
+        return self.swap_array_expr(expr)
+
+    def swap_array_expr(self, expr: core.ArrayValue) -> Block:
+        # TODO: Validate schema unchanged
         return Block(
             expr,
             index_columns=self.index_columns,
diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py
index 8c3f52d22b..4980f5369d 100644
--- a/bigframes/core/expression.py
+++ b/bigframes/core/expression.py
@@ -108,6 +108,11 @@ def bind_all_variables(self, bindings: Mapping[str, Expression]) -> Expression:
     def is_bijective(self) -> bool:
         return False
 
+    @property
+    def is_identity(self) -> bool:
+        """True for identity operation that does not transform input."""
+        return False
+
 
 @dataclasses.dataclass(frozen=True)
 class ScalarConstantExpression(Expression):
@@ -173,6 +178,10 @@ def bind_all_variables(self, bindings: Mapping[str, Expression]) -> Expression:
     def is_bijective(self) -> bool:
         return True
 
+    @property
+    def is_identity(self) -> bool:
+        return True
+
 
 @dataclasses.dataclass(frozen=True)
 class OpExpression(Expression):
diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
index d740605a56..a1072b0d68 100644
--- a/bigframes/core/nodes.py
+++ b/bigframes/core/nodes.py
@@ -15,11 +15,11 @@
 from __future__ import annotations
 
 import abc
-from dataclasses import dataclass, field, fields
+from dataclasses import dataclass, field, fields, replace
 import functools
 import itertools
 import typing
-from typing import Tuple
+from typing import Callable, Tuple
 
 import pandas
 
@@ -39,6 +39,10 @@
     import bigframes.session
 
 
+# A fixed number of variable to assume for overhead on some operations
+OVERHEAD_VARIABLES = 5
+
+
 @dataclass(frozen=True)
 class BigFrameNode:
     """
@@ -102,6 +106,60 @@ def roots(self) -> typing.Set[BigFrameNode]:
     def schema(self) -> schemata.ArraySchema:
         ...
 
+    @property
+    @abc.abstractmethod
+    def variables_introduced(self) -> int:
+        """
+        Defines number of values created by the current node. Helps represent the "width" of a query
+        """
+        ...
+
+    @property
+    def relation_ops_created(self) -> int:
+        """
+        Defines the number of relational ops generated by the current node. Used to estimate query planning complexity.
+        """
+        return 1
+
+    @property
+    def joins(self) -> bool:
+        """
+        Defines whether the node joins data.
+        """
+        return False
+
+    @functools.cached_property
+    def total_variables(self) -> int:
+        return self.variables_introduced + sum(
+            map(lambda x: x.total_variables, self.child_nodes)
+        )
+
+    @functools.cached_property
+    def total_relational_ops(self) -> int:
+        return self.relation_ops_created + sum(
+            map(lambda x: x.total_relational_ops, self.child_nodes)
+        )
+
+    @functools.cached_property
+    def total_joins(self) -> int:
+        return int(self.joins) + sum(map(lambda x: x.total_joins, self.child_nodes))
+
+    @property
+    def planning_complexity(self) -> int:
+        """
+        Empirical heuristic measure of planning complexity.
+
+        Used to determine when to decompose overly complex computations. May require tuning.
+        """
+        return self.total_variables * self.total_relational_ops * (1 + self.total_joins)
+
+    @abc.abstractmethod
+    def transform_children(
+        self, t: Callable[[BigFrameNode], BigFrameNode]
+    ) -> BigFrameNode:
+        """Apply a function to each child node."""
+        ...
+
 
 @dataclass(frozen=True)
 class UnaryNode(BigFrameNode):
@@ -115,6 +173,11 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]:
     def schema(self) -> schemata.ArraySchema:
         return self.child.schema
 
+    def transform_children(
+        self, t: Callable[[BigFrameNode], BigFrameNode]
+    ) -> BigFrameNode:
+        return replace(self, child=t(self.child))
+
 
 @dataclass(frozen=True)
 class JoinNode(BigFrameNode):
@@ -154,6 +217,22 @@ def join_mapping_to_schema_item(mapping: JoinColumnMapping):
         )
         return schemata.ArraySchema(items)
 
+    @functools.cached_property
+    def variables_introduced(self) -> int:
+        """Defines the number of variables generated by the current node. Used to estimate query planning complexity."""
+        return OVERHEAD_VARIABLES
+
+    @property
+    def joins(self) -> bool:
+        return True
+
+    def transform_children(
+        self, t: Callable[[BigFrameNode], BigFrameNode]
+    ) -> BigFrameNode:
+        return replace(
+            self, left_child=t(self.left_child), right_child=t(self.right_child)
+        )
+
 
 @dataclass(frozen=True)
 class ConcatNode(BigFrameNode):
@@ -182,6 +261,16 @@ def schema(self) -> schemata.ArraySchema:
         )
         return schemata.ArraySchema(items)
 
+    @functools.cached_property
+    def variables_introduced(self) -> int:
+        """Defines the number of variables generated by the current node. Used to estimate query planning complexity."""
+        return len(self.schema.items) + OVERHEAD_VARIABLES
+
+    def transform_children(
+        self, t: Callable[[BigFrameNode], BigFrameNode]
+    ) -> BigFrameNode:
+        return replace(self, children=tuple(t(child) for child in self.children))
+
 
 # Input Nodex
 @dataclass(frozen=True)
@@ -201,6 +290,16 @@ def roots(self) -> typing.Set[BigFrameNode]:
     def schema(self) -> schemata.ArraySchema:
         return self.data_schema
 
+    @functools.cached_property
+    def variables_introduced(self) -> int:
+        """Defines the number of variables generated by the current node. Used to estimate query planning complexity."""
+        return len(self.schema.items) + 1
+
+    def transform_children(
+        self, t: Callable[[BigFrameNode], BigFrameNode]
+    ) -> BigFrameNode:
+        return self
+
 
 # TODO: Refactor to take raw gbq object reference
 @dataclass(frozen=True)
@@ -233,6 +332,20 @@ def schema(self) -> schemata.ArraySchema:
         )
         return schemata.ArraySchema(items)
 
+    @functools.cached_property
+    def variables_introduced(self) -> int:
+        return len(self.columns) + len(self.hidden_ordering_columns)
+
+    @property
+    def relation_ops_created(self) -> int:
+        # Assume worst case, where readgbq actually has baked in analytic operation to generate index
+        return 2
+
+    def transform_children(
+        self, t: Callable[[BigFrameNode], BigFrameNode]
+    ) -> BigFrameNode:
+        return self
+
 
 # Unary nodes
 @dataclass(frozen=True)
@@ -252,6 +365,14 @@ def schema(self) -> schemata.ArraySchema:
             schemata.SchemaItem(self.col_id, bigframes.dtypes.INT_DTYPE)
         )
 
+    @property
+    def relation_ops_created(self) -> int:
+        return 2
+
+    @functools.cached_property
+    def variables_introduced(self) -> int:
+        return 1
+
 
 @dataclass(frozen=True)
 class FilterNode(UnaryNode):
@@ -264,6 +385,10 @@ def row_preserving(self) -> bool:
     def __hash__(self):
         return self._node_hash
 
+    @property
+    def variables_introduced(self) -> int:
+        return 1
+
 
 @dataclass(frozen=True)
 class OrderByNode(UnaryNode):
@@ -281,6 +406,15 @@ def __post_init__(self):
     def __hash__(self):
         return self._node_hash
 
+    @property
+    def variables_introduced(self) -> int:
+        return 0
+
+    @property
+    def relation_ops_created(self) -> int:
+        # Doesnt directly create any relational operations
+        return 0
+
 
 @dataclass(frozen=True)
 class ReversedNode(UnaryNode):
@@ -290,6 +424,15 @@ class ReversedNode(UnaryNode):
     def __hash__(self):
         return self._node_hash
 
+    @property
+    def variables_introduced(self) -> int:
+        return 0
+
+    @property
+    def relation_ops_created(self) -> int:
+        # Doesnt directly create any relational operations
+        return 0
+
 
 @dataclass(frozen=True)
 class ProjectionNode(UnaryNode):
@@ -315,6 +458,12 @@ def schema(self) -> schemata.ArraySchema:
         )
         return schemata.ArraySchema(items)
 
+    @property
+    def variables_introduced(self) -> int:
+        # ignore passthrough expressions
+        new_vars = sum(1 for i in self.assignments if not i[0].is_identity)
+        return new_vars
+
 
 # TODO: Merge RowCount into Aggregate Node?
 # Row count can be compute from table metadata sometimes, so it is a bit special.
@@ -334,6 +483,10 @@ def schema(self) -> schemata.ArraySchema:
             (schemata.SchemaItem("count", bigframes.dtypes.INT_DTYPE),)
         )
 
+    @property
+    def variables_introduced(self) -> int:
+        return 1
+
 
 @dataclass(frozen=True)
 class AggregateNode(UnaryNode):
@@ -367,6 +520,10 @@ def schema(self) -> schemata.ArraySchema:
         )
         return schemata.ArraySchema(tuple([*by_items, *agg_items]))
 
+    @property
+    def variables_introduced(self) -> int:
+        return len(self.aggregations) + len(self.by_column_ids)
+
 
 @dataclass(frozen=True)
 class WindowOpNode(UnaryNode):
@@ -396,12 +553,31 @@ def schema(self) -> schemata.ArraySchema:
             schemata.SchemaItem(self.output_name, new_item_dtype)
         )
 
+    @property
+    def variables_introduced(self) -> int:
+        return 1
+
+    @property
+    def relation_ops_created(self) -> int:
+        # Assume that if not reprojecting, that there is a sequence of window operations sharing the same window
+        return 0 if self.skip_reproject_unsafe else 4
+
 
+# TODO: Remove this op
 @dataclass(frozen=True)
 class ReprojectOpNode(UnaryNode):
     def __hash__(self):
         return self._node_hash
 
+    @property
+    def variables_introduced(self) -> int:
+        return 0
+
+    @property
+    def relation_ops_created(self) -> int:
+        # This op is not a real transformation, just a hint to the sql generator
+        return 0
+
 
 @dataclass(frozen=True)
 class UnpivotNode(UnaryNode):
@@ -428,6 +604,10 @@ def row_preserving(self) -> bool:
     def non_local(self) -> bool:
         return True
 
+    @property
+    def joins(self) -> bool:
+        return True
+
     @functools.cached_property
     def schema(self) -> schemata.ArraySchema:
         def infer_dtype(
@@ -469,6 +649,17 @@ def infer_dtype(
         ]
         return schemata.ArraySchema((*index_items, *value_items, *passthrough_items))
 
+    @property
+    def variables_introduced(self) -> int:
+        return (
+            len(self.schema.items) - len(self.passthrough_columns) + OVERHEAD_VARIABLES
+        )
+
+    @property
+    def relation_ops_created(self) -> int:
+        # Unpivot is essentially a cross join and a projection.
+        return 2
+
 
 @dataclass(frozen=True)
 class RandomSampleNode(UnaryNode):
@@ -485,6 +676,10 @@ def row_preserving(self) -> bool:
     def __hash__(self):
         return self._node_hash
 
+    @property
+    def variables_introduced(self) -> int:
+        return 1
+
 
 @dataclass(frozen=True)
 class ExplodeNode(UnaryNode):
@@ -511,3 +706,11 @@ def schema(self) -> schemata.ArraySchema:
             for name in self.child.schema.names
         )
         return schemata.ArraySchema(items)
+
+    @property
+    def relation_ops_created(self) -> int:
+        return 3
+
+    @functools.cached_property
+    def variables_introduced(self) -> int:
+        return len(self.column_ids) + 1
diff --git a/bigframes/core/tree_properties.py b/bigframes/core/tree_properties.py
index bc29f115f6..125a7e6bff 100644
--- a/bigframes/core/tree_properties.py
+++ b/bigframes/core/tree_properties.py
@@ -11,12 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
+import functools
+import itertools
+from typing import Dict
 
 import bigframes.core.nodes as nodes
 
-# TODO: Convert these functions to iterative or enforce hard limit on tree depth. The below algorithms can cause stack to exceed limit.
-
 
 def is_trivially_executable(node: nodes.BigFrameNode) -> bool:
     if local_only(node):
@@ -36,3 +38,48 @@ def peekable(node: nodes.BigFrameNode) -> bool:
     children_peekable = all(peekable(child) for child in node.child_nodes)
     self_peekable = not node.non_local
     return children_peekable and self_peekable
+
+
+def count_complex_nodes(
+    root: nodes.BigFrameNode, min_complexity: float, max_complexity: float
+) -> Dict[nodes.BigFrameNode, int]:
+    @functools.cache
+    def _node_counts_inner(
+        subtree: nodes.BigFrameNode,
+    ) -> Dict[nodes.BigFrameNode, int]:
+        """Helper function to count occurences of duplicate nodes in a subtree. Considers only nodes in a complexity range"""
+        empty_counts: Dict[nodes.BigFrameNode, int] = {}
+        if subtree.planning_complexity >= min_complexity:
+            child_counts = [_node_counts_inner(child) for child in subtree.child_nodes]
+            node_counts = functools.reduce(_combine_counts, child_counts, empty_counts)
+            if subtree.planning_complexity <= max_complexity:
+                return _combine_counts(node_counts, {subtree: 1})
+            else:
+                return node_counts
+        return empty_counts
+
+    return _node_counts_inner(root)
+
+
+def replace_nodes(
+    root: nodes.BigFrameNode,
+    to_replace: nodes.BigFrameNode,
+    replacemenet: nodes.BigFrameNode,
+):
+    @functools.cache
+    def apply_substition(n: nodes.BigFrameNode) -> nodes.BigFrameNode:
+        if n == to_replace:
+            return replacemenet
+        else:
+            return n.transform_children(apply_substition)
+
+    return root.transform_children(apply_substition)
+
+
+def _combine_counts(
+    left: Dict[nodes.BigFrameNode, int], right: Dict[nodes.BigFrameNode, int]
+) -> Dict[nodes.BigFrameNode, int]:
+    return {
+        key: left.get(key, 0) + right.get(key, 0)
+        for key in itertools.chain(left.keys(), right.keys())
+    }
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 9687bc7ff1..0ac9beb9ad 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -1123,7 +1123,7 @@ def to_pandas(
                 downsampled rows and all columns of this DataFrame.
         """
         # TODO(orrbradford): Optimize this in future. Potentially some cases where we can return the stored query job
-
+        self._optimize_query_complexity()
         df, query_job = self._block.to_pandas(
             max_download_size=max_download_size,
             sampling_method=sampling_method,
@@ -1135,6 +1135,7 @@ def to_pandas(
 
     def to_pandas_batches(self) -> Iterable[pandas.DataFrame]:
         """Stream DataFrame results to an iterable of pandas DataFrame"""
+        self._optimize_query_complexity()
         return self._block.to_pandas_batches()
 
     def _compute_dry_run(self) -> bigquery.QueryJob:
@@ -3133,6 +3134,7 @@ def _run_io_query(
         """Executes a query job presenting this dataframe and returns the destination
         table."""
         session = self._block.expr.session
+        self._optimize_query_complexity()
         export_array, id_overrides = self._prepare_export(
             index=index, ordering_id=ordering_id
         )
@@ -3269,6 +3271,14 @@ def _cached(self, *, force: bool = False) -> DataFrame:
         self._set_block(self._block.cached(force=force))
         return self
 
+    def _optimize_query_complexity(self):
+        """Reduce query complexity by caching repeated subtrees and recursively materializing maximum-complexity subtrees.
+        May generate many queries and take substantial time to execute.
+        """
+        # TODO: Move all this to session
+        new_expr = self._session._simplify_with_caching(self._block.expr)
+        self._set_block(self._block.swap_array_expr(new_expr))
+
     _DataFrameOrSeries = typing.TypeVar("_DataFrameOrSeries")
 
     def dot(self, other: _DataFrameOrSeries) -> _DataFrameOrSeries:
diff --git a/bigframes/series.py b/bigframes/series.py
index e4d48904b0..185891bc01 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -176,6 +176,7 @@ def __len__(self):
         return self.shape[0]
 
     def __iter__(self) -> typing.Iterator:
+        self._optimize_query_complexity()
         return itertools.chain.from_iterable(
             map(lambda x: x.squeeze(axis=1), self._block.to_pandas_batches())
         )
@@ -328,6 +329,7 @@ def to_pandas(
             pandas.Series: A pandas Series with all rows of this Series if the data_sampling_threshold_mb
                 is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame.
         """
+        self._optimize_query_complexity()
         df, query_job = self._block.to_pandas(
             max_download_size=max_download_size,
             sampling_method=sampling_method,
@@ -1603,6 +1605,14 @@ def _cached(self, *, force: bool = True) -> Series:
         self._set_block(self._block.cached(force=force))
         return self
 
+    def _optimize_query_complexity(self):
+        """Reduce query complexity by caching repeated subtrees and recursively materializing maximum-complexity subtrees.
+        May generate many queries and take substantial time to execute.
+        """
+        # TODO: Move all this to session
+        new_expr = self._block.session._simplify_with_caching(self._block.expr)
+        self._set_block(self._block.swap_array_expr(new_expr))
+
 
 def _is_list_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Sequence]:
     return pandas.api.types.is_list_like(obj)
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
index 671a3d65e7..354352f1c9 100644
--- a/bigframes/session/__init__.py
+++ b/bigframes/session/__init__.py
@@ -80,6 +80,7 @@
 import bigframes.core.blocks as blocks
 import bigframes.core.compile
 import bigframes.core.guid as guid
+import bigframes.core.nodes as nodes
 from bigframes.core.ordering import IntegerEncoding
 import bigframes.core.ordering as order
 import bigframes.core.tree_properties as traversals
@@ -120,6 +121,11 @@
 # Also must assume that text encoding as literals is much less efficient than in-memory representation.
 MAX_INLINE_DF_BYTES = 5000
 
+# Max complexity that should be executed as a single query
+QUERY_COMPLEXITY_LIMIT = 1e7
+# Number of times to factor out subqueries before giving up.
+MAX_SUBTREE_FACTORINGS = 5
+
 logger = logging.getLogger(__name__)
 
 # Excludes geography, bytes, and nested (array, struct) datatypes
@@ -1851,6 +1857,52 @@ def _cache_with_offsets(self, array_value: core.ArrayValue) -> core.ArrayValue:
             ordering=order.ExpressionOrdering.from_offset_col("bigframes_offsets"),
         )
 
+    def _simplify_with_caching(self, array_value: core.ArrayValue) -> core.ArrayValue:
+        """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces."""
+        if not bigframes.options.compute.enable_multi_query_execution:
+            return array_value
+        node = array_value.node
+        if node.planning_complexity < QUERY_COMPLEXITY_LIMIT:
+            return array_value
+
+        for _ in range(MAX_SUBTREE_FACTORINGS):
+            updated = self._cache_most_complex_subtree(node)
+            if updated is None:
+                return core.ArrayValue(node)
+            else:
+                node = updated
+
+        return core.ArrayValue(node)
+
+    def _cache_most_complex_subtree(
+        self, node: nodes.BigFrameNode
+    ) -> Optional[nodes.BigFrameNode]:
+        # TODO: If query fails, retry with lower complexity limit
+        valid_candidates = traversals.count_complex_nodes(
+            node,
+            min_complexity=(QUERY_COMPLEXITY_LIMIT / 500),
+            max_complexity=QUERY_COMPLEXITY_LIMIT,
+        ).items()
+        # Heuristic: subtree_compleixty * (copies of subtree)^2
+        best_candidate = max(
+            valid_candidates,
+            key=lambda i: i[0].planning_complexity + (i[1] ** 2),
+            default=None,
+        )
+
+        if best_candidate is None:
+            # No good subtrees to cache, just return original tree
+            return None
+
+        # TODO: Add clustering columns based on access patterns
+        materialized = self._cache_with_cluster_cols(
+            core.ArrayValue(best_candidate[0]), []
+        ).node
+
+        return traversals.replace_nodes(
+            node, to_replace=best_candidate[0], replacemenet=materialized
+        )
+
     def _is_trivially_executable(self, array_value: core.ArrayValue):
         """
         Can the block be evaluated very cheaply?
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
index a108ff4a8e..70ff6eee39 100644
--- a/tests/system/conftest.py
+++ b/tests/system/conftest.py
@@ -953,6 +953,14 @@ def restore_sampling_settings():
     bigframes.options.sampling.max_download_size = max_download_size
 
 
+@pytest.fixture()
+def with_multiquery_execution():
+    original_setting = bigframes.options.compute.enable_multi_query_execution
+    bigframes.options.compute.enable_multi_query_execution = True
+    yield
+    bigframes.options.compute.enable_multi_query_execution = original_setting
+
+
 @pytest.fixture()
 def weird_strings_pd():
     df = pd.DataFrame(
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index b15447e450..9ce46878a9 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -4173,6 +4173,56 @@ def test_recursion_limit(scalars_df_index):
     scalars_df_index.to_pandas()
 
 
+def test_query_complexity_repeated_joins(
+    scalars_df_index, scalars_pandas_df_index, with_multiquery_execution
+):
+    pd_df = scalars_pandas_df_index
+    bf_df = scalars_df_index
+    for _ in range(6):
+        # recursively join, resuling in 2^6 - 1 = 63 joins
+        pd_df = pd_df.merge(pd_df, on="int64_col").head(30)
+        pd_df = pd_df[pd_df.columns[:20]]
+        bf_df = bf_df.merge(bf_df, on="int64_col").head(30)
+        bf_df = bf_df[bf_df.columns[:20]]
+
+    bf_result = bf_df.to_pandas()
+    pd_result = pd_df
+    assert_pandas_df_equal(bf_result, pd_result, check_index_type=False)
+
+
+def test_query_complexity_repeated_subtrees(
+    scalars_df_index, scalars_pandas_df_index, with_multiquery_execution
+):
+    # Recursively union the data, if fully inlined has 10^5 identical root tables.
+    pd_df = scalars_pandas_df_index
+    bf_df = scalars_df_index
+    for _ in range(5):
+        pd_df = pd.concat(10 * [pd_df]).head(5)
+        bf_df = bigframes.pandas.concat(10 * [bf_df]).head(5)
+    bf_result = bf_df.to_pandas()
+    pd_result = pd_df
+    assert_pandas_df_equal(bf_result, pd_result)
+
+
+@pytest.mark.skipif(
+    sys.version_info >= (3, 12),
+    # See: https://github.com/python/cpython/issues/112282
+    reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.",
+)
+def test_query_complexity_repeated_analytic(
+    scalars_df_index, scalars_pandas_df_index, with_multiquery_execution
+):
+    bf_df = scalars_df_index[["int64_col", "int64_too"]]
+    pd_df = scalars_pandas_df_index[["int64_col", "int64_too"]]
+    # Uses LAG analytic operator, each in a new SELECT
+    for _ in range(50):
+        bf_df = bf_df.diff()
+        pd_df = pd_df.diff()
+    bf_result = bf_df.to_pandas()
+    pd_result = pd_df
+    assert_pandas_df_equal(bf_result, pd_result)
+
+
 def test_to_pandas_downsampling_option_override(session):
     df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting")
     download_size = 1