Skip to content

Commit

Permalink
fix: warn when read_gbq / read_gbq_table uses the snapshot time c…
Browse files Browse the repository at this point in the history
…ache (#441)
  • Loading branch information
tswast authored Mar 15, 2024
1 parent fde339b commit e16a8c0
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 0 deletions.
18 changes: 18 additions & 0 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,24 @@ def _get_snapshot_sql_and_primary_key(
job_config.labels["bigframes-api"] = api_name
if use_cache and table_ref in self._df_snapshot.keys():
snapshot_timestamp = self._df_snapshot[table_ref]

# Cache hit could be unexpected. See internal issue 329545805.
# Raise a warning with more information about how to avoid the
# problems with the cache.
warnings.warn(
f"Reading cached table from {snapshot_timestamp} to avoid "
"incompatibilies with previous reads of this table. To read "
"the latest version, set `use_cache=False` or close the "
"current session with Session.close() or "
"bigframes.pandas.close_session().",
# There are many layers before we get to (possibly) the user's code:
# pandas.read_gbq_table
# -> with_default_session
# -> Session.read_gbq_table
# -> _read_gbq_table
# -> _get_snapshot_sql_and_primary_key
stacklevel=6,
)
else:
snapshot_timestamp = list(
self.bqclient.query(
Expand Down
12 changes: 12 additions & 0 deletions tests/unit/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
"""Utilities for creating test resources."""


TEST_SCHEMA = (google.cloud.bigquery.SchemaField("col", "INTEGER"),)


def create_bigquery_session(
bqclient: Optional[mock.Mock] = None,
session_id: str = "abcxyz",
Expand All @@ -44,6 +47,13 @@ def create_bigquery_session(
bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True)
bqclient.project = "test-project"

# Mock the location.
table = mock.create_autospec(google.cloud.bigquery.Table, instance=True)
table._properties = {}
type(table).location = mock.PropertyMock(return_value="test-region")
type(table).schema = mock.PropertyMock(return_value=TEST_SCHEMA)
bqclient.get_table.return_value = table

if anonymous_dataset is None:
anonymous_dataset = google.cloud.bigquery.DatasetReference(
"test-project",
Expand All @@ -61,6 +71,8 @@ def query_mock(query, *args, **kwargs):

if query.startswith("SELECT CURRENT_TIMESTAMP()"):
query_job.result = mock.MagicMock(return_value=[[datetime.datetime.now()]])
else:
type(query_job).schema = mock.PropertyMock(return_value=TEST_SCHEMA)

return query_job

Expand Down
19 changes: 19 additions & 0 deletions tests/unit/session/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
import os
import re
from unittest import mock

import google.api_core.exceptions
import google.cloud.bigquery
import pytest

import bigframes
Expand All @@ -31,6 +34,22 @@ def test_read_gbq_missing_parts(missing_parts_table_id):
session.read_gbq(missing_parts_table_id)


def test_read_gbq_cached_table():
session = resources.create_bigquery_session()
table_ref = google.cloud.bigquery.TableReference(
google.cloud.bigquery.DatasetReference("my-project", "my_dataset"),
"my_table",
)
session._df_snapshot[table_ref] = datetime.datetime(
1999, 1, 2, 3, 4, 5, 678901, tzinfo=datetime.timezone.utc
)

with pytest.warns(UserWarning, match=re.escape("use_cache=False")):
df = session.read_gbq("my-project.my_dataset.my_table")

assert "1999-01-02T03:04:05.678901" in df.sql


@pytest.mark.parametrize(
"not_found_table_id",
[("unknown.dataset.table"), ("project.unknown.table"), ("project.dataset.unknown")],
Expand Down

0 comments on commit e16a8c0

Please sign in to comment.