Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: warn when read_gbq / read_gbq_table uses the snapshot time cache #441

Merged
merged 2 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,24 @@ def _get_snapshot_sql_and_primary_key(
job_config.labels["bigframes-api"] = api_name
if use_cache and table_ref in self._df_snapshot.keys():
snapshot_timestamp = self._df_snapshot[table_ref]

# Cache hit could be unexpected. See internal issue 329545805.
# Raise a warning with more information about how to avoid the
# problems with the cache.
warnings.warn(
f"Reading cached table from {snapshot_timestamp} to avoid "
"incompatibilies with previous reads of this table. To read "
"the latest version, set `use_cache=False` or close the "
"current session with Session.close() or "
"bigframes.pandas.close_session().",
# There are many layers before we get to (possibly) the user's code:
# pandas.read_gbq_table
# -> with_default_session
# -> Session.read_gbq_table
# -> _read_gbq_table
# -> _get_snapshot_sql_and_primary_key
stacklevel=6,
)
else:
snapshot_timestamp = list(
self.bqclient.query(
Expand Down
12 changes: 12 additions & 0 deletions tests/unit/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
"""Utilities for creating test resources."""


TEST_SCHEMA = (google.cloud.bigquery.SchemaField("col", "INTEGER"),)


def create_bigquery_session(
bqclient: Optional[mock.Mock] = None,
session_id: str = "abcxyz",
Expand All @@ -44,6 +47,13 @@ def create_bigquery_session(
bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True)
bqclient.project = "test-project"

# Mock the location.
table = mock.create_autospec(google.cloud.bigquery.Table, instance=True)
table._properties = {}
type(table).location = mock.PropertyMock(return_value="test-region")
type(table).schema = mock.PropertyMock(return_value=TEST_SCHEMA)
bqclient.get_table.return_value = table

if anonymous_dataset is None:
anonymous_dataset = google.cloud.bigquery.DatasetReference(
"test-project",
Expand All @@ -61,6 +71,8 @@ def query_mock(query, *args, **kwargs):

if query.startswith("SELECT CURRENT_TIMESTAMP()"):
query_job.result = mock.MagicMock(return_value=[[datetime.datetime.now()]])
else:
type(query_job).schema = mock.PropertyMock(return_value=TEST_SCHEMA)

return query_job

Expand Down
19 changes: 19 additions & 0 deletions tests/unit/session/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
import os
import re
from unittest import mock

import google.api_core.exceptions
import google.cloud.bigquery
import pytest

import bigframes
Expand All @@ -31,6 +34,22 @@ def test_read_gbq_missing_parts(missing_parts_table_id):
session.read_gbq(missing_parts_table_id)


def test_read_gbq_cached_table():
session = resources.create_bigquery_session()
table_ref = google.cloud.bigquery.TableReference(
google.cloud.bigquery.DatasetReference("my-project", "my_dataset"),
"my_table",
)
session._df_snapshot[table_ref] = datetime.datetime(
1999, 1, 2, 3, 4, 5, 678901, tzinfo=datetime.timezone.utc
)

with pytest.warns(UserWarning, match=re.escape("use_cache=False")):
df = session.read_gbq("my-project.my_dataset.my_table")

assert "1999-01-02T03:04:05.678901" in df.sql


@pytest.mark.parametrize(
"not_found_table_id",
[("unknown.dataset.table"), ("project.unknown.table"), ("project.dataset.unknown")],
Expand Down