Skip to content

Commit

Permalink
fix: correctly handle null values when initializing fingerprint order…
Browse files Browse the repository at this point in the history
…ing (#210)

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)

Fixes #<issue_number_goes_here> 🦕
  • Loading branch information
TrevorBergeron authored Nov 16, 2023
1 parent f957b27 commit 8324f13
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 3 deletions.
9 changes: 6 additions & 3 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1120,8 +1120,9 @@ def _create_total_ordering(
ordering_hash_part = guid.generate_guid("bigframes_ordering_")
ordering_rand_part = guid.generate_guid("bigframes_ordering_")

# All inputs into hash must be non-null or resulting hash will be null
str_values = list(
map(lambda col: _convert_to_string(table[col]), table.columns)
map(lambda col: _convert_to_nonnull_string(table[col]), table.columns)
)
full_row_str = (
str_values[0].concat(*str_values[1:])
Expand Down Expand Up @@ -1419,7 +1420,7 @@ def _can_cluster_bq(field: bigquery.SchemaField):
)


def _convert_to_string(column: ibis_types.Column) -> ibis_types.StringColumn:
def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringValue:
col_type = column.type()
if (
col_type.is_numeric()
Expand All @@ -1436,4 +1437,6 @@ def _convert_to_string(column: ibis_types.Column) -> ibis_types.StringColumn:
# TO_JSON_STRING works with all data types, but isn't the most efficient
# Needed for JSON, STRUCT and ARRAY datatypes
result = vendored_ibis_ops.ToJsonString(column).to_expr() # type: ignore
return typing.cast(ibis_types.StringColumn, result)
# Escape backslashes and use backslash as delineator
escaped = typing.cast(ibis_types.StringColumn, result.fillna("")).replace("\\", "\\\\") # type: ignore
return typing.cast(ibis_types.StringColumn, ibis.literal("\\")).concat(escaped)
8 changes: 8 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2703,6 +2703,14 @@ def test_sample(scalars_dfs, frac, n, random_state):
assert bf_result.shape[1] == scalars_df.shape[1]


def test_sample_determinism(penguins_df_default_index):
df = penguins_df_default_index.sample(n=100, random_state=12345).head(15)
bf_result = df.to_pandas()
bf_result2 = df.to_pandas()

pandas.testing.assert_frame_equal(bf_result, bf_result2)


def test_sample_raises_value_error(scalars_dfs):
scalars_df, _ = scalars_dfs
with pytest.raises(
Expand Down

0 comments on commit 8324f13

Please sign in to comment.