Skip to content

Commit

Permalink
feat: bigframes.bigquery.json_extract (#868)
Browse files Browse the repository at this point in the history
* feat: bigframes.bigquery.json_extract

* fixing tests
  • Loading branch information
chelsea-lin authored Aug 7, 2024
1 parent 8c352ce commit 3dbf84b
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 0 deletions.
35 changes: 35 additions & 0 deletions bigframes/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,41 @@ def json_set(
return series


def json_extract(
series: series.Series,
json_path: str,
) -> series.Series:
"""Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON`
value. This function uses single quotes and brackets to escape invalid JSONPath
characters in JSON keys.
**Examples:**
>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> bpd.options.display.progress_bar = None
>>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}'])
>>> bbq.json_extract(s, json_path="$.class")
0 "{\\\"students\\\":[{\\\"id\\\":5},{\\\"id\\\":12}]}"
dtype: string
Args:
series (bigframes.series.Series):
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
json_path (str):
The JSON path identifying the data that you want to obtain from the input.
Returns:
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
"""
return series._apply_unary_op(ops.JSONExtract(json_path=json_path))


# Search functions defined from
# https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions


def vector_search(
base_table: str,
column_to_search: str,
Expand Down
12 changes: 12 additions & 0 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,6 +922,11 @@ def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet):
).to_expr()


@scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True)
def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract):
return json_extract(json_obj=x, json_path=op.json_path)


### Binary Ops
def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None):
"""Wraps a binary operator to generate nulls of the expected type if either input is a null scalar."""
Expand Down Expand Up @@ -1549,6 +1554,13 @@ def json_set(
"""Produces a new SQL JSON value with the specified JSON data inserted or replaced."""


@ibis.udf.scalar.builtin(name="json_extract")
def json_extract(
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str
) -> ibis_dtypes.JSON:
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""


@ibis.udf.scalar.builtin(name="ML.DISTANCE")
def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64:
"""Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")"""
16 changes: 16 additions & 0 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,22 @@ def output_type(self, *input_types):
return dtypes.STRING_DTYPE


## JSON Ops
@dataclasses.dataclass(frozen=True)
class JSONExtract(UnaryOp):
name: typing.ClassVar[str] = "json_extract"
json_path: str

def output_type(self, *input_types):
input_type = input_types[0]
if not dtypes.is_json_like(input_type):
raise TypeError(
"Input type must be an valid JSON object or JSON-formatted string type."
+ f" Received type: {input_type}"
)
return input_type


# Binary Ops
fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE)
Expand Down
27 changes: 27 additions & 0 deletions tests/system/small/bigquery/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,30 @@ def test_json_set_w_invalid_value_type():
def test_json_set_w_invalid_series_type():
with pytest.raises(TypeError):
bbq.json_set(bpd.Series([1, 2]), json_path_value_pairs=[("$.a", 1)])


def test_json_extract_from_json():
s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}])
actual = bbq.json_extract(s, "$.a.b")
# After the introduction of the JSON type, the output should be a JSON-formatted series.
expected = _get_series_from_json(["[1,2]", None, "0"])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_extract_from_string():
s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'])
actual = bbq.json_extract(s, "$.a.b")
expected = _get_series_from_json(["[1,2]", None, "0"])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
check_names=False,
)


def test_json_extract_w_invalid_series_type():
with pytest.raises(TypeError):
bbq.json_extract(bpd.Series([1, 2]), "$.a")

0 comments on commit 3dbf84b

Please sign in to comment.