From 3e9b0a32df7b591cc9fe7e78a6f5472f27de2fd5 Mon Sep 17 00:00:00 2001 From: mkaruza Date: Thu, 7 Nov 2024 14:32:12 +0100 Subject: [PATCH] Read JSON with `read_json` (#405) Added `read_json` stub function, added regression test and updated documentation. --- README.md | 5 +- docs/functions.md | 29 ++++++ sql/pg_duckdb--0.1.0--0.2.0.sql | 40 ++++++++ src/pgduckdb_metadata_cache.cpp | 2 +- test/regression/data/table.json | 100 ++++++++++++++++++++ test/regression/expected/read_functions.out | 19 ++++ test/regression/sql/read_functions.sql | 6 ++ 7 files changed, 198 insertions(+), 3 deletions(-) create mode 100644 test/regression/data/table.json diff --git a/README.md b/README.md index d47438d5..82d1ccf2 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,10 @@ See our [official documentation][docs] for further details. - Able to read [data types](https://www.postgresql.org/docs/current/datatype.html) that exist in both Postgres and DuckDB. The following data types are supported: numeric, character, binary, date/time, boolean, uuid, json, and arrays. - If DuckDB cannot support the query for any reason, execution falls back to Postgres. - Read and Write support for object storage (AWS S3, Cloudflare R2, or Google GCS): - - Read parquet and CSV files: + - Read parquet, CSV and JSON files: - `SELECT n FROM read_parquet('s3://bucket/file.parquet') AS (n int)` - `SELECT n FROM read_csv('s3://bucket/file.csv') AS (n int)` + - `SELECT n FROM read_json('s3://bucket/file.json') AS (n int)` - You can pass globs and arrays to these functions, just like in DuckDB - Enable the DuckDB Iceberg extension using `SELECT duckdb.install_extension('iceberg')` and read Iceberg files with `iceberg_scan`. - Enable the DuckDB Delta extension using `SELECT duckdb.install_extension('delta')` and read Delta files with `delta_scan`. @@ -121,7 +122,7 @@ pg_duckdb relies on DuckDB's vectorized execution engine to read and write data ### Object storage bucket (AWS S3, Cloudflare R2, or Google GCS) -Querying data stored in Parquet, CSV, and Iceberg format can be done with `read_parquet`, `read_csv`, `iceberg_scan` and `delta_scan` respectively. +Querying data stored in Parquet, CSV, JSON, Iceberg and Delta format can be done with `read_parquet`, `read_csv`, `read_json`, `iceberg_scan` and `delta_scan` respectively. 1. Add a credential to enable DuckDB's httpfs support. diff --git a/docs/functions.md b/docs/functions.md index 185c1614..8e9e65b0 100644 --- a/docs/functions.md +++ b/docs/functions.md @@ -10,6 +10,7 @@ Note: `ALTER EXTENSION pg_duckdb WITH SCHEMA schema` is not currently supported. | :--- | :---------- | | [`read_parquet`](#read_parquet) | Read a parquet file | | [`read_csv`](#read_csv) | Read a CSV file | +| [`read_json`](#read_json) | Read a JSON file | | [`iceberg_scan`](#iceberg_scan) | Read an Iceberg dataset | | [`iceberg_metadata`](#iceberg_metadata) | Read Iceberg metadata | | [`iceberg_snapshots`](#iceberg_snapshots) | Read Iceberg snapshot information | @@ -87,6 +88,34 @@ Compatibility notes: * `columns` is not currently supported. * `nullstr` must be an array (`TEXT[]`). +#### `read_json(path TEXT or TEXT[], /* optional parameters */) -> SETOF record` + +Reads a JSON file, either from a remote location (via httpfs) or a local file. + +Returns a record set (`SETOF record`). Functions that return record sets need to have their columns and types specified using `AS`. You must specify at least one column and any columns used in your query. For example: + +```sql +SELECT COUNT(i) FROM read_json('file.json') AS (int i); +``` + +Further information: + +* [DuckDB JSON documentation](https://duckdb.org/docs/data/json/overview) + +##### Required Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| path | text or text[] | The path, either to a remote httpfs file or a local file (if enabled), of the JSON file(s) to read. The path can be a glob or array of files to read. | + +##### Optional Parameters + +Optional parameters mirror [DuckDB's read_json function](https://duckdb.org/docs/data/json/loading_json#json-read-functions). To specify optional parameters, use `parameter := 'value'`. + +Compatibility notes: + +* `columns` is not currently supported. + #### `iceberg_scan(path TEXT, /* optional parameters */) -> SETOF record` Reads an Iceberg table, either from a remote location (via httpfs) or a local directory. diff --git a/sql/pg_duckdb--0.1.0--0.2.0.sql b/sql/pg_duckdb--0.1.0--0.2.0.sql index 3866413a..6c98fc22 100644 --- a/sql/pg_duckdb--0.1.0--0.2.0.sql +++ b/sql/pg_duckdb--0.1.0--0.2.0.sql @@ -7,3 +7,43 @@ BEGIN RAISE EXCEPTION 'Function `delta_scan(TEXT)` only works with Duckdb execution.'; END; $func$; + +CREATE FUNCTION @extschema@.read_json(path text, auto_detect BOOLEAN DEFAULT FALSE, + compression VARCHAR DEFAULT 'auto', + dateformat VARCHAR DEFAULT 'iso', + format VARCHAR DEFAULT 'array', + ignore_errors BOOLEAN DEFAULT FALSE, + maximum_depth BIGINT DEFAULT -1, + maximum_object_size INT DEFAULT 16777216, + records VARCHAR DEFAULT 'records', + sample_size BIGINT DEFAULT 20480, + timestampformat VARCHAR DEFAULT 'iso', + union_by_name BOOLEAN DEFAULT FALSE) +RETURNS SETOF record LANGUAGE 'plpgsql' +SET search_path = pg_catalog, pg_temp +AS +$func$ +BEGIN + RAISE EXCEPTION 'Function `read_json(TEXT)` only works with Duckdb execution.'; +END; +$func$; + +CREATE FUNCTION @extschema@.read_json(path text[], auto_detect BOOLEAN DEFAULT FALSE, + compression VARCHAR DEFAULT 'auto', + dateformat VARCHAR DEFAULT 'iso', + format VARCHAR DEFAULT 'array', + ignore_errors BOOLEAN DEFAULT FALSE, + maximum_depth BIGINT DEFAULT -1, + maximum_object_size INT DEFAULT 16777216, + records VARCHAR DEFAULT 'records', + sample_size BIGINT DEFAULT 20480, + timestampformat VARCHAR DEFAULT 'iso', + union_by_name BOOLEAN DEFAULT FALSE) +RETURNS SETOF record LANGUAGE 'plpgsql' +SET search_path = pg_catalog, pg_temp +AS +$func$ +BEGIN + RAISE EXCEPTION 'Function `read_json(TEXT[])` only works with Duckdb execution.'; +END; +$func$; diff --git a/src/pgduckdb_metadata_cache.cpp b/src/pgduckdb_metadata_cache.cpp index 9f00ade5..2b8af1e4 100644 --- a/src/pgduckdb_metadata_cache.cpp +++ b/src/pgduckdb_metadata_cache.cpp @@ -110,7 +110,7 @@ BuildDuckdbOnlyFunctions() { * caching its OID as a DuckDB-only function. */ const char *function_names[] = {"read_parquet", "read_csv", "iceberg_scan", "iceberg_metadata", - "iceberg_snapshots", "delta_scan"}; + "iceberg_snapshots", "delta_scan", "read_json"}; for (int i = 0; i < lengthof(function_names); i++) { CatCList *catlist = SearchSysCacheList1(PROCNAMEARGSNSP, CStringGetDatum(function_names[i])); diff --git a/test/regression/data/table.json b/test/regression/data/table.json new file mode 100644 index 00000000..4ae41d2c --- /dev/null +++ b/test/regression/data/table.json @@ -0,0 +1,100 @@ +{"a":1,"b":"json_1","c":1.5} +{"a":2,"b":"json_2","c":2.5} +{"a":3,"b":"json_3","c":3.5} +{"a":4,"b":"json_4","c":4.5} +{"a":5,"b":"json_5","c":5.5} +{"a":6,"b":"json_6","c":6.5} +{"a":7,"b":"json_7","c":7.5} +{"a":8,"b":"json_8","c":8.5} +{"a":9,"b":"json_9","c":9.5} +{"a":10,"b":"json_10","c":10.5} +{"a":11,"b":"json_11","c":11.5} +{"a":12,"b":"json_12","c":12.5} +{"a":13,"b":"json_13","c":13.5} +{"a":14,"b":"json_14","c":14.5} +{"a":15,"b":"json_15","c":15.5} +{"a":16,"b":"json_16","c":16.5} +{"a":17,"b":"json_17","c":17.5} +{"a":18,"b":"json_18","c":18.5} +{"a":19,"b":"json_19","c":19.5} +{"a":20,"b":"json_20","c":20.5} +{"a":21,"b":"json_21","c":21.5} +{"a":22,"b":"json_22","c":22.5} +{"a":23,"b":"json_23","c":23.5} +{"a":24,"b":"json_24","c":24.5} +{"a":25,"b":"json_25","c":25.5} +{"a":26,"b":"json_26","c":26.5} +{"a":27,"b":"json_27","c":27.5} +{"a":28,"b":"json_28","c":28.5} +{"a":29,"b":"json_29","c":29.5} +{"a":30,"b":"json_30","c":30.5} +{"a":31,"b":"json_31","c":31.5} +{"a":32,"b":"json_32","c":32.5} +{"a":33,"b":"json_33","c":33.5} +{"a":34,"b":"json_34","c":34.5} +{"a":35,"b":"json_35","c":35.5} +{"a":36,"b":"json_36","c":36.5} +{"a":37,"b":"json_37","c":37.5} +{"a":38,"b":"json_38","c":38.5} +{"a":39,"b":"json_39","c":39.5} +{"a":40,"b":"json_40","c":40.5} +{"a":41,"b":"json_41","c":41.5} +{"a":42,"b":"json_42","c":42.5} +{"a":43,"b":"json_43","c":43.5} +{"a":44,"b":"json_44","c":44.5} +{"a":45,"b":"json_45","c":45.5} +{"a":46,"b":"json_46","c":46.5} +{"a":47,"b":"json_47","c":47.5} +{"a":48,"b":"json_48","c":48.5} +{"a":49,"b":"json_49","c":49.5} +{"a":50,"b":"json_50","c":50.5} +{"a":51,"b":"json_51","c":51.5} +{"a":52,"b":"json_52","c":52.5} +{"a":53,"b":"json_53","c":53.5} +{"a":54,"b":"json_54","c":54.5} +{"a":55,"b":"json_55","c":55.5} +{"a":56,"b":"json_56","c":56.5} +{"a":57,"b":"json_57","c":57.5} +{"a":58,"b":"json_58","c":58.5} +{"a":59,"b":"json_59","c":59.5} +{"a":60,"b":"json_60","c":60.5} +{"a":61,"b":"json_61","c":61.5} +{"a":62,"b":"json_62","c":62.5} +{"a":63,"b":"json_63","c":63.5} +{"a":64,"b":"json_64","c":64.5} +{"a":65,"b":"json_65","c":65.5} +{"a":66,"b":"json_66","c":66.5} +{"a":67,"b":"json_67","c":67.5} +{"a":68,"b":"json_68","c":68.5} +{"a":69,"b":"json_69","c":69.5} +{"a":70,"b":"json_70","c":70.5} +{"a":71,"b":"json_71","c":71.5} +{"a":72,"b":"json_72","c":72.5} +{"a":73,"b":"json_73","c":73.5} +{"a":74,"b":"json_74","c":74.5} +{"a":75,"b":"json_75","c":75.5} +{"a":76,"b":"json_76","c":76.5} +{"a":77,"b":"json_77","c":77.5} +{"a":78,"b":"json_78","c":78.5} +{"a":79,"b":"json_79","c":79.5} +{"a":80,"b":"json_80","c":80.5} +{"a":81,"b":"json_81","c":81.5} +{"a":82,"b":"json_82","c":82.5} +{"a":83,"b":"json_83","c":83.5} +{"a":84,"b":"json_84","c":84.5} +{"a":85,"b":"json_85","c":85.5} +{"a":86,"b":"json_86","c":86.5} +{"a":87,"b":"json_87","c":87.5} +{"a":88,"b":"json_88","c":88.5} +{"a":89,"b":"json_89","c":89.5} +{"a":90,"b":"json_90","c":90.5} +{"a":91,"b":"json_91","c":91.5} +{"a":92,"b":"json_92","c":92.5} +{"a":93,"b":"json_93","c":93.5} +{"a":94,"b":"json_94","c":94.5} +{"a":95,"b":"json_95","c":95.5} +{"a":96,"b":"json_96","c":96.5} +{"a":97,"b":"json_97","c":97.5} +{"a":98,"b":"json_98","c":98.5} +{"a":99,"b":"json_99","c":99.5} +{"a":100,"b":"json_100","c":100.5} diff --git a/test/regression/expected/read_functions.out b/test/regression/expected/read_functions.out index 15bb5bed..85ebae38 100644 --- a/test/regression/expected/read_functions.out +++ b/test/regression/expected/read_functions.out @@ -127,3 +127,22 @@ SELECT * FROM iceberg_metadata('../../data/lineitem_iceberg', allow_moved_paths lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro | 2 | DATA | DELETED | EXISTING | lineitem_iceberg/data/00000-411-0792dcfe-4e25-4ca3-8ada-175286069a47-00001.parquet (2 rows) +-- read_json +SELECT COUNT(a) FROM read_json('../../data/table.json') AS (a INT); + count +------- + 100 +(1 row) + +SELECT COUNT(a) FROM read_json('../../data/table.json') AS (a INT, c FLOAT) WHERE c > 50.4; + count +------- + 51 +(1 row) + +SELECT a, b, c FROM read_json('../../data/table.json') AS (a INT, b VARCHAR, c FLOAT) WHERE c > 50.4 AND c < 51.2; + a | b | c +----+---------+------ + 50 | json_50 | 50.5 +(1 row) + diff --git a/test/regression/sql/read_functions.sql b/test/regression/sql/read_functions.sql index 00516417..b3af1915 100644 --- a/test/regression/sql/read_functions.sql +++ b/test/regression/sql/read_functions.sql @@ -56,3 +56,9 @@ LIMIT 1; SELECT * FROM iceberg_snapshots('../../data/lineitem_iceberg'); SELECT * FROM iceberg_metadata('../../data/lineitem_iceberg', allow_moved_paths => true); + +-- read_json + +SELECT COUNT(a) FROM read_json('../../data/table.json') AS (a INT); +SELECT COUNT(a) FROM read_json('../../data/table.json') AS (a INT, c FLOAT) WHERE c > 50.4; +SELECT a, b, c FROM read_json('../../data/table.json') AS (a INT, b VARCHAR, c FLOAT) WHERE c > 50.4 AND c < 51.2;