From 3e9b0a32df7b591cc9fe7e78a6f5472f27de2fd5 Mon Sep 17 00:00:00 2001
From: mkaruza <mkaruza@users.noreply.github.com>
Date: Thu, 7 Nov 2024 14:32:12 +0100
Subject: [PATCH] Read JSON with `read_json` (#405)

Added `read_json` stub function, added regression test and updated
documentation.
---
 README.md                                   |   5 +-
 docs/functions.md                           |  29 ++++++
 sql/pg_duckdb--0.1.0--0.2.0.sql             |  40 ++++++++
 src/pgduckdb_metadata_cache.cpp             |   2 +-
 test/regression/data/table.json             | 100 ++++++++++++++++++++
 test/regression/expected/read_functions.out |  19 ++++
 test/regression/sql/read_functions.sql      |   6 ++
 7 files changed, 198 insertions(+), 3 deletions(-)
 create mode 100644 test/regression/data/table.json
diff --git a/README.md b/README.md
index d47438d5..82d1ccf2 100644
--- a/README.md
+++ b/README.md
@@ -18,9 +18,10 @@ See our [official documentation][docs] for further details.
 	- Able to read [data types](https://www.postgresql.org/docs/current/datatype.html) that exist in both Postgres and DuckDB. The following data types are supported: numeric, character, binary, date/time, boolean, uuid, json, and arrays.
 	- If DuckDB cannot support the query for any reason, execution falls back to Postgres.
 - Read and Write support for object storage (AWS S3, Cloudflare R2, or Google GCS):
-	- Read parquet and CSV files:
+	- Read parquet, CSV and JSON files:
 		- `SELECT n FROM read_parquet('s3://bucket/file.parquet') AS (n int)`
 		- `SELECT n FROM read_csv('s3://bucket/file.csv') AS (n int)`
+		- `SELECT n FROM read_json('s3://bucket/file.json') AS (n int)`
 		- You can pass globs and arrays to these functions, just like in DuckDB
 	- Enable the DuckDB Iceberg extension using `SELECT duckdb.install_extension('iceberg')` and read Iceberg files with `iceberg_scan`.
 	- Enable the DuckDB Delta extension using `SELECT duckdb.install_extension('delta')` and read Delta files with `delta_scan`.
@@ -121,7 +122,7 @@ pg_duckdb relies on DuckDB's vectorized execution engine to read and write data
 
 ### Object storage bucket (AWS S3, Cloudflare R2, or Google GCS)
 
-Querying data stored in Parquet, CSV, and Iceberg format can be done with `read_parquet`, `read_csv`, `iceberg_scan` and `delta_scan` respectively.
+Querying data stored in Parquet, CSV, JSON, Iceberg and Delta format can be done with `read_parquet`, `read_csv`, `read_json`, `iceberg_scan` and `delta_scan` respectively.
 
 1. Add a credential to enable DuckDB's httpfs support.
 
diff --git a/docs/functions.md b/docs/functions.md
index 185c1614..8e9e65b0 100644
--- a/docs/functions.md
+++ b/docs/functions.md
@@ -10,6 +10,7 @@ Note: `ALTER EXTENSION pg_duckdb WITH SCHEMA schema` is not currently supported.
 | :--- | :---------- |
 | [`read_parquet`](#read_parquet) | Read a parquet file |
 | [`read_csv`](#read_csv) | Read a CSV file |
+| [`read_json`](#read_json) | Read a JSON file |
 | [`iceberg_scan`](#iceberg_scan) | Read an Iceberg dataset |
 | [`iceberg_metadata`](#iceberg_metadata) | Read Iceberg metadata |
 | [`iceberg_snapshots`](#iceberg_snapshots) | Read Iceberg snapshot information |
@@ -87,6 +88,34 @@ Compatibility notes:
 * `columns` is not currently supported.
 * `nullstr` must be an array (`TEXT[]`).
 
+#### <a name="read_json"></a>`read_json(path TEXT or TEXT[], /* optional parameters */) -> SETOF record`
+
+Reads a JSON file, either from a remote location (via httpfs) or a local file.
+
+Returns a record set (`SETOF record`). Functions that return record sets need to have their columns and types specified using `AS`. You must specify at least one column and any columns used in your query. For example:
+
+```sql
+SELECT COUNT(i) FROM read_json('file.json') AS (int i);
+```
+
+Further information:
+
+* [DuckDB JSON documentation](https://duckdb.org/docs/data/json/overview)
+
+##### Required Arguments
+
+| Name | Type | Description |
+| :--- | :--- | :---------- |
+| path | text or text[] | The path, either to a remote httpfs file or a local file (if enabled), of the JSON file(s) to read. The path can be a glob or array of files to read. |
+
+##### Optional Parameters
+
+Optional parameters mirror [DuckDB's read_json function](https://duckdb.org/docs/data/json/loading_json#json-read-functions). To specify optional parameters, use `parameter := 'value'`.
+
+Compatibility notes:
+
+* `columns` is not currently supported.
+
 #### <a name="iceberg_scan"></a>`iceberg_scan(path TEXT, /* optional parameters */) -> SETOF record`
 
 Reads an Iceberg table, either from a remote location (via httpfs) or a local directory.
diff --git a/sql/pg_duckdb--0.1.0--0.2.0.sql b/sql/pg_duckdb--0.1.0--0.2.0.sql
index 3866413a..6c98fc22 100644
--- a/sql/pg_duckdb--0.1.0--0.2.0.sql
+++ b/sql/pg_duckdb--0.1.0--0.2.0.sql
@@ -7,3 +7,43 @@ BEGIN
     RAISE EXCEPTION 'Function `delta_scan(TEXT)` only works with Duckdb execution.';
 END;
 $func$;
+
+CREATE FUNCTION @extschema@.read_json(path text, auto_detect BOOLEAN DEFAULT FALSE,
+                                                 compression VARCHAR DEFAULT 'auto',
+                                                 dateformat VARCHAR DEFAULT 'iso',
+                                                 format VARCHAR DEFAULT 'array',
+                                                 ignore_errors BOOLEAN DEFAULT FALSE,
+                                                 maximum_depth BIGINT DEFAULT -1,
+                                                 maximum_object_size INT DEFAULT 16777216,
+                                                 records VARCHAR DEFAULT 'records',
+                                                 sample_size BIGINT DEFAULT 20480,
+                                                 timestampformat VARCHAR DEFAULT 'iso',
+                                                 union_by_name BOOLEAN DEFAULT FALSE)
+RETURNS SETOF record LANGUAGE 'plpgsql'
+SET search_path = pg_catalog, pg_temp
+AS
+$func$
+BEGIN
+    RAISE EXCEPTION 'Function `read_json(TEXT)` only works with Duckdb execution.';
+END;
+$func$;
+
+CREATE FUNCTION @extschema@.read_json(path text[], auto_detect BOOLEAN DEFAULT FALSE,
+                                                   compression VARCHAR DEFAULT 'auto',
+                                                   dateformat VARCHAR DEFAULT 'iso',
+                                                   format VARCHAR DEFAULT 'array',
+                                                   ignore_errors BOOLEAN DEFAULT FALSE,
+                                                   maximum_depth BIGINT DEFAULT -1,
+                                                   maximum_object_size INT DEFAULT 16777216,
+                                                   records VARCHAR DEFAULT 'records',
+                                                   sample_size BIGINT DEFAULT 20480,
+                                                   timestampformat VARCHAR DEFAULT 'iso',
+                                                   union_by_name BOOLEAN DEFAULT FALSE)
+RETURNS SETOF record LANGUAGE 'plpgsql'
+SET search_path = pg_catalog, pg_temp
+AS
+$func$
+BEGIN
+    RAISE EXCEPTION 'Function `read_json(TEXT[])` only works with Duckdb execution.';
+END;
+$func$;
diff --git a/src/pgduckdb_metadata_cache.cpp b/src/pgduckdb_metadata_cache.cpp
index 9f00ade5..2b8af1e4 100644
--- a/src/pgduckdb_metadata_cache.cpp
+++ b/src/pgduckdb_metadata_cache.cpp
@@ -110,7 +110,7 @@ BuildDuckdbOnlyFunctions() {
 	 * caching its OID as a DuckDB-only function.
 	 */
 	const char *function_names[] = {"read_parquet", "read_csv", "iceberg_scan", "iceberg_metadata",
-	                                "iceberg_snapshots", "delta_scan"};
+	                                "iceberg_snapshots", "delta_scan", "read_json"};
 
 	for (int i = 0; i < lengthof(function_names); i++) {
 		CatCList *catlist = SearchSysCacheList1(PROCNAMEARGSNSP, CStringGetDatum(function_names[i]));
diff --git a/test/regression/data/table.json b/test/regression/data/table.json
new file mode 100644
index 00000000..4ae41d2c
--- /dev/null
+++ b/test/regression/data/table.json
@@ -0,0 +1,100 @@
+{"a":1,"b":"json_1","c":1.5}
+{"a":2,"b":"json_2","c":2.5}
+{"a":3,"b":"json_3","c":3.5}
+{"a":4,"b":"json_4","c":4.5}
+{"a":5,"b":"json_5","c":5.5}
+{"a":6,"b":"json_6","c":6.5}
+{"a":7,"b":"json_7","c":7.5}
+{"a":8,"b":"json_8","c":8.5}
+{"a":9,"b":"json_9","c":9.5}
+{"a":10,"b":"json_10","c":10.5}
+{"a":11,"b":"json_11","c":11.5}
+{"a":12,"b":"json_12","c":12.5}
+{"a":13,"b":"json_13","c":13.5}
+{"a":14,"b":"json_14","c":14.5}
+{"a":15,"b":"json_15","c":15.5}
+{"a":16,"b":"json_16","c":16.5}
+{"a":17,"b":"json_17","c":17.5}
+{"a":18,"b":"json_18","c":18.5}
+{"a":19,"b":"json_19","c":19.5}
+{"a":20,"b":"json_20","c":20.5}
+{"a":21,"b":"json_21","c":21.5}
+{"a":22,"b":"json_22","c":22.5}
+{"a":23,"b":"json_23","c":23.5}
+{"a":24,"b":"json_24","c":24.5}
+{"a":25,"b":"json_25","c":25.5}
+{"a":26,"b":"json_26","c":26.5}
+{"a":27,"b":"json_27","c":27.5}
+{"a":28,"b":"json_28","c":28.5}
+{"a":29,"b":"json_29","c":29.5}
+{"a":30,"b":"json_30","c":30.5}
+{"a":31,"b":"json_31","c":31.5}
+{"a":32,"b":"json_32","c":32.5}
+{"a":33,"b":"json_33","c":33.5}
+{"a":34,"b":"json_34","c":34.5}
+{"a":35,"b":"json_35","c":35.5}
+{"a":36,"b":"json_36","c":36.5}
+{"a":37,"b":"json_37","c":37.5}
+{"a":38,"b":"json_38","c":38.5}
+{"a":39,"b":"json_39","c":39.5}
+{"a":40,"b":"json_40","c":40.5}
+{"a":41,"b":"json_41","c":41.5}
+{"a":42,"b":"json_42","c":42.5}
+{"a":43,"b":"json_43","c":43.5}
+{"a":44,"b":"json_44","c":44.5}
+{"a":45,"b":"json_45","c":45.5}
+{"a":46,"b":"json_46","c":46.5}
+{"a":47,"b":"json_47","c":47.5}
+{"a":48,"b":"json_48","c":48.5}
+{"a":49,"b":"json_49","c":49.5}
+{"a":50,"b":"json_50","c":50.5}
+{"a":51,"b":"json_51","c":51.5}
+{"a":52,"b":"json_52","c":52.5}
+{"a":53,"b":"json_53","c":53.5}
+{"a":54,"b":"json_54","c":54.5}
+{"a":55,"b":"json_55","c":55.5}
+{"a":56,"b":"json_56","c":56.5}
+{"a":57,"b":"json_57","c":57.5}
+{"a":58,"b":"json_58","c":58.5}
+{"a":59,"b":"json_59","c":59.5}
+{"a":60,"b":"json_60","c":60.5}
+{"a":61,"b":"json_61","c":61.5}
+{"a":62,"b":"json_62","c":62.5}
+{"a":63,"b":"json_63","c":63.5}
+{"a":64,"b":"json_64","c":64.5}
+{"a":65,"b":"json_65","c":65.5}
+{"a":66,"b":"json_66","c":66.5}
+{"a":67,"b":"json_67","c":67.5}
+{"a":68,"b":"json_68","c":68.5}
+{"a":69,"b":"json_69","c":69.5}
+{"a":70,"b":"json_70","c":70.5}
+{"a":71,"b":"json_71","c":71.5}
+{"a":72,"b":"json_72","c":72.5}
+{"a":73,"b":"json_73","c":73.5}
+{"a":74,"b":"json_74","c":74.5}
+{"a":75,"b":"json_75","c":75.5}
+{"a":76,"b":"json_76","c":76.5}
+{"a":77,"b":"json_77","c":77.5}
+{"a":78,"b":"json_78","c":78.5}
+{"a":79,"b":"json_79","c":79.5}
+{"a":80,"b":"json_80","c":80.5}
+{"a":81,"b":"json_81","c":81.5}
+{"a":82,"b":"json_82","c":82.5}
+{"a":83,"b":"json_83","c":83.5}
+{"a":84,"b":"json_84","c":84.5}
+{"a":85,"b":"json_85","c":85.5}
+{"a":86,"b":"json_86","c":86.5}
+{"a":87,"b":"json_87","c":87.5}
+{"a":88,"b":"json_88","c":88.5}
+{"a":89,"b":"json_89","c":89.5}
+{"a":90,"b":"json_90","c":90.5}
+{"a":91,"b":"json_91","c":91.5}
+{"a":92,"b":"json_92","c":92.5}
+{"a":93,"b":"json_93","c":93.5}
+{"a":94,"b":"json_94","c":94.5}
+{"a":95,"b":"json_95","c":95.5}
+{"a":96,"b":"json_96","c":96.5}
+{"a":97,"b":"json_97","c":97.5}
+{"a":98,"b":"json_98","c":98.5}
+{"a":99,"b":"json_99","c":99.5}
+{"a":100,"b":"json_100","c":100.5}
diff --git a/test/regression/expected/read_functions.out b/test/regression/expected/read_functions.out
index 15bb5bed..85ebae38 100644
--- a/test/regression/expected/read_functions.out
+++ b/test/regression/expected/read_functions.out
@@ -127,3 +127,22 @@ SELECT * FROM iceberg_metadata('../../data/lineitem_iceberg',  allow_moved_paths
  lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro |                        2 | DATA             | DELETED | EXISTING | lineitem_iceberg/data/00000-411-0792dcfe-4e25-4ca3-8ada-175286069a47-00001.parquet
 (2 rows)
 
+-- read_json
+SELECT COUNT(a) FROM read_json('../../data/table.json') AS (a INT);
+ count 
+-------
+   100
+(1 row)
+
+SELECT COUNT(a) FROM read_json('../../data/table.json') AS (a INT, c FLOAT) WHERE c > 50.4;
+ count 
+-------
+    51
+(1 row)
+
+SELECT a, b, c FROM read_json('../../data/table.json') AS (a INT, b VARCHAR, c FLOAT) WHERE c > 50.4 AND c < 51.2;
+ a  |    b    |  c   
+----+---------+------
+ 50 | json_50 | 50.5
+(1 row)
+
diff --git a/test/regression/sql/read_functions.sql b/test/regression/sql/read_functions.sql
index 00516417..b3af1915 100644
--- a/test/regression/sql/read_functions.sql
+++ b/test/regression/sql/read_functions.sql
@@ -56,3 +56,9 @@ LIMIT 1;
 
 SELECT * FROM iceberg_snapshots('../../data/lineitem_iceberg');
 SELECT * FROM iceberg_metadata('../../data/lineitem_iceberg',  allow_moved_paths => true);
+
+-- read_json
+
+SELECT COUNT(a) FROM read_json('../../data/table.json') AS (a INT);
+SELECT COUNT(a) FROM read_json('../../data/table.json') AS (a INT, c FLOAT) WHERE c > 50.4;
+SELECT a, b, c FROM read_json('../../data/table.json') AS (a INT, b VARCHAR, c FLOAT) WHERE c > 50.4 AND c < 51.2;