From bb82c937936b97824bb766893745e1456eb1aa08 Mon Sep 17 00:00:00 2001 From: "Y." Date: Thu, 5 Dec 2024 07:29:37 -0500 Subject: [PATCH] feat: Add support for Azure secrets (#478) Add support for Azure secrets: 1. Install Azure extension ```sql SELECT duckdb.install_extension('azure'); ``` 2. Add a secret: ```sql INSERT INTO duckdb.secrets (type, connection_string) VALUES ('Azure', ''); ``` 3. Run a query: ```sql SELECT count(*) FROM read_parquet('azure://my_container/orders.parquet') AS (order_id int); ``` --- README.md | 20 +++++++-- include/pgduckdb/pgduckdb_options.hpp | 30 +++++++------ sql/pg_duckdb--0.1.0--0.2.0.sql | 11 +++++ src/pgduckdb_duckdb.cpp | 62 ++++++++++++++++----------- src/pgduckdb_options.cpp | 58 +++++++++++++++++++++++-- 5 files changed, 138 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 0fb6789b..7125f86f 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ See our [official documentation][docs] for further details. - `SELECT` queries executed by the DuckDB engine can directly read Postgres tables. (If you only query Postgres tables you need to run `SET duckdb.force_execution TO true`, see the **IMPORTANT** section above for details) - Able to read [data types](https://www.postgresql.org/docs/current/datatype.html) that exist in both Postgres and DuckDB. The following data types are supported: numeric, character, binary, date/time, boolean, uuid, json, and arrays. - If DuckDB cannot support the query for any reason, execution falls back to Postgres. -- Read and Write support for object storage (AWS S3, Cloudflare R2, or Google GCS): +- Read and Write support for object storage (AWS S3, Azure, Cloudflare R2, or Google GCS): - Read parquet, CSV and JSON files: - `SELECT n FROM read_parquet('s3://bucket/file.parquet') AS (n int)` - `SELECT n FROM read_csv('s3://bucket/file.csv') AS (n int)` @@ -124,9 +124,9 @@ CREATE EXTENSION pg_duckdb; See our [official documentation][docs] for more usage information. -pg_duckdb relies on DuckDB's vectorized execution engine to read and write data to object storage bucket (AWS S3, Cloudflare R2, or Google GCS) and/or MotherDuck. The follow two sections describe how to get started with these destinations. +pg_duckdb relies on DuckDB's vectorized execution engine to read and write data to object storage bucket (AWS S3, Azure, Cloudflare R2, or Google GCS) and/or MotherDuck. The follow two sections describe how to get started with these destinations. -### Object storage bucket (AWS S3, Cloudflare R2, or Google GCS) +### Object storage bucket (AWS S3, Azure, Cloudflare R2, or Google GCS) Querying data stored in Parquet, CSV, JSON, Iceberg and Delta format can be done with `read_parquet`, `read_csv`, `read_json`, `iceberg_scan` and `delta_scan` respectively. @@ -157,6 +157,20 @@ Querying data stored in Parquet, CSV, JSON, Iceberg and Delta format can be done LIMIT 100; ``` +Note, for Azure, you will need to first install the Azure extension: +```sql +SELECT duckdb.install_extension('azure'); +``` + +You may then store a secret using the `connection_string` parameter as such: +```sql +INSERT INTO duckdb.secrets +(type, connection_string) +VALUES ('Azure', ''); +``` + +Note: writes to Azure are not yet supported, [here][duckdb/duckdb_azure#44] is the current discussion for more information. + ### Connect with MotherDuck pg_duckdb also integrates with [MotherDuck][md]. To enable this support you first need to [generate an access token][md-access-token] and then add the following line to your `postgresql.conf` file: diff --git a/include/pgduckdb/pgduckdb_options.hpp b/include/pgduckdb/pgduckdb_options.hpp index e58eee01..8be56a73 100644 --- a/include/pgduckdb/pgduckdb_options.hpp +++ b/include/pgduckdb/pgduckdb_options.hpp @@ -6,21 +6,24 @@ namespace pgduckdb { /* constants for duckdb.secrets */ -#define Natts_duckdb_secret 10 -#define Anum_duckdb_secret_name 1 -#define Anum_duckdb_secret_type 2 -#define Anum_duckdb_secret_key_id 3 -#define Anum_duckdb_secret_secret 4 -#define Anum_duckdb_secret_region 5 -#define Anum_duckdb_secret_session_token 6 -#define Anum_duckdb_secret_endpoint 7 -#define Anum_duckdb_secret_r2_account_id 8 -#define Anum_duckdb_secret_use_ssl 9 -#define Anum_duckdb_secret_scope 10 +#define Natts_duckdb_secret 11 +#define Anum_duckdb_secret_name 1 +#define Anum_duckdb_secret_type 2 +#define Anum_duckdb_secret_key_id 3 +#define Anum_duckdb_secret_secret 4 +#define Anum_duckdb_secret_region 5 +#define Anum_duckdb_secret_session_token 6 +#define Anum_duckdb_secret_endpoint 7 +#define Anum_duckdb_secret_r2_account_id 8 +#define Anum_duckdb_secret_use_ssl 9 +#define Anum_duckdb_secret_scope 10 +#define Anum_duckdb_secret_connection_string 11 + +enum SecretType { S3, R2, GCS, AZURE }; typedef struct DuckdbSecret { std::string name; - std::string type; + SecretType type; std::string key_id; std::string secret; std::string region; @@ -29,8 +32,11 @@ typedef struct DuckdbSecret { std::string r2_account_id; bool use_ssl; std::string scope; + std::string connection_string; // Used for Azure } DuckdbSecret; +std::string SecretTypeToString(SecretType type); + extern std::vector ReadDuckdbSecrets(); /* constants for duckdb.extensions */ diff --git a/sql/pg_duckdb--0.1.0--0.2.0.sql b/sql/pg_duckdb--0.1.0--0.2.0.sql index 613f8b09..daaec385 100644 --- a/sql/pg_duckdb--0.1.0--0.2.0.sql +++ b/sql/pg_duckdb--0.1.0--0.2.0.sql @@ -77,3 +77,14 @@ ALTER TABLE duckdb.secrets ADD COLUMN scope TEXT; ALTER TABLE duckdb.tables ADD COLUMN default_database TEXT NOT NULL DEFAULT 'my_db'; ALTER TABLE duckdb.tables ALTER COLUMN default_database DROP DEFAULT; + +-- Alter duckdb.secrets to allow column "key_id" & "secret" to be NULL +ALTER TABLE duckdb.secrets ALTER COLUMN key_id DROP NOT NULL; +ALTER TABLE duckdb.secrets ALTER COLUMN secret DROP NOT NULL; + +-- Update "type_constraint" CHECK on "type" to allow "Azure" +ALTER TABLE duckdb.secrets DROP CONSTRAINT type_constraint; +ALTER TABLE duckdb.secrets ADD CONSTRAINT type_constraint CHECK (upper(type) IN ('S3', 'GCS', 'R2', 'AZURE')); + +-- Add "azure_connection_string" column +ALTER TABLE duckdb.secrets ADD COLUMN connection_string TEXT; diff --git a/src/pgduckdb_duckdb.cpp b/src/pgduckdb_duckdb.cpp index 68e7550f..d66be5b5 100644 --- a/src/pgduckdb_duckdb.cpp +++ b/src/pgduckdb_duckdb.cpp @@ -224,6 +224,30 @@ GetSeqLastValue(const char *seq_name) { return PostgresFunctionGuard(DirectFunctionCall1Coll, pg_sequence_last_value, InvalidOid, table_seq_oid); } +void +WriteSecretQueryForS3R2OrGCP(const DuckdbSecret &secret, std::ostringstream &query) { + bool is_r2_cloud_secret = secret.type == SecretType::R2; + query << "KEY_ID '" << secret.key_id << "', SECRET '" << secret.secret << "'"; + if (secret.region.length() && !is_r2_cloud_secret) { + query << ", REGION '" << secret.region << "'"; + } + if (secret.session_token.length() && !is_r2_cloud_secret) { + query << ", SESSION_TOKEN '" << secret.session_token << "'"; + } + if (secret.endpoint.length() && !is_r2_cloud_secret) { + query << ", ENDPOINT '" << secret.endpoint << "'"; + } + if (is_r2_cloud_secret) { + query << ", ACCOUNT_ID '" << secret.endpoint << "'"; + } + if (!secret.use_ssl) { + query << ", USE_SSL 'FALSE'"; + } + if (secret.scope.length()) { + query << ", SCOPE '" << secret.scope << "'"; + } +} + void DuckDBManager::LoadSecrets(duckdb::ClientContext &context) { auto duckdb_secrets = ReadDuckdbSecrets(); @@ -231,27 +255,15 @@ DuckDBManager::LoadSecrets(duckdb::ClientContext &context) { int secret_id = 0; for (auto &secret : duckdb_secrets) { std::ostringstream query; - bool is_r2_cloud_secret = (secret.type.rfind("R2", 0) == 0); query << "CREATE SECRET pgduckb_secret_" << secret_id << " "; - query << "(TYPE " << secret.type << ", KEY_ID '" << secret.key_id << "', SECRET '" << secret.secret << "'"; - if (secret.region.length() && !is_r2_cloud_secret) { - query << ", REGION '" << secret.region << "'"; - } - if (secret.session_token.length() && !is_r2_cloud_secret) { - query << ", SESSION_TOKEN '" << secret.session_token << "'"; - } - if (secret.endpoint.length() && !is_r2_cloud_secret) { - query << ", ENDPOINT '" << secret.endpoint << "'"; - } - if (is_r2_cloud_secret) { - query << ", ACCOUNT_ID '" << secret.endpoint << "'"; - } - if (!secret.use_ssl) { - query << ", USE_SSL 'FALSE'"; - } - if (secret.scope.length()) { - query << ", SCOPE '" << secret.scope << "'"; + query << "(TYPE " << SecretTypeToString(secret.type) << ", "; + + if (secret.type == SecretType::AZURE) { + query << "CONNECTION_STRING '" << secret.connection_string << "'"; + } else { + WriteSecretQueryForS3R2OrGCP(secret, query); } + query << ");"; DuckDBQueryOrThrow(context, query.str()); @@ -298,6 +310,12 @@ DuckDBManager::LoadExtensions(duckdb::ClientContext &context) { void DuckDBManager::RefreshConnectionState(duckdb::ClientContext &context) { + const auto extensions_table_last_seq = GetSeqLastValue("extensions_table_seq"); + if (IsExtensionsSeqLessThan(extensions_table_last_seq)) { + LoadExtensions(context); + UpdateExtensionsSeq(extensions_table_last_seq); + } + const auto secret_table_last_seq = GetSeqLastValue("secrets_table_seq"); if (IsSecretSeqLessThan(secret_table_last_seq)) { DropSecrets(context); @@ -305,12 +323,6 @@ DuckDBManager::RefreshConnectionState(duckdb::ClientContext &context) { UpdateSecretSeq(secret_table_last_seq); } - const auto extensions_table_last_seq = GetSeqLastValue("extensions_table_seq"); - if (IsExtensionsSeqLessThan(extensions_table_last_seq)) { - LoadExtensions(context); - UpdateExtensionsSeq(extensions_table_last_seq); - } - auto http_file_cache_set_dir_query = duckdb::StringUtil::Format("SET http_file_cache_dir TO '%s';", CreateOrGetDirectoryPath("duckdb_cache")); DuckDBQueryOrThrow(context, http_file_cache_set_dir_query); diff --git a/src/pgduckdb_options.cpp b/src/pgduckdb_options.cpp index 238cd75a..8d56e64b 100644 --- a/src/pgduckdb_options.cpp +++ b/src/pgduckdb_options.cpp @@ -55,6 +55,43 @@ DatumToString(Datum datum) { return column_value; } +bool +DoesSecretRequiresKeyIdOrSecret(const SecretType type) { + return type == SecretType::S3 || type == SecretType::GCS || type == SecretType::R2; +} + +SecretType +StringToSecretType(const std::string &type) { + auto uc_type = duckdb::StringUtil::Upper(type); + if (uc_type == "S3") { + return SecretType::S3; + } else if (uc_type == "R2") { + return SecretType::R2; + } else if (uc_type == "GCS") { + return SecretType::GCS; + } else if (uc_type == "AZURE") { + return SecretType::AZURE; + } else { + throw std::runtime_error("Invalid secret type: '" + type + "'"); + } +} + +std::string +SecretTypeToString(SecretType type) { + switch (type) { + case SecretType::S3: + return "S3"; + case SecretType::R2: + return "R2"; + case SecretType::GCS: + return "GCS"; + case SecretType::AZURE: + return "AZURE"; + default: + throw std::runtime_error("Invalid secret type: '" + std::to_string(type) + "'"); + } +} + std::vector ReadDuckdbSecrets() { HeapTuple tuple = NULL; @@ -70,9 +107,21 @@ ReadDuckdbSecrets() { heap_deform_tuple(tuple, RelationGetDescr(duckdb_secret_relation), datum_array, is_null_array); DuckdbSecret secret; - secret.type = DatumToString(datum_array[Anum_duckdb_secret_type - 1]); - secret.key_id = DatumToString(datum_array[Anum_duckdb_secret_key_id - 1]); - secret.secret = DatumToString(datum_array[Anum_duckdb_secret_secret - 1]); + auto type_str = DatumToString(datum_array[Anum_duckdb_secret_type - 1]); + secret.type = StringToSecretType(type_str); + if (!is_null_array[Anum_duckdb_secret_key_id - 1]) + secret.key_id = DatumToString(datum_array[Anum_duckdb_secret_key_id - 1]); + else if (DoesSecretRequiresKeyIdOrSecret(secret.type)) { + elog(WARNING, "Invalid '%s' secret: key id is required.", type_str.c_str()); + continue; + } + + if (!is_null_array[Anum_duckdb_secret_secret - 1]) + secret.secret = DatumToString(datum_array[Anum_duckdb_secret_secret - 1]); + else if (DoesSecretRequiresKeyIdOrSecret(secret.type)) { + elog(WARNING, "Invalid '%s' secret: secret is required.", type_str.c_str()); + continue; + } if (!is_null_array[Anum_duckdb_secret_region - 1]) secret.region = DatumToString(datum_array[Anum_duckdb_secret_region - 1]); @@ -94,6 +143,9 @@ ReadDuckdbSecrets() { if (!is_null_array[Anum_duckdb_secret_scope - 1]) secret.scope = DatumToString(datum_array[Anum_duckdb_secret_scope - 1]); + if (!is_null_array[Anum_duckdb_secret_connection_string - 1]) + secret.connection_string = DatumToString(datum_array[Anum_duckdb_secret_connection_string - 1]); + duckdb_secrets.push_back(secret); }