From 1c64ca0e736ecda972a372ee8bb87d3b9c12e2cf Mon Sep 17 00:00:00 2001 From: Gareth Western Date: Mon, 2 Sep 2024 10:55:49 +0200 Subject: [PATCH 1/4] feat: add support for abfs:// --- src/azure_dfs_filesystem.cpp | 2 ++ src/azure_parsed_url.cpp | 17 +++++++++++------ src/include/azure_dfs_filesystem.hpp | 2 ++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/azure_dfs_filesystem.cpp b/src/azure_dfs_filesystem.cpp index 739078c..0f95626 100644 --- a/src/azure_dfs_filesystem.cpp +++ b/src/azure_dfs_filesystem.cpp @@ -20,6 +20,8 @@ namespace duckdb { const string AzureDfsStorageFileSystem::SCHEME = "abfss"; const string AzureDfsStorageFileSystem::PATH_PREFIX = "abfss://"; +const string AzureDfsStorageFileSystem::UNSECURE_SCHEME = "abfs"; +const string AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX = "abfs://"; inline static bool IsDfsScheme(const string &fpath) { return fpath.rfind("abfss://", 0) == 0; diff --git a/src/azure_parsed_url.cpp b/src/azure_parsed_url.cpp index 2eb2be0..b5ad702 100644 --- a/src/azure_parsed_url.cpp +++ b/src/azure_parsed_url.cpp @@ -7,14 +7,16 @@ namespace duckdb { AzureParsedUrl ParseUrl(const std::string &url) { constexpr auto invalid_url_format = "The URL %s does not match the expected formats: (azure|az):///[] or the fully qualified one: " - "(abfss|azure|az)://.//[] " - "or abfss://@./[]"; + "(abfs[s]|azure|az)://.//[] " + "or abfs[s]://@./[]"; bool is_fully_qualified; std::string container, storage_account_name, endpoint, prefix, path; if (url.rfind("azure://", 0) != 0 && url.rfind("az://", 0) != 0 && url.rfind(AzureDfsStorageFileSystem::PATH_PREFIX, 0) != 0) { - throw IOException("URL needs to start with azure:// or az:// or %s", AzureDfsStorageFileSystem::PATH_PREFIX); + throw IOException("URL needs to start with azure:// or az:// or %s or %s", + AzureDfsStorageFileSystem::PATH_PREFIX, + AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX); } const auto prefix_end_pos = url.find("//") + 2; @@ -31,9 +33,12 @@ AzureParsedUrl ParseUrl(const std::string &url) { if (dot_pos != std::string::npos && dot_pos < slash_pos) { is_fully_qualified = true; - if (url.rfind(AzureDfsStorageFileSystem::PATH_PREFIX, 0) == 0 && + if (( + url.rfind(AzureDfsStorageFileSystem::PATH_PREFIX, 0) == 0 || + url.rfind(AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX, 0) == 0 + ) && at_pos != std::string::npos) { - // syntax is abfss://@./[] + // syntax is abfs[s]://@./[] const auto path_slash_pos = url.find('/', prefix_end_pos + 1); if (path_slash_pos == string::npos) { throw IOException(invalid_url_format, url); @@ -44,7 +49,7 @@ AzureParsedUrl ParseUrl(const std::string &url) { endpoint = url.substr(dot_pos + 1, path_slash_pos - dot_pos - 1); path = url.substr(path_slash_pos + 1); } else { - // syntax is (abfss|azure|az)://.//[] + // syntax is (abfs[s]|azure|az)://.//[] const auto container_slash_pos = url.find('/', dot_pos); if (container_slash_pos == string::npos) { throw IOException(invalid_url_format, url); diff --git a/src/include/azure_dfs_filesystem.hpp b/src/include/azure_dfs_filesystem.hpp index 8f35dc7..9105b4e 100644 --- a/src/include/azure_dfs_filesystem.hpp +++ b/src/include/azure_dfs_filesystem.hpp @@ -51,6 +51,8 @@ class AzureDfsStorageFileSystem : public AzureStorageFileSystem { public: static const string SCHEME; static const string PATH_PREFIX; + static const string UNSECURE_SCHEME; + static const string UNSECURE_PATH_PREFIX; protected: // From AzureFilesystem From 79b79848330e98ff34b9722e833bf6505ecae3dd Mon Sep 17 00:00:00 2001 From: Gareth Western Date: Mon, 2 Sep 2024 11:16:44 +0200 Subject: [PATCH 2/4] fix: add some tests --- test/sql/cloud/hierarchical_namespace.test | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/sql/cloud/hierarchical_namespace.test b/test/sql/cloud/hierarchical_namespace.test index 8a4608e..71f25eb 100644 --- a/test/sql/cloud/hierarchical_namespace.test +++ b/test/sql/cloud/hierarchical_namespace.test @@ -63,18 +63,36 @@ SELECT count(*) FROM 'abfss://testing-private/partitioned/l_receipmonth=1997/l_s ---- 1291 +# Check with absolute path using unsecure abfs +query I +SELECT count(*) FROM 'abfs://testing-private/partitioned/l_receipmonth=1997/l_shipmode=TRUCK/data_0.csv'; +---- +1291 + # Check fully qualified name query I SELECT count(*) FROM 'abfss://${AZURE_STORAGE_ACCOUNT}.dfs.core.windows.net/testing-private/partitioned/l_receipmonth=*/l_shipmode=TRUCK/*.csv'; ---- 2317 +# Check fully qualified name using unsecure abfs +query I +SELECT count(*) FROM 'abfs://${AZURE_STORAGE_ACCOUNT}.dfs.core.windows.net/testing-private/partitioned/l_receipmonth=*/l_shipmode=TRUCK/*.csv'; +---- +2317 + # Check fully qualified name abfss alternative syntax query I SELECT count(*) FROM 'abfss://testing-private@${AZURE_STORAGE_ACCOUNT}.dfs.core.windows.net/partitioned/l_receipmonth=*/l_shipmode=TRUCK/*.csv'; ---- 2317 +# Check fully qualified name abfs alternative syntax +query I +SELECT count(*) FROM 'abfs://testing-private@${AZURE_STORAGE_ACCOUNT}.dfs.core.windows.net/partitioned/l_receipmonth=*/l_shipmode=TRUCK/*.csv'; +---- +2317 + # Enable http info for the explain analyze statement statement ok SET azure_http_stats = true; From 538e6917e1cb73b5d83844186e515ba81f742039 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Mon, 2 Sep 2024 12:47:00 +0200 Subject: [PATCH 3/4] fix and test abfs prefix --- src/azure_dfs_filesystem.cpp | 2 +- src/azure_parsed_url.cpp | 2 +- src/azure_secret.cpp | 4 +++ src/azure_storage_account_client.cpp | 4 +-- test/sql/cloud/hierarchical_namespace.test | 42 ++++++++++++++-------- 5 files changed, 36 insertions(+), 18 deletions(-) diff --git a/src/azure_dfs_filesystem.cpp b/src/azure_dfs_filesystem.cpp index 0f95626..27966e3 100644 --- a/src/azure_dfs_filesystem.cpp +++ b/src/azure_dfs_filesystem.cpp @@ -24,7 +24,7 @@ const string AzureDfsStorageFileSystem::UNSECURE_SCHEME = "abfs"; const string AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX = "abfs://"; inline static bool IsDfsScheme(const string &fpath) { - return fpath.rfind("abfss://", 0) == 0; + return fpath.rfind(AzureDfsStorageFileSystem::PATH_PREFIX, 0) == 0 || fpath.rfind(AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX, 0) == 0; } static void Walk(const Azure::Storage::Files::DataLake::DataLakeFileSystemClient &fs, const std::string &path, diff --git a/src/azure_parsed_url.cpp b/src/azure_parsed_url.cpp index b5ad702..ccc013a 100644 --- a/src/azure_parsed_url.cpp +++ b/src/azure_parsed_url.cpp @@ -13,7 +13,7 @@ AzureParsedUrl ParseUrl(const std::string &url) { std::string container, storage_account_name, endpoint, prefix, path; if (url.rfind("azure://", 0) != 0 && url.rfind("az://", 0) != 0 && - url.rfind(AzureDfsStorageFileSystem::PATH_PREFIX, 0) != 0) { + url.rfind(AzureDfsStorageFileSystem::PATH_PREFIX, 0) != 0 && url.rfind(AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX, 0) != 0) { throw IOException("URL needs to start with azure:// or az:// or %s or %s", AzureDfsStorageFileSystem::PATH_PREFIX, AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX); diff --git a/src/azure_secret.cpp b/src/azure_secret.cpp index ba24951..e9f08f0 100644 --- a/src/azure_secret.cpp +++ b/src/azure_secret.cpp @@ -36,6 +36,7 @@ static unique_ptr CreateAzureSecretFromConfig(ClientContext &context scope.push_back("azure://"); scope.push_back("az://"); scope.push_back(AzureDfsStorageFileSystem::PATH_PREFIX); + scope.push_back(AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX); } auto result = make_uniq(scope, input.type, input.provider, input.name); @@ -61,6 +62,7 @@ static unique_ptr CreateAzureSecretFromCredentialChain(ClientContext scope.push_back("azure://"); scope.push_back("az://"); scope.push_back(AzureDfsStorageFileSystem::PATH_PREFIX); + scope.push_back(AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX); } auto result = make_uniq(scope, input.type, input.provider, input.name); @@ -85,6 +87,7 @@ static unique_ptr CreateAzureSecretFromServicePrincipal(ClientContex scope.push_back("azure://"); scope.push_back("az://"); scope.push_back(AzureDfsStorageFileSystem::PATH_PREFIX); + scope.push_back(AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX); } auto result = make_uniq(scope, input.type, input.provider, input.name); @@ -114,6 +117,7 @@ static unique_ptr CreateAzureSecretFromAccessToken(ClientContext &co scope.push_back("azure://"); scope.push_back("az://"); scope.push_back(AzureDfsStorageFileSystem::PATH_PREFIX); + scope.push_back(AzureDfsStorageFileSystem::UNSECURE_PATH_PREFIX); } auto result = make_uniq(scope, input.type, input.provider, input.name); diff --git a/src/azure_storage_account_client.cpp b/src/azure_storage_account_client.cpp index a1825f0..e0e2293 100644 --- a/src/azure_storage_account_client.cpp +++ b/src/azure_storage_account_client.cpp @@ -591,8 +591,8 @@ ConnectToDfsStorageAccount(optional_ptr opener, const std::string &p if (!azure_parsed_url.is_fully_qualified) { throw InvalidInputException( - "Cannot identified the storage account from path '%s'. To connect anonymously to a " - "storage account easier a fully qualified path has to be provided or secret must be create.", + "Cannot identify the storage account from path '%s'. To connect anonymously to a " + "storage account easier a fully qualified path has to be provided or secret must be created.", path); } diff --git a/test/sql/cloud/hierarchical_namespace.test b/test/sql/cloud/hierarchical_namespace.test index 71f25eb..f9e59a3 100644 --- a/test/sql/cloud/hierarchical_namespace.test +++ b/test/sql/cloud/hierarchical_namespace.test @@ -5,26 +5,35 @@ # Require statement will ensure this test is run with this extension loaded require azure -require-env AZURE_TENANT_ID - -require-env AZURE_CLIENT_ID - -require-env AZURE_CLIENT_SECRET - +#require-env AZURE_TENANT_ID +# +#require-env AZURE_CLIENT_ID +# +#require-env AZURE_CLIENT_SECRET +# require-env AZURE_STORAGE_ACCOUNT +# +#statement ok +#set allow_persistent_secrets=false +# +#statement ok +#CREATE SECRET spn ( +# TYPE AZURE, +# PROVIDER SERVICE_PRINCIPAL, +# TENANT_ID '${AZURE_TENANT_ID}', +# CLIENT_ID '${AZURE_CLIENT_ID}', +# CLIENT_SECRET '${AZURE_CLIENT_SECRET}', +# ACCOUNT_NAME '${AZURE_STORAGE_ACCOUNT}' +#); -statement ok -set allow_persistent_secrets=false statement ok -CREATE SECRET spn ( +CREATE SECRET az1 ( TYPE AZURE, - PROVIDER SERVICE_PRINCIPAL, - TENANT_ID '${AZURE_TENANT_ID}', - CLIENT_ID '${AZURE_CLIENT_ID}', - CLIENT_SECRET '${AZURE_CLIENT_SECRET}', + PROVIDER CREDENTIAL_CHAIN, + CHAIN 'cli', ACCOUNT_NAME '${AZURE_STORAGE_ACCOUNT}' -); +) # Check that with Azure ADLS GEN2 directories are not show in globs query I @@ -102,6 +111,11 @@ EXPLAIN ANALYZE SELECT count(*) FROM 'abfss://testing-private/partitioned/l_rece ---- analyzed_plan :.*HTTP Stats.*in\: 322\.0 KiB.*\#HEAD\: 1.*GET\: 4.*PUT\: 0.*\#POST\: 0.* +query II +EXPLAIN ANALYZE SELECT count(*) FROM 'abfs://testing-private/partitioned/l_receipmonth=*7/l_shipmode=TRUCK/*.csv'; +---- +analyzed_plan :.*HTTP Stats.*in\: 322\.0 KiB.*\#HEAD\: 1.*GET\: 4.*PUT\: 0.*\#POST\: 0.* + query II EXPLAIN ANALYZE SELECT count(*) FROM 'azure://testing-private/partitioned/l_receipmonth=*7/l_shipmode=TRUCK/*.csv'; From 2d21afdad06436a0486928c7f3fea2702aa9cbd3 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Mon, 2 Sep 2024 17:00:04 +0200 Subject: [PATCH 4/4] revert accidentally commited change --- test/sql/cloud/hierarchical_namespace.test | 37 ++++++++-------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/test/sql/cloud/hierarchical_namespace.test b/test/sql/cloud/hierarchical_namespace.test index f9e59a3..8e72955 100644 --- a/test/sql/cloud/hierarchical_namespace.test +++ b/test/sql/cloud/hierarchical_namespace.test @@ -5,35 +5,26 @@ # Require statement will ensure this test is run with this extension loaded require azure -#require-env AZURE_TENANT_ID -# -#require-env AZURE_CLIENT_ID -# -#require-env AZURE_CLIENT_SECRET -# +require-env AZURE_TENANT_ID + +require-env AZURE_CLIENT_ID + +require-env AZURE_CLIENT_SECRET + require-env AZURE_STORAGE_ACCOUNT -# -#statement ok -#set allow_persistent_secrets=false -# -#statement ok -#CREATE SECRET spn ( -# TYPE AZURE, -# PROVIDER SERVICE_PRINCIPAL, -# TENANT_ID '${AZURE_TENANT_ID}', -# CLIENT_ID '${AZURE_CLIENT_ID}', -# CLIENT_SECRET '${AZURE_CLIENT_SECRET}', -# ACCOUNT_NAME '${AZURE_STORAGE_ACCOUNT}' -#); +statement ok +set allow_persistent_secrets=false statement ok -CREATE SECRET az1 ( +CREATE SECRET spn ( TYPE AZURE, - PROVIDER CREDENTIAL_CHAIN, - CHAIN 'cli', + PROVIDER SERVICE_PRINCIPAL, + TENANT_ID '${AZURE_TENANT_ID}', + CLIENT_ID '${AZURE_CLIENT_ID}', + CLIENT_SECRET '${AZURE_CLIENT_SECRET}', ACCOUNT_NAME '${AZURE_STORAGE_ACCOUNT}' -) +); # Check that with Azure ADLS GEN2 directories are not show in globs query I