diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 1ba9f769f3f0..114c0b6a526a 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -80,6 +80,7 @@ ordered-float = "3.0" parking_lot = "0.12" parquet = { version = "24.0.0", features = ["arrow", "async"] } paste = "^1.0" +percent-encoding = "2.2.0" pin-project-lite = "^0.2.7" pyo3 = { version = "0.17.1", optional = true } rand = "0.8" diff --git a/datafusion/core/src/datasource/listing/url.rs b/datafusion/core/src/datasource/listing/url.rs index 8676f2118728..d1a527f23ac7 100644 --- a/datafusion/core/src/datasource/listing/url.rs +++ b/datafusion/core/src/datasource/listing/url.rs @@ -23,6 +23,7 @@ use glob::Pattern; use itertools::Itertools; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; +use percent_encoding; use url::Url; /// A parsed URL identifying files for a listing table, see [`ListingTableUrl::parse`] @@ -108,7 +109,9 @@ impl ListingTableUrl { /// Creates a new [`ListingTableUrl`] from a url and optional glob expression fn new(url: Url, glob: Option) -> Self { - let prefix = Path::parse(url.path()).expect("should be URL safe"); + let decoded_path = + percent_encoding::percent_decode_str(url.path()).decode_utf8_lossy(); + let prefix = Path::from(decoded_path.as_ref()); Self { url, prefix, glob } } @@ -246,6 +249,15 @@ mod tests { let url = ListingTableUrl::parse("file:///foo").unwrap(); let child = Path::parse("/foob/bar").unwrap(); assert!(url.strip_prefix(&child).is_none()); + + let url = ListingTableUrl::parse("file:///foo/ bar").unwrap(); + assert_eq!(url.prefix.as_ref(), "foo/ bar"); + + let url = ListingTableUrl::parse("file:///foo/bar?").unwrap(); + assert_eq!(url.prefix.as_ref(), "foo/bar"); + + let url = ListingTableUrl::parse("file:///foo/😺").unwrap(); + assert_eq!(url.prefix.as_ref(), "foo/%F0%9F%98%BA"); } #[test]