diff --git a/crates/polars-io/src/cloud/glob.rs b/crates/polars-io/src/cloud/glob.rs index fac221b47402..b61ac58baa0a 100644 --- a/crates/polars-io/src/cloud/glob.rs +++ b/crates/polars-io/src/cloud/glob.rs @@ -108,13 +108,9 @@ impl CloudLocation { (bucket, key) }; - let key = if parsed.scheme().starts_with("http") { - percent_encoding::percent_decode_str(key) - .decode_utf8() - .map_err(to_compute_err)? - } else { - std::borrow::Cow::Borrowed(key) - }; + let key = percent_encoding::percent_decode_str(key) + .decode_utf8() + .map_err(to_compute_err)?; let (mut prefix, expansion) = extract_prefix_expansion(&key)?; if is_local && key.starts_with(DELIMITER) { prefix.insert(0, DELIMITER); diff --git a/crates/polars-io/src/cloud/object_store_setup.rs b/crates/polars-io/src/cloud/object_store_setup.rs index ef8618f7ea0d..4d43d8184d62 100644 --- a/crates/polars-io/src/cloud/object_store_setup.rs +++ b/crates/polars-io/src/cloud/object_store_setup.rs @@ -40,6 +40,19 @@ fn url_and_creds_to_key(url: &Url, options: Option<&CloudOptions>) -> String { ) } +/// Simply construct an object_store `Path` struct from a string. +pub fn object_path_from_string(path: String) -> object_store::path::Path { + // We transmute because they don't expose a way to just create it from a string + // without encoding or decoding it. If one day we can't use this transmute hack + // anymore then we'll just have to `Path::from_url_path(percent_encode(path))` + { + const _: [(); std::mem::align_of::()] = + [(); std::mem::align_of::()]; + }; + + unsafe { std::mem::transmute::(path) } +} + /// Build an [`ObjectStore`] based on the URL and passed in url. Return the cloud location and an implementation of the object store. pub async fn build_object_store( url: &str, @@ -127,3 +140,15 @@ pub async fn build_object_store( } Ok((cloud_location, store)) } + +mod test { + #[test] + fn test_object_path_from_string() { + use super::object_path_from_string; + + let path = "%25"; + let out = object_path_from_string(path.to_string()); + + assert_eq!(out.as_ref(), path); + } +} diff --git a/crates/polars-io/src/cloud/options.rs b/crates/polars-io/src/cloud/options.rs index 197d7467a628..59266282113c 100644 --- a/crates/polars-io/src/cloud/options.rs +++ b/crates/polars-io/src/cloud/options.rs @@ -131,16 +131,6 @@ impl CloudType { #[cfg(feature = "cloud")] pub(crate) fn parse_url(input: &str) -> std::result::Result { Ok(if input.contains("://") { - let input = if input.starts_with("https://") { - std::borrow::Cow::Borrowed(input) - } else { - // Some paths may contain '%', we need to double-encode as it doesn't seem - // possible to construct `Url` without having it decode the path. - // TODO: Maybe we can avoid using `Url`. - const PERC: percent_encoding::AsciiSet = percent_encoding::CONTROLS.add(b'%'); - std::borrow::Cow::::from(percent_encoding::percent_encode(input.as_bytes(), &PERC)) - }; - let input = input.as_ref(); url::Url::parse(input)? } else { let path = std::path::Path::new(input); diff --git a/crates/polars-io/src/file_cache/utils.rs b/crates/polars-io/src/file_cache/utils.rs index 7bdb45a2689d..6262cfe772aa 100644 --- a/crates/polars-io/src/file_cache/utils.rs +++ b/crates/polars-io/src/file_cache/utils.rs @@ -3,12 +3,14 @@ use std::sync::Arc; use std::time::UNIX_EPOCH; use once_cell::sync::Lazy; -use polars_error::{to_compute_err, PolarsError, PolarsResult}; +use polars_error::{PolarsError, PolarsResult}; use super::cache::{get_env_file_cache_ttl, FILE_CACHE}; use super::entry::FileCacheEntry; use super::file_fetcher::{CloudFileFetcher, LocalFileFetcher}; -use crate::cloud::{build_object_store, CloudLocation, CloudOptions, PolarsObjectStore}; +use crate::cloud::{ + build_object_store, object_path_from_string, CloudLocation, CloudOptions, PolarsObjectStore, +}; use crate::pl_async; use crate::prelude::{is_cloud_url, POLARS_TEMP_DIR_BASE_PATH}; use crate::utils::ensure_directory_init; @@ -83,8 +85,7 @@ pub fn init_entries_from_uri_list]>>( let cloud_path = { assert!(expansion.is_none(), "path should not contain wildcards"); - object_store::path::Path::from_url_path(prefix) - .map_err(to_compute_err)? + object_path_from_string(prefix) }; let object_store = object_store.clone(); diff --git a/crates/polars-io/src/ipc/ipc_reader_async.rs b/crates/polars-io/src/ipc/ipc_reader_async.rs index f18046f550d5..2a78f4bfe3b6 100644 --- a/crates/polars-io/src/ipc/ipc_reader_async.rs +++ b/crates/polars-io/src/ipc/ipc_reader_async.rs @@ -8,7 +8,9 @@ use polars_core::frame::DataFrame; use polars_core::schema::Schema; use polars_error::{polars_bail, polars_err, to_compute_err, PolarsResult}; -use crate::cloud::{build_object_store, CloudLocation, CloudOptions, PolarsObjectStore}; +use crate::cloud::{ + build_object_store, object_path_from_string, CloudLocation, CloudOptions, PolarsObjectStore, +}; use crate::file_cache::{init_entries_from_uri_list, FileCacheEntry}; use crate::predicates::PhysicalIoExpr; use crate::prelude::{materialize_projection, IpcReader}; @@ -76,7 +78,7 @@ impl IpcReaderAsync { // Any wildcards should already have been resolved here. Without this assertion they would // be ignored. debug_assert!(expansion.is_none(), "path should not contain wildcards"); - Path::from_url_path(prefix).map_err(to_compute_err)? + object_path_from_string(prefix) }; Ok(Self { diff --git a/crates/polars-io/src/parquet/read/async_impl.rs b/crates/polars-io/src/parquet/read/async_impl.rs index 67b8be1820dc..e53cd8922d71 100644 --- a/crates/polars-io/src/parquet/read/async_impl.rs +++ b/crates/polars-io/src/parquet/read/async_impl.rs @@ -5,7 +5,6 @@ use arrow::datatypes::ArrowSchemaRef; use bytes::Bytes; use object_store::path::Path as ObjectPath; use polars_core::config::{get_rg_prefetch_size, verbose}; -use polars_core::error::to_compute_err; use polars_core::prelude::*; use polars_parquet::read::RowGroupMetaData; use polars_parquet::write::FileMetaData; @@ -16,7 +15,9 @@ use tokio::sync::Mutex; use super::mmap::ColumnStore; use super::predicates::read_this_row_group; use super::read_impl::compute_row_group_range; -use crate::cloud::{build_object_store, CloudLocation, CloudOptions, PolarsObjectStore}; +use crate::cloud::{ + build_object_store, object_path_from_string, CloudLocation, CloudOptions, PolarsObjectStore, +}; use crate::parquet::metadata::FileMetaDataRef; use crate::pl_async::get_runtime; use crate::predicates::PhysicalIoExpr; @@ -48,7 +49,7 @@ impl ParquetObjectStore { // Any wildcards should already have been resolved here. Without this assertion they would // be ignored. debug_assert!(expansion.is_none(), "path should not contain wildcards"); - let path = ObjectPath::from_url_path(prefix).map_err(to_compute_err)?; + let path = object_path_from_string(prefix); Ok(ParquetObjectStore { store: PolarsObjectStore::new(store), diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs index 904716a371b4..eaec83618732 100644 --- a/crates/polars-lazy/src/scan/file_list_reader.rs +++ b/crates/polars-lazy/src/scan/file_list_reader.rs @@ -55,6 +55,8 @@ fn expand_paths( if is_cloud || { cfg!(not(target_family = "windows")) && config::force_async() } { #[cfg(feature = "async")] { + use polars_io::cloud::object_path_from_string; + let format_path = |scheme: &str, bucket: &str, location: &str| { if is_cloud { format!("{}://{}/{}", scheme, bucket, location) @@ -70,7 +72,7 @@ fn expand_paths( let (cloud_location, store) = polars_io::cloud::build_object_store(path, cloud_options).await?; - let prefix = cloud_location.prefix.clone().into(); + let prefix = object_path_from_string(cloud_location.prefix.clone()); let out = if !path.ends_with("/") && cloud_location.expansion.is_none() diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py index 0560985a1609..4819f730df34 100644 --- a/py-polars/tests/unit/io/test_scan.py +++ b/py-polars/tests/unit/io/test_scan.py @@ -489,3 +489,20 @@ def test_scan_glob_excludes_directories(tmp_path: Path) -> None: pl.scan_parquet(tmp_path / "**/*").collect(), pl.concat(3 * [df]) ) assert_frame_equal(pl.scan_parquet(tmp_path / "*").collect(), df) + + +@pytest.mark.parametrize("file_name", ["a b", "a %25 b"]) +def test_scan_async_whitespace_in_path( + tmp_path: Path, monkeypatch: Any, file_name: str +) -> None: + monkeypatch.setenv("POLARS_FORCE_ASYNC", "1") + tmp_path.mkdir(exist_ok=True) + + path = tmp_path / f"{file_name}.parquet" + df = pl.DataFrame({"x": 1}) + df.write_parquet(path) + assert_frame_equal(pl.scan_parquet(path).collect(), df) + assert_frame_equal(pl.scan_parquet(tmp_path).collect(), df) + assert_frame_equal(pl.scan_parquet(tmp_path / "*").collect(), df) + assert_frame_equal(pl.scan_parquet(tmp_path / "*.parquet").collect(), df) + path.unlink()