From 44338c103fe2aabceddb1bd9d79c54febbd9b2f6 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Sat, 4 Feb 2023 20:40:40 -0800 Subject: [PATCH] minor: optimize partition lookup for vacuum loop (#1120) # Description Avoid resolving table partition inside the file iteration loop --- rust/src/operations/vacuum.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/rust/src/operations/vacuum.rs b/rust/src/operations/vacuum.rs index ea0fe43fe8..a4eab1b317 100644 --- a/rust/src/operations/vacuum.rs +++ b/rust/src/operations/vacuum.rs @@ -155,12 +155,18 @@ impl VacuumBuilder { let mut files_to_delete = vec![]; let mut all_files = self.store.list(None).await.map_err(DeltaTableError::from)?; + let partition_columns = &self + .snapshot + .current_metadata() + .ok_or(DeltaTableError::NoMetadata)? + .partition_columns; + while let Some(obj_meta) = all_files.next().await { // TODO should we allow NotFound here in case we have a temporary commit file in the list let obj_meta = obj_meta.map_err(DeltaTableError::from)?; if valid_files.contains(&obj_meta.location) // file is still being tracked in table || !expired_tombstones.contains(obj_meta.location.as_ref()) // file is not an expired tombstone - || is_hidden_directory(&self.snapshot, &obj_meta.location)? + || is_hidden_directory(partition_columns, &obj_meta.location)? { continue; } @@ -236,15 +242,12 @@ impl VacuumPlan { /// Names of the form partitionCol=[value] are partition directories, and should be /// deleted even if they'd normally be hidden. The _db_index directory contains (bloom filter) /// indexes and these must be deleted when the data they are tied to is deleted. -fn is_hidden_directory(snapshot: &DeltaTableState, path: &Path) -> Result { +fn is_hidden_directory(partition_columns: &[String], path: &Path) -> Result { let path_name = path.to_string(); Ok((path_name.starts_with('.') || path_name.starts_with('_')) && !path_name.starts_with("_delta_index") && !path_name.starts_with("_change_data") - && !snapshot - .current_metadata() - .ok_or(DeltaTableError::NoMetadata)? - .partition_columns + && !partition_columns .iter() .any(|partition_column| path_name.starts_with(partition_column))) }