Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: optimize get/upsert copied file info #8282

Merged
merged 4 commits into from
Oct 19, 2022
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 99 additions & 16 deletions src/query/service/src/interpreters/interpreter_copy_v2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ use crate::sql::plans::CopyPlanV2;
use crate::sql::plans::Plan;
use crate::storages::stage::StageTable;

const MAX_QUERY_COPIED_FILES_NUM: usize = 50;

pub struct CopyInterpreterV2 {
ctx: Arc<QueryContext>,
plan: CopyPlanV2,
Expand All @@ -51,6 +53,50 @@ impl CopyInterpreterV2 {
Ok(CopyInterpreterV2 { ctx, plan })
}

async fn do_query_copied_files_info(
&self,
catalog_name: String,
database_name: String,
table_id: u64,
query_copied_files: &mut Vec<String>,
file_info: &mut BTreeMap<String, TableCopiedFileInfo>,
) -> Result<()> {
let catalog = self.ctx.get_catalog(&catalog_name)?;
let tenant = self.ctx.get_tenant();
let req = GetTableCopiedFileReq {
table_id,
files: query_copied_files.to_owned(),
};
let resp = catalog
.get_table_copied_file_info(&tenant, &database_name, req)
.await?;

file_info.extend(resp.file_info);

query_copied_files.clear();
Ok(())
}

async fn do_upsert_copied_files_info(
&self,
catalog_name: String,
database_name: String,
table_id: u64,
copy_stage_files: &mut BTreeMap<String, TableCopiedFileInfo>,
) -> Result<()> {
let req = UpsertTableCopiedFileReq {
table_id,
file_info: copy_stage_files.clone(),
expire_at: None,
};
let catalog = self.ctx.get_catalog(&catalog_name)?;
catalog
.upsert_table_copied_file_info(&self.ctx.get_tenant(), &database_name, req)
.await?;
copy_stage_files.clear();
Ok(())
}

async fn filter_duplicate_files(
&self,
force: bool,
Expand All @@ -66,21 +112,42 @@ impl CopyInterpreterV2 {
.get_table(&tenant, database_name, table_name)
.await?;
let table_id = table.get_id();
let req = GetTableCopiedFileReq {
table_id,
files: files.to_owned(),
};

let mut file_map = BTreeMap::new();

if !force {
// if force is false, copy only the files that unmatch to the meta copied files info.
let resp = catalog
.get_table_copied_file_info(&tenant, database_name, req)
let mut file_info = BTreeMap::new();
let mut query_copied_files = vec![];

for file in files.iter() {
lichuang marked this conversation as resolved.
Show resolved Hide resolved
query_copied_files.push(file.clone());
if query_copied_files.len() > MAX_QUERY_COPIED_FILES_NUM {
self.do_query_copied_files_info(
catalog_name.to_string(),
database_name.to_string(),
table_id,
&mut query_copied_files,
&mut file_info,
)
.await?;
}
}
if !query_copied_files.is_empty() {
self.do_query_copied_files_info(
catalog_name.to_string(),
database_name.to_string(),
table_id,
&mut query_copied_files,
&mut file_info,
)
.await?;
}

for file in files.iter() {
let stage_file = stat_file(&self.ctx, &table_info.stage_info, file).await?;

if let Some(file_info) = resp.file_info.get(file) {
if let Some(file_info) = file_info.get(file) {
match &file_info.etag {
Some(_etag) => {
// No need to copy the file again if etag is_some and match.
Expand Down Expand Up @@ -135,17 +202,33 @@ impl CopyInterpreterV2 {
) -> Result<()> {
tracing::info!("upsert_copied_files_info: {:?}", copy_stage_files);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please change this log level from info to debug :)


if !copy_stage_files.is_empty() {
let req = UpsertTableCopiedFileReq {
table_id,
file_info: copy_stage_files.clone(),
expire_at: None,
};
let catalog = self.ctx.get_catalog(catalog_name)?;
catalog
.upsert_table_copied_file_info(&self.ctx.get_tenant(), database_name, req)
if copy_stage_files.is_empty() {
return Ok(());
}

let mut do_copy_stage_files = BTreeMap::new();
for (file_name, file_info) in copy_stage_files {
do_copy_stage_files.insert(file_name.clone(), file_info);
if do_copy_stage_files.len() > MAX_QUERY_COPIED_FILES_NUM {
BohuTANG marked this conversation as resolved.
Show resolved Hide resolved
self.do_upsert_copied_files_info(
catalog_name.to_string(),
database_name.to_string(),
table_id,
&mut do_copy_stage_files,
)
.await?;
}
}
if !do_copy_stage_files.is_empty() {
self.do_upsert_copied_files_info(
catalog_name.to_string(),
database_name.to_string(),
table_id,
&mut do_copy_stage_files,
)
.await?;
}

Ok(())
}

Expand Down