Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Reduce sort memory usage v2 by @richox #2134

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions ballista/rust/core/src/serde/physical_plan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ use datafusion::physical_plan::hash_join::{HashJoinExec, PartitionMode};
use datafusion::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
use datafusion::physical_plan::projection::ProjectionExec;
use datafusion::physical_plan::repartition::RepartitionExec;
use datafusion::physical_plan::sorts::sort::SortExec;
use datafusion::physical_plan::sorts::sort2::SortExec2;
use datafusion::physical_plan::union::UnionExec;
use datafusion::physical_plan::windows::{create_window_expr, WindowAggExec};
use datafusion::physical_plan::{
Expand Down Expand Up @@ -522,7 +522,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
}
})
.collect::<Result<Vec<_>, _>>()?;
Ok(Arc::new(SortExec::try_new(exprs, input)?))
Ok(Arc::new(SortExec2::try_new(exprs, input)?))
}
PhysicalPlanType::Unresolved(unresolved_shuffle) => {
let schema = Arc::new(convert_required!(unresolved_shuffle.schema)?);
Expand Down Expand Up @@ -849,7 +849,7 @@ impl AsExecutionPlan for PhysicalPlanNode {
},
))),
})
} else if let Some(exec) = plan.downcast_ref::<SortExec>() {
} else if let Some(exec) = plan.downcast_ref::<SortExec2>() {
let input = protobuf::PhysicalPlanNode::try_from_physical_plan(
exec.input().to_owned(),
extension_codec,
Expand Down Expand Up @@ -1032,7 +1032,7 @@ mod roundtrip_tests {
hash_aggregate::{AggregateMode, HashAggregateExec},
hash_join::{HashJoinExec, PartitionMode},
limit::{GlobalLimitExec, LocalLimitExec},
sorts::sort::SortExec,
sorts::sort2::SortExec2,
AggregateExpr, ExecutionPlan, Partitioning, PhysicalExpr, Statistics,
},
prelude::SessionContext,
Expand Down Expand Up @@ -1193,7 +1193,7 @@ mod roundtrip_tests {
},
},
];
roundtrip_test(Arc::new(SortExec::try_new(
roundtrip_test(Arc::new(SortExec2::try_new(
sort_exprs,
Arc::new(EmptyExec::new(false, schema)),
)?))
Expand Down
6 changes: 3 additions & 3 deletions ballista/rust/core/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ use datafusion::physical_plan::filter::FilterExec;
use datafusion::physical_plan::hash_aggregate::HashAggregateExec;
use datafusion::physical_plan::hash_join::HashJoinExec;
use datafusion::physical_plan::projection::ProjectionExec;
use datafusion::physical_plan::sorts::sort::SortExec;
use datafusion::physical_plan::sorts::sort2::SortExec2;
use datafusion::physical_plan::{metrics, ExecutionPlan, RecordBatchStream};
use futures::{Stream, StreamExt};

Expand Down Expand Up @@ -153,8 +153,8 @@ fn build_exec_plan_diagram(
) -> Result<usize> {
let operator_str = if plan.as_any().downcast_ref::<HashAggregateExec>().is_some() {
"HashAggregateExec"
} else if plan.as_any().downcast_ref::<SortExec>().is_some() {
"SortExec"
} else if plan.as_any().downcast_ref::<SortExec2>().is_some() {
"SortExec2"
} else if plan.as_any().downcast_ref::<ProjectionExec>().is_some() {
"ProjectionExec"
} else if plan.as_any().downcast_ref::<HashJoinExec>().is_some() {
Expand Down
4 changes: 2 additions & 2 deletions ballista/rust/scheduler/src/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ mod test {
use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec};
use datafusion::physical_plan::hash_join::HashJoinExec;
use datafusion::physical_plan::sorts::sort::SortExec;
use datafusion::physical_plan::sorts::sort2::SortExec2;
use datafusion::physical_plan::{
coalesce_partitions::CoalescePartitionsExec, projection::ProjectionExec,
};
Expand Down Expand Up @@ -361,7 +361,7 @@ mod test {

// verify stage 2
let stage2 = stages[2].children()[0].clone();
let sort = downcast_exec!(stage2, SortExec);
let sort = downcast_exec!(stage2, SortExec2);
let coalesce_partitions = sort.children()[0].clone();
let coalesce_partitions =
downcast_exec!(coalesce_partitions, CoalescePartitionsExec);
Expand Down
21 changes: 6 additions & 15 deletions benchmarks/queries/q1.sql
Original file line number Diff line number Diff line change
@@ -1,21 +1,12 @@
select
l_returnflag,
l_linestatus,
sum(l_quantity) as sum_qty,
sum(l_extendedprice) as sum_base_price,
sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
avg(l_quantity) as avg_qty,
avg(l_extendedprice) as avg_price,
avg(l_discount) as avg_disc,
count(*) as count_order
l_quantity,
l_extendedprice,
l_discount,
l_tax
from
lineitem
where
l_shipdate <= date '1998-09-02'
group by
l_returnflag,
l_linestatus
order by
l_returnflag,
l_linestatus;
l_extendedprice,
l_discount;
14 changes: 7 additions & 7 deletions datafusion/core/src/physical_optimizer/repartition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ mod tests {
use crate::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec};
use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
use crate::physical_plan::projection::ProjectionExec;
use crate::physical_plan::sorts::sort::SortExec;
use crate::physical_plan::sorts::sort2::SortExec2;
use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
use crate::physical_plan::union::UnionExec;
use crate::physical_plan::{displayable, Statistics};
Expand Down Expand Up @@ -290,7 +290,7 @@ mod tests {
expr: col("c1", &schema()).unwrap(),
options: SortOptions::default(),
}];
Arc::new(SortExec::try_new(sort_exprs, input).unwrap())
Arc::new(SortExec2::try_new(sort_exprs, input).unwrap())
}

fn projection_exec(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
Expand Down Expand Up @@ -413,7 +413,7 @@ mod tests {
"GlobalLimitExec: limit=100",
"LocalLimitExec: limit=100",
// data is sorted so can't repartition here
"SortExec: [c1@0 ASC]",
"SortExec2: [c1@0 ASC]",
"ParquetExec: limit=None, partitions=[x], projection=[c1]",
];

Expand All @@ -431,7 +431,7 @@ mod tests {
"FilterExec: c1@0",
// data is sorted so can't repartition here even though
// filter would benefit from parallelism, the answers might be wrong
"SortExec: [c1@0 ASC]",
"SortExec2: [c1@0 ASC]",
"ParquetExec: limit=None, partitions=[x], projection=[c1]",
];

Expand Down Expand Up @@ -519,7 +519,7 @@ mod tests {
let expected = &[
"SortPreservingMergeExec: [c1@0 ASC]",
// Expect repartition on the input to the sort (as it can benefit from additional parallelism)
"SortExec: [c1@0 ASC]",
"SortExec2: [c1@0 ASC]",
"ProjectionExec: expr=[c1@0 as c1]",
"RepartitionExec: partitioning=RoundRobinBatch(10)",
"ParquetExec: limit=None, partitions=[x], projection=[c1]",
Expand All @@ -536,7 +536,7 @@ mod tests {
let expected = &[
"SortPreservingMergeExec: [c1@0 ASC]",
// Expect repartition on the input to the sort (as it can benefit from additional parallelism)
"SortExec: [c1@0 ASC]",
"SortExec2: [c1@0 ASC]",
"FilterExec: c1@0",
"RepartitionExec: partitioning=RoundRobinBatch(10)",
"ParquetExec: limit=None, partitions=[x], projection=[c1]",
Expand All @@ -555,7 +555,7 @@ mod tests {
let expected = &[
"SortPreservingMergeExec: [c1@0 ASC]",
// Expect repartition on the input to the sort (as it can benefit from additional parallelism)
"SortExec: [c1@0 ASC]",
"SortExec2: [c1@0 ASC]",
"ProjectionExec: expr=[c1@0 as c1]",
"FilterExec: c1@0",
// repartition is lowest down
Expand Down
8 changes: 4 additions & 4 deletions datafusion/core/src/physical_plan/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ use crate::physical_plan::hash_join::HashJoinExec;
use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
use crate::physical_plan::projection::ProjectionExec;
use crate::physical_plan::repartition::RepartitionExec;
use crate::physical_plan::sorts::sort::SortExec;
use crate::physical_plan::sorts::sort2::SortExec2;
use crate::physical_plan::udf;
use crate::physical_plan::windows::WindowAggExec;
use crate::physical_plan::{join_utils, Partitioning};
Expand Down Expand Up @@ -457,9 +457,9 @@ impl DefaultPhysicalPlanner {
})
.collect::<Result<Vec<_>>>()?;
Arc::new(if can_repartition {
SortExec::new_with_partitioning(sort_keys, input_exec, true)
SortExec2::new_with_partitioning(sort_keys, input_exec, true)
} else {
SortExec::try_new(sort_keys, input_exec)?
SortExec2::try_new(sort_keys, input_exec)?
})
};

Expand Down Expand Up @@ -704,7 +704,7 @@ impl DefaultPhysicalPlanner {
)),
})
.collect::<Result<Vec<_>>>()?;
Ok(Arc::new(SortExec::try_new(sort_expr, physical_input)?) )
Ok(Arc::new(SortExec2::try_new(sort_expr, physical_input)?) )
}
LogicalPlan::Join(Join {
left,
Expand Down
9 changes: 9 additions & 0 deletions datafusion/core/src/physical_plan/sorts/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use std::sync::Arc;
use std::task::{Context, Poll};

pub mod sort;
pub mod sort2;
pub mod sort_preserving_merge;

/// A `SortKeyCursor` is created from a `RecordBatch`, and a set of
Expand Down Expand Up @@ -128,6 +129,14 @@ impl SortKeyCursor {
)));
}

if self.is_finished() && other.is_finished() {
return Ok(Ordering::Equal);
} else if self.is_finished() {
return Ok(Ordering::Greater);
} else if other.is_finished() {
return Ok(Ordering::Less);
}

let zipped: Vec<((&ArrayRef, &ArrayRef), &SortOptions)> = self
.sort_columns
.iter()
Expand Down
Loading