Skip to content

Commit

Permalink
Initial projection pushdown optimization (#113)
Browse files Browse the repository at this point in the history
* Add column usage enum
* Add support for detecting column usage in expressions
* Extract GetColumnUsage trait
* Add logic to detect column usage in mark encodings
* Add column usage detection for marks and scales
* Add column usage detection for signals
* Add column usage detection for group marks and full charts
* Add support for project transform
* Add projection pushdown optimization
  • Loading branch information
jonmmease authored May 26, 2022
1 parent 05be5c7 commit e0d24a4
Show file tree
Hide file tree
Showing 25 changed files with 1,535 additions and 23 deletions.
6 changes: 3 additions & 3 deletions javascript/vegafusion-embed/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 10 additions & 10 deletions python/vegafusion-jupyter/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 27 additions & 2 deletions vegafusion-core/src/expression/ast/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
* this program the details of the active license.
*/
use crate::error::{Result, VegaFusionError};
use crate::expression::column_usage::{
DatasetsColumnUsage, GetDatasetsColumnUsage, VlSelectionFields,
};
use crate::expression::visitors::{
CheckSupportedExprVisitor, ClearSpansVisitor, ExpressionVisitor, GetInputVariablesVisitor,
ImplicitVariablesExprVisitor, MutExpressionVisitor, UpdateVariablesExprVisitor,
CheckSupportedExprVisitor, ClearSpansVisitor, DatasetsColumnUsageVisitor, ExpressionVisitor,
GetInputVariablesVisitor, ImplicitVariablesExprVisitor, MutExpressionVisitor,
UpdateVariablesExprVisitor,
};
use crate::proto::gen::expression::expression::Expr;
use crate::proto::gen::expression::{
Expand All @@ -18,6 +22,8 @@ use crate::proto::gen::expression::{
UnaryExpression,
};
use crate::proto::gen::tasks::Variable;
use crate::task_graph::graph::ScopedVariable;
use crate::task_graph::scope::TaskScope;
use crate::task_graph::task::InputVariable;
use itertools::sorted;
use std::fmt::{Display, Formatter};
Expand Down Expand Up @@ -326,3 +332,22 @@ impl<V: Into<literal::Value>> From<V> for Expr {
Self::Literal(Literal::new(v, &repr))
}
}

impl GetDatasetsColumnUsage for Expression {
fn datasets_column_usage(
&self,
datum_var: &Option<ScopedVariable>,
usage_scope: &[u32],
task_scope: &TaskScope,
vl_selection_fields: &VlSelectionFields,
) -> DatasetsColumnUsage {
let mut visitor = DatasetsColumnUsageVisitor::new(
datum_var,
usage_scope,
task_scope,
vl_selection_fields,
);
self.walk(&mut visitor);
visitor.dataset_column_usage
}
}
196 changes: 196 additions & 0 deletions vegafusion-core/src/expression/column_usage.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
/*
* VegaFusion
* Copyright (C) 2022 VegaFusion Technologies LLC
*
* This program is distributed under multiple licenses.
* Please consult the license documentation provided alongside
* this program the details of the active license.
*/
use crate::task_graph::graph::ScopedVariable;
use crate::task_graph::scope::TaskScope;
use std::collections::{HashMap, HashSet};

pub type VlSelectionFields = HashMap<ScopedVariable, Vec<String>>;

/// Enum storing info on which dataset columns are used in a given context.
/// Due to the dynamic nature of Vega specifications, it's not always possible to statically
/// determine which columns from a dataset will be used at runtime. In this case the
/// ColumnUsage::Unknown variant is used. In the context of projection pushdown,
/// the ColumnUsage::Unknown variant indicates that all of original dataset columns must be
/// maintained
#[derive(Clone, Debug, PartialEq)]
pub enum ColumnUsage {
Unknown,
Known(HashSet<String>),
}

impl ColumnUsage {
pub fn empty() -> ColumnUsage {
ColumnUsage::Known(Default::default())
}

pub fn with_column(&self, column: &str) -> ColumnUsage {
self.union(&ColumnUsage::from(vec![column].as_slice()))
}

/// Take the union of two ColumnUsage instances. If both are ColumnUsage::Known, then take
/// the union of their known columns. If either is ColumnUsage::Unknown, then the union is
/// also Unknown.
pub fn union(&self, other: &ColumnUsage) -> ColumnUsage {
match (self, other) {
(ColumnUsage::Known(self_cols), ColumnUsage::Known(other_cols)) => {
// If both column usages are known, we can union the known columns
let new_cols: HashSet<_> = self_cols.union(other_cols).cloned().collect();
ColumnUsage::Known(new_cols)
}
_ => {
// If either is Unknown, then the union is unknown
ColumnUsage::Unknown
}
}
}
}

impl From<&str> for ColumnUsage {
fn from(column: &str) -> Self {
let columns: HashSet<_> = vec![column.to_string()].into_iter().collect();
Self::Known(columns)
}
}

impl From<&[&str]> for ColumnUsage {
fn from(columns: &[&str]) -> Self {
let columns: HashSet<_> = columns.iter().map(|s| s.to_string()).collect();
Self::Known(columns)
}
}

impl From<&[String]> for ColumnUsage {
fn from(columns: &[String]) -> Self {
let columns: HashSet<_> = columns.iter().cloned().collect();
Self::Known(columns)
}
}

/// Struct that tracks the usage of all columns across a collection of datasets
#[derive(Clone, Debug, PartialEq)]
pub struct DatasetsColumnUsage {
pub usages: HashMap<ScopedVariable, ColumnUsage>,
pub aliases: HashMap<ScopedVariable, ScopedVariable>,
}

impl DatasetsColumnUsage {
pub fn empty() -> Self {
Self {
usages: Default::default(),
aliases: Default::default(),
}
}

pub fn with_column_usage(&self, datum_var: &ScopedVariable, usage: ColumnUsage) -> Self {
let other_column_usage = Self {
usages: vec![(datum_var.clone(), usage)].into_iter().collect(),
aliases: Default::default(),
};
self.union(&other_column_usage)
}

pub fn with_unknown_usage(&self, datum_var: &ScopedVariable) -> Self {
self.with_column_usage(datum_var, ColumnUsage::Unknown)
}

pub fn with_alias(&self, from: ScopedVariable, to: ScopedVariable) -> Self {
let mut aliases = self.aliases.clone();
aliases.insert(from, to);
Self {
usages: self.usages.clone(),
aliases,
}
}

/// Take the union of two DatasetColumnUsage instances.
pub fn union(&self, other: &DatasetsColumnUsage) -> DatasetsColumnUsage {
let self_vars: HashSet<_> = self.usages.keys().cloned().collect();
let other_vars: HashSet<_> = other.usages.keys().cloned().collect();
let union_vars: HashSet<_> = self_vars.union(&other_vars).cloned().collect();

// Union aliases
let mut aliases = self.aliases.clone();
for (key, val) in &other.aliases {
aliases.insert(key.clone(), val.clone());
}

let mut usages: HashMap<ScopedVariable, ColumnUsage> = HashMap::new();
for var in union_vars {
// Check if var is an alias
let var = aliases.get(&var).unwrap_or(&var).clone();

let self_usage = self
.usages
.get(&var)
.cloned()
.unwrap_or_else(ColumnUsage::empty);
let other_usage = other
.usages
.get(&var)
.cloned()
.unwrap_or_else(ColumnUsage::empty);
let combined_usage = self_usage.union(&other_usage);
usages.insert(var, combined_usage);
}

Self { usages, aliases }
}
}

pub trait GetDatasetsColumnUsage {
fn datasets_column_usage(
&self,
datum_var: &Option<ScopedVariable>,
usage_scope: &[u32],
task_scope: &TaskScope,
vl_selection_fields: &VlSelectionFields,
) -> DatasetsColumnUsage;
}

#[cfg(test)]
mod tests {
use crate::expression::column_usage::ColumnUsage;

#[test]
fn test_with_column() {
let left = ColumnUsage::from(vec!["one", "two"].as_slice());
let result = left.with_column("three").with_column("four");
let expected = ColumnUsage::from(vec!["one", "two", "three", "four"].as_slice());
assert_eq!(result, expected)
}

#[test]
fn test_union_known_known() {
let left = ColumnUsage::from(vec!["one", "two"].as_slice());
let right = ColumnUsage::from(vec!["two", "three", "four"].as_slice());
let union = left.union(&right);
let expected = ColumnUsage::from(vec!["one", "two", "three", "four"].as_slice());
assert_eq!(union, expected)
}

#[test]
fn test_union_known_unknown() {
let left = ColumnUsage::from(vec!["one", "two"].as_slice());
let union = left.union(&ColumnUsage::Unknown);
assert_eq!(union, ColumnUsage::Unknown)
}

#[test]
fn test_union_unknown_known() {
let right = ColumnUsage::from(vec!["two", "three", "four"].as_slice());
let union = ColumnUsage::Unknown.union(&right);
assert_eq!(union, ColumnUsage::Unknown)
}

#[test]
fn test_union_unknown_unknown() {
let union = ColumnUsage::Unknown.union(&ColumnUsage::Unknown);
assert_eq!(union, ColumnUsage::Unknown)
}
}
1 change: 1 addition & 0 deletions vegafusion-core/src/expression/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* this program the details of the active license.
*/
pub mod ast;
pub mod column_usage;
pub mod lexer;
pub mod ops;
pub mod parser;
Expand Down
Loading

0 comments on commit e0d24a4

Please sign in to comment.