Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial projection pushdown optimization #113

Merged
merged 19 commits into from
May 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions javascript/vegafusion-embed/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 10 additions & 10 deletions python/vegafusion-jupyter/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 27 additions & 2 deletions vegafusion-core/src/expression/ast/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
* this program the details of the active license.
*/
use crate::error::{Result, VegaFusionError};
use crate::expression::column_usage::{
DatasetsColumnUsage, GetDatasetsColumnUsage, VlSelectionFields,
};
use crate::expression::visitors::{
CheckSupportedExprVisitor, ClearSpansVisitor, ExpressionVisitor, GetInputVariablesVisitor,
ImplicitVariablesExprVisitor, MutExpressionVisitor, UpdateVariablesExprVisitor,
CheckSupportedExprVisitor, ClearSpansVisitor, DatasetsColumnUsageVisitor, ExpressionVisitor,
GetInputVariablesVisitor, ImplicitVariablesExprVisitor, MutExpressionVisitor,
UpdateVariablesExprVisitor,
};
use crate::proto::gen::expression::expression::Expr;
use crate::proto::gen::expression::{
Expand All @@ -18,6 +22,8 @@ use crate::proto::gen::expression::{
UnaryExpression,
};
use crate::proto::gen::tasks::Variable;
use crate::task_graph::graph::ScopedVariable;
use crate::task_graph::scope::TaskScope;
use crate::task_graph::task::InputVariable;
use itertools::sorted;
use std::fmt::{Display, Formatter};
Expand Down Expand Up @@ -326,3 +332,22 @@ impl<V: Into<literal::Value>> From<V> for Expr {
Self::Literal(Literal::new(v, &repr))
}
}

impl GetDatasetsColumnUsage for Expression {
fn datasets_column_usage(
&self,
datum_var: &Option<ScopedVariable>,
usage_scope: &[u32],
task_scope: &TaskScope,
vl_selection_fields: &VlSelectionFields,
) -> DatasetsColumnUsage {
let mut visitor = DatasetsColumnUsageVisitor::new(
datum_var,
usage_scope,
task_scope,
vl_selection_fields,
);
self.walk(&mut visitor);
visitor.dataset_column_usage
}
}
196 changes: 196 additions & 0 deletions vegafusion-core/src/expression/column_usage.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
/*
* VegaFusion
* Copyright (C) 2022 VegaFusion Technologies LLC
*
* This program is distributed under multiple licenses.
* Please consult the license documentation provided alongside
* this program the details of the active license.
*/
use crate::task_graph::graph::ScopedVariable;
use crate::task_graph::scope::TaskScope;
use std::collections::{HashMap, HashSet};

pub type VlSelectionFields = HashMap<ScopedVariable, Vec<String>>;

/// Enum storing info on which dataset columns are used in a given context.
/// Due to the dynamic nature of Vega specifications, it's not always possible to statically
/// determine which columns from a dataset will be used at runtime. In this case the
/// ColumnUsage::Unknown variant is used. In the context of projection pushdown,
/// the ColumnUsage::Unknown variant indicates that all of original dataset columns must be
/// maintained
#[derive(Clone, Debug, PartialEq)]
pub enum ColumnUsage {
Unknown,
Known(HashSet<String>),
}

impl ColumnUsage {
pub fn empty() -> ColumnUsage {
ColumnUsage::Known(Default::default())
}

pub fn with_column(&self, column: &str) -> ColumnUsage {
self.union(&ColumnUsage::from(vec![column].as_slice()))
}

/// Take the union of two ColumnUsage instances. If both are ColumnUsage::Known, then take
/// the union of their known columns. If either is ColumnUsage::Unknown, then the union is
/// also Unknown.
pub fn union(&self, other: &ColumnUsage) -> ColumnUsage {
match (self, other) {
(ColumnUsage::Known(self_cols), ColumnUsage::Known(other_cols)) => {
// If both column usages are known, we can union the known columns
let new_cols: HashSet<_> = self_cols.union(other_cols).cloned().collect();
ColumnUsage::Known(new_cols)
}
_ => {
// If either is Unknown, then the union is unknown
ColumnUsage::Unknown
}
}
}
}

impl From<&str> for ColumnUsage {
fn from(column: &str) -> Self {
let columns: HashSet<_> = vec![column.to_string()].into_iter().collect();
Self::Known(columns)
}
}

impl From<&[&str]> for ColumnUsage {
fn from(columns: &[&str]) -> Self {
let columns: HashSet<_> = columns.iter().map(|s| s.to_string()).collect();
Self::Known(columns)
}
}

impl From<&[String]> for ColumnUsage {
fn from(columns: &[String]) -> Self {
let columns: HashSet<_> = columns.iter().cloned().collect();
Self::Known(columns)
}
}

/// Struct that tracks the usage of all columns across a collection of datasets
#[derive(Clone, Debug, PartialEq)]
pub struct DatasetsColumnUsage {
pub usages: HashMap<ScopedVariable, ColumnUsage>,
pub aliases: HashMap<ScopedVariable, ScopedVariable>,
}

impl DatasetsColumnUsage {
pub fn empty() -> Self {
Self {
usages: Default::default(),
aliases: Default::default(),
}
}

pub fn with_column_usage(&self, datum_var: &ScopedVariable, usage: ColumnUsage) -> Self {
let other_column_usage = Self {
usages: vec![(datum_var.clone(), usage)].into_iter().collect(),
aliases: Default::default(),
};
self.union(&other_column_usage)
}

pub fn with_unknown_usage(&self, datum_var: &ScopedVariable) -> Self {
self.with_column_usage(datum_var, ColumnUsage::Unknown)
}

pub fn with_alias(&self, from: ScopedVariable, to: ScopedVariable) -> Self {
let mut aliases = self.aliases.clone();
aliases.insert(from, to);
Self {
usages: self.usages.clone(),
aliases,
}
}

/// Take the union of two DatasetColumnUsage instances.
pub fn union(&self, other: &DatasetsColumnUsage) -> DatasetsColumnUsage {
let self_vars: HashSet<_> = self.usages.keys().cloned().collect();
let other_vars: HashSet<_> = other.usages.keys().cloned().collect();
let union_vars: HashSet<_> = self_vars.union(&other_vars).cloned().collect();

// Union aliases
let mut aliases = self.aliases.clone();
for (key, val) in &other.aliases {
aliases.insert(key.clone(), val.clone());
}

let mut usages: HashMap<ScopedVariable, ColumnUsage> = HashMap::new();
for var in union_vars {
// Check if var is an alias
let var = aliases.get(&var).unwrap_or(&var).clone();

let self_usage = self
.usages
.get(&var)
.cloned()
.unwrap_or_else(ColumnUsage::empty);
let other_usage = other
.usages
.get(&var)
.cloned()
.unwrap_or_else(ColumnUsage::empty);
let combined_usage = self_usage.union(&other_usage);
usages.insert(var, combined_usage);
}

Self { usages, aliases }
}
}

pub trait GetDatasetsColumnUsage {
fn datasets_column_usage(
&self,
datum_var: &Option<ScopedVariable>,
usage_scope: &[u32],
task_scope: &TaskScope,
vl_selection_fields: &VlSelectionFields,
) -> DatasetsColumnUsage;
}

#[cfg(test)]
mod tests {
use crate::expression::column_usage::ColumnUsage;

#[test]
fn test_with_column() {
let left = ColumnUsage::from(vec!["one", "two"].as_slice());
let result = left.with_column("three").with_column("four");
let expected = ColumnUsage::from(vec!["one", "two", "three", "four"].as_slice());
assert_eq!(result, expected)
}

#[test]
fn test_union_known_known() {
let left = ColumnUsage::from(vec!["one", "two"].as_slice());
let right = ColumnUsage::from(vec!["two", "three", "four"].as_slice());
let union = left.union(&right);
let expected = ColumnUsage::from(vec!["one", "two", "three", "four"].as_slice());
assert_eq!(union, expected)
}

#[test]
fn test_union_known_unknown() {
let left = ColumnUsage::from(vec!["one", "two"].as_slice());
let union = left.union(&ColumnUsage::Unknown);
assert_eq!(union, ColumnUsage::Unknown)
}

#[test]
fn test_union_unknown_known() {
let right = ColumnUsage::from(vec!["two", "three", "four"].as_slice());
let union = ColumnUsage::Unknown.union(&right);
assert_eq!(union, ColumnUsage::Unknown)
}

#[test]
fn test_union_unknown_unknown() {
let union = ColumnUsage::Unknown.union(&ColumnUsage::Unknown);
assert_eq!(union, ColumnUsage::Unknown)
}
}
1 change: 1 addition & 0 deletions vegafusion-core/src/expression/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* this program the details of the active license.
*/
pub mod ast;
pub mod column_usage;
pub mod lexer;
pub mod ops;
pub mod parser;
Expand Down
Loading