Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add VegaFusion data transformer with mime renderer, save, and to_dict/to_json integration #3094

Merged
merged 16 commits into from
Jul 8, 2023
Merged
189 changes: 189 additions & 0 deletions altair/utils/_vegafusion_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
from typing import TypedDict, Union, Dict, Set, MutableMapping, Final

from toolz import curried
import uuid
from weakref import WeakValueDictionary

from altair.utils.core import _DataFrameLike
from altair.utils.data import _DataType, _ToValuesReturnType, MaxRowsError
from altair.vegalite.data import default_data_transformer

# Temporary storage for dataframes that have been extracted
# from charts by the vegafusion data transformer. Use a WeakValueDictionary
# rather than a dict so that the Python interpreter is free to garbage
# collect the stored DataFrames.
extracted_inline_tables: MutableMapping[str, _DataFrameLike] = WeakValueDictionary()

# Special URL prefix that VegaFusion uses to denote that a
# dataset in a Vega spec corresponds to an entry in the `inline_datasets`
# kwarg of vf.runtime.pre_transform_spec().
VEGAFUSION_PREFIX: Final = "table://"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe rename this to vf_source://? Since you also prefix a DataFrame as table_<x>, here. This will keep space for the support for arrays/tensors/matrices once they will be introduced.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This prefix is build into VegaFusion. But there is an alternative prefix supported that I can switch to vegafusion+dataset://.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

renamed in 3a5517b



class _ToVegaFusionReturnUrlDict(TypedDict):
url: str


@curried.curry
def vegafusion_data_transformer(
data: _DataType, max_rows: int = 100000
) -> Union[_ToVegaFusionReturnUrlDict, _ToValuesReturnType]:
"""VegaFusion Data Transformer"""
if hasattr(data, "__geo_interface__"):
# Use default transformer for geo interface objects
# # (e.g. a geopandas GeoDataFrame)
return default_data_transformer(data)
elif hasattr(data, "__dataframe__"):
table_name = f"table_{uuid.uuid4()}".replace("-", "_")
extracted_inline_tables[table_name] = data
return {"url": VEGAFUSION_PREFIX + table_name}
else:
# Use default transformer if we don't recognize data type
return default_data_transformer(data)


def get_inline_table_names(vega_spec: dict) -> Set[str]:
"""Get a set of the inline datasets names in the provided Vega spec

Inline datasets are encoded as URLs that start with the table://
prefix.

Parameters
----------
vega_spec: dict
A Vega specification dict

Returns
-------
set of str
Set of the names of the inline datasets that are referenced
in the specification.

Examples
--------
>>> spec = {
... "data": [
... {
... "name": "foo",
... "url": "https://path/to/file.csv"
... },
... {
... "name": "bar",
... "url": "table://inline_dataset_123"
... }
... ]
... }
>>> get_inline_table_names(spec)
{'inline_dataset_123'}
"""
table_names = set()

# Process datasets
for data in vega_spec.get("data", []):
url = data.get("url", "")
if url.startswith(VEGAFUSION_PREFIX):
name = url[len(VEGAFUSION_PREFIX) :]
table_names.add(name)

# Recursively process child marks, which may have their own datasets
for mark in vega_spec.get("marks", []):
table_names.update(get_inline_table_names(mark))

return table_names


def get_inline_tables(vega_spec: dict) -> Dict[str, _DataFrameLike]:
"""Get the inline tables referenced by a Vega specification

Note: This function should only be called on a Vega spec that corresponds
to a chart that was processed by the vegafusion_data_transformer.
Furthermore, this function may only be called once per spec because
the returned dataframes are deleted from internal storage.

Parameters
----------
vega_spec: dict
A Vega specification dict

Returns
-------
dict from str to dataframe
dict from inline dataset name to dataframe object
"""
table_names = get_inline_table_names(vega_spec)
tables = {}
for table_name in table_names:
try:
tables[table_name] = extracted_inline_tables.pop(table_name)
except KeyError:
# named dataset that was provided by the user
pass
return tables


def compile_with_vegafusion(vegalite_spec: dict) -> dict:
"""Compile a Vega-Lite spec to Vega and pre-transform with VegaFusion

Note: This function should only be called on a Vega-Lite spec
that was generated with the "vegafusion" data transformer enabled.
In particular, this spec may contain references to extract datasets
using table:// prefixed URLs.

Parameters
----------
vegalite_spec: dict
A Vega-Lite spec that was generated from an Altair chart with
the "vegafusion" data transformer enabled

Returns
-------
dict
A Vega spec that has been pre-transformed by VegaFusion
"""
from altair import vegalite_compilers, data_transformers

try:
import vegafusion as vf
except ImportError as e:
raise ImportError(
'The "vegafusion" data transformer requires the vegafusion-python-embed\n'
"and vegafusion packages. These can be installed with pip using:\n"
' pip install "vegafusion[embed]"\n'
"Or with conda using:\n"
" conda install -c conda-forge vegafusion-python-embed vegafusion"
) from e

# Compile Vega-Lite spec to Vega
vega_spec = vegalite_compilers.get()(vegalite_spec)

# Retrieve dict of inline tables referenced by the spec
inline_tables = get_inline_tables(vega_spec)

# Pre-evaluate transforms in vega spec with vegafusion
row_limit = data_transformers.options.get("max_rows", None)
transformed_vega_spec, warnings = vf.runtime.pre_transform_spec(
vega_spec,
vf.get_local_tz(),
inline_datasets=inline_tables,
row_limit=row_limit,
)

# Check from row limit warning and convert to MaxRowsError
for warning in warnings:
if warning.get("type") == "RowLimitExceeded":
raise MaxRowsError(
Copy link
Contributor

@binste binste Jul 4, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also never experienced any issues with larger datasets for single charts. However, if you do some exploratory data analysis with many charts in a Jupyter notebook, that notebook can get rather slow after a while if you plot many larger charts. I still like the idea though of increasing it and 100k sounds as good as any as it would be difficult to benchmark and figure out a good compromise. Just wanted to mention this.

"The number of dataset rows after filtering and aggregation exceeds\n"
f"the current limit of {row_limit}. Try adding an aggregation to reduce\n"
"the size of the dataset that must be loaded into the browser. Or, disable\n"
"the limit by calling alt.data_transformers.disable_max_rows(). Note that\n"
"disabling this limit may cause the browser to freeze or crash."
)

return transformed_vega_spec


def using_vegafusion() -> bool:
"""Check whether the vegafusion data transfomer is enabled"""
from altair.vegalite.v5.data import data_transformers

return data_transformers.active == "vegafusion"
10 changes: 10 additions & 0 deletions altair/utils/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Callable, Dict, Optional, Tuple, Any, Union
import uuid

from ._vegafusion_data import compile_with_vegafusion, using_vegafusion
from .plugin_registry import PluginRegistry, PluginEnabler
from .mimebundle import spec_to_mimebundle
from .schemapi import validate_jsonschema
Expand Down Expand Up @@ -161,10 +162,19 @@ def default_renderer_base(
This renderer works with modern frontends (JupyterLab, nteract) that know
how to render the custom VegaLite MIME type listed above.
"""
from altair.vegalite.v5.display import VEGA_MIME_TYPE, VEGALITE_MIME_TYPE
assert isinstance(spec, dict)
bundle: Dict[str, Union[str, dict]] = {}
metadata: Dict[str, Dict[str, Any]] = {}

if using_vegafusion():
spec = compile_with_vegafusion(spec)

# Swap mimetype from Vega-Lite to Vega.
# If mimetype was JSON, leave it alone
if mime_type == VEGALITE_MIME_TYPE:
mime_type = VEGA_MIME_TYPE

bundle[mime_type] = spec
bundle["text/plain"] = str_repr
if options:
Expand Down
33 changes: 25 additions & 8 deletions altair/utils/mimebundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,14 @@ def spec_to_mimebundle(
----
The png, svg, pdf, and vega outputs require the altair_saver package
"""
from altair.utils.display import compile_with_vegafusion, using_vegafusion
if mode != "vega-lite":
raise ValueError("mode must be 'vega-lite'")

if using_vegafusion():
spec = compile_with_vegafusion(spec)
mode = "vega"

if format in ["png", "svg", "pdf", "vega"]:
return _spec_to_mimebundle_with_engine(
spec, format, mode, engine=engine, **kwargs
Expand Down Expand Up @@ -82,7 +87,7 @@ def _spec_to_mimebundle_with_engine(spec, format, mode, **kwargs):
a dictionary representing a vega-lite plot spec
format : string {'png', 'svg', 'pdf', 'vega'}
the format of the mimebundle to be returned
mode : string {'vega-lite'}
mode : string {'vega-lite', 'vega'}
The rendering mode.
engine: string {'vl-convert', 'altair_saver'}
the conversion engine to use
Expand All @@ -102,17 +107,29 @@ def _spec_to_mimebundle_with_engine(spec, format, mode, **kwargs):
# from SCHEMA_VERSION (of the form 'v5.2.0')
vl_version = "_".join(SCHEMA_VERSION.split(".")[:2])
if format == "vega":
vg = vlc.vegalite_to_vega(spec, vl_version=vl_version)
if mode == "vega":
vg = spec
else:
vg = vlc.vegalite_to_vega(spec, vl_version=vl_version)
return {"application/vnd.vega.v5+json": vg}
elif format == "svg":
svg = vlc.vegalite_to_svg(spec, vl_version=vl_version)
if mode == "vega":
svg = vlc.vega_to_svg(spec)
else:
svg = vlc.vegalite_to_svg(spec, vl_version=vl_version)
return {"image/svg+xml": svg}
elif format == "png":
png = vlc.vegalite_to_png(
spec,
vl_version=vl_version,
scale=kwargs.get("scale_factor", 1.0),
)
if mode == "vega":
png = vlc.vega_to_png(
spec,
scale=kwargs.get("scale_factor", 1.0),
)
else:
png = vlc.vegalite_to_png(
spec,
vl_version=vl_version,
scale=kwargs.get("scale_factor", 1.0),
)
return {"image/png": png}
else:
# This should be validated above
Expand Down
27 changes: 19 additions & 8 deletions altair/utils/save.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def save(
**kwargs :
additional kwargs passed to spec_to_mimebundle.
"""
from altair.utils._vegafusion_data import using_vegafusion
if json_kwds is None:
json_kwds = {}

Expand All @@ -122,15 +123,10 @@ def save(

format = set_inspect_format_argument(format, fp, inline)

# Temporarily turn off any data transformers so that all data is inlined
# when calling chart.to_dict. This is relevant for vl-convert which cannot access
# local json files which could be created by a json data transformer. Furthermore,
# we don't exit the with statement until this function completed due to the issue
# described at https://github.com/vega/vl-convert/issues/31
with data_transformers.enable("default"), data_transformers.disable_max_rows():
def perform_save():
spec = chart.to_dict()

mode = set_inspect_mode_argument(mode, embed_options, spec, vegalite_version)
inner_mode = set_inspect_mode_argument(mode, embed_options, spec, vegalite_version)

if format == "json":
json_spec = json.dumps(spec, **json_kwds)
Expand All @@ -141,7 +137,7 @@ def save(
mimebundle = spec_to_mimebundle(
spec=spec,
format=format,
mode=mode,
mode=inner_mode,
vega_version=vega_version,
vegalite_version=vegalite_version,
vegaembed_version=vegaembed_version,
Expand Down Expand Up @@ -174,3 +170,18 @@ def save(
)
else:
raise ValueError("Unsupported format: '{}'".format(format))

if using_vegafusion():
# When the vegafusion data transformer is enabled, transforms will be
# evaluated during save and the resulting data will be included in the
# vega specification that is saved.
with data_transformers.disable_max_rows():
perform_save()
else:
# Temporarily turn off any data transformers so that all data is inlined
# when calling chart.to_dict. This is relevant for vl-convert which cannot access
# local json files which could be created by a json data transformer. Furthermore,
# we don't exit the with statement until this function completed due to the issue
# described at https://github.com/vega/vl-convert/issues/31
with data_transformers.enable("default"), data_transformers.disable_max_rows():
perform_save()
2 changes: 1 addition & 1 deletion altair/vegalite/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class DataTransformerRegistry(_DataTransformerRegistry):
def disable_max_rows(self) -> PluginEnabler:
"""Disable the MaxRowsError."""
options = self.options
if self.active == "default":
if self.active in ("default", "vegafusion"):
options = options.copy()
options["max_rows"] = None
return self.enable(**options)
Expand Down
4 changes: 4 additions & 0 deletions altair/vegalite/v5/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
DataTransformerRegistry,
)

from ...utils._vegafusion_data import vegafusion_data_transformer

if sys.version_info >= (3, 8):
from typing import Final
else:
Expand All @@ -31,6 +33,7 @@
data_transformers.register("default", default_data_transformer)
data_transformers.register("json", to_json)
data_transformers.register("csv", to_csv)
data_transformers.register("vegafusion", vegafusion_data_transformer)
data_transformers.enable("default")


Expand All @@ -45,4 +48,5 @@
"to_json",
"to_values",
"data_transformers",
"vegafusion_data_transformer",
)
3 changes: 3 additions & 0 deletions altair/vegalite/v5/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
# The MIME type for Vega-Lite 5.x releases.
VEGALITE_MIME_TYPE: Final = "application/vnd.vegalite.v5+json"

# The MIME type for Vega 5.x releases.
VEGA_MIME_TYPE: Final = "application/vnd.vega.v5+json"

# The entry point group that can be used by other packages to declare other
# renderers that will be auto-detected. Explicit registration is also
# allowed by the PluginRegistery API.
Expand Down