Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Notebook updates #131

Merged
merged 16 commits into from
Mar 31, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions datashader/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,18 @@ class Pipeline(object):
color_fn : callable, optional
A callable that takes the output of ``tranform_fn``, and returns an
``Image`` object. Default is ``interpolate``.
spread_fn : callable, optional
A callable that takes the output of ``color_fn``, and returns another
``Image`` object. Default is ``dynspread``.
"""
def __init__(self, df, glyph, agg=reductions.count(),
transform_fn=identity, color_fn=tf.interpolate):
transform_fn=identity, color_fn=tf.interpolate, spread_fn=tf.dynspread):
self.df = df
self.glyph = glyph
self.agg = agg
self.transform_fn = transform_fn
self.color_fn = color_fn
self.spread_fn = spread_fn

def __call__(self, x_range=None, y_range=None, width=600, height=600):
"""Compute an image from the specified pipeline.
Expand All @@ -55,4 +59,5 @@ def __call__(self, x_range=None, y_range=None, width=600, height=600):
canvas = core.Canvas(plot_width=width, plot_height=height,
x_range=x_range, y_range=y_range)
bins = core.bypixel(self.df, canvas, self.glyph, self.agg)
return self.color_fn(self.transform_fn(bins))
img = self.color_fn(self.transform_fn(bins))
return self.spread_fn(img)
20 changes: 10 additions & 10 deletions datashader/tests/test_transfer_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,9 @@ def test_masks():
np.testing.assert_equal(tf._square_mask(0), np.ones((1, 1), dtype='bool'))
# Circle
np.testing.assert_equal(tf._circle_mask(0), np.ones((1, 1), dtype='bool'))
out = np.array([[0, 1, 0],
out = np.array([[1, 1, 1],
[1, 1, 1],
[0, 1, 0]], dtype='bool')
[1, 1, 1]], dtype='bool')
np.testing.assert_equal(tf._circle_mask(1), out)
out = np.array([[0, 0, 1, 1, 1, 0, 0],
[0, 1, 1, 1, 1, 1, 0],
Expand All @@ -203,11 +203,11 @@ def test_spread():
img = tf.Image(data, coords=coords, dims=dims)

s = tf.spread(img)
o = np.array([[0xdc00007d, 0xdc009036, 0x7d00007d, 0x00000000, 0x00000000],
[0xdc009036, 0xdc009036, 0x7d00ff00, 0x00000000, 0x00000000],
[0x7d00007d, 0x7d00ff00, 0x00000000, 0x7dff0000, 0x00000000],
o = np.array([[0xed00863b, 0xed00863b, 0xbc00a82a, 0x00000000, 0x00000000],
[0xed00863b, 0xed00863b, 0xbc00a82a, 0x00000000, 0x00000000],
[0xbc00a82a, 0xbc00a82a, 0xbca85600, 0x7dff0000, 0x7dff0000],
[0x00000000, 0x00000000, 0x7dff0000, 0x7dff0000, 0x7dff0000],
[0x00000000, 0x00000000, 0x00000000, 0x7dff0000, 0x00000000]])
[0x00000000, 0x00000000, 0x7dff0000, 0x7dff0000, 0x7dff0000]])
np.testing.assert_equal(s.data, o)
assert (s.x_axis == img.x_axis).all()
assert (s.y_axis == img.y_axis).all()
Expand All @@ -230,11 +230,11 @@ def test_spread():
np.testing.assert_equal(s.data, o)

s = tf.spread(img, how='add')
o = np.array([[0xff0000b7, 0xff007d7a, 0x7d00007d, 0x00000000, 0x00000000],
[0xff007d7a, 0xff007d7a, 0x7d00ff00, 0x00000000, 0x00000000],
[0x7d00007d, 0x7d00ff00, 0x00000000, 0x7dff0000, 0x00000000],
o = np.array([[0xff007db7, 0xff007db7, 0xfa007f3e, 0x00000000, 0x00000000],
[0xff007db7, 0xff007db7, 0xfa007f3e, 0x00000000, 0x00000000],
[0xfa007f3e, 0xfa007f3e, 0xfa7f7f00, 0x7dff0000, 0x7dff0000],
[0x00000000, 0x00000000, 0x7dff0000, 0x7dff0000, 0x7dff0000],
[0x00000000, 0x00000000, 0x00000000, 0x7dff0000, 0x00000000]])
[0x00000000, 0x00000000, 0x7dff0000, 0x7dff0000, 0x7dff0000]])
np.testing.assert_equal(s.data, o)

mask = np.array([[1, 0, 1],
Expand Down
18 changes: 10 additions & 8 deletions datashader/transfer_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

from io import BytesIO
import warnings

import collections

import numpy as np
import numba as nb
import toolz as tz
Expand All @@ -11,7 +12,7 @@


from .colors import rgb
from .composite import composite_op_lookup, source
from .composite import composite_op_lookup, over
from .utils import ngjit


Expand Down Expand Up @@ -61,7 +62,7 @@ def stack(*imgs, **kwargs):
return Image(out, coords=imgs[0].coords, dims=imgs[0].dims)


def eq_hist(data, mask=None, nbins=256):
def eq_hist(data, mask=None, nbins=256*256):
"""Return a numpy array after histogram equalization.

For use in `interpolate`.
Expand Down Expand Up @@ -114,7 +115,7 @@ def _normalize_interpolate_how(how):
raise ValueError("Unknown interpolation method: {0}".format(how))


def interpolate(agg, low=None, high=None, cmap=None, how='cbrt'):
def interpolate(agg, low=None, high=None, cmap=None, how='eq_hist'):
"""Convert a 2D DataArray to an image.

Data is converted to an image either by interpolating between a `low` and
Expand Down Expand Up @@ -161,6 +162,8 @@ def interpolate(agg, low=None, high=None, cmap=None, how='cbrt'):
offset = agg.data[agg.data > 0].min()
data = how(agg.data - offset, mask.data)
span = [np.nanmin(data), np.nanmax(data)]
if isinstance(cmap,collections.Iterator):
cmap = list(cmap)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if isinstance(cmap, collections.Iterator)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's better, thanks. Done.

if isinstance(cmap, list):
rspan, gspan, bspan = np.array(list(zip(*map(rgb, cmap))))
span = np.linspace(span[0], span[1], len(cmap))
Expand All @@ -180,7 +183,7 @@ def interpolate(agg, low=None, high=None, cmap=None, how='cbrt'):
return Image(img, coords=agg.coords, dims=agg.dims)


def colorize(agg, color_key, how='cbrt', min_alpha=20):
def colorize(agg, color_key, how='eq_hist', min_alpha=20):
"""Color a CategoricalAggregate by field.

Parameters
Expand Down Expand Up @@ -245,7 +248,7 @@ def set_background(img, color=None):
if color is None:
return img
background = np.uint8(rgb(color) + (255,)).view('uint32')[0]
data = source(img.data, background)
data = over(img.data, background)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you change this? This doesn't set the background, it overlays an image over a background of that color. For images that use alpha to indicate magnitude (output of colorize), this will make all set pixels have full alpha. I'm unsure if this is desired here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's correct, and intended. The problem was that previously, if one tried to set the background to black, it only changed the fully transparent pixels, which had very strange results -- if you take such an image and view it in e.g. Preview on a Mac, it's all garbled: the fully transparent pixels are black, but Preview's default gray background shines through (to a greater or lesser extent, depending on alpha) all the others. So I don't think that set_background was doing something useful before; changing only the fully transparent pixels while removing their transparency yet leaving other pixels transparent doesn't result in a usable image in any scenario I can think of, and certainly not in the use cases I had in mind for set_background.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On second look, I think I see what you're saying here, but I don't think it's true. I.e., you're worried that the alpha channel will simply be discarded from the top image? The code isn't discarding the src alpha, as far as I can see; it's using it to control how the src gets mixed with the background, which is what we want here. Try comparing census.ipynb with set_background using over and source for comparison, and you should see what I mean...

return Image(data, coords=img.coords, dims=img.dims)


Expand Down Expand Up @@ -326,8 +329,7 @@ def _square_mask(px):
def _circle_mask(r):
"""Produce a circular mask with a diameter of ``2 * r + 1``"""
x = np.arange(-r, r + 1, dtype='i4')
bound = r + 0.5 if r > 1 else r
return np.where(np.sqrt(x**2 + x[:, None]**2) <= bound, True, False)
return np.where(np.sqrt(x**2 + x[:, None]**2) <= r+0.5, True, False)


_mask_lookup = {'square': _square_mask,
Expand Down
140 changes: 123 additions & 17 deletions examples/census.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The output of `.tail()` shows that there are more than 300 million datapoints (one per person), each with a location in Web Mercator format, and that the race for each datapoint has been encoded as a single character (where 'w' is white, 'b' is black, 'a' is Asian, 'h' is Hispanic, and 'o' is other (typically Native American).\n",
"The output of `.tail()` shows that there are more than 300 million datapoints (one per person), each with a location in Web Mercator format, and that the race for each datapoint has been encoded as a single character (where 'w' is white, 'b' is black, 'a' is Asian, 'h' is Hispanic, and 'o' is other (typically Native American)).\n",
"\n",
"Let's define some geographic ranges to look at later, and also a default plot size. Feel free to increase `plot_width` to 2000 or more if you have a very large monitor or want to save files to disk, which shouldn't *greatly* affect the processing time or memory requirements. "
]
Expand All @@ -72,10 +72,17 @@
},
"outputs": [],
"source": [
"USA = ((-13884029, -7453304), (2698291, 6455972))\n",
"LakeMichigan = ((-10206131, -9348029), (4975642, 5477059))\n",
"Chicago = (( -9828281, -9717659), (5096658, 5161298))\n",
"Chinatown = (( -9759210, -9754583), (5137122, 5139825))\n",
"USA = ((-13884029, -7453304), (2698291, 6455972))\n",
"LakeMichigan = ((-10206131, -9348029), (4975642, 5477059))\n",
"Chicago = (( -9828281, -9717659), (5096658, 5161298))\n",
"Chinatown = (( -9759210, -9754583), (5137122, 5139825))\n",
"\n",
"NewYorkCity = (( -8280656, -8175066), (4940514, 4998954))\n",
"LosAngeles = ((-13195052, -13114944), (3979242, 4023720))\n",
"Houston = ((-10692703, -10539441), (3432521, 3517616))\n",
"Austin = ((-10898752, -10855820), (3525750, 3550837))\n",
"NewOrleans = ((-10059963, -10006348), (3480787, 3510555))\n",
"Atlanta = (( -9448349, -9354773), (3955797, 4007753))\n",
"\n",
"x_range,y_range = USA\n",
"\n",
Expand All @@ -101,9 +108,6 @@
"black_background = True\n",
"\n",
"from IPython.core.display import HTML, display\n",
"if black_background:\n",
" display(HTML(\"<style>.output_result { background-color:black !important; color:white }</style>\"))\n",
"\n",
"display(HTML(\"<style>.container { width:100% !important; }</style>\"))"
]
},
Expand All @@ -124,6 +128,8 @@
"source": [
"def export(img,filename,fmt=\".png\",_return=True):\n",
" \"\"\"Given a datashader Image object, saves it to a disk file in the requested format\"\"\"\n",
" if black_background: \n",
" img=tf.set_background(img,\"black\")\n",
" img.to_pil().save(filename+fmt)\n",
" return img if _return else None\n",
"\n",
Expand Down Expand Up @@ -236,9 +242,9 @@
"source": [
"Suddenly, we can see an amazing amount of structure! There are clearly meaningful patterns at nearly every location, ranging from the geographic variations in the mountainous West, to the densely spaced urban centers in New England, and the many towns stretched out along roadsides in the midwest (especially those leading to Denver, the hot spot towards the right of the Rocky Mountains). \n",
"\n",
"Clearly, we can now see much more of what's going on in this dataset, thanks to the logarithmic mapping. Yet the choice of `'log'` was purely arbitrary, and one could easily imagine that other nonlinear functions would show other interesting patterns. Instead of blindly searching through the space of all such functions, we can step back and notice that the main effect of the log transform has been to reveal *local* patterns at all population densities -- urban areas show up clearly even if they are just slightly more dense than their immediate, rural neighbors, yet they still show up as denser areas in more populated regions.\n",
"Clearly, we can now see much more of what's going on in this dataset, thanks to the logarithmic mapping. Yet the choice of `'log'` was purely arbitrary, and one could easily imagine that other nonlinear functions would show other interesting patterns. Instead of blindly searching through the space of all such functions, we can step back and notice that the main effect of the log transform has been to reveal *local* patterns at all population densities -- small towns show up clearly even if they are just slightly more dense than their immediate, rural neighbors, yet large cities with high population density also show up well against the surrounding suburban regions, even if those regions are more dense than the small towns on an absolute scale.\n",
"\n",
"With this in mind, let's try the image-processing technique called histogram equalization. I.e., given a set of raw counts, map these into a range for display such that every available color on the screen represents about the same number of samples in the original dataset. The result is similar to that from the log transform, but is now non-parametric -- it will equalize any linearly or nonlinearly distributed integer data, regardless of the distribution:"
"With this idea of showing relative differences across a large range of data values in mind, let's try the image-processing technique called histogram equalization. I.e., given a set of raw counts, map these into a range for display such that every available color on the screen represents about the same number of samples in the original dataset. The result is similar to that from the log transform, but is now non-parametric -- it will equalize any linearly or nonlinearly distributed integer data, regardless of the distribution:"
]
},
{
Expand Down Expand Up @@ -271,7 +277,7 @@
},
"outputs": [],
"source": [
"print(Hot)\n",
"print(cm(Hot,0.2))\n",
"export(tf.interpolate(agg, cmap = cm(Hot,0.2), how='eq_hist'),\"census_ds_hot_eq_hist\")"
]
},
Expand Down Expand Up @@ -309,7 +315,7 @@
},
"outputs": [],
"source": [
"export(tf.interpolate(agg, cmap=cm(viridis), how='eq_hist'),\"census_viridis_eq_hist.png\")"
"export(tf.interpolate(agg, cmap=cm(viridis), how='eq_hist'),\"census_viridis_eq_hist\")"
]
},
{
Expand Down Expand Up @@ -443,7 +449,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Eventually, we can zoom in far enough to see individual datapoints, which we make more visible here using the `tf.spread` function to enlarge each point to cover multiple pixels. Here we can see that the Chinatown region of Chicago has, as expected, very high numbers of Asian residents, and that other nearby regions (separated by features like roads and highways) have other races, varying in how uniformly segregated they are:"
"Eventually, we can zoom in far enough to see individual datapoints. Here we can see that the Chinatown region of Chicago has, as expected, very high numbers of Asian residents, and that other nearby regions (separated by features like roads and highways) have other races, varying in how uniformly segregated they are:"
]
},
{
Expand All @@ -457,13 +463,112 @@
"export(tf.spread(create_image(*Chinatown),px=plot_width/400),\"Zoom 3 - Chinatown\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that we've used the `tf.spread` function to enlarge each point to cover multiple pixels so that each point is clearly visible. Instead of the default circular spreading, you could choose `shape='square'` if you prefer, or any mask shape, e.g.:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"mask = np.array([[1, 1, 1, 1, 1],\n",
" [1, 0, 0, 0, 1],\n",
" [1, 0, 0, 0, 1],\n",
" [1, 0, 0, 0, 1],\n",
" [1, 1, 1, 1, 1]])\n",
"\n",
"export(tf.spread(create_image(*Chinatown), mask=mask),\"Chinatown outlines\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Other cities, for comparison\n",
"\n",
"Different cities have very different racial makeup, but they all appear highly segregated:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"export(create_image(*NewYorkCity),\"NYC\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"export(create_image(*LosAngeles),\"LosAngeles\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"export(create_image(*Houston),\"Houston\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"export(create_image(*Atlanta),\"Atlanta\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"export(create_image(*NewOrleans),\"NewOrleans\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"export(create_image(*Austin),\"Austin\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Analyzing racial data through visualization\n",
"\n",
"Now that we have categorical data, we can break it down and ask specific questions. For instance, if we switch back to the full USA and then select only the black population, we can see that blacks predominantly reside in urban areas except in the South and the East Coast:"
"In addition to simply visualizing categorical data, we can break it down and ask specific questions. For instance, if we switch back to the full USA and then select only the black population, we can see that blacks predominantly reside in urban areas except in the South and the East Coast:"
]
},
{
Expand All @@ -477,7 +582,7 @@
"cvs = ds.Canvas(plot_width=plot_width, plot_height=plot_height)\n",
"agg = cvs.points(df, 'meterswest', 'metersnorth', ds.count_cat('race'))\n",
"\n",
"tf.interpolate(agg.sel(race='b'), cmap=cm(Greys9,0.25), how='eq_hist')"
"export(tf.interpolate(agg.sel(race='b'), cmap=cm(Greys9,0.25), how='eq_hist'),\"USA blacks\")"
]
},
{
Expand All @@ -497,7 +602,8 @@
},
"outputs": [],
"source": [
"tf.colorize(agg.where((agg.sel(race=['w', 'b', 'a', 'h']) > 0).all(dim='race')).fillna(0), color_key, how='eq_hist')"
"agg2 = agg.where((agg.sel(race=['w', 'b', 'a', 'h']) > 0).all(dim='race')).fillna(0)\n",
"export(tf.colorize(agg2, color_key, how='eq_hist'),\"USA all\")"
]
},
{
Expand All @@ -517,7 +623,7 @@
},
"outputs": [],
"source": [
"tf.colorize(agg.where(agg.sel(race='w') < agg.sel(race='b')).fillna(0), color_key, how='eq_hist')"
"export(tf.colorize(agg.where(agg.sel(race='w') < agg.sel(race='b')).fillna(0), color_key, how='eq_hist'),\"more_blacks\")"
]
},
{
Expand Down
Loading