From 920fd424eb1a8f94a587f2b1112bd20e5066ef53 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 31 Jan 2014 13:13:02 -0800 Subject: [PATCH 01/45] Tweaks to Variable and Dataset repr --- src/scidata/data.py | 26 ++++++++++++++++++-------- src/scidata/variable.py | 5 +++-- test/test_data.py | 9 +++++++++ test/test_variable.py | 7 ++++++- 4 files changed, 36 insertions(+), 11 deletions(-) diff --git a/src/scidata/data.py b/src/scidata/data.py index 7347ee321cb..70504c9c60c 100644 --- a/src/scidata/data.py +++ b/src/scidata/data.py @@ -211,6 +211,9 @@ def __delitem__(self, key): # does deleting variables make sense for all backends? raise NotImplementedError + # mutable objects should not be hashable + __hash__ = None + def __eq__(self, other): try: # some stores (e.g., scipy) do not seem to preserve order, so don't @@ -302,13 +305,14 @@ def __str__(self): conventions.pretty_print(val, 30)) for att, val in self.attributes.iteritems()]) # create the actual summary - return '\n'.join(summary) + return '\n'.join(summary).replace('\t', ' ' * 4) def __repr__(self): - dim_summary = ', '.join('%s: %s' % (k, v) for k, v - in self.dimensions.iteritems()) - vars_summary = ' '.join(map(str, self.noncoordinates)) - return '' % (dim_summary, vars_summary) + dim_summary = ', '.join('%s%s: %s' % ('@' if k in self else '', k, v) + for k, v in self.dimensions.iteritems()) + return '' % (type(self).__name__, + dim_summary, + ' '.join(self.noncoordinates)) def create_dimension(self, name, length): """Adds a dimension with name dim and length to the object @@ -365,6 +369,9 @@ def create_coordinate(self, name, data, attributes=None): for the common case when the variable is a 1-dimensional coordinate variable with the same name as the dimension. + If the dimension already exists, this function proceeds unless there is + already a corresponding variable or if the lengths disagree. + Parameters ---------- name : string @@ -386,12 +393,15 @@ def create_coordinate(self, name, data, attributes=None): # We need to be cleanly roll back the effects of # create_dimension if create_variable fails, otherwise we will # end up in a partial state. - if name in self.dimensions: - raise ValueError("dimension named '%s' already exists" % name) + if name in self.coordinates: + raise ValueError("coordinate named '%s' already exists" % name) var = variable.Variable((name,), np.asarray(data), attributes) if var.ndim != 1: raise ValueError("coordinate data must be 1-dimensional (vector)") - self._unchecked_create_dimension(name, var.size) + if name not in self.dimensions: + self._unchecked_create_dimension(name, var.size) + elif self.dimensions[name] != var.size: + raise ValueError('dimension already exists with different length') return self._unchecked_add_variable(name, var) def add_variable(self, name, var): diff --git a/src/scidata/variable.py b/src/scidata/variable.py index 40bb36fa263..26cb5dfa1d9 100644 --- a/src/scidata/variable.py +++ b/src/scidata/variable.py @@ -251,12 +251,13 @@ def __str__(self): conventions.pretty_print(val, 30)) for att, val in self.attributes.iteritems()]) # create the actual summary - return '\n'.join(summary) + return '\n'.join(summary).replace('\t', ' ' * 4) def __repr__(self): dim_summary = ', '.join('%s: %s' % (k, v) for k, v in zip(self.dimensions, self.shape)) - return '' % (dim_summary, self.dtype) + return '' % (type(self).__name__, + dim_summary, self.dtype) def views(self, slicers): """Return a new Variable object whose contents are a view of the object diff --git a/test/test_data.py b/test/test_data.py index 22b43fde163..282299e49fd 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -38,6 +38,11 @@ class DataTest(TestCase): def get_store(self): return None + def test_repr(self): + data = create_test_data(self.get_store()) + self.assertEqual('', repr(data)) + def test_iterator(self): data = create_test_data(self.get_store()) # iterate over the first dim @@ -362,6 +367,10 @@ def get_store(self): fobj = StringIO() return backends.ScipyDataStore(fobj, 'w') + def test_repr(self): + # scipy.io.netcdf does not keep track of dimension order :( + pass + class StoreTest(TestCase): def test_stored_to_consistency(self): diff --git a/test/test_variable.py b/test/test_variable.py index 3282ddc3679..11f48c0ffb4 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -8,7 +8,7 @@ class TestVariable(TestCase): def setUp(self): - self.d = np.random.random((10, 3)) + self.d = np.random.random((10, 3)).astype(np.float64) def test_data(self): v = Variable(['time', 'x'], self.d) @@ -39,6 +39,11 @@ def test_properties(self): self.assertEqual(len(v), 10) self.assertEqual(v.attributes, {'foo': u'bar'}) + def test_repr(self): + v = Variable(['time', 'x'], self.d) + self.assertEqual('', + repr(v)) + def test_items(self): v = Variable(['time', 'x'], self.d) self.assertVarEqual(v, v[:]) From e3304389f7f0bbbbee8d56ccac4e0dc787b6573e Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sat, 1 Feb 2014 17:43:42 -0800 Subject: [PATCH 02/45] Add DataView and indexing --- src/scidata/__init__.py | 1 + src/scidata/backends.py | 17 +-- src/scidata/common.py | 55 ++++++++++ src/scidata/data.py | 230 ++++++++++++++++++++++++++++++++-------- src/scidata/dataview.py | 152 ++++++++++++++++++++++++++ src/scidata/ops.py | 46 ++++++++ src/scidata/utils.py | 95 ++++++++++++++--- src/scidata/variable.py | 163 ++++++++++++++-------------- test/test_data.py | 33 ++++-- test/test_dataview.py | 73 +++++++++++++ test/test_utils.py | 8 +- test/test_variable.py | 29 +++-- 12 files changed, 733 insertions(+), 169 deletions(-) create mode 100644 src/scidata/common.py create mode 100644 src/scidata/dataview.py create mode 100644 src/scidata/ops.py create mode 100644 test/test_dataview.py diff --git a/src/scidata/__init__.py b/src/scidata/__init__.py index 628427b3273..05cea9eb79a 100644 --- a/src/scidata/__init__.py +++ b/src/scidata/__init__.py @@ -1,4 +1,5 @@ from data import Dataset, open_dataset +from dataview import DataView from variable import Variable import backends diff --git a/src/scidata/backends.py b/src/scidata/backends.py index b548a4a46a1..4ed0def9c6e 100644 --- a/src/scidata/backends.py +++ b/src/scidata/backends.py @@ -1,8 +1,12 @@ +#TODO: refactor this module so all the stores just expose dimension, variables +# and attributes with the OrderedDict API that handle all the storage logic + import netCDF4 as nc4 from scipy.io import netcdf from collections import OrderedDict +from utils import FrozenOrderedDict import variable @@ -61,8 +65,8 @@ def __init__(self, fobj, *args, **kwdargs): @property def variables(self): - return OrderedDict((k, ScipyVariable(v)) - for k, v in self.ds.variables.iteritems()) + return FrozenOrderedDict((k, ScipyVariable(v)) + for k, v in self.ds.variables.iteritems()) @property def attributes(self): @@ -98,7 +102,8 @@ def unchecked_add_variable(self, name, variable): self.ds.variables[name][:] = variable.data[:] for k, v in variable.attributes.iteritems(): setattr(self.ds.variables[name], k, v) - return variable #self.ds.variables[name] + return variable + #TODO: return the variable instead? # return self.ds.variables[name] def sync(self): @@ -140,8 +145,8 @@ def __init__(self, filename, *args, **kwdargs): @property def variables(self): - return OrderedDict((k, NetCDF4Variable(v)) - for k, v in self.ds.variables.iteritems()) + return FrozenOrderedDict((k, NetCDF4Variable(v)) + for k, v in self.ds.variables.iteritems()) @property def attributes(self): @@ -150,7 +155,7 @@ def attributes(self): @property def dimensions(self): - return OrderedDict((k, len(v)) for k, v in self.ds.dimensions.iteritems()) + return FrozenOrderedDict((k, len(v)) for k, v in self.ds.dimensions.iteritems()) def unchecked_set_dimensions(self, dimensions): """Set the dimensions without checking validity""" diff --git a/src/scidata/common.py b/src/scidata/common.py new file mode 100644 index 00000000000..700ccfec396 --- /dev/null +++ b/src/scidata/common.py @@ -0,0 +1,55 @@ +import numpy as np + + +class _DataWrapperMixin(object): + @property + def data(self): + """ + The variable's data as a numpy.ndarray + """ + if not isinstance(self._data, np.ndarray): + self._data = np.asarray(self._data[...]) + return self._data + + @data.setter + def data(self, value): + value = np.asarray(value) + if value.shape != self.shape: + raise ValueError("replacement data must match the Variable's " + "shape") + self._data = value + + @property + def dtype(self): + return self._data.dtype + + @property + def shape(self): + return self._data.shape + + @property + def size(self): + return self._data.size + + @property + def ndim(self): + return self._data.ndim + + def __len__(self): + return len(self._data) + + def __nonzero__(self): + return bool(self._data) + + def __float__(self): + return float(self._data) + + def __int__(self): + return int(self._data) + + def __complex__(self): + return complex(self._data) + + def __long__(self): + return long(self._data) + diff --git a/src/scidata/data.py b/src/scidata/data.py index 70504c9c60c..c80c84d6bb9 100644 --- a/src/scidata/data.py +++ b/src/scidata/data.py @@ -1,14 +1,17 @@ # TODO Use various backend data stores. pytable, ncdf4, scipy.io, iris, memory - import os import copy import numpy as np import netCDF4 as nc4 +import pandas as pd from cStringIO import StringIO from collections import OrderedDict -import conventions, backends, variable, utils +from dataview import DataView +from utils import FrozenOrderedDict +from variable import Variable +import backends, conventions, utils date2num = nc4.date2num num2date = nc4.num2date @@ -115,7 +118,7 @@ class Dataset(object): store : baackends.*DataStore """ def __init__(self, variables=None, dimensions=None, attributes=None, - store=None, check_consistency=True): + store=None, indices=None, check_consistency=True): """ If dimensions are not provided, they are inferred from the variables. @@ -125,7 +128,7 @@ def __init__(self, variables=None, dimensions=None, attributes=None, # TODO: fill out this docstring if store is None: store = backends.InMemoryDataStore() - object.__setattr__(self, 'store', store) + self.store = store if attributes is not None: self._unchecked_set_attributes(attributes) @@ -140,8 +143,42 @@ def __init__(self, variables=None, dimensions=None, attributes=None, check_dims_and_vars_consistency(dimensions, variables) self._unchecked_set_variables(variables) - def _unchecked_set_dimensions(self, *args, **kwdargs): - self.store.unchecked_set_dimensions(*args, **kwdargs) + if indices is None: + indices = {} + else: + for k, v in indices.iteritems(): + if k not in self.dimensions or v.size != self.dimensions[k]: + raise ValueError('inconsisent index %r' % k) + self._indices = indices + + def _create_index(self, dim): + if dim in self.variables: + var = self.variables[dim] + data = var.data + attr = var.attributes + if 'units' in attr and 'since' in attr['units']: + print 'time units!' + data = num2date(data, attr['units']) + else: + print var.attributes + elif dim in self.dimensions: + data = np.arange(self.dimensions[dim]) + else: + raise ValueError('cannot find index %r in dataset' % dim) + return pd.Index(data) + + def _lookup_index(self, dim): + if dim not in self._indices: + self._indices[dim] = self._create_index(dim) + return self._indices[dim] + + @property + def indices(self): + return FrozenOrderedDict((dim, self._lookup_index(dim)) + for dim in self.dimensions) + + def _unchecked_set_dimensions(self, dims, *args, **kwdargs): + self.store.unchecked_set_dimensions(dims, *args, **kwdargs) def _unchecked_set_attributes(self, *args, **kwdargs): self.store.unchecked_set_attributes(*args, **kwdargs) @@ -149,8 +186,8 @@ def _unchecked_set_attributes(self, *args, **kwdargs): def _unchecked_set_variables(self, *args, **kwdargs): self.store.unchecked_set_variables(*args, **kwdargs) - def _unchecked_create_dimension(self, *args, **kwdargs): - self.store.unchecked_create_dimension(*args, **kwdargs) + def _unchecked_create_dimension(self, dim, *args, **kwdargs): + self.store.unchecked_create_dimension(dim, *args, **kwdargs) def _unchecked_add_variable(self, *args, **kwdargs): return self.store.unchecked_add_variable(*args, **kwdargs) @@ -183,14 +220,6 @@ def __copy__(self): return type(self)(self.variables, self.dimensions, self.attributes, check_consistency=False) - def __setattr__(self, attr, value): - """"__setattr__ is overloaded to prevent operations that could - cause loss of data consistency. If you really intend to update - dir(self), use the self.__dict__.update method or the - super(type(a), self).__setattr__ method to bypass.""" - #TODO: remove this hack? - raise AttributeError("__setattr__ is disabled") - def __contains__(self, key): """ The 'in' operator will return true or false depending on @@ -209,7 +238,10 @@ def __setitem__(self, key, value): def __delitem__(self, key): # does deleting variables make sense for all backends? - raise NotImplementedError + del self.variables[key] + + #TODO: add keys, items, and values methods (and the iter versions) to + # complete the dict analogy? # mutable objects should not be hashable __hash__ = None @@ -233,7 +265,7 @@ def __ne__(self, other): @property def coordinates(self): """Coordinates are variables with names that match dimensions""" - return OrderedDict([(dim, self.variables[dim]) + return FrozenOrderedDict([(dim, self.variables[dim]) for dim in self.dimensions if dim in self.variables and self.variables[dim].data.ndim == 1 and @@ -244,7 +276,7 @@ def noncoordinates(self): """Non-coordinates are variables with names that do not match dimensions """ - return OrderedDict([(name, v) + return FrozenOrderedDict([(name, v) for (name, v) in self.variables.iteritems() if name not in self.coordinates]) @@ -359,7 +391,7 @@ def create_variable(self, name, dims, data, attributes=None): Reference to the newly created variable. """ # any error checking should be taken care of by add_variable - v = variable.Variable(dims, np.asarray(data), attributes) + v = Variable(dims, np.asarray(data), attributes) return self.add_variable(name, v) def create_coordinate(self, name, data, attributes=None): @@ -395,7 +427,7 @@ def create_coordinate(self, name, data, attributes=None): # end up in a partial state. if name in self.coordinates: raise ValueError("coordinate named '%s' already exists" % name) - var = variable.Variable((name,), np.asarray(data), attributes) + var = Variable((name,), np.asarray(data), attributes) if var.ndim != 1: raise ValueError("coordinate data must be 1-dimensional (vector)") if name not in self.dimensions: @@ -411,7 +443,7 @@ def add_variable(self, name, var): ---------- name : string The name under which the variable will be added. - variable : variable.Variable + variable : Variable The variable to be added. If the desired action is to add a copy of the variable be sure to do so before passing it to this function. @@ -422,6 +454,9 @@ def add_variable(self, name, var): """ if name in self.variables: raise ValueError("Variable named %r already exists" % name) + if name in self.dimensions and name in self._indices: + # remove existing index + del self._indices[name] check_dims_and_vars_consistency(self.dimensions, {name: var}) return self._unchecked_add_variable(name, var) @@ -455,7 +490,7 @@ def views(self, slicers): """ if not all(k in self.dimensions for k in slicers): invalid = [k for k in slicers if not k in self.dimensions] - raise KeyError("dimensions %r don't exist" % invalid) + raise ValueError("dimensions %r don't exist" % invalid) # slice all variables variables = OrderedDict() @@ -495,9 +530,32 @@ def search_dim_len(dim, variables): # integer (dimension 0) dimensions[dim] = new_len + # TODO: slice indices and pass them on so we don't need to create them + # from scratch in the new object return type(self)(variables, dimensions, self.attributes, check_consistency=False) + def loc_views(self, slicers): + islicers = {} + for k, v in slicers.iteritems(): + index = self.indices[k] + if isinstance(v, slice): + ind_slice = index.slice_indexer(v.start, v.stop) + print v, ind_slice + # assume step-size is valid unchanged + islicers[k] = slice(ind_slice.start, ind_slice.stop, v.step) + elif not np.iterable(v): + islicers[k] = index.get_loc(v) + else: + new_index, indexer = index.reindex(v) + if np.any(indexer < 0): + raise ValueError('not all values found in index %s' % k) + islicers[k] = indexer + # FIXME: don't throw away new_index (we'll need to recreate it + # later) + return self.views(islicers) + + def view(self, s, dim): """Return a new object whose contents are a view of a slice from the current object along a specified dimension @@ -530,7 +588,7 @@ def view(self, s, dim): """ return self.views({dim: s}) - def take(self, indices, dim=None): + def take(self, indices, dim): """Return a new object whose contents are taken from the current object along a specified dimension @@ -543,9 +601,6 @@ def take(self, indices, dim=None): The dimension to slice along. If multiple dimensions of a variable equal dim (e.g. a correlation matrix), then that variable is sliced only along its first matching dimension. - If None (default), then the object is sliced along its - unlimited dimension; an exception is raised if the object - does not have an unlimited dimension. Returns ------- @@ -562,8 +617,6 @@ def take(self, indices, dim=None): numpy.take Variable.take """ - if dim is None: - raise ValueError("dim cannot be None") # Create a new object obj = type(self)() # Create fancy-indexed variables and infer the new dimension length @@ -607,7 +660,7 @@ def renamed(self, name_dict): dims = tuple(name_dict.get(dim, dim) for dim in v.dimensions) #TODO: public interface for renaming a variable without loading # data - variables[name] = variable.Variable(dims, v._data, v.attributes) + variables[name] = Variable(dims, v._data, v.attributes) dimensions = OrderedDict((name_dict.get(k, k), v) for k, v in self.dimensions.iteritems()) @@ -615,11 +668,27 @@ def renamed(self, name_dict): return type(self)(variables, dimensions, self.attributes, check_consistency=False) - def join(self, other): - """ - Join two datasets into a single new dataset + def merge(self, other): + """Merge two datasets into a single new dataset - Raises ValueError if any variables or dimensions do not match. + Parameters + ---------- + other : Dataset + Dataset to merge with this dataset. + + Returns + ------- + Dataset + New dataset with the merged contents of both datasets + + Raises + ------ + ValueError + If any variables, dimensions or attributes conflict. + + See Also + -------- + Dataset.update : update a dataset in place """ new_vars = utils.safe_merge(self.variables, other.variables, compat=utils.variable_equal) @@ -627,10 +696,37 @@ def join(self, other): new_attr = utils.safe_merge(self.attributes, other.attributes) return type(self)(new_vars, new_dims, new_attr) + def update(self, other): + """Update this dataset in place with the contents of another dataset + + Parameters + ---------- + other : Dataset + Dataset with which to update this dataset. + + Raises + ------ + ValueError + If any variables, dimensions or attributes conflict. + + See Also + -------- + Dataset.merge : merge two datasets into a new dataset + """ + # check for conflicts + utils.update_safety_check(self.variables, other.variables, + compat=utils.variable_equal) + utils.update_safety_check(self.dimensions, other.dimensions) + utils.update_safety_check(self.attributes, other.attributes) + # update contents + self.variables.update(other.variables) + self.dimensions.update(other.dimensions) + self.attributes.update(other.attributes) + def select(self, *names): - """Return a new object that contains the specified namesiables, - along with the dimensions on which those variables are defined - and corresponding coordinate variables. + """Returns a new dataset that contains the named variables, along with + the dimensions on which those variables are defined and corresponding + coordinate variables. Parameters ---------- @@ -639,17 +735,16 @@ def select(self, *names): Returns ------- - obj : Data object - The returned object has the same attributes as the - original. A dimension is included if at least one of the - specified variables is defined along that dimension. - Coordinate variables (1-dimensional variables with the same - name as a dimension) that correspond to an included - dimension are also included. All other variables are + Dataset + The returned object has the same attributes as the original. A + dimension is included if at least one of the specified variables is + defined along that dimension. Coordinate variables (1-dimensional + variables with the same name as a dimension) that correspond to an + included dimension are also included. All other variables are dropped. """ if not all(k in self.variables for k in names): - raise KeyError( + raise ValueError( "One or more of the specified variables does not exist") dim_names = (set(self.variables[k].dimensions) for k in names) @@ -662,6 +757,51 @@ def select(self, *names): return type(self)(variables, dimensions, self.attributes, check_consistency=False) + def unselect(self, *names): + """Returns a new dataset without the named variables + + Parameters + ---------- + *names : str + Names of the variables to omit from the returned object. + + Returns + ------- + Dataset + New dataset based on this dataset. Only the named variables are + removed. Dimensions are unchanged. + """ + if not all(k in self.variables for k in names): + raise ValueError( + "One or more of the specified variables does not exist") + variables = OrderedDict((k, v) for k, v in self.variables.iteritems() + if k not in names) + return type(self)(variables, self.dimensions, self.attributes, + check_consistency=False) + + def to_dataview(self, name, extra_variables=None): + """Return a dataview selected from this dataset + + Parameters + ---------- + name : str + Name of variable on which to orient (and name) the new dataview. + extra_variables : sequence of str, optional + Additional variables from this dataset to include in the dataview's + dataset. These variables's coordinates (if any) are also included. + By default, the dataview's dataset only includes `name` and all of + its coordinate variables. + + Returns + ------- + dataview : DataView + Dataview with a selection of variables from this dataset and the + name `name`. + """ + if extra_variables is None: + extra_variables = [] + return DataView(self.select(*([name] + list(extra_variables))), name) + def iterator(self, dim=None, views=False): """Iterator along a data dimension diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py new file mode 100644 index 00000000000..d8dbd7c6b3a --- /dev/null +++ b/src/scidata/dataview.py @@ -0,0 +1,152 @@ +import re + +import variable +from common import _DataWrapperMixin +from ops import inject_special_operations +from utils import expanded_indexer + + +class DataView(_DataWrapperMixin): + """ + A Dataset wrapper oriented around a single Variable + + Dataviews are the primary way to do computations with Dataset variables. + They are designed to make it easy to manipulate variables in the context of + an intact Dataset object. Getting items from or doing mathematical + operations with a dataview returns another dataview. + + The design of dataviews is strongly inspired by the Iris Cube. However, + dataviews are much lighter weight than cubes. They are simply aligned, + labeled datasets and do not explicitly guarantee or rely on the CF model. + """ + def __init__(self, dataset, name): + """ + Parameters + ---------- + dataset : scidata.Dataset + The dataset on which to build this data view. + name : str + The name of the "focus variable" in dataset on which this view is + oriented. + """ + if not name in dataset: + raise ValueError('name %r is not a variable in dataset %r' + % (name, dataset)) + self.dataset = dataset + self.name = name + + @property + def variable(self): + return self.dataset[self.name] + + @variable.setter + def variable(self, value): + # TODO: remove this line, so we change DataView's dataset in-place + # (if supported by the underlying store) + self.dataset = self.unselected() + self.dataset[self.name] = value + + @property + def _data(self): + # necessary for _DataWrapperMixin + return self.variable._data + + @property + def dimensions(self): + return self.variable.dimensions + + def __getitem__(self, key): + key = expanded_indexer(key, self.ndim) + slicers = dict(zip(self.dimensions, key)) + return type(self)(self.dataset.views(slicers), self.name) + + def __setitem__(self, key, value): + self.variable[key] = value + + def __iter__(self): + for n in range(len(self)): + yield self[n] + + @property + def attributes(self): + return self.variable.attributes + + def copy(self): + return self.__copy__() + + def __copy__(self): + # shallow copy the underlying dataset + return DataView(self.dataset.copy(), self.name) + + # mutable objects should not be hashable + __hash__ = None + + def __str__(self): + #TODO: make this less hacky + return re.sub(' {4}(%s\s+%s)' % (self.dtype, self.name), + r'--> \1', str(self.dataset)) + + def __repr__(self): + if self.ndim > 0: + dim_summary = ', '.join('%s%s: %s' % + ('@' if k in self.dataset else '', k, v) + for k, v in zip(self.dimensions, + self.shape)) + contents = ' (%s): %s' % (dim_summary, self.dtype) + else: + contents = ': %s' % self.data + return '' % (type(self).__name__, self.name, contents) + + def renamed(self, new_name): + """Returns a new DataView with this DataView's focus variable renamed + """ + renamed_dataset = self.dataset.renamed({self.name: new_name}) + return type(self)(renamed_dataset, new_name) + + def unselected(self): + """Returns a copy of this DataView's dataset with this DataView's + focus variable removed + """ + return self.dataset.unselect(self.name) + + def transpose(self, *dimensions): + ds = self.unselected() + ds[self.name] = self.variable.transpose(*dimensions) + return DataView(ds, self.name) + + +def unary_op(f): + def func(self): + ds = self.unselected() + ds[self.name] = f(self.variable) + return DataView(ds, self.name) + return func + + +def binary_op(f, reflexive=False): + def func(self, other): + ds = self.unselected() + other_variable = getattr(other, 'variable', other) + ds[self.name] = (f(self.variable, other_variable) + if not reflexive + else f(other_variable, self.variable)) + if hasattr(other, 'unselected'): + ds.update(other.unselected()) + return DataView(ds, self.name) + return func + + +def inplace_binary_op(f): + def func(self, other): + other_variable = getattr(other, 'variable', other) + self.variable = f(self.variable, other_variable) + if hasattr(other, 'unselected'): + self.dataset.update(other.unselected()) + return self + return func + + +inject_special_operations(DataView, unary_op, binary_op, inplace_binary_op) + +DataView.tranpose = unary_op(variable.Variable.transpose) + diff --git a/src/scidata/ops.py b/src/scidata/ops.py new file mode 100644 index 00000000000..4677fc6a21e --- /dev/null +++ b/src/scidata/ops.py @@ -0,0 +1,46 @@ +import operator + + +# NUMPY_COLLAPSE_METHODS = ['argmax', 'min', 'argmin', 'ptp', 'sum', 'cumsum', +# 'mean', 'var', 'std', 'prod', 'cumprod', 'all', +# 'any'] + +# def wrap_numpy_collapse_method(f): +# def func(self, dimension=None, axis=None): +# if dimension is not None: +# if axis is None: +# axis = self.dimensions.index(dim) +# else: +# raise ValueError("cannot specify both 'axis' and 'dimension'") +# # dims = tuple(dim for dim in self.dimension is dim != dimension) +# if axis is not None: +# dims = tuple(dim for i, dim in enumerate(self.dimension) +# if i not in [axis, axis + self.ndim]) +# else: +# dims = (), +# data = f(self.data, axis=axis) +# return Variable(dims, data, self.attributes) + + +UNARY_OPS = ['neg', 'pos', 'abs', 'invert'] +CMP_BINARY_OPS = ['lt', 'le', 'eq', 'ne', 'ge', 'gt'] +NUM_BINARY_OPS = ['add', 'sub', 'mul', 'div', 'truediv', 'floordiv', 'mod', + 'pow', 'and', 'xor', 'or'] + + +def inject_special_operations(cls, unary_op, binary_op, inplace_binary_op, + priority=50): + # priortize our operations over those of numpy.ndarray (priority=1) + # and numpy.matrix (priority=10) + cls.__array_priority__ = priority + op_str = lambda name: '__%s__' % name + op = lambda name: getattr(operator, op_str(name)) + # patch in standard special operations + for op_names, op_wrap in [(UNARY_OPS, unary_op), + (CMP_BINARY_OPS + NUM_BINARY_OPS, binary_op)]: + for name in op_names: + setattr(cls, op_str(name), op_wrap(op(name))) + # only numeric operations have in-place and reflexive variants + for name in NUM_BINARY_OPS: + setattr(cls, op_str('r' + name), binary_op(op(name), reflexive=True)) + setattr(cls, op_str('i' + name), inplace_binary_op(op('i' + name))) diff --git a/src/scidata/utils.py b/src/scidata/utils.py index ae591fab9bc..99813958aa6 100644 --- a/src/scidata/utils.py +++ b/src/scidata/utils.py @@ -9,21 +9,73 @@ def expanded_indexer(key, ndim): key which is a tuple with length equal to the number of dimensions """ if not isinstance(key, tuple): + # numpy treats non-tuple keys equivalent to tuples of length 1 key = (key,) - new_key = [slice(None)] * ndim - new_key[:len(key)] = key + new_key = [] + # handling Ellipsis right is a little tricky, see: + # http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing + found_ellipsis = False + for k in key: + if k is Ellipsis: + if not found_ellipsis: + new_key.extend((ndim + 1 - len(key)) * [slice(None)]) + found_ellipsis = True + else: + new_key.append(slice(None)) + else: + new_key.append(k) + new_key.extend((ndim - len(new_key)) * [slice(None)]) return tuple(new_key) -def safe_merge(*dicts, **kwargs): - """Merge any number of dictionaries into a new OrderedDict +def update_safety_check(first_dict, second_dict, compat=operator.eq): + """Check the safety of updating one dictionary with another Raises ValueError if dictionaries have non-compatible values for any key, where compatibility is determined by the `compat` function. Parameters ---------- - *dicts : dict-like + first_dict, second_dict : dict-like + All items in the second dictionary are checked against for conflicts + against items in the first dictionary. + compat : function, optional + Binary operator to determine if two values are compatible. By default, + checks for equality. + """ + for k, v in second_dict.iteritems(): + if k in first_dict and not compat(v, first_dict[k]): + raise ValueError('unsafe to merge dictionaries without ' + 'overriding values') + + +def safe_update(first_dict, second_dict, compat=operator.eq): + """Safely update a dictionary with another dictionary + + Raises ValueError if dictionaries have non-compatible values for any key, + where compatibility is determined by the `compat` function. + + Parameters + ---------- + first_dict, second_dict : dict-like + Mappings to merge. The first dictionary is modified in place. + compat : function, optional + Binary operator to determine if two values are compatible. By default, + checks for equality. + """ + update_safety_check(first_dict, second_dict, compat=compat) + first_dict.update(second_dict) + + +def safe_merge(first_dict, second_dict, compat=operator.eq): + """Safely merge two dictionaries into a new OrderedDict + + Raises ValueError if dictionaries have non-compatible values for any key, + where compatibility is determined by the `compat` function. + + Parameters + ---------- + first_dict, second_dict : dict-like Mappings to merge. compat : function, optional Binary operator to determine if two values are compatible. By default, @@ -34,14 +86,10 @@ def safe_merge(*dicts, **kwargs): merged : OrderedDict Merged contents. """ - compat = kwargs.pop('compat', operator.eq) - merged = OrderedDict() - for d in dicts: - for k, v in d.iteritems(): - if k in merged and not compat(v, merged[k]): - raise ValueError('cannot override values with safe_merge') - merged[k] = v - return merged + update_safety_check(first_dict, second_dict, compat=compat) + new_dict = OrderedDict(first_dict) + new_dict.update(second_dict) + return new_dict def variable_equal(v1, v2): @@ -64,3 +112,24 @@ def variable_equal(v1, v2): return np.array_equal(v1.data, v2.data) else: return False + + +class FrozenOrderedDict(OrderedDict): + """A subclass of OrderedDict whose contents are frozen after initialization + to prevent tampering + """ + def __init__(self, *args, **kwds): + # bypass the disabled __setitem__ method + # initialize as an empty OrderedDict + super(FrozenOrderedDict, self).__init__() + # Capture arguments in an OrderedDict + args_dict = OrderedDict(*args, **kwds) + # Call __setitem__ of the superclass + for (key, value) in args_dict.iteritems(): + super(FrozenOrderedDict, self).__setitem__(key, value) + + def _not_implemented(self, *args, **kwargs): + raise TypeError('%s is immutable' % type(self).__name__) + + __setitem__ = __delitem__ = setdefault = update = pop = popitem = clear = \ + _not_implemented diff --git a/src/scidata/variable.py b/src/scidata/variable.py index 26cb5dfa1d9..290fc658e31 100644 --- a/src/scidata/variable.py +++ b/src/scidata/variable.py @@ -1,13 +1,15 @@ import copy import numpy as np -from collections import OrderedDict -from functools import wraps -import operator import warnings +from collections import OrderedDict import conventions -from utils import expanded_indexer, safe_merge +import data +import dataview +from common import _DataWrapperMixin +from utils import expanded_indexer, safe_merge, safe_update +from ops import inject_special_operations class AttributesDict(OrderedDict): @@ -110,7 +112,7 @@ def _as_compatible_data(data): return data -class Variable(object): +class Variable(_DataWrapperMixin): """ A netcdf-like variable consisting of dimensions, data and attributes which describe a single varRiable. A single variable object is not @@ -131,49 +133,6 @@ def __init__(self, dims, data, attributes=None): def dimensions(self): return self._dimensions - @property - def data(self): - """ - The variable's data as a numpy.ndarray - """ - if not isinstance(self._data, np.ndarray): - self._data = np.asarray(self._data[...]) - return self._data - - @data.setter - def data(self, value): - value = np.asarray(value) - if value.shape != self.shape: - raise ValueError("replacement data must match the Variable's " - "shape") - self._data = value - - @property - def dtype(self): - return self._data.dtype - - @property - def shape(self): - return self._data.shape - - @property - def size(self): - return self._data.size - - @property - def ndim(self): - return self._data.ndim - - def __len__(self): - return len(self._data) - - def __nonzero__(self): - if self.size == 1: - return bool(self.data) - else: - raise ValueError('ValueError: The truth value of variable with ' - 'more than one element is ambiguous.') - def __getitem__(self, key): """ Return a new Variable object whose contents are consistent with getting @@ -182,10 +141,20 @@ def __getitem__(self, key): key = expanded_indexer(key, self.ndim) dimensions = [dim for k, dim in zip(key, self.dimensions) if not isinstance(k, int)] + #TODO: wrap _data in a biggus array or use np.ix_ so fancy indexing + # always slices axes independently (as in the python-netcdf4 package) + new_data = self._data[key] + if new_data.ndim != len(dimensions): + raise ValueError('indexing results in an array of shape %s, ' + 'which has inconsistent length with the ' + 'expected dimensions %s (if you really want to ' + 'do this sort of indexing, index the `data` ' + 'attribute directly)' + % (new_data.shape, dimensions)) # always return a Variable, because Variable subtypes may have # different constructors and may not make sense without an attached # datastore - return Variable(dimensions, self._data[key], self.attributes) + return Variable(dimensions, new_data, self.attributes) def __setitem__(self, key, value): """__setitem__ is overloaded to access the underlying numpy data""" @@ -254,10 +223,13 @@ def __str__(self): return '\n'.join(summary).replace('\t', ' ' * 4) def __repr__(self): - dim_summary = ', '.join('%s: %s' % (k, v) for k, v - in zip(self.dimensions, self.shape)) - return '' % (type(self).__name__, - dim_summary, self.dtype) + if self.ndim > 0: + dim_summary = ', '.join('%s: %s' % (k, v) for k, v + in zip(self.dimensions, self.shape)) + contents = ' (%s): %s' % (dim_summary, self.dtype) + else: + contents = ': %s' % self.data + return '' % (type(self).__name__, contents) def views(self, slicers): """Return a new Variable object whose contents are a view of the object @@ -351,10 +323,53 @@ def take(self, indices, dim): data = self.data.take(indices, axis=axis) return Variable(self.dimensions, data, self.attributes) + def transpose(self, *dimensions): + """Return a new Variable object with transposed dimensions + + Note: Although this operation returns a view of this variable's data, + it is not lazy -- the data will be fully loaded. + + Parameters + ---------- + *dimensions : str, optional + By default, reverse the dimensions. Otherwise, reorder the + dimensions to this order. + + Returns + ------- + obj : Variable object + The returned object has transposed data and dimensions with the + same attributes as the original. + + See Also + -------- + numpy.transpose + """ + if len(dimensions) == 0: + dimensions = self.dimensions[::-1] + axes = [dimensions.index(dim) for dim in self.dimensions] + data = self.data.transpose(*axes) + return Variable(dimensions, data, self.attributes) + def broadcast_var_data(self, other): self_data = self.data - if all(hasattr(other, attr) for attr in ['dimensions', 'data']): + if isinstance(other, data.Dataset): + raise TypeError('datasets do not support mathematical operations') + elif all(hasattr(other, attr) for attr in ['dimensions', 'data', 'shape']): + # validate dimensions + dim_lengths = dict(zip(self.dimensions, self.shape)) + for k, v in zip(other.dimensions, other.shape): + if k in dim_lengths and dim_lengths[k] != v: + raise ValueError('operands could not be broadcast together ' + 'with mismatched lengths for dimension %r: %s' + % (k, (dim_lengths[k], v))) + for dimensions in [self.dimensions, other.dimensions]: + if len(set(dimensions)) < len(dimensions): + raise ValueError('broadcasting requires that neither operand ' + 'has duplicate dimensions: %r' + % list(dimensions)) + # build dimensions for new Variable other_only_dims = [dim for dim in other.dimensions if dim not in self.dimensions] @@ -393,20 +408,16 @@ def _math_safe_attributes(v): return OrderedDict((k, v) for k, v in attr.items() if k != 'units') -def unary_op_wrapper(name): - f = getattr(operator, '__%s__' % name) - @wraps(f) +def unary_op(f): def func(self): - new_data = f(self.data) - new_attr = _math_safe_attributes(self) - return Variable(self.dimensions, new_data, new_attr) + return Variable(self.dimensions, f(self.data), self.attributes) return func -def binary_op(name, reflexive=False): - f = getattr(operator, '__%s__' % name) - @wraps(f) +def binary_op(f, reflexive=False): def func(self, other): + if isinstance(other, dataview.DataView): + return NotImplemented self_data, other_data, new_dims = broadcast_var_data(self, other) new_data = (f(self_data, other_data) if not reflexive @@ -417,35 +428,15 @@ def func(self, other): return func -def inplace_binary_op(name): - f = getattr(operator, '__i%s__' % name) - @wraps(f) +def inplace_binary_op(f): def func(self, other): self_data, other_data, dimensions = broadcast_var_data(self, other) if dimensions != self.dimensions: raise ValueError('dimensions cannot change for in-place operations') self.data = f(self_data, other_data) + safe_update(self.attributes, _math_safe_attributes(other)) return self return func -UNARY_OPS = ['neg', 'pos', 'abs', 'invert'] -CMP_BINARY_OPS = ['lt', 'le', 'eq', 'ne', 'ge', 'gt'] -NUM_BINARY_OPS = ['add', 'sub', 'mul', 'div', 'truediv', 'floordiv', 'mod', - 'pow', 'and', 'xor', 'or'] - - -def inject_special_operations(cls, priority=10): - # priortize our operations over numpy.ndarray's (priority=1.0) - cls.__array_priority__ = priority - for name in UNARY_OPS: - setattr(cls, '__%s__' % name, unary_op_wrapper(name)) - for name in CMP_BINARY_OPS: - setattr(cls, '__%s__' % name, binary_op(name)) - for name in NUM_BINARY_OPS: - setattr(cls, '__%s__' % name, binary_op(name)) - setattr(cls, '__r%s__' % name, binary_op(name, reflexive=True)) - setattr(cls, '__i%s__' % name, inplace_binary_op(name)) - - -inject_special_operations(Variable) +inject_special_operations(Variable, unary_op, binary_op, inplace_binary_op) diff --git a/test/test_data.py b/test/test_data.py index 282299e49fd..c59d4ca8271 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -20,7 +20,7 @@ def create_test_data(store=None): obj = Dataset(store=store) - obj.create_dimension('time', 10) + obj.create_dimension('time', 1000) for d, l in sorted(_dims.items()): obj.create_dimension(d, l) var = obj.create_variable(name=d, dims=(d,), @@ -40,8 +40,8 @@ def get_store(self): def test_repr(self): data = create_test_data(self.get_store()) - self.assertEqual('', repr(data)) + self.assertEqual('', repr(data)) def test_iterator(self): data = create_test_data(self.get_store()) @@ -233,11 +233,11 @@ def test_views(self): # actual.fill(np.pi) # np.testing.assert_array_equal(expected, actual) - self.assertRaises(KeyError, data.views, + self.assertRaises(ValueError, data.views, {'not_a_dim': slice(0, 2)}) ret = data.views({'dim1': 0}) - self.assertEqual({'time': 10, 'dim2': 50, 'dim3': 10}, ret.dimensions) + self.assertEqual({'time': 1000, 'dim2': 50, 'dim3': 10}, ret.dimensions) ret = data.views({'time': slice(2), 'dim1': 0, 'dim2': slice(5)}) self.assertEqual({'time': 2, 'dim2': 5, 'dim3': 10}, ret.dimensions) @@ -245,6 +245,19 @@ def test_views(self): ret = data.views({'time': 0, 'dim1': 0, 'dim2': slice(5)}) self.assertItemsEqual({'dim2': 5, 'dim3': 10}, ret.dimensions) + def test_loc_views(self): + data = create_test_data(self.get_store()) + int_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 2)} + loc_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 1)} + self.assertEqual(data.views(int_slicers), data.loc_views(loc_slicers)) + data.create_variable('time', ['time'], np.arange(1000, dtype=np.int32), + {'units': 'days since 2000-01-01'}) + data.create_variable('foobar', ['time', 'dim3'], + np.random.randn(1000, 10)) + self.assertEqual(data.views({'time': slice(10)}), + data.loc_views({'time': + slice('2000-01-01', '2000-01-10')})) + self.assertEqual(data, data.loc_views({'time': slice('1999', '2005')})) def test_take(self): data = create_test_data(self.get_store()) @@ -298,7 +311,7 @@ def test_select(self): ret = data.select(_testvar) self.assertVarEqual(data[_testvar], ret[_testvar]) self.assertTrue(_vars.keys()[1] not in ret.variables) - self.assertRaises(KeyError, data.select, (_testvar, 'not_a_var')) + self.assertRaises(ValueError, data.select, (_testvar, 'not_a_var')) def test_copy(self): data = create_test_data(self.get_store()) @@ -341,17 +354,17 @@ def test_rename(self): self.assertTrue('dim2' not in renamed.variables) self.assertTrue('dim2' not in renamed.dimensions) - def test_join(self): + def test_merge(self): data = create_test_data(self.get_store()) ds1 = data.select('var1') ds2 = data.select('var3') expected = data.select('var1', 'var3') - actual = ds1.join(ds2) + actual = ds1.merge(ds2) self.assertEqual(expected, actual) with self.assertRaises(ValueError): - ds1.join(ds2.view(0, 'dim1')) + ds1.merge(ds2.view(0, 'dim1')) with self.assertRaises(ValueError): - ds1.join(ds2.renamed({'var3': 'var1'})) + ds1.merge(ds2.renamed({'var3': 'var1'})) class NetCDF4DataTest(DataTest): diff --git a/test/test_dataview.py b/test/test_dataview.py new file mode 100644 index 00000000000..7a569b8e33e --- /dev/null +++ b/test/test_dataview.py @@ -0,0 +1,73 @@ +import numpy as np + +from scidata import Dataset, DataView, Variable +from . import TestCase + + +class TestDataView(TestCase): + def assertViewEqual(self, dv1, dv2): + self.assertEqual(dv1.dataset, dv2.dataset) + self.assertEqual(dv1.name, dv2.name) + + def setUp(self): + self.x = np.random.random((10, 20)) + self.v = Variable(['x', 'y'], self.x) + self.ds = Dataset({'foo': self.v}) + self.ds.create_coordinate('x', np.arange(10)) + self.ds.create_coordinate('y', np.arange(20)) + self.dv = DataView(self.ds, 'foo') + + def test_properties(self): + self.assertIs(self.dv.dataset, self.ds) + self.assertEqual(self.dv.name, 'foo') + self.assertVarEqual(self.dv.variable, self.v) + self.assertArrayEqual(self.dv.data, self.v.data) + for attr in ['dimensions', 'dtype', 'shape', 'size', 'ndim', + 'attributes']: + self.assertEqual(getattr(self.dv, attr), getattr(self.v, attr)) + self.assertEqual(len(self.dv), len(self.v)) + self.assertVarEqual(self.dv, self.v) + + def test_items(self): + self.assertVarEqual(self.dv[0], self.v[0]) + self.assertEqual(self.dv[0].dataset, self.ds.views({'x': 0})) + self.assertVarEqual(self.dv[:3, :5], self.v[:3, :5]) + self.assertEqual(self.dv[:3, :5].dataset, + self.ds.views({'x': slice(3), 'y': slice(5)})) + + def test_renamed(self): + renamed = self.dv.renamed('bar') + self.assertEqual(renamed.dataset, self.ds.renamed({'foo': 'bar'})) + self.assertEqual(renamed.name, 'bar') + + def test_to_dataview(self): + dv = self.ds.to_dataview('foo') + self.assertViewEqual(dv, self.dv) + + def test_math(self): + x = self.x + v = self.v + a = self.dv + # variable math was already tested extensively, so let's just make sure + # that all types are properly converted here + self.assertViewEqual(a, +a) + self.assertViewEqual(a, a + 0) + self.assertViewEqual(a, 0 + a) + self.assertViewEqual(a, a + 0 * v) + self.assertViewEqual(a, 0 * v + a) + self.assertViewEqual(a, a + 0 * x) + self.assertViewEqual(a, 0 * x + a) + self.assertViewEqual(a, a + 0 * a) + self.assertViewEqual(a, 0 * a + a) + + def test_inplace_math(self): + x = self.x + v = self.v + a = self.dv + b = a + b += 1 + self.assertIs(b, a) + self.assertIs(b.variable, v) + self.assertIs(b.data, x) + #FIXME: this test currently fails (see DataView.variable.setter) + # self.assertIs(b.dataset, self.ds) diff --git a/test/test_utils.py b/test/test_utils.py index 2783485bde2..c4777f91e2d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -11,10 +11,12 @@ def __getitem__(self, key): class TestExpandedIndexer(TestCase): def test(self): - x = np.random.randn(10, 20, 30) + x = np.random.randn(10, 11, 12, 13, 14) + y = np.arange(5) i = ReturnItem() - for i in [i[:], i[...], i[0, :, 10], i[:5, ...], i[np.arange(5)]]: - j = utils.expanded_indexer(i, 3) + for i in [i[:], i[...], i[0, :, 10], i[..., 10], i[:5, ..., 0], + i[y], i[y, y], i[..., y, y], i[..., 0, 1, 2, 3, 4]]: + j = utils.expanded_indexer(i, x.ndim) self.assertArrayEqual(x[i], x[j]) diff --git a/test/test_variable.py b/test/test_variable.py index 11f48c0ffb4..6547168da91 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -41,7 +41,7 @@ def test_properties(self): def test_repr(self): v = Variable(['time', 'x'], self.d) - self.assertEqual('', + self.assertEqual('', repr(v)) def test_items(self): @@ -65,6 +65,15 @@ def test_views(self): self.assertVarEqual(v.views({'time': slice(0, 3)}), v[:3]) self.assertVarEqual(v.views({'x': 0}), v[:, 0]) + def test_transpose(self): + v = Variable(['time', 'x'], self.d) + v2 = Variable(['x', 'time'], self.d.T) + self.assertVarEqual(v, v2.transpose()) + x = np.random.randn(2, 3, 4, 5) + w = Variable(['a', 'b', 'c', 'd'], x) + w2 = Variable(['d', 'b', 'c', 'a'], np.einsum('abcd->dbca', x)) + self.assertVarEqual(w2, w.transpose('d', 'b', 'c', 'a')) + def test_1d_math(self): x = np.arange(5) y = np.ones(5) @@ -73,11 +82,6 @@ def test_1d_math(self): self.assertVarEqual(v, +v) self.assertVarEqual(v, abs(v)) self.assertArrayEqual((-v).data, -x) - # verify attributes - v2 = Variable(['x'], x, {'units': 'meters'}) - self.assertVarEqual(v, +v2) - v3 = Variable(['x'], x, {'some': 'attribute'}) - self.assertVarEqual(v3, +v3) # bianry ops with numbers self.assertVarEqual(v, v + 0) self.assertVarEqual(v, 0 + v) @@ -91,6 +95,10 @@ def test_1d_math(self): self.assertArrayEqual((x * v).data, x ** 2) self.assertArrayEqual(v - y, v - 1) self.assertArrayEqual(y - v, 1 - v) + # verify attributes + v2 = Variable(['x'], x, {'units': 'meters'}) + self.assertVarEqual(v2, +v2) + self.assertVarEqual(v, 0 + v2) # binary ops with all variables self.assertArrayEqual(v + v, 2 * v) w = Variable(['x'], y, {'foo': 'bar'}) @@ -128,6 +136,15 @@ def test_broadcasting_math(self): v * w[0], Variable(['a', 'b', 'c', 'd'], np.einsum('ab,cd->abcd', x, y[0]))) + def test_broadcasting_failures(self): + a = Variable(['x'], np.arange(10)) + b = Variable(['x'], np.arange(5)) + c = Variable(['x', 'x'], np.arange(100).reshape(10, 10)) + with self.assertRaisesRegexp(ValueError, 'mismatched lengths'): + a + b + with self.assertRaisesRegexp(ValueError, 'duplicate dimensions'): + a + c + def test_inplace_math(self): x = np.arange(5) v = Variable(['x'], x) From 63e2b155d6ac02146a84f23e8549f96d08710faf Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 2 Feb 2014 18:41:58 -0800 Subject: [PATCH 03/45] Orthogonal indexing and cached indices --- src/scidata/backends.py | 167 ++++++++++++++++++------------- src/scidata/data.py | 216 ++++++++++++++++------------------------ src/scidata/dataview.py | 71 ++++++++----- src/scidata/utils.py | 38 ++++++- src/scidata/variable.py | 191 +++++++++-------------------------- test/test_data.py | 20 ++-- test/test_utils.py | 51 +++++++++- test/test_variable.py | 2 + 8 files changed, 374 insertions(+), 382 deletions(-) diff --git a/src/scidata/backends.py b/src/scidata/backends.py index 4ed0def9c6e..8ae24dc3336 100644 --- a/src/scidata/backends.py +++ b/src/scidata/backends.py @@ -2,43 +2,55 @@ # and attributes with the OrderedDict API that handle all the storage logic import netCDF4 as nc4 +import numpy as np from scipy.io import netcdf from collections import OrderedDict from utils import FrozenOrderedDict +import conventions +import utils import variable -class InMemoryDataStore(object): - """ - Stores dimensions, variables and attributes - in ordered dictionaries, making this store - fast compared to stores which store to disk. - """ - def __init__(self): - self.dimensions = OrderedDict() - self.variables = OrderedDict() - self.attributes = variable.AttributesDict() - +class AbstractDataStore(object): def unchecked_set_dimensions(self, dimensions): """Set the dimensions without checking validity""" - self.dimensions.update(dimensions) + for d, l in dimensions.iteritems(): + self.unchecked_set_dimension(d, l) def unchecked_set_attributes(self, attributes): """Set the attributes without checking validity""" - self.attributes.update(attributes) + for k, v in attributes.iteritems(): + self.unchecked_set_attribute(k, v) def unchecked_set_variables(self, variables): """Set the variables without checking validity""" - self.variables.update(variables) + for vn, v in variables.iteritems(): + self.unchecked_set_variable(vn, v) + - def unchecked_create_dimension(self, name, length): +class InMemoryDataStore(AbstractDataStore): + """ + Stores dimensions, variables and attributes + in ordered dictionaries, making this store + fast compared to stores which store to disk. + """ + def __init__(self): + self.dimensions = OrderedDict() + self.variables = OrderedDict() + self.attributes = OrderedDict() + + def unchecked_set_dimension(self, name, length): """Set a dimension length""" self.dimensions[name] = length - def unchecked_add_variable(self, name, variable): - """Add a variable without checks""" + def unchecked_set_attribute(self, key, value): + """Set the attributes without checking validity""" + self.attributes[key] = value + + def unchecked_set_variable(self, name, variable): + """Set a variable without checks""" self.variables[name] = variable return self.variables[name] @@ -53,7 +65,7 @@ def __init__(self, scipy_var): self._attributes = scipy_var._attributes -class ScipyDataStore(object): +class ScipyDataStore(AbstractDataStore): """ Stores data using the scipy.io.netcdf package. This store has the advantage of being able to @@ -76,35 +88,54 @@ def attributes(self): def dimensions(self): return self.ds.dimensions - def unchecked_set_dimensions(self, dimensions): - """Set the dimensions without checking validity""" - for d, l in dimensions.iteritems(): - self.unchecked_create_dimension(d, l) - - def unchecked_set_attributes(self, attributes): - """Set the attributes without checking validity""" - for k, v in attributes.iteritems(): - setattr(self.ds, k, v) - - def unchecked_set_variables(self, variables): - """Set the variables without checking validity""" - for vn, v in variables.iteritems(): - self.unchecked_add_variable(vn, v) - - def unchecked_create_dimension(self, name, length): + def unchecked_set_dimension(self, name, length): """Set a dimension length""" + if name in self.ds.dimensions: + raise ValueError('%s does not support modifying dimensions' + % type(self).__name__) self.ds.createDimension(name, length) - def unchecked_add_variable(self, name, variable): + def _validate_attr_key(self, key): + if not conventions.is_valid_name(key): + raise ValueError("Not a valid attribute name") + + def _cast_attr_value(self, value): + # Strings get special handling because netCDF treats them as + # character arrays. Everything else gets coerced to a numpy + # vector. netCDF treats scalars as 1-element vectors. Arrays of + # non-numeric type are not allowed. + if isinstance(value, basestring): + # netcdf attributes should be unicode + value = unicode(value) + else: + try: + value = conventions.coerce_type(np.atleast_1d(np.asarray(value))) + except: + raise ValueError("Not a valid value for a netCDF attribute") + if value.ndim > 1: + raise ValueError("netCDF attributes must be vectors " + + "(1-dimensional)") + value = conventions.coerce_type(value) + if str(value.dtype) not in conventions.TYPEMAP: + # A plain string attribute is okay, but an array of + # string objects is not okay! + raise ValueError("Can not convert to a valid netCDF type") + return value + + def unchecked_set_attribute(self, key, value): + self._validate_attr_key(key) + setattr(self.ds, key, self._cast_attr_value(value)) + + def unchecked_set_variable(self, name, variable): """Add a variable without checks""" - self.ds.createVariable(name, variable.dtype, - variable.dimensions) - self.ds.variables[name][:] = variable.data[:] + if name not in self.ds.variables: + self.ds.createVariable(name, variable.dtype, variable.dimensions) + scipy_var = self.ds.variables[name] + scipy_var[:] = variable.data[:] for k, v in variable.attributes.iteritems(): - setattr(self.ds.variables[name], k, v) - return variable - #TODO: return the variable instead? - # return self.ds.variables[name] + self._validate_attr_key(k) + setattr(scipy_var, k, self._cast_attr_value(v)) + return ScipyVariable(scipy_var) def sync(self): self.ds.flush() @@ -117,6 +148,11 @@ def __init__(self, nc4_variable): self._data = nc4_variable self._attributes = None + def _remap_indexer(self, key): + # netCDF4-python already does orthogonal indexing, so just expand + # the indexer + return utils.expanded_indexer(key, self.ndim) + @property def attributes(self): if self._attributes is None: @@ -133,13 +169,13 @@ def attributes(self): packing_attributes = ['scale_factor', 'add_offset'] keys = [k for k in self._nc4_variable.ncattrs() if not k in packing_attributes] - attr_dict = variable.AttributesDict( + attr_dict = OrderedDict( (k, self._nc4_variable.getncattr(k)) for k in keys) self._attributes = attr_dict return self._attributes -class NetCDF4DataStore(object): +class NetCDF4DataStore(AbstractDataStore): def __init__(self, filename, *args, **kwdargs): self.ds = nc4.Dataset(filename, *args, **kwdargs) @@ -150,45 +186,36 @@ def variables(self): @property def attributes(self): - return variable.AttributesDict((k, self.ds.getncattr(k)) - for k in self.ds.ncattrs()) + return FrozenOrderedDict((k, self.ds.getncattr(k)) + for k in self.ds.ncattrs()) @property def dimensions(self): return FrozenOrderedDict((k, len(v)) for k, v in self.ds.dimensions.iteritems()) - def unchecked_set_dimensions(self, dimensions): - """Set the dimensions without checking validity""" - for d, l in dimensions.iteritems(): - self.unchecked_create_dimension(d, l) - - def unchecked_set_attributes(self, attributes): - """Set the attributes without checking validity""" - self.ds.setncatts(attributes) - - def unchecked_set_variables(self, variables): - """Set the variables without checking validity""" - for vn, v in variables.iteritems(): - self.unchecked_add_variable(vn, v) - - def unchecked_create_dimension(self, name, length): + def unchecked_set_dimension(self, name, length): """Set a dimension length""" self.ds.createDimension(name, size=length) - def unchecked_add_variable(self, name, variable): - """Add a variable without checks""" + def unchecked_set_attribute(self, key, value): + self.ds.setncatts({key: value}) + + def unchecked_set_variable(self, name, variable): + """Set a variable without checks""" # netCDF4 will automatically assign a fill value # depending on the datatype of the variable. Here # we let the package handle the _FillValue attribute # instead of setting it ourselves. fill_value = variable.attributes.pop('_FillValue', None) - self.ds.createVariable(varname=name, - datatype=variable.dtype, - dimensions=variable.dimensions, - fill_value=fill_value) - self.ds.variables[name][:] = variable.data[:] - self.ds.variables[name].setncatts(variable.attributes) - return variable #self.ds.variables[name] + if name not in self.ds.variables: + self.ds.createVariable(varname=name, + datatype=variable.dtype, + dimensions=variable.dimensions, + fill_value=fill_value) + nc4_var = self.ds.variables[name] + nc4_var[:] = variable.data[:] + nc4_var.setncatts(variable.attributes) + return NetCDF4Variable(nc4_var) def sync(self): self.ds.sync() diff --git a/src/scidata/data.py b/src/scidata/data.py index c80c84d6bb9..2f29c583571 100644 --- a/src/scidata/data.py +++ b/src/scidata/data.py @@ -1,12 +1,11 @@ # TODO Use various backend data stores. pytable, ncdf4, scipy.io, iris, memory import os -import copy import numpy as np import netCDF4 as nc4 import pandas as pd from cStringIO import StringIO -from collections import OrderedDict +from collections import OrderedDict, MutableMapping from dataview import DataView from utils import FrozenOrderedDict @@ -93,17 +92,38 @@ def open_dataset(nc, *args, **kwargs): return Dataset(store=store) +class _IndicesCache(MutableMapping): + # MutableMapping subclasses should implement: + # __getitem__, __setitem__, __delitem__, __iter__, __len__ + def __init__(self, dataset, cache=None): + self.dataset = dataset + self.cache = {} if cache is None else dict(cache) + + def __getitem__(self, key): + if not key in self.cache: + self.cache[key] = self.dataset._create_index(key) + return self.cache[key] + + def __setitem__(self, key, value): + self.cache[key] = value + + def __delitem__(self, key): + del self.cache[key] + + def __iter__(self): + return iter(self.dataset.dimensions) + + def __len__(self): + return len(self.dataset.dimensions) + + class Dataset(object): """ A netcdf-like data object consisting of dimensions, variables and attributes which together form a self describing data set - Dataset objects can also be treated as a mapping from variable names to - Variable objects. - - They should be modified by using methods, not by directly changing any of - the attributes listed below: - TODO: change this! + Datasets are containers of variable name. Getting an item from a Dataset + returns a DataView focused on that variable. Attributes ---------- @@ -114,7 +134,9 @@ class Dataset(object): all have dimension 1. noncoordinates : {name: variable, ...} Variables that are not coordinates. - attributes : dict-like + attributes : {key: value, ...} + indices : {dimension: index, ...} + Mapping from dimensions to pandas.Index objects. store : baackends.*DataStore """ def __init__(self, variables=None, dimensions=None, attributes=None, @@ -131,25 +153,23 @@ def __init__(self, variables=None, dimensions=None, attributes=None, self.store = store if attributes is not None: - self._unchecked_set_attributes(attributes) + store.unchecked_set_attributes(attributes) if dimensions is not None: - self._unchecked_set_dimensions(dimensions) + store.unchecked_set_dimensions(dimensions) if variables is not None: if dimensions is None: - self._unchecked_set_dimensions(construct_dimensions(variables)) + store.unchecked_set_dimensions(construct_dimensions(variables)) elif check_consistency: check_dims_and_vars_consistency(dimensions, variables) - self._unchecked_set_variables(variables) + store.unchecked_set_variables(variables) - if indices is None: - indices = {} - else: + if indices is not None: for k, v in indices.iteritems(): if k not in self.dimensions or v.size != self.dimensions[k]: raise ValueError('inconsisent index %r' % k) - self._indices = indices + self._indices = _IndicesCache(self, indices) def _create_index(self, dim): if dim in self.variables: @@ -157,40 +177,16 @@ def _create_index(self, dim): data = var.data attr = var.attributes if 'units' in attr and 'since' in attr['units']: - print 'time units!' data = num2date(data, attr['units']) - else: - print var.attributes elif dim in self.dimensions: data = np.arange(self.dimensions[dim]) else: raise ValueError('cannot find index %r in dataset' % dim) return pd.Index(data) - def _lookup_index(self, dim): - if dim not in self._indices: - self._indices[dim] = self._create_index(dim) - return self._indices[dim] - @property def indices(self): - return FrozenOrderedDict((dim, self._lookup_index(dim)) - for dim in self.dimensions) - - def _unchecked_set_dimensions(self, dims, *args, **kwdargs): - self.store.unchecked_set_dimensions(dims, *args, **kwdargs) - - def _unchecked_set_attributes(self, *args, **kwdargs): - self.store.unchecked_set_attributes(*args, **kwdargs) - - def _unchecked_set_variables(self, *args, **kwdargs): - self.store.unchecked_set_variables(*args, **kwdargs) - - def _unchecked_create_dimension(self, dim, *args, **kwdargs): - self.store.unchecked_create_dimension(dim, *args, **kwdargs) - - def _unchecked_add_variable(self, *args, **kwdargs): - return self.store.unchecked_add_variable(*args, **kwdargs) + return self._indices def sync(self): return self.store.sync() @@ -231,14 +227,7 @@ def __iter__(self): return iter(self.variables) def __getitem__(self, key): - return self.variables[key] - - def __setitem__(self, key, value): - return self.add_variable(key, value) - - def __delitem__(self, key): - # does deleting variables make sense for all backends? - del self.variables[key] + return DataView(self.select(key), key) #TODO: add keys, items, and values methods (and the iter versions) to # complete the dict analogy? @@ -347,7 +336,7 @@ def __repr__(self): ' '.join(self.noncoordinates)) def create_dimension(self, name, length): - """Adds a dimension with name dim and length to the object + """Add a dimension to this datasets Parameters ---------- @@ -359,14 +348,13 @@ def create_dimension(self, name, length): """ if name in self.dimensions: raise ValueError('dimension named %r already exists' % name) - elif not isinstance(length, int): - raise TypeError('length must be an integer') - elif length < 0: + length = int(length) + if length < 0: raise ValueError('length must be non-negative') - self._unchecked_create_dimension(name, int(length)) + self.store.unchecked_set_dimension(name, int(length)) def create_variable(self, name, dims, data, attributes=None): - """Create a new variable. + """Create a new variable and add it to this dataset Parameters ---------- @@ -395,7 +383,7 @@ def create_variable(self, name, dims, data, attributes=None): return self.add_variable(name, v) def create_coordinate(self, name, data, attributes=None): - """Create a new dimension and a corresponding coordinate variable. + """Create a new dimension and a corresponding coordinate variable This method combines the create_dimension and create_variable methods for the common case when the variable is a 1-dimensional coordinate @@ -431,10 +419,10 @@ def create_coordinate(self, name, data, attributes=None): if var.ndim != 1: raise ValueError("coordinate data must be 1-dimensional (vector)") if name not in self.dimensions: - self._unchecked_create_dimension(name, var.size) + self.store.unchecked_set_dimension(name, var.size) elif self.dimensions[name] != var.size: raise ValueError('dimension already exists with different length') - return self._unchecked_add_variable(name, var) + return self.store.unchecked_set_variable(name, var) def add_variable(self, name, var): """Add a variable to the dataset @@ -454,11 +442,14 @@ def add_variable(self, name, var): """ if name in self.variables: raise ValueError("Variable named %r already exists" % name) + return self.set_variable(name, var) + + def set_variable(self, name, var): if name in self.dimensions and name in self._indices: # remove existing index del self._indices[name] check_dims_and_vars_consistency(self.dimensions, {name: var}) - return self._unchecked_add_variable(name, var) + return self.store.unchecked_set_variable(name, var) def views(self, slicers): """Return a new object whose contents are a view of a slice from the @@ -535,26 +526,27 @@ def search_dim_len(dim, variables): return type(self)(variables, dimensions, self.attributes, check_consistency=False) - def loc_views(self, slicers): - islicers = {} - for k, v in slicers.iteritems(): - index = self.indices[k] - if isinstance(v, slice): - ind_slice = index.slice_indexer(v.start, v.stop) - print v, ind_slice - # assume step-size is valid unchanged - islicers[k] = slice(ind_slice.start, ind_slice.stop, v.step) - elif not np.iterable(v): - islicers[k] = index.get_loc(v) - else: - new_index, indexer = index.reindex(v) + def _loc_to_int_indexer(self, dim, locations): + index = self.indices[dim] + if isinstance(locations, slice): + tmp_slice = index.slice_indexer(locations.start, locations.stop) + # assume step-size is valid unchanged + indexer = slice(tmp_slice.start, tmp_slice.stop, locations.step) + else: + try: + indexer = index.get_loc(locations) + except TypeError: + # value is an list or array + new_index, indexer = index.reindex(locations) if np.any(indexer < 0): - raise ValueError('not all values found in index %s' % k) - islicers[k] = indexer + raise ValueError('not all values found in index %r' % dim) # FIXME: don't throw away new_index (we'll need to recreate it # later) - return self.views(islicers) + return indexer + def loc_views(self, slicers): + return self.views({k: self._loc_to_int_indexer(k, v) + for k, v in slicers.iteritems()}) def view(self, s, dim): """Return a new object whose contents are a view of a slice from the @@ -588,58 +580,6 @@ def view(self, s, dim): """ return self.views({dim: s}) - def take(self, indices, dim): - """Return a new object whose contents are taken from the - current object along a specified dimension - - Parameters - ---------- - indices : array_like - The indices of the values to extract. indices must be compatible - with the ndarray.take() method. - dim : string, optional - The dimension to slice along. If multiple dimensions of a - variable equal dim (e.g. a correlation matrix), then that - variable is sliced only along its first matching dimension. - - Returns - ------- - obj : Data object - The returned object has the same attributes, dimensions, - variable names and variable attributes as the original. - Variables that are not defined along the specified - dimensions are copied in their entirety. Variables that are - defined along the specified dimension have their data - contents taken along the specified dimension. - - See Also - -------- - numpy.take - Variable.take - """ - # Create a new object - obj = type(self)() - # Create fancy-indexed variables and infer the new dimension length - new_length = self.dimensions[dim] - for (name, var) in self.variables.iteritems(): - if dim in var.dimensions: - obj.store.unchecked_add_variable(name, var.take(indices, dim)) - new_length = obj.variables[name].data.shape[ - list(var.dimensions).index(dim)] - else: - obj.store.unchecked_add_variable(name, copy.deepcopy(var)) - # Hard write the dimensions, skipping validation - for d, l in self.dimensions.iteritems(): - if d == dim: - l = new_length - obj.store.unchecked_create_dimension(d, l) - if obj.dimensions[dim] == 0: - raise IndexError( - "take would result in a dimension of length zero") - # Copy attributes - self._unchecked_set_attributes(self.attributes.copy()) - return obj - def renamed(self, name_dict): """ Returns a new object with renamed variables and dimensions @@ -779,6 +719,26 @@ def unselect(self, *names): return type(self)(variables, self.dimensions, self.attributes, check_consistency=False) + def replace(self, name, variable): + """Returns a new dataset with the variable 'name' replaced with + 'variable' + + Parameters + ---------- + name : str + Name of the variable to replace in this object. + variable : Variable + Replacement variable. + + Returns + ------- + Dataset + New dataset based on this dataset. Dimensions are unchanged. + """ + ds = self.unselect(name) + ds.add_variable(name, variable) + return ds + def to_dataview(self, name, extra_variables=None): """Return a dataview selected from this dataset diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index d8dbd7c6b3a..69b305c7c94 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -1,11 +1,25 @@ import re -import variable from common import _DataWrapperMixin from ops import inject_special_operations from utils import expanded_indexer +class _LocIndexer(object): + def __init__(self, dataview): + self.dataview = dataview + + def _remap_key(self, key): + return tuple(self.dataview.dataset._loc_to_int_indexer(k, v) + for k, v in self.dataview._key_to_slicers(key)) + + def __getitem__(self, key): + return self.dataview[self._remap_key(key)] + + def __setitem__(self, key, value): + self.dataview[self._remap_key(key)] = value + + class DataView(_DataWrapperMixin): """ A Dataset wrapper oriented around a single Variable @@ -37,32 +51,40 @@ def __init__(self, dataset, name): @property def variable(self): - return self.dataset[self.name] - + return self.dataset.variables[self.name] @variable.setter def variable(self, value): - # TODO: remove this line, so we change DataView's dataset in-place - # (if supported by the underlying store) - self.dataset = self.unselected() - self.dataset[self.name] = value + self.dataset.set_variable(self.name, value) + # _data and _data.setter are necessary for _DataWrapperMixin @property def _data(self): - # necessary for _DataWrapperMixin return self.variable._data + @_data.setter + def _data(self, value): + self.variable._data = value @property def dimensions(self): return self.variable.dimensions - def __getitem__(self, key): + def _key_to_slicers(self, key): key = expanded_indexer(key, self.ndim) - slicers = dict(zip(self.dimensions, key)) + return zip(self.dimensions, key) + + def __getitem__(self, key): + slicers = dict(self._key_to_slicers(key)) return type(self)(self.dataset.views(slicers), self.name) def __setitem__(self, key, value): self.variable[key] = value + @property + def loc(self): + """Attribute for location based indexing with pandas + """ + return _LocIndexer(self) + def __iter__(self): for n in range(len(self)): yield self[n] @@ -109,30 +131,32 @@ def unselected(self): """ return self.dataset.unselect(self.name) + def replace_focus(self, new_var): + """Returns a copy of this DataView's dataset with this DataView's + focus variable replaced by 'new_var' + """ + ds = self.dataset.replace(self.name, new_var) + return type(self)(ds, self.name) + def transpose(self, *dimensions): - ds = self.unselected() - ds[self.name] = self.variable.transpose(*dimensions) - return DataView(ds, self.name) + return self.replace_focus(self.variable.transpose(*dimensions)) def unary_op(f): def func(self): - ds = self.unselected() - ds[self.name] = f(self.variable) - return DataView(ds, self.name) + return self.replace_focus(f(self.variable)) return func def binary_op(f, reflexive=False): def func(self, other): - ds = self.unselected() other_variable = getattr(other, 'variable', other) - ds[self.name] = (f(self.variable, other_variable) - if not reflexive - else f(other_variable, self.variable)) + dv = self.replace_focus(f(self.variable, other_variable) + if not reflexive + else f(other_variable, self.variable)) if hasattr(other, 'unselected'): - ds.update(other.unselected()) - return DataView(ds, self.name) + dv.dataset.update(other.unselected()) + return dv return func @@ -147,6 +171,3 @@ def func(self, other): inject_special_operations(DataView, unary_op, binary_op, inplace_binary_op) - -DataView.tranpose = unary_op(variable.Variable.transpose) - diff --git a/src/scidata/utils.py b/src/scidata/utils.py index 99813958aa6..3a5b4dbcc62 100644 --- a/src/scidata/utils.py +++ b/src/scidata/utils.py @@ -5,8 +5,12 @@ def expanded_indexer(key, ndim): - """Given a key for indexing an ndarray, return an equivalent - key which is a tuple with length equal to the number of dimensions + """Given a key for indexing an ndarray, return an equivalent key which is a + tuple with length equal to the number of dimensions + + The expansion is done by replacing all `Ellipsis` items with the right + number of full slices and then padding the key with full slices so that it + reaches the appropriate dimensionality. """ if not isinstance(key, tuple): # numpy treats non-tuple keys equivalent to tuples of length 1 @@ -28,6 +32,33 @@ def expanded_indexer(key, ndim): return tuple(new_key) +def orthogonal_indexer(key, shape): + """Given a key for orthogonal array indexing, returns an equivalent key + suitable for indexing a numpy.ndarray with fancy indexing + """ + def expand_array(k, length): + if isinstance(k, slice): + return np.arange(k.start or 0, k.stop or length, k.step or 1) + else: + k = np.asarray(k) + if k.ndim != 1: + raise ValueError('orthogonal array indexing only supports ' + '1d arrays') + return k + # replace Ellipsis objects with slices + key = list(expanded_indexer(key, len(shape))) + # replace 1d arrays and slices with broadcast compatible arrays + # note: we treat integers separately (instead of turning them into 1d + # arrays) because integers (and only integers) collapse axes when used with + # __getitem__ + non_int_keys = [n for n, k in enumerate(key) if not isinstance(k, int)] + array_indexers = np.ix_(*(expand_array(key[n], shape[n]) + for n in non_int_keys)) + for i, n in enumerate(non_int_keys): + key[n] = array_indexers[i] + return tuple(key) + + def update_safety_check(first_dict, second_dict, compat=operator.eq): """Check the safety of updating one dictionary with another @@ -114,6 +145,9 @@ def variable_equal(v1, v2): return False +# class DisabledMixin(object): + + class FrozenOrderedDict(OrderedDict): """A subclass of OrderedDict whose contents are frozen after initialization to prevent tampering diff --git a/src/scidata/variable.py b/src/scidata/variable.py index 290fc658e31..dc5eab51b18 100644 --- a/src/scidata/variable.py +++ b/src/scidata/variable.py @@ -7,96 +7,11 @@ import conventions import data import dataview +import utils from common import _DataWrapperMixin -from utils import expanded_indexer, safe_merge, safe_update from ops import inject_special_operations -class AttributesDict(OrderedDict): - """A subclass of OrderedDict whose __setitem__ method automatically - checks and converts values to be valid netCDF attributes - """ - def __init__(self, *args, **kwds): - OrderedDict.__init__(self, *args, **kwds) - - def __setitem__(self, key, value): - if not conventions.is_valid_name(key): - raise ValueError("Not a valid attribute name") - # Strings get special handling because netCDF treats them as - # character arrays. Everything else gets coerced to a numpy - # vector. netCDF treats scalars as 1-element vectors. Arrays of - # non-numeric type are not allowed. - if isinstance(value, basestring): - # netcdf attributes should be unicode - value = unicode(value) - else: - try: - value = conventions.coerce_type(np.atleast_1d(np.asarray(value))) - except: - raise ValueError("Not a valid value for a netCDF attribute") - if value.ndim > 1: - raise ValueError("netCDF attributes must be vectors " + - "(1-dimensional)") - value = conventions.coerce_type(value) - if str(value.dtype) not in conventions.TYPEMAP: - # A plain string attribute is okay, but an array of - # string objects is not okay! - raise ValueError("Can not convert to a valid netCDF type") - OrderedDict.__setitem__(self, key, value) - - def copy(self): - """The copy method of the superclass simply calls the constructor, - which in turn calls the update method, which in turns calls - __setitem__. This subclass implementation bypasses the expensive - validation in __setitem__ for a substantial speedup.""" - obj = self.__class__() - for (attr, value) in self.iteritems(): - OrderedDict.__setitem__(obj, attr, copy.copy(value)) - return obj - - def __deepcopy__(self, memo=None): - """ - Returns a deep copy of the current object. - - memo does nothing but is required for compatability with copy.deepcopy - """ - return self.copy() - - def update(self, *other, **kwargs): - """Set multiple attributes with a mapping object or an iterable of - key/value pairs""" - # Capture arguments in an OrderedDict - args_dict = OrderedDict(*other, **kwargs) - try: - # Attempt __setitem__ - for (attr, value) in args_dict.iteritems(): - self.__setitem__(attr, value) - except: - # A plain string attribute is okay, but an array of - # string objects is not okay! - raise ValueError("Can not convert to a valid netCDF type") - # Clean up so that we don't end up in a partial state - for (attr, value) in args_dict.iteritems(): - if self.__contains__(attr): - self.__delitem__(attr) - # Re-raise - raise - - def __eq__(self, other): - if not set(self.keys()) == set(other.keys()): - return False - for (key, value) in self.iteritems(): - if value.__class__ != other[key].__class__: - return False - if isinstance(value, basestring): - if value != other[key]: - return False - else: - if value.tostring() != other[key].tostring(): - return False - return True - - def _as_compatible_data(data): """If data does not have the necessary attributes to be the private _data attribute, convert it to a np.ndarray and raise an warning @@ -105,9 +20,9 @@ def _as_compatible_data(data): # numeric type like np.float32 required = ['dtype', 'shape', 'size', 'ndim'] if not all(hasattr(data, attr) for attr in required): - warnings.warn('converting data to np.ndarray because it lacks some of ' - 'the necesssary attributes for lazy use', RuntimeWarning, - stacklevel=3) + warnings.warn('converting data to np.ndarray because %s lacks some of ' + 'the necesssary attributes for lazy use' + % type(data).__name__, RuntimeWarning, stacklevel=3) data = np.asarray(data) return data @@ -127,43 +42,62 @@ def __init__(self, dims, data, attributes=None): self._data = data if attributes is None: attributes = {} - self._attributes = AttributesDict(attributes) + self._attributes = OrderedDict(attributes) @property def dimensions(self): return self._dimensions - def __getitem__(self, key): + def _remap_indexer(self, key): + """Converts an orthogonal indexer into a fully expanded key (of the + same length as dimensions) suitable for indexing `_data` + + See Also + -------- + utils.expanded_indexer + utils.orthogonal_indexer """ - Return a new Variable object whose contents are consistent with getting - the provided key from the underlying data + key = utils.expanded_indexer(key, self.ndim) + if any(not isinstance(k, (int, slice)) for k in key): + # key would trigger fancy indexing + key = utils.orthogonal_indexer(key, self.shape) + return key + + def __getitem__(self, key): + """Return a new Variable object whose contents are consistent with + getting the provided key from the underlying data + + NB. __getitem__ and __setitem__ implement "orthogonal indexing" like + netCDF4-python, where the key can only include integers, slices + (including `Ellipsis`) and 1d arrays, each of which are applied + orthogonally along their respective dimensions. + + The difference not matter in most cases unless you are using numpy's + "fancy indexing," which can otherwise result in data arrays + with shapes is inconsistent (or just uninterpretable with) with the + variable's dimensions. + + If you really want to do indexing like `x[x > 0]`, manipulate the numpy + array `x.data` directly. """ - key = expanded_indexer(key, self.ndim) + key = self._remap_indexer(key) dimensions = [dim for k, dim in zip(key, self.dimensions) if not isinstance(k, int)] - #TODO: wrap _data in a biggus array or use np.ix_ so fancy indexing - # always slices axes independently (as in the python-netcdf4 package) new_data = self._data[key] - if new_data.ndim != len(dimensions): - raise ValueError('indexing results in an array of shape %s, ' - 'which has inconsistent length with the ' - 'expected dimensions %s (if you really want to ' - 'do this sort of indexing, index the `data` ' - 'attribute directly)' - % (new_data.shape, dimensions)) + # orthogonal indexing should ensure the dimensionality is consistent + assert new_data.ndim == len(dimensions) # always return a Variable, because Variable subtypes may have # different constructors and may not make sense without an attached # datastore return Variable(dimensions, new_data, self.attributes) def __setitem__(self, key, value): - """__setitem__ is overloaded to access the underlying numpy data""" - self.data[key] = value + """__setitem__ is overloaded to access the underlying numpy data with + orthogonal indexing (see __getitem__ for more details) + """ + self.data[self._remap_indexer(key)] = value def __iter__(self): - """ - Iterate over the contents of this Variable - """ for n in range(len(self)): yield self[n] @@ -288,41 +222,6 @@ def view(self, s, dim): """ return self.views({dim: s}) - def take(self, indices, dim): - """Return a new Variable object whose contents are sliced from - the current object along a specified dimension - - Parameters - ---------- - indices : array_like - The indices of the values to extract. indices must be compatible - with the ndarray.take() method. - dim : string - The dimension to slice along. If multiple dimensions equal - dim (e.g. a correlation matrix), then the slicing is done - only along the first matching dimension. - - Returns - ------- - obj : Variable object - The returned object has the same attributes and dimensions - as the original. Data contents are taken along the - specified dimension. - - See Also - -------- - numpy.take - """ - indices = np.asarray(indices) - if indices.ndim != 1: - raise ValueError('indices should have a single dimension') - # When dim appears repeatedly in self.dimensions, using the index() - # method gives us only the first one, which is the desired behavior - axis = self.dimensions.index(dim) - # take only works on actual numpy arrays - data = self.data.take(indices, axis=axis) - return Variable(self.dimensions, data, self.attributes) - def transpose(self, *dimensions): """Return a new Variable object with transposed dimensions @@ -422,8 +321,8 @@ def func(self, other): new_data = (f(self_data, other_data) if not reflexive else f(other_data, self_data)) - new_attr = safe_merge(_math_safe_attributes(self), - _math_safe_attributes(other)) + new_attr = utils.safe_merge(_math_safe_attributes(self), + _math_safe_attributes(other)) return Variable(new_dims, new_data, new_attr) return func @@ -434,7 +333,7 @@ def func(self, other): if dimensions != self.dimensions: raise ValueError('dimensions cannot change for in-place operations') self.data = f(self_data, other_data) - safe_update(self.attributes, _math_safe_attributes(other)) + utils.safe_update(self.attributes, _math_safe_attributes(other)) return self return func diff --git a/test/test_data.py b/test/test_data.py index c59d4ca8271..aec3b07ceec 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -5,6 +5,7 @@ import unittest import numpy as np +import pandas as pd from scidata import Dataset, Variable, backends from . import TestCase @@ -43,6 +44,7 @@ def test_repr(self): self.assertEqual('', repr(data)) + @unittest.skip('method needs rewrite and/or removal') def test_iterator(self): data = create_test_data(self.get_store()) # iterate over the first dim @@ -70,7 +72,7 @@ def test_iterarray(self): ind = int(np.where(data.variables[iterdim].data == t)[0]) # make sure all the slices match dim_axis = list(data[_testvar].dimensions).index(iterdim) - expected = data[_testvar].data.take([ind], axis=dim_axis) + expected = data.variables[_testvar].data.take([ind], axis=dim_axis) np.testing.assert_array_equal(d, expected) # test that the yielded objects are views of the original # This test doesn't make sense for the netCDF4 backend @@ -85,7 +87,7 @@ def test_dimension(self): # prevent duplicate creation self.assertRaises(ValueError, a.create_dimension, 'time', 0) # length must be integer - self.assertRaises(TypeError, a.create_dimension, 'foo', 'a') + self.assertRaises(ValueError, a.create_dimension, 'foo', 'a') self.assertRaises(TypeError, a.create_dimension, 'foo', [1,]) self.assertRaises(ValueError, a.create_dimension, 'foo', -1) self.assertTrue('foo' not in a.dimensions) @@ -101,8 +103,8 @@ def test_variable(self): a.create_variable(name='bar', dims=('time', 'x',), data=d) # order of creation is preserved self.assertTrue(a.variables.keys() == ['foo', 'bar']) - self.assertTrue(all([a['foo'][i].data == d[i] - for i in np.ndindex(*d.shape)])) + self.assertTrue(all([a.variables['foo'][i].data == d[i] + for i in np.ndindex(*d.shape)])) # prevent duplicate creation self.assertRaises(ValueError, a.create_variable, name='foo', dims=('time', 'x',), data=d) @@ -151,6 +153,7 @@ def test_coordinate(self): name='y', data=scal) self.assertTrue('y' not in a.dimensions) + @unittest.skip('attribute checks are not yet backend specific') def test_attributes(self): a = Dataset() a.attributes['foo'] = 'abc' @@ -252,13 +255,17 @@ def test_loc_views(self): self.assertEqual(data.views(int_slicers), data.loc_views(loc_slicers)) data.create_variable('time', ['time'], np.arange(1000, dtype=np.int32), {'units': 'days since 2000-01-01'}) - data.create_variable('foobar', ['time', 'dim3'], - np.random.randn(1000, 10)) + self.assertEqual(data.views({'time': 0}), + data.loc_views({'time': '2000-01-01'})) self.assertEqual(data.views({'time': slice(10)}), data.loc_views({'time': slice('2000-01-01', '2000-01-10')})) self.assertEqual(data, data.loc_views({'time': slice('1999', '2005')})) + self.assertEqual(data.views({'time': slice(3)}), + data.loc_views({'time': + pd.date_range('2000-01-01', periods=3)})) + @unittest.skip('obsolete method should be removed') def test_take(self): data = create_test_data(self.get_store()) slicedim = _testdim @@ -297,6 +304,7 @@ def test_take(self): indices=[data.dimensions[slicedim] + 10], dim=slicedim) + @unittest.skip('method needs rewrite and/or removal') def test_squeeze(self): data = create_test_data(self.get_store()) singleton = data.take([1], 'dim2') diff --git a/test/test_utils.py b/test/test_utils.py index c4777f91e2d..2ac025c5a84 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -9,15 +9,56 @@ def __getitem__(self, key): return key -class TestExpandedIndexer(TestCase): - def test(self): +class TestIndexers(TestCase): + def set_to_zero(self, x, i): + x = x.copy() + x[i] = 0 + return x + + def test_expanded_indexer(self): x = np.random.randn(10, 11, 12, 13, 14) y = np.arange(5) - i = ReturnItem() - for i in [i[:], i[...], i[0, :, 10], i[..., 10], i[:5, ..., 0], - i[y], i[y, y], i[..., y, y], i[..., 0, 1, 2, 3, 4]]: + I = ReturnItem() + for i in [I[:], I[...], I[0, :, 10], I[..., 10], I[:5, ..., 0], + I[y], I[y, y], I[..., y, y], I[..., 0, 1, 2, 3, 4]]: j = utils.expanded_indexer(i, x.ndim) self.assertArrayEqual(x[i], x[j]) + self.assertArrayEqual(self.set_to_zero(x, i), + self.set_to_zero(x, j)) + + def test_orthogonal_indexer(self): + x = np.random.randn(10, 11, 12, 13, 14) + y = np.arange(5) + I = ReturnItem() + # orthogonal and numpy indexing should be equivalent, because we only + # use at most one array and it never in between two slice objects + # (i.e., we try to avoid numpy's mind-boggling "partial indexing" + # http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html) + for i in [I[:], I[0], I[0, 0], I[:5], I[2:5], I[2:5:-1], I[:3, :4], + I[:3, 0, :4], I[:3, 0, :4, 0], I[y], I[:, y], I[0, y], + I[:2, :3, y], I[0, y, :, :4, 0]]: + j = utils.orthogonal_indexer(i, x.shape) + self.assertArrayEqual(x[i], x[j]) + self.assertArrayEqual(self.set_to_zero(x, i), + self.set_to_zero(x, j)) + # for more complicated cases, check orthogonal indexing is still + # equivalent to slicing + z = np.arange(2, 8, 2) + for i, j, shape in [ + (I[y, y], I[:5, :5], (5, 5, 12, 13, 14)), + (I[y, z], I[:5, 2:8:2], (5, 3, 12, 13, 14)), + (I[0, y, y], I[0, :5, :5], (5, 5, 13, 14)), + (I[y, 0, z], I[:5, 0, 2:8:2], (5, 3, 13, 14)), + (I[0, :2, y, y, 0], I[0, :2, :5, :5, 0], (2, 5, 5)), + (I[0, :, y, :, 0], I[0, :, :5, :, 0], (11, 5, 13))]: + k = utils.orthogonal_indexer(i, x.shape) + self.assertEqual(shape, x[k].shape) + self.assertArrayEqual(x[j], x[k]) + self.assertArrayEqual(self.set_to_zero(x, j), + self.set_to_zero(x, k)) + # standard numpy (non-orthogonal) indexing doesn't work anymore + with self.assertRaisesRegexp(ValueError, 'only supports 1d'): + utils.orthogonal_indexer(x > 0, x.shape) class TestSafeMerge(TestCase): diff --git a/test/test_variable.py b/test/test_variable.py index 6547168da91..9346e2bce49 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -53,6 +53,8 @@ def test_items(self): Variable(['time'], self.d[:, 0]), v[:, 0]) self.assertVarEqual( Variable(['time', 'x'], self.d[:3, :2]), v[:3, :2]) + # variables should do orthogonal indexing + self.assertVarEqual(v[:3, :2], v[range(3), range(2)]) for n, item in enumerate(v): self.assertVarEqual(Variable(['x'], self.d[n]), item) v.data[:] = 0 From 0192e25e03a7c8f2d8b1660ad9a33ed154806a0e Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 3 Feb 2014 16:09:31 -0800 Subject: [PATCH 04/45] Added utils.num2datetimeindex and collapse operators --- src/scidata/common.py | 40 ++++++++++ src/scidata/data.py | 150 ++++++++++++++++++++++++++++--------- src/scidata/dataview.py | 144 +++++++++++++++++++++++++++-------- src/scidata/ops.py | 41 ++++------ src/scidata/utils.py | 74 +++++++++++------- src/scidata/variable.py | 161 ++++++++++++++++++++++++++++++---------- test/test_data.py | 12 ++- test/test_dataview.py | 19 ++++- test/test_utils.py | 13 ++++ test/test_variable.py | 16 ++++ 10 files changed, 509 insertions(+), 161 deletions(-) diff --git a/src/scidata/common.py b/src/scidata/common.py index 700ccfec396..fd2a5e9fff4 100644 --- a/src/scidata/common.py +++ b/src/scidata/common.py @@ -53,3 +53,43 @@ def __complex__(self): def __long__(self): return long(self._data) + _collapse_method_docstring = \ + """Collapse this {cls}'s data' by applying `{name}` along some + dimension(s) + + Parameters + ---------- + dimension : str or sequence of str, optional + Dimension(s) over which to repeatedly apply `{name}`. + axis : int or sequence of int, optional + Axis(es) over which to repeatedly apply `{name}`. Only one of the + 'dimension' and 'axis' arguments can be supplied. If neither are + supplied, then `{name}` is calculated over the flattened array + (by calling `{name}(x)` without an axis argument). + **kwargs : dict + Additional keyword arguments passed on to `{name}`. + + Note + ---- + If this method is called with multiple dimensions (or axes, which are + converted into dimensions), then `{name}` is performed repeatedly along + each dimension in turn from left to right. + + Returns + ------- + collapsed : {cls} + New {cls} object with `{name}` applied to its data and the + indicated dimension(s) removed. + """ + + @classmethod + def _collapse_method(cls, f, name=None, module=None): + def func(self, dimension=None, axis=None, **kwargs): + return self.collapsed(f, dimension, axis, **kwargs) + if name is None: + name = f.__name__ + func.__name__ = name + func.__doc__ = cls._collapse_method_docstring.format( + name=('' if module is None else module + '.') + name, + cls=cls.__name__) + return func diff --git a/src/scidata/data.py b/src/scidata/data.py index 2f29c583571..7b8e541afa0 100644 --- a/src/scidata/data.py +++ b/src/scidata/data.py @@ -93,15 +93,28 @@ def open_dataset(nc, *args, **kwargs): class _IndicesCache(MutableMapping): + """Cache for Dataset indices""" # MutableMapping subclasses should implement: # __getitem__, __setitem__, __delitem__, __iter__, __len__ def __init__(self, dataset, cache=None): self.dataset = dataset self.cache = {} if cache is None else dict(cache) + # for performance reasons, we could remove this: + self.sync() + + def build_index(self, key): + """Cache the index for the dimension 'key'""" + self.cache[key] = self.dataset._create_index(key) + + def sync(self): + """Cache indices for all dimensions in this dataset""" + for key in self.dataset.dimensions: + self.build_index(key) def __getitem__(self, key): if not key in self.cache: - self.cache[key] = self.dataset._create_index(key) + assert key in self.dataset.dimensions + self.build_index(key) return self.cache[key] def __setitem__(self, key, value): @@ -116,6 +129,17 @@ def __iter__(self): def __len__(self): return len(self.dataset.dimensions) + def __contains__(self, key): + return key in self.cache + + def __repr__(self): + contents = '\n'.join("'%s': %s" % + (k, str(v).replace( + '\n', '\n' + ' ' * (len(k) + 4))) + for k, v in self.items()) + return ("\n%s" + % (type(self).__name__, contents)) + class Dataset(object): """ @@ -140,7 +164,7 @@ class Dataset(object): store : baackends.*DataStore """ def __init__(self, variables=None, dimensions=None, attributes=None, - store=None, indices=None, check_consistency=True): + store=None, indices=None): """ If dimensions are not provided, they are inferred from the variables. @@ -161,14 +185,16 @@ def __init__(self, variables=None, dimensions=None, attributes=None, if variables is not None: if dimensions is None: store.unchecked_set_dimensions(construct_dimensions(variables)) - elif check_consistency: + else: check_dims_and_vars_consistency(dimensions, variables) store.unchecked_set_variables(variables) - if indices is not None: + if indices is None: + indices = {} + else: for k, v in indices.iteritems(): if k not in self.dimensions or v.size != self.dimensions[k]: - raise ValueError('inconsisent index %r' % k) + raise ValueError('inconsistent index %r' % k) self._indices = _IndicesCache(self, indices) def _create_index(self, dim): @@ -177,12 +203,15 @@ def _create_index(self, dim): data = var.data attr = var.attributes if 'units' in attr and 'since' in attr['units']: - data = num2date(data, attr['units']) + index = utils.num2datetimeindex(data, attr['units'], + attr.get('calendar')) + else: + index = pd.Index(data) elif dim in self.dimensions: - data = np.arange(self.dimensions[dim]) + index = pd.Index(np.arange(self.dimensions[dim])) else: raise ValueError('cannot find index %r in dataset' % dim) - return pd.Index(data) + return index @property def indices(self): @@ -214,7 +243,7 @@ def __copy__(self): Returns a shallow copy of the current object. """ return type(self)(self.variables, self.dimensions, self.attributes, - check_consistency=False) + indices=self.indices.cache) def __contains__(self, key): """ @@ -275,7 +304,7 @@ def stored_to(self, store): dataset with the contents of the store """ target = type(self)(self.variables, self.dimensions, self.attributes, - store=store, check_consistency=False) + store=store, indices=self.indices.cache) target.store.sync() return target @@ -438,18 +467,36 @@ def add_variable(self, name, var): Returns ------- variable - The variable object in the underlying datastore + The variable object in the underlying datastore. """ if name in self.variables: raise ValueError("Variable named %r already exists" % name) return self.set_variable(name, var) def set_variable(self, name, var): - if name in self.dimensions and name in self._indices: - # remove existing index - del self._indices[name] + """Set a variable in the dataset + + Unlike `add_variable`, this function allows for overriding existing + variables. + + Parameters + ---------- + name : string + The name under which the variable will be added. + variable : Variable + The variable to be added. If the desired action is to add a copy of + the variable be sure to do so before passing it to this function. + + Returns + ------- + variable + The variable object in the underlying datastore. + """ check_dims_and_vars_consistency(self.dimensions, {name: var}) - return self.store.unchecked_set_variable(name, var) + new_var = self.store.unchecked_set_variable(name, var) + if name in self.indices: + self.indices.build_index(name) + return new_var def views(self, slicers): """Return a new object whose contents are a view of a slice from the @@ -521,10 +568,12 @@ def search_dim_len(dim, variables): # integer (dimension 0) dimensions[dim] = new_len - # TODO: slice indices and pass them on so we don't need to create them - # from scratch in the new object + # slice index cache + indices = {k: v[slicers[k]] + if k in slicers and not isinstance(slicers[k], int) else v + for k, v in self.indices.cache.items() if k in dimensions} return type(self)(variables, dimensions, self.attributes, - check_consistency=False) + indices=indices) def _loc_to_int_indexer(self, dim, locations): index = self.indices[dim] @@ -604,9 +653,10 @@ def renamed(self, name_dict): dimensions = OrderedDict((name_dict.get(k, k), v) for k, v in self.dimensions.iteritems()) - + indices = {name_dict.get(k, k): v + for k, v in self.indices.cache.items()} return type(self)(variables, dimensions, self.attributes, - check_consistency=False) + indices=indices) def merge(self, other): """Merge two datasets into a single new dataset @@ -634,7 +684,9 @@ def merge(self, other): compat=utils.variable_equal) new_dims = utils.safe_merge(self.dimensions, other.dimensions) new_attr = utils.safe_merge(self.attributes, other.attributes) - return type(self)(new_vars, new_dims, new_attr) + new_indices = utils.safe_merge(self.indices.cache, other.indices.cache, + compat=np.array_equal) + return type(self)(new_vars, new_dims, new_attr, indices=new_indices) def update(self, other): """Update this dataset in place with the contents of another dataset @@ -658,15 +710,20 @@ def update(self, other): compat=utils.variable_equal) utils.update_safety_check(self.dimensions, other.dimensions) utils.update_safety_check(self.attributes, other.attributes) + utils.update_safety_check(self.indices.cache, other.indices.cache, + compat=np.array_equal) # update contents self.variables.update(other.variables) self.dimensions.update(other.dimensions) self.attributes.update(other.attributes) + self.indices.update(other.indices.cache) def select(self, *names): - """Returns a new dataset that contains the named variables, along with - the dimensions on which those variables are defined and corresponding - coordinate variables. + """Returns a new dataset that contains the named variables + + Dimensions on which those variables are defined are also included, as + well as the corresponding coordinate variables, and any variables + listed under the 'coordinates' attribute of the named variables. Parameters ---------- @@ -687,37 +744,58 @@ def select(self, *names): raise ValueError( "One or more of the specified variables does not exist") - dim_names = (set(self.variables[k].dimensions) for k in names) - names = set(names).union(*dim_names) + def get_aux_names(var): + names = set(var.dimensions) + if 'coordinates' in var.attributes: + coords = var.attributes['coordinates'] + if coords != '': + names |= set(coords.split(' ')) + return names + + aux_names = [get_aux_names(self.variables[k]) for k in names] + names = set(names).union(*aux_names) variables = OrderedDict((k, v) for k, v in self.variables.iteritems() if k in names) dimensions = OrderedDict((k, v) for k, v in self.dimensions.iteritems() if k in names) + indices = {k: v for k, v in self.indices.cache.items() if k in names} return type(self)(variables, dimensions, self.attributes, - check_consistency=False) + indices=indices) - def unselect(self, *names): + def unselect(self, *names, **kwargs): """Returns a new dataset without the named variables Parameters ---------- *names : str Names of the variables to omit from the returned object. + omit_dimensions : bool, optional (default True) + Whether or not to also omit dimensions with the given names. Returns ------- Dataset - New dataset based on this dataset. Only the named variables are - removed. Dimensions are unchanged. + New dataset based on this dataset. Only the named variables + /dimensions are removed. """ - if not all(k in self.variables for k in names): - raise ValueError( - "One or more of the specified variables does not exist") + if any(k not in self.variables and k not in self.dimensions + for k in names): + raise ValueError('One or more of the specified variable/dimension ' + 'names does not exist on this dataset') variables = OrderedDict((k, v) for k, v in self.variables.iteritems() if k not in names) - return type(self)(variables, self.dimensions, self.attributes, - check_consistency=False) + if kwargs.get('omit_dimensions', True): + dimensions = OrderedDict((k, v) for k, v + in self.dimensions.iteritems() + if k not in names) + indices = {k: v for k, v in self.indices.cache.items() + if k not in names} + else: + dimensions = self.dimensions + indices = self.indices + return type(self)(variables, dimensions, self.attributes, + indices=indices) def replace(self, name, variable): """Returns a new dataset with the variable 'name' replaced with @@ -735,7 +813,7 @@ def replace(self, name, variable): Dataset New dataset based on this dataset. Dimensions are unchanged. """ - ds = self.unselect(name) + ds = self.unselect(name, omit_dimensions=False) ds.add_variable(name, variable) return ds diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index 69b305c7c94..7b8d5ce9928 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -1,8 +1,11 @@ +import functools import re +import numpy as np + +import ops from common import _DataWrapperMixin -from ops import inject_special_operations -from utils import expanded_indexer +from utils import expanded_indexer, FrozenOrderedDict class _LocIndexer(object): @@ -93,6 +96,12 @@ def __iter__(self): def attributes(self): return self.variable.attributes + @property + def indices(self): + return FrozenOrderedDict((k, v) for k, v + in self.dataset.indices.iteritems() + if k in self.dimensions) + def copy(self): return self.__copy__() @@ -139,35 +148,112 @@ def replace_focus(self, new_var): return type(self)(ds, self.name) def transpose(self, *dimensions): - return self.replace_focus(self.variable.transpose(*dimensions)) + """Return a new DataView object with transposed dimensions + Note: Although this operation returns a view of this dataview's + variable's data, it is not lazy -- the data will be fully loaded. -def unary_op(f): - def func(self): - return self.replace_focus(f(self.variable)) - return func - - -def binary_op(f, reflexive=False): - def func(self, other): - other_variable = getattr(other, 'variable', other) - dv = self.replace_focus(f(self.variable, other_variable) - if not reflexive - else f(other_variable, self.variable)) - if hasattr(other, 'unselected'): - dv.dataset.update(other.unselected()) - return dv - return func - + Parameters + ---------- + *dimensions : str, optional + By default, reverse the dimensions. Otherwise, reorder the + dimensions to this order. + + Returns + ------- + transposed : DataView + The returned DataView's variable is transposed. + + See Also + -------- + numpy.transpose + Variable.tranpose + """ + return self.replace_focus(self.variable.transpose(*dimensions)) -def inplace_binary_op(f): - def func(self, other): - other_variable = getattr(other, 'variable', other) - self.variable = f(self.variable, other_variable) - if hasattr(other, 'unselected'): - self.dataset.update(other.unselected()) - return self - return func + def collapsed(self, f, dimension=None, axis=None, **kwargs): + """Collapse this variable by applying `f` along some dimension(s) + Parameters + ---------- + f : function + Function which can be called in the form + `f(x, axis=axis, **kwargs)` to return the result of collapsing an + np.ndarray over an integer valued axis. + dimension : str or sequence of str, optional + Dimension(s) over which to repeatedly apply `f`. + axis : int or sequence of int, optional + Axis(es) over which to repeatedly apply `f`. Only one of the + 'dimension' and 'axis' arguments can be supplied. If neither are + supplied, then the collapse is calculated over the flattened array + (by calling `f(x)` without an axis argument). + **kwargs : dict + Additional keyword arguments passed on to `f`. + + Note + ---- + If `collapsed` is called with multiple dimensions (or axes, which + are converted into dimensions), then the collapse operation is + performed repeatedly along each dimension in turn from left to right. + + Returns + ------- + collapsed : DataView + DataView with this dataview's variable replaced with a variable + with summarized data and the indicated dimension(s) removed. + """ + var = self.variable.collapsed(f, dimension, axis, **kwargs) + dropped_dims = set(self.dimensions) - set(var.dimensions) + # For now, take an aggressive strategy of removing all variables + # associated with any dropped dimensions + # TODO: save some summary (mean? bounds?) of dropped variables + drop = ({self.name} | dropped_dims | + {k for k, v in self.dataset.variables.iteritems() + if any(dim in dropped_dims for dim in v.dimensions)}) + ds = self.dataset.unselect(*drop) + ds.add_variable(self.name, var) + return type(self)(ds, self.name) -inject_special_operations(DataView, unary_op, binary_op, inplace_binary_op) + @staticmethod + def _unary_op(f): + @functools.wraps(f) + def func(self): + return self.replace_focus(f(self.variable)) + return func + + def _check_indices_compat(self, other): + # TODO: possibly automatically select index intersection instead? + if hasattr(other, 'indices'): + for k, v in self.indices.iteritems(): + if (k in other.indices + and not np.array_equal(v, other.indices[k])): + raise ValueError('index %r is not aligned' % k) + + @staticmethod + def _binary_op(f, reflexive=False): + @functools.wraps(f) + def func(self, other): + self._check_indices_compat(other) + other_variable = getattr(other, 'variable', other) + dv = self.replace_focus(f(self.variable, other_variable) + if not reflexive + else f(other_variable, self.variable)) + if hasattr(other, 'unselected'): + dv.dataset.update(other.unselected()) + return dv + return func + + @staticmethod + def _inplace_binary_op(f): + @functools.wraps(f) + def func(self, other): + self._check_indices_compat(other) + other_variable = getattr(other, 'variable', other) + self.variable = f(self.variable, other_variable) + if hasattr(other, 'unselected'): + self.dataset.update(other.unselected()) + return self + return func + + +ops.inject_special_operations(DataView) diff --git a/src/scidata/ops.py b/src/scidata/ops.py index 4677fc6a21e..ef1f7f3c469 100644 --- a/src/scidata/ops.py +++ b/src/scidata/ops.py @@ -1,46 +1,35 @@ import operator - -# NUMPY_COLLAPSE_METHODS = ['argmax', 'min', 'argmin', 'ptp', 'sum', 'cumsum', -# 'mean', 'var', 'std', 'prod', 'cumprod', 'all', -# 'any'] - -# def wrap_numpy_collapse_method(f): -# def func(self, dimension=None, axis=None): -# if dimension is not None: -# if axis is None: -# axis = self.dimensions.index(dim) -# else: -# raise ValueError("cannot specify both 'axis' and 'dimension'") -# # dims = tuple(dim for dim in self.dimension is dim != dimension) -# if axis is not None: -# dims = tuple(dim for i, dim in enumerate(self.dimension) -# if i not in [axis, axis + self.ndim]) -# else: -# dims = (), -# data = f(self.data, axis=axis) -# return Variable(dims, data, self.attributes) +import numpy as np UNARY_OPS = ['neg', 'pos', 'abs', 'invert'] CMP_BINARY_OPS = ['lt', 'le', 'eq', 'ne', 'ge', 'gt'] NUM_BINARY_OPS = ['add', 'sub', 'mul', 'div', 'truediv', 'floordiv', 'mod', 'pow', 'and', 'xor', 'or'] +NUMPY_COLLAPSE_METHODS = ['all', 'any', 'argmax', 'argmin', 'cumprod', + 'cumsum', 'max', 'mean', 'min', 'prod', 'ptp', 'std', + 'sum', 'var'] -def inject_special_operations(cls, unary_op, binary_op, inplace_binary_op, - priority=50): +def inject_special_operations(cls, priority=50): # priortize our operations over those of numpy.ndarray (priority=1) # and numpy.matrix (priority=10) cls.__array_priority__ = priority op_str = lambda name: '__%s__' % name op = lambda name: getattr(operator, op_str(name)) # patch in standard special operations - for op_names, op_wrap in [(UNARY_OPS, unary_op), - (CMP_BINARY_OPS + NUM_BINARY_OPS, binary_op)]: + for op_names, op_wrap in [(UNARY_OPS, cls._unary_op), + (CMP_BINARY_OPS + NUM_BINARY_OPS, + cls._binary_op)]: for name in op_names: setattr(cls, op_str(name), op_wrap(op(name))) # only numeric operations have in-place and reflexive variants for name in NUM_BINARY_OPS: - setattr(cls, op_str('r' + name), binary_op(op(name), reflexive=True)) - setattr(cls, op_str('i' + name), inplace_binary_op(op('i' + name))) + setattr(cls, op_str('r' + name), + cls._binary_op(op(name), reflexive=True)) + setattr(cls, op_str('i' + name), + cls._inplace_binary_op(op('i' + name))) + for name in NUMPY_COLLAPSE_METHODS: + setattr(cls, name, cls._collapse_method(getattr(np, name), + name, 'numpy')) diff --git a/src/scidata/utils.py b/src/scidata/utils.py index 3a5b4dbcc62..d1760e03b25 100644 --- a/src/scidata/utils.py +++ b/src/scidata/utils.py @@ -1,7 +1,10 @@ +import netCDF4 as nc4 import operator from collections import OrderedDict +from datetime import datetime import numpy as np +import pandas as pd def expanded_indexer(key, ndim): @@ -59,6 +62,52 @@ def expand_array(k, length): return tuple(key) +def num2datetimeindex(num_dates, units, calendar=None): + """Convert an array of numeric dates in netCDF format into a + pandas.DatetimeIndex + + For standard (Gregorian) calendars, this function uses vectorized + operations, which makes it much faster than netCDF4.num2date. + """ + num_dates = np.asarray(num_dates) + if calendar is None: + calendar = 'standard' + start_date = nc4.num2date(num_dates[0], units, calendar) + if (num_dates.size < 2 + or calendar not in ['standard', 'gregorian', 'proleptic_gregorian'] + or (start_date < datetime(1582, 10, 15) + and calendar != 'proleptic_gregorian')): + dates = nc4.num2date(num_dates, units, calendar) + else: + first_dates = nc4.num2date(num_dates[:2], units, calendar) + first_time_delta = np.timedelta64(first_dates[1] - first_dates[0]) + num_delta = (num_dates - num_dates[0]) / (num_dates[1] - num_dates[0]) + dates = first_time_delta * num_delta + np.datetime64(first_dates[0]) + return pd.Index(dates) + + +def variable_equal(v1, v2): + """True if two objects have the same dimensions, attributes and data; + otherwise False + + This function is necessary because `v1 == v2` for variables and dataviews + does element-wise comparisos (like numpy.ndarrays). + """ + if (v1.dimensions == v2.dimensions + and v1.attributes == v2.attributes): + try: + # if _data is identical, skip checking arrays by value + if v1._data is v2._data: + return True + except AttributeError: + # _data is not part of the public interface, so it's okay if its + # missing + pass + return np.array_equal(v1.data, v2.data) + else: + return False + + def update_safety_check(first_dict, second_dict, compat=operator.eq): """Check the safety of updating one dictionary with another @@ -123,31 +172,6 @@ def safe_merge(first_dict, second_dict, compat=operator.eq): return new_dict -def variable_equal(v1, v2): - """True if two objects have the same dimensions, attributes and data; - otherwise False - - This function is necessary because `v1 == v2` does element-wise comparison - (like numpy.ndarrays). - """ - if (v1.dimensions == v2.dimensions - and v1.attributes == v2.attributes): - try: - # if _data is identical, skip checking arrays by value - if v1._data is v2._data: - return True - except AttributeError: - # _data is not part of the public interface, so it's okay if its - # missing - pass - return np.array_equal(v1.data, v2.data) - else: - return False - - -# class DisabledMixin(object): - - class FrozenOrderedDict(OrderedDict): """A subclass of OrderedDict whose contents are frozen after initialization to prevent tampering diff --git a/src/scidata/variable.py b/src/scidata/variable.py index dc5eab51b18..aec60abd097 100644 --- a/src/scidata/variable.py +++ b/src/scidata/variable.py @@ -1,15 +1,16 @@ import copy -import numpy as np - +import functools import warnings from collections import OrderedDict +import numpy as np + import conventions import data import dataview +import ops import utils from common import _DataWrapperMixin -from ops import inject_special_operations def _as_compatible_data(data): @@ -236,7 +237,7 @@ def transpose(self, *dimensions): Returns ------- - obj : Variable object + transposed : Variable The returned object has transposed data and dimensions with the same attributes as the original. @@ -250,8 +251,124 @@ def transpose(self, *dimensions): data = self.data.transpose(*axes) return Variable(dimensions, data, self.attributes) + def collapsed(self, f, dimension=None, axis=None, **kwargs): + """Collapse this variable by applying `f` along some dimension(s) -def broadcast_var_data(self, other): + Parameters + ---------- + f : function + Function which can be called in the form + `f(x, axis=axis, **kwargs)` to return the result of collapsing an + np.ndarray over an integer valued axis. + dimension : str or sequence of str, optional + Dimension(s) over which to repeatedly apply `f`. + axis : int or sequence of int, optional + Axis(es) over which to repeatedly apply `f`. Only one of the + 'dimension' and 'axis' arguments can be supplied. If neither are + supplied, then the collapse is calculated over the flattened array + (by calling `f(x)` without an axis argument). + **kwargs : dict + Additional keyword arguments passed on to `f`. + + Note + ---- + If `collapsed` is called with multiple dimensions (or axes, which + are converted into dimensions), then the collapse operation is + performed repeatedly along each dimension in turn from left to right. + + Returns + ------- + collapsed : Variable + Variable with summarized data and the indicated dimension(s) + removed. + """ + if dimension is not None and axis is not None: + raise ValueError("cannot supply both 'axis' and 'dimension' " + "arguments") + + if axis is not None: + # determine dimensions + if isinstance(axis, int): + axis = [axis] + dimension = [self.dimensions[i] for i in axis] + + if dimension is not None: + if isinstance(dimension, basestring): + dimension = [dimension] + var = self + for dim in dimension: + var = var._collapsed(f, dim, **kwargs) + else: + attr = self._attributes_with_added_cell_method( + ': '.join(self.dimensions) + ': ' + f.__name__) + var = Variable([], f(self.data, **kwargs), attr) + return var + + def _attributes_with_added_cell_method(self, string): + attr = OrderedDict(self.attributes) + if 'cell_methods' in attr: + base = attr['cell_methods'] + ' ' + else: + base = '' + attr['cell_methods'] = base + string + return attr + + def _collapsed(self, f, dim, **kwargs): + """Collapse a single dimension""" + axis = self.dimensions.index(dim) + dims = tuple(dim for i, dim in enumerate(self.dimensions) + if axis not in [i, i - self.ndim]) + data = f(self.data, axis=axis, **kwargs) + attr = self._attributes_with_added_cell_method( + self.dimensions[axis] + ': ' + f.__name__) + return Variable(dims, data, attr) + + @staticmethod + def _unary_op(f): + @functools.wraps(f) + def func(self): + return Variable(self.dimensions, f(self.data), self.attributes) + return func + + @staticmethod + def _binary_op(f, reflexive=False): + @functools.wraps(f) + def func(self, other): + if isinstance(other, dataview.DataView): + return NotImplemented + self_data, other_data, new_dims = _broadcast_var_data(self, other) + new_data = (f(self_data, other_data) + if not reflexive + else f(other_data, self_data)) + new_attr = utils.safe_merge(_math_safe_attributes(self), + _math_safe_attributes(other)) + return Variable(new_dims, new_data, new_attr) + return func + + @staticmethod + def _inplace_binary_op(f): + @functools.wraps(f) + def func(self, other): + self_data, other_data, dims = _broadcast_var_data(self, other) + if dims != self.dimensions: + raise ValueError('dimensions cannot change for in-place ' + 'operations') + self.data = f(self_data, other_data) + utils.safe_update(self.attributes, _math_safe_attributes(other)) + return self + return func + + # @staticmethod + # def collapse_method(f): + # @functools.wraps(f) + # def func(self, dimension=None, axis=None): + # return self.collapsed(f, dimension, axis) + # return func + +ops.inject_special_operations(Variable) + + +def _broadcast_var_data(self, other): self_data = self.data if isinstance(other, data.Dataset): raise TypeError('datasets do not support mathematical operations') @@ -305,37 +422,3 @@ def _math_safe_attributes(v): return {} else: return OrderedDict((k, v) for k, v in attr.items() if k != 'units') - - -def unary_op(f): - def func(self): - return Variable(self.dimensions, f(self.data), self.attributes) - return func - - -def binary_op(f, reflexive=False): - def func(self, other): - if isinstance(other, dataview.DataView): - return NotImplemented - self_data, other_data, new_dims = broadcast_var_data(self, other) - new_data = (f(self_data, other_data) - if not reflexive - else f(other_data, self_data)) - new_attr = utils.safe_merge(_math_safe_attributes(self), - _math_safe_attributes(other)) - return Variable(new_dims, new_data, new_attr) - return func - - -def inplace_binary_op(f): - def func(self, other): - self_data, other_data, dimensions = broadcast_var_data(self, other) - if dimensions != self.dimensions: - raise ValueError('dimensions cannot change for in-place operations') - self.data = f(self_data, other_data) - utils.safe_update(self.attributes, _math_safe_attributes(other)) - return self - return func - - -inject_special_operations(Variable, unary_op, binary_op, inplace_binary_op) diff --git a/test/test_data.py b/test/test_data.py index aec3b07ceec..77c381c34c1 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -3,6 +3,7 @@ from cStringIO import StringIO import os.path import unittest +import tempfile import numpy as np import pandas as pd @@ -377,10 +378,13 @@ def test_merge(self): class NetCDF4DataTest(DataTest): def get_store(self): - tmp_file = './delete_me.nc' - if os.path.exists(tmp_file): - os.remove(tmp_file) - return backends.NetCDF4DataStore(tmp_file, mode='w') + f, self.tmp_file = tempfile.mkstemp(suffix='.nc') + os.close(f) + return backends.NetCDF4DataStore(self.tmp_file, mode='w') + + def tearDown(self): + if hasattr(self, 'tmp_file') and os.path.exists(self.tmp_file): + os.remove(self.tmp_file) class ScipyDataTest(DataTest): diff --git a/test/test_dataview.py b/test/test_dataview.py index 7a569b8e33e..48c590fcef0 100644 --- a/test/test_dataview.py +++ b/test/test_dataview.py @@ -27,6 +27,9 @@ def test_properties(self): self.assertEqual(getattr(self.dv, attr), getattr(self.v, attr)) self.assertEqual(len(self.dv), len(self.v)) self.assertVarEqual(self.dv, self.v) + self.assertEqual(list(self.dv.indices), list(self.ds.indices)) + for k, v in self.dv.indices.iteritems(): + self.assertArrayEqual(v, self.ds.indices[k]) def test_items(self): self.assertVarEqual(self.dv[0], self.v[0]) @@ -34,6 +37,7 @@ def test_items(self): self.assertVarEqual(self.dv[:3, :5], self.v[:3, :5]) self.assertEqual(self.dv[:3, :5].dataset, self.ds.views({'x': slice(3), 'y': slice(5)})) + self.assertEqual(list(self.dv[0].indices), ['y']) def test_renamed(self): renamed = self.dv.renamed('bar') @@ -59,6 +63,13 @@ def test_math(self): self.assertViewEqual(a, 0 * x + a) self.assertViewEqual(a, a + 0 * a) self.assertViewEqual(a, 0 * a + a) + # test different indices + ds2 = self.ds.replace('x', Variable(['x'], 3 + np.arange(10))) + b = DataView(ds2, 'foo') + with self.assertRaisesRegexp(ValueError, 'not aligned'): + a + b + with self.assertRaisesRegexp(ValueError, 'not aligned'): + b + a def test_inplace_math(self): x = self.x @@ -69,5 +80,9 @@ def test_inplace_math(self): self.assertIs(b, a) self.assertIs(b.variable, v) self.assertIs(b.data, x) - #FIXME: this test currently fails (see DataView.variable.setter) - # self.assertIs(b.dataset, self.ds) + self.assertIs(b.dataset, self.ds) + + def test_collapsed(self): + self.assertVarEqual(self.dv.collapsed(np.mean, 'x'), + self.v.collapsed(np.mean, 'x')) + # needs more... diff --git a/test/test_utils.py b/test/test_utils.py index 2ac025c5a84..0b004b5b1c7 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,4 +1,6 @@ +import netCDF4 as nc4 import numpy as np +import pandas as pd from scidata import utils from . import TestCase @@ -61,6 +63,17 @@ def test_orthogonal_indexer(self): utils.orthogonal_indexer(x > 0, x.shape) +class TestNum2DatetimeIndex(TestCase): + def test(self): + for num_dates, units in [ + (np.arange(1000), 'days since 2000-01-01'), + (12300 + np.arange(500), 'hours since 1680-01-01 00:00:00')]: + for calendar in ['standard', 'gregorian', 'proleptic_gregorian']: + expected = pd.Index(nc4.num2date(num_dates, units, calendar)) + actual = utils.num2datetimeindex(num_dates, units, calendar) + self.assertArrayEqual(expected, actual) + + class TestSafeMerge(TestCase): def setUp(self): self.x = {'a': 'A', 'b': 'B'} diff --git a/test/test_variable.py b/test/test_variable.py index 9346e2bce49..7b85149814e 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -156,3 +156,19 @@ def test_inplace_math(self): # since we provided an ndarray for data, it is also modified in-place self.assertIs(v.data, x) self.assertArrayEqual(v.data, np.arange(5) + 1) + + def test_collapsed(self): + v = Variable(['time', 'x'], self.d) + # intentionally test with an operation for which order matters + self.assertVarEqual(v.collapsed(np.std, 'time'), + Variable(['x'], self.d.std(axis=0), + {'cell_methods': 'time: std'})) + self.assertVarEqual(v.collapsed(np.std, axis=0), + v.collapsed(np.std, dimension='time')) + self.assertVarEqual(v.collapsed(np.std, ['x', 'time']), + Variable([], self.d.std(axis=1).std(axis=0), + {'cell_methods': 'x: std time: std'})) + self.assertVarEqual(v.collapsed(np.std), + Variable([], self.d.std(), + {'cell_methods': 'time: x: std'})) + self.assertVarEqual(v.mean('time'), v.collapsed(np.mean, 'time')) From 900b61e93b9d354c797d13d6f8f74d0d6fddd00f Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Tue, 4 Feb 2014 19:30:52 -0800 Subject: [PATCH 05/45] Simpler Dataset.views + working non-trivial indices + aggregated_by --- src/scidata/common.py | 23 +++++ src/scidata/data.py | 210 +++++++++++++++++----------------------- src/scidata/dataview.py | 66 +++++++++++-- src/scidata/utils.py | 45 ++++++++- src/scidata/variable.py | 193 +++++++++++++++++++++++++----------- test/__init__.py | 5 + test/test_data.py | 63 ++++++------ test/test_dataview.py | 45 +++++++-- test/test_utils.py | 7 +- test/test_variable.py | 88 ++++++++++++++--- 10 files changed, 495 insertions(+), 250 deletions(-) diff --git a/src/scidata/common.py b/src/scidata/common.py index fd2a5e9fff4..1d0cd40da59 100644 --- a/src/scidata/common.py +++ b/src/scidata/common.py @@ -53,6 +53,18 @@ def __complex__(self): def __long__(self): return long(self._data) + # adapted from pandas.NDFrame + # https://github.com/pydata/pandas/blob/master/pandas/core/generic.py#L699 + + def __array__(self, dtype=None): + return self.data + + # @property + # def __array_interface__(self): + # data = self.data + # return dict(typestr=data.dtype.str, shape=data.shape, data=data) + + _collapse_method_docstring = \ """Collapse this {cls}'s data' by applying `{name}` along some dimension(s) @@ -93,3 +105,14 @@ def func(self, dimension=None, axis=None, **kwargs): name=('' if module is None else module + '.') + name, cls=cls.__name__) return func + + # we want something like this, right? + # def apply(self, func, dimension=None, axis=None, **kwargs): + # if dimension is not None and axis is not None: + # raise ValueError("cannot supply both 'axis' and 'dimension' " + # "arguments") + # if axis is None: + # axis = self.dimensions.index(dimension) + # f = self._unary_op(partial(func, axis=axis, **kwargs)) + # return f(self) + diff --git a/src/scidata/data.py b/src/scidata/data.py index 7b8e541afa0..c4316869ed7 100644 --- a/src/scidata/data.py +++ b/src/scidata/data.py @@ -268,12 +268,14 @@ def __eq__(self, other): try: # some stores (e.g., scipy) do not seem to preserve order, so don't # require matching dimension or variable order for equality - return (dict(self.dimensions) == dict(other.dimensions) - and self.attributes == other.attributes + return (sorted(self.dimensions.items()) + == sorted(other.dimensions.items()) + and sorted(self.attributes.items()) + == sorted(other.attributes.items()) and all(k1 == k2 and utils.variable_equal(v1, v2) for (k1, v1), (k2, v2) - in zip(dict(self.variables).items(), - dict(other.variables).items()))) + in zip(sorted(self.variables.items()), + sorted(other.variables.items())))) except AttributeError: return False @@ -364,24 +366,6 @@ def __repr__(self): dim_summary, ' '.join(self.noncoordinates)) - def create_dimension(self, name, length): - """Add a dimension to this datasets - - Parameters - ---------- - name : string - The name of the new dimension. An exception will be raised if the - object already has a dimension with this name. - length : int - The length of the new dimension; must a be non-negative integer. - """ - if name in self.dimensions: - raise ValueError('dimension named %r already exists' % name) - length = int(length) - if length < 0: - raise ValueError('length must be non-negative') - self.store.unchecked_set_dimension(name, int(length)) - def create_variable(self, name, dims, data, attributes=None): """Create a new variable and add it to this dataset @@ -439,19 +423,27 @@ def create_coordinate(self, name, data, attributes=None): var : Variable Reference to the newly created coordinate variable. """ - # We need to be cleanly roll back the effects of - # create_dimension if create_variable fails, otherwise we will - # end up in a partial state. - if name in self.coordinates: - raise ValueError("coordinate named '%s' already exists" % name) - var = Variable((name,), np.asarray(data), attributes) - if var.ndim != 1: - raise ValueError("coordinate data must be 1-dimensional (vector)") - if name not in self.dimensions: - self.store.unchecked_set_dimension(name, var.size) - elif self.dimensions[name] != var.size: - raise ValueError('dimension already exists with different length') - return self.store.unchecked_set_variable(name, var) + # any error checking should be taken care of by add_coordinate + v = Variable((name,), np.asarray(data), attributes) + return self.add_coordinate(v) + + def add_dimension(self, name, length): + """Add a dimension to this dataset + + Parameters + ---------- + name : string + The name of the new dimension. An exception will be raised if the + object already has a dimension with this name. + length : int + The length of the new dimension; must a be non-negative integer. + """ + if name in self.dimensions: + raise ValueError('dimension named %r already exists' % name) + length = int(length) + if length < 0: + raise ValueError('length must be non-negative') + self.store.unchecked_set_dimension(name, int(length)) def add_variable(self, name, var): """Add a variable to the dataset @@ -473,6 +465,34 @@ def add_variable(self, name, var): raise ValueError("Variable named %r already exists" % name) return self.set_variable(name, var) + def add_coordinate(self, var): + """Add a coordinate variable to the dataset + + Parameters + ---------- + variable : Variable + The coordinate variable to be added. Coordinate variables must be + 1D, and will be added under the same name as their sole dimension. + + Returns + ------- + variable + The variable object in the underlying datastore. + """ + # We need to be cleanly roll back the effects of + # create_dimension if create_variable fails, otherwise we will + # end up in a partial state. + name = var.dimensions[0] + if name in self.coordinates: + raise ValueError("coordinate named '%s' already exists" % name) + if var.ndim != 1: + raise ValueError("coordinate data must be 1-dimensional (vector)") + if name not in self.dimensions: + self.store.unchecked_set_dimension(name, var.size) + elif self.dimensions[name] != var.size: + raise ValueError('dimension already exists with different length') + return self.store.unchecked_set_variable(name, var) + def set_variable(self, name, var): """Set a variable in the dataset @@ -498,7 +518,7 @@ def set_variable(self, name, var): self.indices.build_index(name) return new_var - def views(self, slicers): + def views(self, **slicers): """Return a new object whose contents are a view of a slice from the current object along a specified dimension @@ -526,52 +546,27 @@ def views(self, slicers): numpy.take Variable.take """ - if not all(k in self.dimensions for k in slicers): - invalid = [k for k in slicers if not k in self.dimensions] - raise ValueError("dimensions %r don't exist" % invalid) + invalid = [k for k in slicers if not k in self.dimensions] + if invalid: + raise ValueError("dimensions %r do not exist" % invalid) + + # all slicers should be int, slice or np.ndarrays + slicers = {k: np.asarray(v) if not isinstance(v, (int, slice)) else v + for k, v in slicers.iteritems()} - # slice all variables variables = OrderedDict() - for (name, var) in self.variables.iteritems(): - var_slicers = dict((k, v) for k, v in slicers.iteritems() - if k in var.dimensions) - variables[name] = var.views(var_slicers) - - def search_dim_len(dim, variables): - # loop through the variables to find the dimension length, or if - # the dimension is not found, return None - for var in variables.values(): - if dim in var.dimensions: - return int(var.shape[var.dimensions.index(dim)]) - return None - - # update dimensions - dimensions = OrderedDict() - for dim in self.dimensions: - new_len = search_dim_len(dim, variables) - if new_len is not None: - # dimension length is defined by a new dataset variable - dimensions[dim] = new_len - elif search_dim_len(dim, self.variables) is None: - # dimension length is also not defined by old dataset variables - # note: dimensions only defined in old dataset variables are be - # dropped - if dim not in slicers: - dimensions[dim] = self.dimensions[dim] - else: - # figure it by slicing temporary coordinate data - temp_data = np.arange(self.dimensions[dim]) - temp_data_sliced = temp_data[slicers[dim]] - new_len = temp_data_sliced.size - if new_len > 0 and temp_data_sliced.ndim > 0: - # drop the dimension if the result of getitem is an - # integer (dimension 0) - dimensions[dim] = new_len - - # slice index cache - indices = {k: v[slicers[k]] - if k in slicers and not isinstance(slicers[k], int) else v - for k, v in self.indices.cache.items() if k in dimensions} + for name, var in self.variables.iteritems(): + var_slicers = {k: v for k, v in slicers.iteritems() + if k in var.dimensions} + variables[name] = var.views(**var_slicers) + + indices = {k: (v[slicers[k]] if k in slicers else v) + for k, v in self.indices.iteritems()} + # filter out non-indices (indices for which one value was selected) + indices = {k: v for k, v in indices.iteritems() + if isinstance(v, pd.Index)} + dimensions = OrderedDict((k, indices[k].size) for k in self.dimensions + if k in indices) return type(self)(variables, dimensions, self.attributes, indices=indices) @@ -586,48 +581,16 @@ def _loc_to_int_indexer(self, dim, locations): indexer = index.get_loc(locations) except TypeError: # value is an list or array - new_index, indexer = index.reindex(locations) + new_index, indexer = index.reindex(np.asarray(locations)) if np.any(indexer < 0): raise ValueError('not all values found in index %r' % dim) # FIXME: don't throw away new_index (we'll need to recreate it # later) return indexer - def loc_views(self, slicers): - return self.views({k: self._loc_to_int_indexer(k, v) - for k, v in slicers.iteritems()}) - - def view(self, s, dim): - """Return a new object whose contents are a view of a slice from the - current object along a specified dimension - - Parameters - ---------- - s : slice - The slice representing the range of the values to extract. - dim : string, optional - The dimension to slice along. - - Returns - ------- - obj : Data object - The returned object has the same attributes, dimensions, - variable names and variable attributes as the original. - Variables that are not defined along the specified - dimensions are viewed in their entirety. Variables that are - defined along the specified dimension have their data - contents taken along the specified dimension. - - Care must be taken since modifying (most) values in the returned - object will result in modification to the parent object. - - See Also - -------- - views - numpy.take - Variable.take - """ - return self.views({dim: s}) + def loc_views(self, **slicers): + return self.views(**{k: self._loc_to_int_indexer(k, v) + for k, v in slicers.iteritems()}) def renamed(self, name_dict): """ @@ -661,6 +624,10 @@ def renamed(self, name_dict): def merge(self, other): """Merge two datasets into a single new dataset + This method generally not allow for overriding data. Variables, + dimensions and indices are checked for conflicts. However, conflicting + attributes are removed. + Parameters ---------- other : Dataset @@ -683,7 +650,8 @@ def merge(self, other): new_vars = utils.safe_merge(self.variables, other.variables, compat=utils.variable_equal) new_dims = utils.safe_merge(self.dimensions, other.dimensions) - new_attr = utils.safe_merge(self.attributes, other.attributes) + new_attr = utils.ordered_dict_intersection(self.attributes, + other.attributes) new_indices = utils.safe_merge(self.indices.cache, other.indices.cache, compat=np.array_equal) return type(self)(new_vars, new_dims, new_attr, indices=new_indices) @@ -691,6 +659,10 @@ def merge(self, other): def update(self, other): """Update this dataset in place with the contents of another dataset + Unlike `dict.update`, this method generally not allow for overriding + data. Variables, dimensions and indices are checked for conflicts. + However, conflicting attributes are removed. + Parameters ---------- other : Dataset @@ -709,14 +681,14 @@ def update(self, other): utils.update_safety_check(self.variables, other.variables, compat=utils.variable_equal) utils.update_safety_check(self.dimensions, other.dimensions) - utils.update_safety_check(self.attributes, other.attributes) utils.update_safety_check(self.indices.cache, other.indices.cache, compat=np.array_equal) # update contents self.variables.update(other.variables) self.dimensions.update(other.dimensions) - self.attributes.update(other.attributes) self.indices.update(other.indices.cache) + # override attributes without checking for compatibility + utils.remove_incompatible_items(self.attributes, other.attributes) def select(self, *names): """Returns a new dataset that contains the named variables diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index 7b8d5ce9928..bfc0b5b5028 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -77,7 +77,7 @@ def _key_to_slicers(self, key): def __getitem__(self, key): slicers = dict(self._key_to_slicers(key)) - return type(self)(self.dataset.views(slicers), self.name) + return type(self)(self.dataset.views(**slicers), self.name) def __setitem__(self, key, value): self.variable[key] = value @@ -128,6 +128,17 @@ def __repr__(self): contents = ': %s' % self.data return '' % (type(self).__name__, self.name, contents) + def views(self, **slicers): + """Return a new Dataset whose contents are a view of a slice from the + current dataset along specified dimensions + + See Also + -------- + Dataset.views + """ + ds = self.dataset.views(**slicers) + return type(self)(ds, self.name) + def renamed(self, new_name): """Returns a new DataView with this DataView's focus variable renamed """ @@ -171,24 +182,24 @@ def transpose(self, *dimensions): """ return self.replace_focus(self.variable.transpose(*dimensions)) - def collapsed(self, f, dimension=None, axis=None, **kwargs): - """Collapse this variable by applying `f` along some dimension(s) + def collapsed(self, func, dimension=None, axis=None, **kwargs): + """Collapse this variable by applying `func` along some dimension(s) Parameters ---------- - f : function + func : function Function which can be called in the form `f(x, axis=axis, **kwargs)` to return the result of collapsing an np.ndarray over an integer valued axis. dimension : str or sequence of str, optional - Dimension(s) over which to repeatedly apply `f`. + Dimension(s) over which to repeatedly apply `func`. axis : int or sequence of int, optional - Axis(es) over which to repeatedly apply `f`. Only one of the + Axis(es) over which to repeatedly apply `func`. Only one of the 'dimension' and 'axis' arguments can be supplied. If neither are supplied, then the collapse is calculated over the flattened array (by calling `f(x)` without an axis argument). **kwargs : dict - Additional keyword arguments passed on to `f`. + Additional keyword arguments passed on to `func`. Note ---- @@ -202,7 +213,7 @@ def collapsed(self, f, dimension=None, axis=None, **kwargs): DataView with this dataview's variable replaced with a variable with summarized data and the indicated dimension(s) removed. """ - var = self.variable.collapsed(f, dimension, axis, **kwargs) + var = self.variable.collapsed(func, dimension, axis, **kwargs) dropped_dims = set(self.dimensions) - set(var.dimensions) # For now, take an aggressive strategy of removing all variables # associated with any dropped dimensions @@ -214,6 +225,43 @@ def collapsed(self, f, dimension=None, axis=None, **kwargs): ds.add_variable(self.name, var) return type(self)(ds, self.name) + def aggregated_by(self, func, new_dim_name, **kwargs): + """Aggregate this dataview by applying `func` to grouped elements + + Parameters + ---------- + func : function + Function which can be called in the form + `func(x, axis=axis, **kwargs)` to reduce an np.ndarray over an + integer valued axis. + new_dim_name : str or sequence of str, optional + Name of the variable in this dataview's dataset by which to group + variable elements. The dimension along which this variable exists + will be replaced by this name. + **kwargs : dict + Additional keyword arguments passed on to `func`. + + Returns + ------- + aggregated : DataView + DataView with aggregated data and the new dimension `new_dim_name`. + """ + agg_var = self.dataset[new_dim_name] + unique, aggregated = self.variable.aggregated_by( + func, new_dim_name, agg_var, **kwargs) + # TODO: add options for how to summarize variables along aggregated + # dimensions instead of just dropping them + drop = ({self.name, new_dim_name} | + {k for k, v in self.dataset.variables.iteritems() + if any(dim in agg_var.dimensions for dim in v.dimensions)}) + ds = self.dataset.unselect(*drop) + ds.add_coordinate(unique) + ds.add_variable(self.name, aggregated) + return type(self)(ds, self.name) + + def __array_wrap__(self, result): + return self.replace_focus(self.variable.__array_wrap__(result)) + @staticmethod def _unary_op(f): @functools.wraps(f) @@ -256,4 +304,4 @@ def func(self, other): return func -ops.inject_special_operations(DataView) +ops.inject_special_operations(DataView, priority=60) diff --git a/src/scidata/utils.py b/src/scidata/utils.py index d1760e03b25..901cba81953 100644 --- a/src/scidata/utils.py +++ b/src/scidata/utils.py @@ -91,7 +91,7 @@ def variable_equal(v1, v2): otherwise False This function is necessary because `v1 == v2` for variables and dataviews - does element-wise comparisos (like numpy.ndarrays). + does element-wise comparisions (like numpy.ndarrays). """ if (v1.dimensions == v2.dimensions and v1.attributes == v2.attributes): @@ -172,6 +172,49 @@ def safe_merge(first_dict, second_dict, compat=operator.eq): return new_dict +def remove_incompatible_items(first_dict, second_dict, compat=operator.eq): + """Remove incompatible items from the first dictionary in-place + + Items are retained if their keys are found in both dictionaries and the + values are compatible. + + Parameters + ---------- + first_dict, second_dict : dict-like + Mappings to merge. + compat : function, optional + Binary operator to determine if two values are compatible. By default, + checks for equality. + """ + for k, v in second_dict.iteritems(): + if k in first_dict and not compat(v, first_dict[k]): + del k + + +def ordered_dict_intersection(first_dict, second_dict, compat=operator.eq): + """Return the intersection of two dictionaries as a new OrderedDict + + Items are retained if their keys are found in both dictionaries and the + values are compatible. + + Parameters + ---------- + first_dict, second_dict : dict-like + Mappings to merge. + compat : function, optional + Binary operator to determine if two values are compatible. By default, + checks for equality. + + Returns + ------- + intersection : OrderedDict + Intersection of the contents. + """ + new_dict = OrderedDict(first_dict) + remove_incompatible_items(new_dict, second_dict, compat) + return new_dict + + class FrozenOrderedDict(OrderedDict): """A subclass of OrderedDict whose contents are frozen after initialization to prevent tampering diff --git a/src/scidata/variable.py b/src/scidata/variable.py index aec60abd097..d1b1d52645e 100644 --- a/src/scidata/variable.py +++ b/src/scidata/variable.py @@ -166,7 +166,7 @@ def __repr__(self): contents = ': %s' % self.data return '' % (type(self).__name__, contents) - def views(self, slicers): + def views(self, **slicers): """Return a new Variable object whose contents are a view of the object sliced along a specified dimension. @@ -191,38 +191,16 @@ def views(self, slicers): view take """ + invalid = [k for k in slicers if not k in self.dimensions] + if invalid: + raise ValueError("dimensions %r do not exist" % invalid) + slices = [slice(None)] * self.data.ndim for i, dim in enumerate(self.dimensions): if dim in slicers: slices[i] = slicers[dim] return self[tuple(slices)] - def view(self, s, dim): - """Return a new Variable object whose contents are a view of the object - sliced along a specified dimension. - - Parameters - ---------- - s : slice - The slice representing the range of the values to extract. - dim : string - The dimension to slice along. - - Returns - ------- - obj : Variable object - The returned object has the same attributes and dimensions - as the original. Data contents are taken along the - specified dimension. Care must be taken since modifying (most) - values in the returned object will result in modification to the - parent object. - - See Also - -------- - take - """ - return self.views({dim: s}) - def transpose(self, *dimensions): """Return a new Variable object with transposed dimensions @@ -251,24 +229,24 @@ def transpose(self, *dimensions): data = self.data.transpose(*axes) return Variable(dimensions, data, self.attributes) - def collapsed(self, f, dimension=None, axis=None, **kwargs): - """Collapse this variable by applying `f` along some dimension(s) + def collapsed(self, func, dimension=None, axis=None, **kwargs): + """Collapse this variable by applying `func` along some dimension(s) Parameters ---------- - f : function + func : function Function which can be called in the form - `f(x, axis=axis, **kwargs)` to return the result of collapsing an + `func(x, axis=axis, **kwargs)` to return the result of collapsing an np.ndarray over an integer valued axis. dimension : str or sequence of str, optional - Dimension(s) over which to repeatedly apply `f`. + Dimension(s) over which to repeatedly apply `func`. axis : int or sequence of int, optional - Axis(es) over which to repeatedly apply `f`. Only one of the + Axis(es) over which to repeatedly apply `func`. Only one of the 'dimension' and 'axis' arguments can be supplied. If neither are supplied, then the collapse is calculated over the flattened array - (by calling `f(x)` without an axis argument). + (by calling `func(x)` without an axis argument). **kwargs : dict - Additional keyword arguments passed on to `f`. + Additional keyword arguments passed on to `func`. Note ---- @@ -297,21 +275,19 @@ def collapsed(self, f, dimension=None, axis=None, **kwargs): dimension = [dimension] var = self for dim in dimension: - var = var._collapsed(f, dim, **kwargs) + var = var._collapsed(func, dim, **kwargs) else: - attr = self._attributes_with_added_cell_method( - ': '.join(self.dimensions) + ': ' + f.__name__) - var = Variable([], f(self.data, **kwargs), attr) + var = Variable([], func(self.data, **kwargs), self.attributes) + var._append_to_cell_methods(': '.join(self.dimensions) + + ': ' + func.__name__) return var - def _attributes_with_added_cell_method(self, string): - attr = OrderedDict(self.attributes) - if 'cell_methods' in attr: - base = attr['cell_methods'] + ' ' + def _append_to_cell_methods(self, string): + if 'cell_methods' in self.attributes: + base = self.attributes['cell_methods'] + ' ' else: base = '' - attr['cell_methods'] = base + string - return attr + self.attributes['cell_methods'] = base + string def _collapsed(self, f, dim, **kwargs): """Collapse a single dimension""" @@ -319,9 +295,56 @@ def _collapsed(self, f, dim, **kwargs): dims = tuple(dim for i, dim in enumerate(self.dimensions) if axis not in [i, i - self.ndim]) data = f(self.data, axis=axis, **kwargs) - attr = self._attributes_with_added_cell_method( - self.dimensions[axis] + ': ' + f.__name__) - return Variable(dims, data, attr) + new_var = Variable(dims, data, self.attributes) + new_var._append_to_cell_methods(self.dimensions[axis] + + ': ' + f.__name__) + return new_var + + def aggregated_by(self, func, new_dim_name, groups, **kwargs): + """Aggregate this variable by applying `func` to grouped elements + + Parameters + ---------- + func : function + Function which can be called in the form + `func(x, axis=axis, **kwargs)` to reduce an np.ndarray over an + integer valued axis. + new_dim_name : str or sequence of str, optional + Name of the new dimension to create. + groups : Variable + 1D variable which contains the values by which to group. + **kwargs : dict + Additional keyword arguments passed on to `func`. + + Returns + ------- + unique : Variable + 1D variable of unique values in group, along the dimension given by + `new_dim_name`. + aggregated : Variable + Variable with aggregated data and the original dimension from + `groups` replaced by `new_dim_name`. + """ + if groups.ndim != 1: + # TODO: remove this limitation? + raise ValueError('aggregation variables must be 1 dimensional') + dim = groups.dimensions[0] + axis = self.dimensions.index(dim) + if groups.size != self.shape[axis]: + raise ValueError('the aggregation variable\'s length does not ' + 'match the length of this variable along its ' + 'dimension') + unique_values = np.unique(groups.data) + aggregated = [self.views(**{dim: groups == u}).collapsed( + func, dim, axis=None, **kwargs) + for u in unique_values] + stacked = stack_variables(aggregated, new_dim_name, unique_values.size) + ordered_dims = [new_dim_name if d == dim else d for d in self.dimensions] + unique = Variable([new_dim_name], unique_values) + return unique, stacked.transpose(*ordered_dims) + + def __array_wrap__(self, result): + return Variable(self.dimensions, result, self.attributes) @staticmethod def _unary_op(f): @@ -340,8 +363,11 @@ def func(self, other): new_data = (f(self_data, other_data) if not reflexive else f(other_data, self_data)) - new_attr = utils.safe_merge(_math_safe_attributes(self), - _math_safe_attributes(other)) + if hasattr(other, 'attributes'): + new_attr = utils.ordered_dict_intersection(self.attributes, + other.attributes) + else: + new_attr = self.attributes return Variable(new_dims, new_data, new_attr) return func @@ -354,20 +380,71 @@ def func(self, other): raise ValueError('dimensions cannot change for in-place ' 'operations') self.data = f(self_data, other_data) - utils.safe_update(self.attributes, _math_safe_attributes(other)) + if hasattr(other, 'attributes'): + utils.remove_incompatible_items(self.attributes, other) return self return func - # @staticmethod - # def collapse_method(f): - # @functools.wraps(f) - # def func(self, dimension=None, axis=None): - # return self.collapsed(f, dimension, axis) - # return func - ops.inject_special_operations(Variable) +def stack_variables(variables, dim, length=None, + allow_conflicting_attributes=False): + """Stack variables along a new dimension + + Parameters + ---------- + variables : iterable of Variable + Variables to stack. + dim : str + Name of the new dimension + length : int, optional + Length of the new dimension. This is used to allocate the new data + array for the stacked variable data before iterating over all items, + which can be more memory efficient. + allow_conflicting_attributes : bool, optional + Whether or not to enforce safely checks to require identical attribute + data. If `True`, each variable be applied in turn to determien the new + attributes. + + Returns + ------- + stacked : Variable + Stacked variable formed by stacking all the supplied variables along + the new dimension. The new dimension will be the first dimension in the + stacked variable. + """ + if length is None: + # so much for lazy evaluation! we need to figure out how many variables + # there are + variables = list(variables) + length = len(variables) + + i = -1 + for i, var in enumerate(variables): + if i == 0: + new_data = np.empty((length,) + var.shape, dtype=var.dtype) + old_dimensions = var.dimensions + attributes = OrderedDict(var.attributes) + else: + if i == length: + raise ValueError('too many stack variables; supplied length ' + 'was %s' % length) + if var.dimensions != old_dimensions: + raise ValueError('inconsistent dimensions between merge ' + 'variables') + if not allow_conflicting_attributes: + utils.update_safety_check(attributes, var.attributes) + attributes.update(var.attributes) + new_data[i] = var.data + + if i + 1 != length: + raise ValueError('only %s stack variables; supplied length ' + 'was %s' % (i + 1, length)) + + return Variable((dim,) + old_dimensions, new_data, attributes) + + def _broadcast_var_data(self, other): self_data = self.data if isinstance(other, data.Dataset): diff --git a/test/__init__.py b/test/__init__.py index c8cd6a5ebd8..517306bf909 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -14,3 +14,8 @@ def assertVarNotEqual(self, v1, v2): def assertArrayEqual(self, a1, a2): assert_array_equal(a1, a2) + + +class ReturnItem(object): + def __getitem__(self, key): + return key diff --git a/test/test_data.py b/test/test_data.py index 77c381c34c1..5d5f42885e5 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -22,9 +22,9 @@ def create_test_data(store=None): obj = Dataset(store=store) - obj.create_dimension('time', 1000) + obj.add_dimension('time', 1000) for d, l in sorted(_dims.items()): - obj.create_dimension(d, l) + obj.add_dimension(d, l) var = obj.create_variable(name=d, dims=(d,), data=np.arange(l, dtype=np.int32), attributes={'units':'integers'}) @@ -83,20 +83,20 @@ def test_iterarray(self): def test_dimension(self): a = Dataset() - a.create_dimension('time', 10) - a.create_dimension('x', 5) + a.add_dimension('time', 10) + a.add_dimension('x', 5) # prevent duplicate creation - self.assertRaises(ValueError, a.create_dimension, 'time', 0) + self.assertRaises(ValueError, a.add_dimension, 'time', 0) # length must be integer - self.assertRaises(ValueError, a.create_dimension, 'foo', 'a') - self.assertRaises(TypeError, a.create_dimension, 'foo', [1,]) - self.assertRaises(ValueError, a.create_dimension, 'foo', -1) + self.assertRaises(ValueError, a.add_dimension, 'foo', 'a') + self.assertRaises(TypeError, a.add_dimension, 'foo', [1,]) + self.assertRaises(ValueError, a.add_dimension, 'foo', -1) self.assertTrue('foo' not in a.dimensions) def test_variable(self): a = Dataset() - a.create_dimension('time', 10) - a.create_dimension('x', 3) + a.add_dimension('time', 10) + a.add_dimension('x', 3) d = np.random.random((10, 3)) a.create_variable(name='foo', dims=('time', 'x',), data=d) self.assertTrue('foo' in a.variables) @@ -142,7 +142,7 @@ def test_coordinate(self): self.assertTrue('x' in a.coordinates) self.assertVarEqual(a.coordinates['x'], a.variables['x']) b = Dataset() - b.create_dimension('x', vec.size) + b.add_dimension('x', vec.size) b.create_variable('x', dims=('x',), data=vec, attributes=attributes) self.assertVarEqual(a['x'], b['x']) self.assertEquals(a.dimensions, b.dimensions) @@ -200,16 +200,10 @@ def test_attributes(self): self.assertRaises(ValueError, b.attributes.__setitem__, 'foo', np.zeros((2, 2))) self.assertRaises(ValueError, b.attributes.__setitem__, 'foo', dict()) - def test_view(self): - data = create_test_data(self.get_store()) - slicedim = _testdim - self.assertEqual(data.view(slice(10), slicedim), - data.views({slicedim: slice(10)})) - def test_views(self): data = create_test_data(self.get_store()) slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 2)} - ret = data.views(slicers) + ret = data.views(**slicers) # Verify that only the specified dimension was altered self.assertItemsEqual(data.dimensions, ret.dimensions) @@ -237,34 +231,35 @@ def test_views(self): # actual.fill(np.pi) # np.testing.assert_array_equal(expected, actual) - self.assertRaises(ValueError, data.views, - {'not_a_dim': slice(0, 2)}) + with self.assertRaises(ValueError): + data.views(not_a_dim=slice(0, 2)) - ret = data.views({'dim1': 0}) + ret = data.views(dim1=0) self.assertEqual({'time': 1000, 'dim2': 50, 'dim3': 10}, ret.dimensions) - ret = data.views({'time': slice(2), 'dim1': 0, 'dim2': slice(5)}) + ret = data.views(time=slice(2), dim1=0, dim2=slice(5)) self.assertEqual({'time': 2, 'dim2': 5, 'dim3': 10}, ret.dimensions) - ret = data.views({'time': 0, 'dim1': 0, 'dim2': slice(5)}) + ret = data.views(time=0, dim1=0, dim2=slice(5)) self.assertItemsEqual({'dim2': 5, 'dim3': 10}, ret.dimensions) def test_loc_views(self): data = create_test_data(self.get_store()) int_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 2)} loc_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 1)} - self.assertEqual(data.views(int_slicers), data.loc_views(loc_slicers)) + self.assertEqual(data.views(**int_slicers), + data.loc_views(**loc_slicers)) data.create_variable('time', ['time'], np.arange(1000, dtype=np.int32), {'units': 'days since 2000-01-01'}) - self.assertEqual(data.views({'time': 0}), - data.loc_views({'time': '2000-01-01'})) - self.assertEqual(data.views({'time': slice(10)}), - data.loc_views({'time': - slice('2000-01-01', '2000-01-10')})) - self.assertEqual(data, data.loc_views({'time': slice('1999', '2005')})) - self.assertEqual(data.views({'time': slice(3)}), - data.loc_views({'time': - pd.date_range('2000-01-01', periods=3)})) + self.assertEqual(data.views(time=0), + data.loc_views(time='2000-01-01')) + self.assertEqual(data.views(time=slice(10)), + data.loc_views(time=slice('2000-01-01', + '2000-01-10'))) + self.assertEqual(data, data.loc_views(time=slice('1999', '2005'))) + self.assertEqual(data.views(time=slice(3)), + data.loc_views( + time=pd.date_range('2000-01-01', periods=3))) @unittest.skip('obsolete method should be removed') def test_take(self): @@ -371,7 +366,7 @@ def test_merge(self): actual = ds1.merge(ds2) self.assertEqual(expected, actual) with self.assertRaises(ValueError): - ds1.merge(ds2.view(0, 'dim1')) + ds1.merge(ds2.views(dim1=0)) with self.assertRaises(ValueError): ds1.merge(ds2.renamed({'var3': 'var1'})) diff --git a/test/test_dataview.py b/test/test_dataview.py index 48c590fcef0..6adf6ce5e4f 100644 --- a/test/test_dataview.py +++ b/test/test_dataview.py @@ -1,7 +1,7 @@ import numpy as np from scidata import Dataset, DataView, Variable -from . import TestCase +from . import TestCase, ReturnItem class TestDataView(TestCase): @@ -32,20 +32,36 @@ def test_properties(self): self.assertArrayEqual(v, self.ds.indices[k]) def test_items(self): - self.assertVarEqual(self.dv[0], self.v[0]) - self.assertEqual(self.dv[0].dataset, self.ds.views({'x': 0})) - self.assertVarEqual(self.dv[:3, :5], self.v[:3, :5]) - self.assertEqual(self.dv[:3, :5].dataset, - self.ds.views({'x': slice(3), 'y': slice(5)})) + # test indexing + x = self.ds['x'] + y = self.ds['y'] + I = ReturnItem() + for i in [I[:], I[...], I[x.data], I[x.variable], I[x], I[x, y], + I[x.data > -1], I[x.variable > -1], I[x > -1], + I[x > -1, y > -1]]: + self.assertVarEqual(self.dv, self.dv[i]) + for i in [I[0], I[:, 0], I[:3, :2], + I[x.data[:3]], I[x.variable[:3]], I[x[:3]], I[x[:3], y[:4]], + I[x.data > 3], I[x.variable > 3], I[x > 3], I[x > 3, y > 3]]: + self.assertVarEqual(self.v[i], self.dv[i]) + # test index self.assertEqual(list(self.dv[0].indices), ['y']) + # test matches views + self.assertEqual(self.dv[0].dataset, self.ds.views(x=0)) + self.assertEqual(self.dv[:3, :5].dataset, + self.ds.views(x=slice(3), y=slice(5))) + + def test_views(self): + self.assertViewEqual(self.dv, self.dv.views(x=slice(None))) + self.assertViewEqual(self.dv[:3], self.dv.views(x=slice(3))) def test_renamed(self): renamed = self.dv.renamed('bar') self.assertEqual(renamed.dataset, self.ds.renamed({'foo': 'bar'})) self.assertEqual(renamed.name, 'bar') - def test_to_dataview(self): - dv = self.ds.to_dataview('foo') + def test_dataset_getitem(self): + dv = self.ds['foo'] self.assertViewEqual(dv, self.dv) def test_math(self): @@ -86,3 +102,16 @@ def test_collapsed(self): self.assertVarEqual(self.dv.collapsed(np.mean, 'x'), self.v.collapsed(np.mean, 'x')) # needs more... + # should check which extra dimensions are dropped + + def test_aggregated_by(self): + agg_var = Variable(['y'], np.array(['a'] * 9 + ['c'] + ['b'] * 7 + + ['c'] * 3)) + self.ds.add_variable('abc', agg_var) + expected_unique, expected_var = \ + self.dv.variable.aggregated_by(np.mean, 'abc', agg_var) + expected = DataView(Dataset( + {'foo': expected_var, 'x': self.ds.variables['x'], + 'abc': expected_unique}), 'foo') + actual = self.dv.aggregated_by(np.mean, 'abc') + self.assertViewEqual(expected, actual) diff --git a/test/test_utils.py b/test/test_utils.py index 0b004b5b1c7..fb8feaea1bd 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -3,12 +3,7 @@ import pandas as pd from scidata import utils -from . import TestCase - - -class ReturnItem(object): - def __getitem__(self, key): - return key +from . import TestCase, ReturnItem class TestIndexers(TestCase): diff --git a/test/test_variable.py b/test/test_variable.py index 7b85149814e..f8ff8e023ca 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -2,7 +2,8 @@ import numpy as np -from scidata import Variable +from scidata import Variable, Dataset +from scidata.variable import stack_variables from . import TestCase @@ -45,27 +46,39 @@ def test_repr(self): repr(v)) def test_items(self): - v = Variable(['time', 'x'], self.d) + data = np.random.random((10, 11)) + v = Variable(['x', 'y'], data) + # test slicing self.assertVarEqual(v, v[:]) self.assertVarEqual(v, v[...]) - self.assertVarEqual(Variable(['x'], self.d[0]), v[0]) - self.assertVarEqual( - Variable(['time'], self.d[:, 0]), v[:, 0]) - self.assertVarEqual( - Variable(['time', 'x'], self.d[:3, :2]), v[:3, :2]) - # variables should do orthogonal indexing + self.assertVarEqual(Variable(['y'], data[0]), v[0]) + self.assertVarEqual(Variable(['x'], data[:, 0]), v[:, 0]) + self.assertVarEqual(Variable(['x', 'y'], data[:3, :2]), v[:3, :2]) + # test array indexing + x = Variable(['x'], np.arange(10)) + y = Variable(['y'], np.arange(11)) + self.assertVarEqual(v, v[x.data]) + self.assertVarEqual(v, v[x]) + self.assertVarEqual(v[:3], v[x < 3]) + self.assertVarEqual(v[:, 3:], v[:, y >= 3]) + self.assertVarEqual(v[:3, 3:], v[x < 3, y >= 3]) + self.assertVarEqual(v[:3, :2], v[x[:3], y[:2]]) self.assertVarEqual(v[:3, :2], v[range(3), range(2)]) + # test iteration for n, item in enumerate(v): - self.assertVarEqual(Variable(['x'], self.d[n]), item) + self.assertVarEqual(Variable(['y'], data[n]), item) + # test setting v.data[:] = 0 self.assertTrue(np.all(v.data == 0)) def test_views(self): v = Variable(['time', 'x'], self.d) - self.assertVarEqual(v.views({'time': slice(None)}), v) - self.assertVarEqual(v.views({'time': 0}), v[0]) - self.assertVarEqual(v.views({'time': slice(0, 3)}), v[:3]) - self.assertVarEqual(v.views({'x': 0}), v[:, 0]) + self.assertVarEqual(v.views(time=slice(None)), v) + self.assertVarEqual(v.views(time=0), v[0]) + self.assertVarEqual(v.views(time=slice(0, 3)), v[:3]) + self.assertVarEqual(v.views(x=0), v[:, 0]) + with self.assertRaisesRegexp(ValueError, 'do not exist'): + v.views(not_a_dim=0) def test_transpose(self): v = Variable(['time', 'x'], self.d) @@ -100,11 +113,11 @@ def test_1d_math(self): # verify attributes v2 = Variable(['x'], x, {'units': 'meters'}) self.assertVarEqual(v2, +v2) - self.assertVarEqual(v, 0 + v2) + self.assertVarEqual(v2, 0 + v2) # binary ops with all variables self.assertArrayEqual(v + v, 2 * v) w = Variable(['x'], y, {'foo': 'bar'}) - self.assertVarEqual(v + w, Variable(['x'], x + y, {'foo': 'bar'})) + self.assertVarEqual(v + w, Variable(['x'], x + y)) self.assertArrayEqual((v * w).data, x * y) # something complicated self.assertArrayEqual((v ** 2 * w - 1 + x).data, x ** 2 * y - 1 + x) @@ -157,6 +170,13 @@ def test_inplace_math(self): self.assertIs(v.data, x) self.assertArrayEqual(v.data, np.arange(5) + 1) + def test_array_interface(self): + x = np.arange(5) + v = Variable(['x'], x) + self.assertArrayEqual(np.asarray(v), x) + # test ufuncs + self.assertVarEqual(np.sin(v), Variable(['x'], np.sin(x))) + def test_collapsed(self): v = Variable(['time', 'x'], self.d) # intentionally test with an operation for which order matters @@ -172,3 +192,41 @@ def test_collapsed(self): Variable([], self.d.std(), {'cell_methods': 'time: x: std'})) self.assertVarEqual(v.mean('time'), v.collapsed(np.mean, 'time')) + + def test_aggregated_by(self): + agg_var = Variable(['y'], np.array(['a', 'a', 'b'])) + v = Variable(['x', 'y'], self.d) + expected_unique = Variable(['abc'], np.array(['a', 'b'])) + expected_aggregated = Variable(['x', 'abc'], + np.array([self.d[:, :2].sum(axis=1), + self.d[:, 2:].sum(axis=1)]).T, + {'cell_methods': 'y: sum'}) + actual_unique, actual_aggregated = v.aggregated_by(np.sum, 'abc', agg_var) + self.assertVarEqual(expected_unique, actual_unique) + self.assertVarEqual(expected_aggregated, actual_aggregated) + # should be equivalent to aggregate by a dataview, too + alt_agg_var = Dataset({'abc': agg_var})['abc'] + actual_unique, actual_aggregated = v.aggregated_by(np.sum, 'abc', + alt_agg_var) + self.assertVarEqual(expected_unique, actual_unique) + self.assertVarEqual(expected_aggregated, actual_aggregated) + + def test_stack_variables(self): + x = np.arange(5) + y = np.ones(5) + v = Variable(['a'], x) + w = Variable(['a'], y) + self.assertVarEqual(Variable(['b', 'a'], np.array([x, y])), + stack_variables([v, w], 'b')) + self.assertVarEqual(Variable(['b', 'a'], np.array([x, y])), + stack_variables((v, w), 'b')) + self.assertVarEqual(Variable(['b', 'a'], np.array([x, y])), + stack_variables((v, w), 'b', length=2)) + with self.assertRaisesRegexp(ValueError, 'too many'): + stack_variables([v, w], 'b', length=1) + with self.assertRaisesRegexp(ValueError, r'only \d+ stack'): + stack_variables([v, w, w], 'b', length=4) + with self.assertRaisesRegexp(ValueError, 'inconsistent dimensions'): + stack_variables([v, Variable(['c'], y)], 'b') + + From 0fbea90e0b6886e41a559e78fb7c9a4bcdb15408 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 5 Feb 2014 14:49:59 -0800 Subject: [PATCH 06/45] Refactored backends for more consistency/flexibility --- src/scidata/backends.py | 109 ++++++++++++-------------- src/scidata/common.py | 18 ----- src/scidata/data.py | 167 ++++++++++------------------------------ src/scidata/dataview.py | 22 ++++-- src/scidata/utils.py | 45 +---------- src/scidata/variable.py | 86 +++++++++++---------- test/test_data.py | 47 ++++++++--- test/test_utils.py | 20 +++-- 8 files changed, 197 insertions(+), 317 deletions(-) diff --git a/src/scidata/backends.py b/src/scidata/backends.py index 8ae24dc3336..cec3a9581fb 100644 --- a/src/scidata/backends.py +++ b/src/scidata/backends.py @@ -1,6 +1,8 @@ -#TODO: refactor this module so all the stores just expose dimension, variables -# and attributes with the OrderedDict API that handle all the storage logic +"""Backend objects for saving and loading data +DataStores provide a uniform interface for saving and loading data in different +formats. They should not be used directly, but rather through Dataset objects. +""" import netCDF4 as nc4 import numpy as np @@ -8,26 +10,25 @@ from collections import OrderedDict from utils import FrozenOrderedDict +from variable import Variable import conventions -import utils -import variable class AbstractDataStore(object): - def unchecked_set_dimensions(self, dimensions): + def set_dimensions(self, dimensions): """Set the dimensions without checking validity""" for d, l in dimensions.iteritems(): - self.unchecked_set_dimension(d, l) + self.set_dimension(d, l) - def unchecked_set_attributes(self, attributes): + def set_attributes(self, attributes): """Set the attributes without checking validity""" for k, v in attributes.iteritems(): - self.unchecked_set_attribute(k, v) + self.set_attribute(k, v) - def unchecked_set_variables(self, variables): + def set_variables(self, variables): """Set the variables without checking validity""" for vn, v in variables.iteritems(): - self.unchecked_set_variable(vn, v) + self.set_variable(vn, v) class InMemoryDataStore(AbstractDataStore): @@ -41,28 +42,28 @@ def __init__(self): self.variables = OrderedDict() self.attributes = OrderedDict() - def unchecked_set_dimension(self, name, length): + def set_dimension(self, name, length): """Set a dimension length""" self.dimensions[name] = length - def unchecked_set_attribute(self, key, value): + def set_attribute(self, key, value): """Set the attributes without checking validity""" self.attributes[key] = value - def unchecked_set_variable(self, name, variable): + def set_variable(self, name, variable): """Set a variable without checks""" self.variables[name] = variable return self.variables[name] + def del_attribute(self, key): + del self.attributes[key] + def sync(self): pass -class ScipyVariable(variable.Variable): - def __init__(self, scipy_var): - self._dimensions = scipy_var.dimensions - self._data = scipy_var.data - self._attributes = scipy_var._attributes +def convert_scipy_variable(var): + return Variable(var.dimensions, var.data, var._attributes) class ScipyDataStore(AbstractDataStore): @@ -77,7 +78,7 @@ def __init__(self, fobj, *args, **kwdargs): @property def variables(self): - return FrozenOrderedDict((k, ScipyVariable(v)) + return FrozenOrderedDict((k, convert_scipy_variable(v)) for k, v in self.ds.variables.iteritems()) @property @@ -88,7 +89,7 @@ def attributes(self): def dimensions(self): return self.ds.dimensions - def unchecked_set_dimension(self, name, length): + def set_dimension(self, name, length): """Set a dimension length""" if name in self.ds.dimensions: raise ValueError('%s does not support modifying dimensions' @@ -122,11 +123,11 @@ def _cast_attr_value(self, value): raise ValueError("Can not convert to a valid netCDF type") return value - def unchecked_set_attribute(self, key, value): + def set_attribute(self, key, value): self._validate_attr_key(key) setattr(self.ds, key, self._cast_attr_value(value)) - def unchecked_set_variable(self, name, variable): + def set_variable(self, name, variable): """Add a variable without checks""" if name not in self.ds.variables: self.ds.createVariable(name, variable.dtype, variable.dimensions) @@ -135,44 +136,29 @@ def unchecked_set_variable(self, name, variable): for k, v in variable.attributes.iteritems(): self._validate_attr_key(k) setattr(scipy_var, k, self._cast_attr_value(v)) - return ScipyVariable(scipy_var) + return convert_scipy_variable(scipy_var) + + def del_attribute(self, key): + delattr(self.ds, key) def sync(self): self.ds.flush() -class NetCDF4Variable(variable.Variable): - def __init__(self, nc4_variable): - self._nc4_variable = nc4_variable - self._dimensions = nc4_variable.dimensions - self._data = nc4_variable - self._attributes = None - - def _remap_indexer(self, key): - # netCDF4-python already does orthogonal indexing, so just expand - # the indexer - return utils.expanded_indexer(key, self.ndim) - - @property - def attributes(self): - if self._attributes is None: - # we don't want to see scale_factor and add_offset in the attributes - # since the netCDF4 package automatically scales the data on read. - # If we kept scale_factor and add_offset around and did this: - # - # foo = ncdf4.Dataset('foo.nc') - # ncdf4.dump(foo, 'bar.nc') - # bar = ncdf4.Dataset('bar.nc') - # - # you would find that any packed variables in the original - # netcdf file would now have been scaled twice! - packing_attributes = ['scale_factor', 'add_offset'] - keys = [k for k in self._nc4_variable.ncattrs() - if not k in packing_attributes] - attr_dict = OrderedDict( - (k, self._nc4_variable.getncattr(k)) for k in keys) - self._attributes = attr_dict - return self._attributes +def convert_nc4_variable(var): + # we don't want to see scale_factor and add_offset in the attributes + # since the netCDF4 package automatically scales the data on read. + # If we kept scale_factor and add_offset around and did this: + # + # foo = ncdf4.Dataset('foo.nc') + # ncdf4.dump(foo, 'bar.nc') + # bar = ncdf4.Dataset('bar.nc') + # + # you would find that any packed variables in the original + # netcdf file would now have been scaled twice! + attr = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs() + if k not in ['scale_factor', 'add_offset']) + return Variable(var.dimensions, var, attr, indexing_mode='orthogonal') class NetCDF4DataStore(AbstractDataStore): @@ -181,7 +167,7 @@ def __init__(self, filename, *args, **kwdargs): @property def variables(self): - return FrozenOrderedDict((k, NetCDF4Variable(v)) + return FrozenOrderedDict((k, convert_nc4_variable(v)) for k, v in self.ds.variables.iteritems()) @property @@ -193,14 +179,14 @@ def attributes(self): def dimensions(self): return FrozenOrderedDict((k, len(v)) for k, v in self.ds.dimensions.iteritems()) - def unchecked_set_dimension(self, name, length): + def set_dimension(self, name, length): """Set a dimension length""" self.ds.createDimension(name, size=length) - def unchecked_set_attribute(self, key, value): + def set_attribute(self, key, value): self.ds.setncatts({key: value}) - def unchecked_set_variable(self, name, variable): + def set_variable(self, name, variable): """Set a variable without checks""" # netCDF4 will automatically assign a fill value # depending on the datatype of the variable. Here @@ -215,7 +201,10 @@ def unchecked_set_variable(self, name, variable): nc4_var = self.ds.variables[name] nc4_var[:] = variable.data[:] nc4_var.setncatts(variable.attributes) - return NetCDF4Variable(nc4_var) + return convert_nc4_variable(nc4_var) + + def del_attribute(self, key): + self.ds.delncattr(key) def sync(self): self.ds.sync() diff --git a/src/scidata/common.py b/src/scidata/common.py index 1d0cd40da59..98524452f5f 100644 --- a/src/scidata/common.py +++ b/src/scidata/common.py @@ -2,23 +2,6 @@ class _DataWrapperMixin(object): - @property - def data(self): - """ - The variable's data as a numpy.ndarray - """ - if not isinstance(self._data, np.ndarray): - self._data = np.asarray(self._data[...]) - return self._data - - @data.setter - def data(self, value): - value = np.asarray(value) - if value.shape != self.shape: - raise ValueError("replacement data must match the Variable's " - "shape") - self._data = value - @property def dtype(self): return self._data.dtype @@ -64,7 +47,6 @@ def __array__(self, dtype=None): # data = self.data # return dict(typestr=data.dtype.str, shape=data.shape, data=data) - _collapse_method_docstring = \ """Collapse this {cls}'s data' by applying `{name}` along some dimension(s) diff --git a/src/scidata/data.py b/src/scidata/data.py index c4316869ed7..ae0336a7bb9 100644 --- a/src/scidata/data.py +++ b/src/scidata/data.py @@ -89,7 +89,7 @@ def open_dataset(nc, *args, **kwargs): # If nc is a file-like object we read it using # the scipy.io.netcdf package store = backends.ScipyDataStore(nc, *args, **kwargs) - return Dataset(store=store) + return Dataset.load_store(store) class _IndicesCache(MutableMapping): @@ -161,15 +161,16 @@ class Dataset(object): attributes : {key: value, ...} indices : {dimension: index, ...} Mapping from dimensions to pandas.Index objects. - store : baackends.*DataStore + store : backends.*DataStore """ def __init__(self, variables=None, dimensions=None, attributes=None, - store=None, indices=None): + indices=None, store=None): """ If dimensions are not provided, they are inferred from the variables. - Otherwise, variables and dimensions are only checked for consistency - if check_dimensions=True. + Only set a store if you want to Dataset operations to modify stored + data in-place. Otherwise, load data from a store using the + `open_dataset` function or the `from_store` class method. """ # TODO: fill out this docstring if store is None: @@ -177,17 +178,17 @@ def __init__(self, variables=None, dimensions=None, attributes=None, self.store = store if attributes is not None: - store.unchecked_set_attributes(attributes) + store.set_attributes(attributes) if dimensions is not None: - store.unchecked_set_dimensions(dimensions) + store.set_dimensions(dimensions) if variables is not None: if dimensions is None: - store.unchecked_set_dimensions(construct_dimensions(variables)) + store.set_dimensions(construct_dimensions(variables)) else: check_dims_and_vars_consistency(dimensions, variables) - store.unchecked_set_variables(variables) + store.set_variables(variables) if indices is None: indices = {} @@ -197,6 +198,10 @@ def __init__(self, variables=None, dimensions=None, attributes=None, raise ValueError('inconsistent index %r' % k) self._indices = _IndicesCache(self, indices) + @classmethod + def load_store(cls, store): + return cls(store.variables, store.dimensions, store.attributes) + def _create_index(self, dim): if dim in self.variables: var = self.variables[dim] @@ -217,9 +222,6 @@ def _create_index(self, dim): def indices(self): return self._indices - def sync(self): - return self.store.sync() - @property def variables(self): return self.store.variables @@ -300,32 +302,28 @@ def noncoordinates(self): for (name, v) in self.variables.iteritems() if name not in self.coordinates]) - def stored_to(self, store): - """ - Store dataset contents to a backends.*DataStore object and return a new - dataset with the contents of the store - """ + def dump_to_store(self, store): + """Store dataset contents to a backends.*DataStore object""" target = type(self)(self.variables, self.dimensions, self.attributes, store=store, indices=self.indices.cache) target.store.sync() return target def dump(self, filepath, *args, **kwdargs): - """ - Dump dataset contents to a location on disk using the netCDF4 package + """Dump dataset contents to a location on disk using the netCDF4 + package """ nc4_store = backends.NetCDF4DataStore(filepath, mode='w', *args, **kwdargs) - self.stored_to(nc4_store) + self.dump_to_store(nc4_store) def dumps(self): - """ - Serialize dataset contents to a string. The serialization creates an + """Serialize dataset contents to a string. The serialization creates an in memory netcdf version 3 string using the scipy.io.netcdf package. """ fobj = StringIO() scipy_store = backends.ScipyDataStore(fobj, mode='w') - self.stored_to(scipy_store) + self.dump_to_store(scipy_store) return fobj.getvalue() def __str__(self): @@ -443,7 +441,7 @@ def add_dimension(self, name, length): length = int(length) if length < 0: raise ValueError('length must be non-negative') - self.store.unchecked_set_dimension(name, int(length)) + self.store.set_dimension(name, length) def add_variable(self, name, var): """Add a variable to the dataset @@ -488,10 +486,10 @@ def add_coordinate(self, var): if var.ndim != 1: raise ValueError("coordinate data must be 1-dimensional (vector)") if name not in self.dimensions: - self.store.unchecked_set_dimension(name, var.size) + self.store.set_dimension(name, var.size) elif self.dimensions[name] != var.size: raise ValueError('dimension already exists with different length') - return self.store.unchecked_set_variable(name, var) + return self.store.set_variable(name, var) def set_variable(self, name, var): """Set a variable in the dataset @@ -513,7 +511,7 @@ def set_variable(self, name, var): The variable object in the underlying datastore. """ check_dims_and_vars_consistency(self.dimensions, {name: var}) - new_var = self.store.unchecked_set_variable(name, var) + new_var = self.store.set_variable(name, var) if name in self.indices: self.indices.build_index(name) return new_var @@ -621,7 +619,7 @@ def renamed(self, name_dict): return type(self)(variables, dimensions, self.attributes, indices=indices) - def merge(self, other): + def merge(self, other, inplace=False): """Merge two datasets into a single new dataset This method generally not allow for overriding data. Variables, @@ -632,50 +630,19 @@ def merge(self, other): ---------- other : Dataset Dataset to merge with this dataset. + inplace : bool, optional + If True, merge the other dataset into this dataset in-place. Returns ------- - Dataset - New dataset with the merged contents of both datasets + merged : Dataset + Merged dataset. Raises ------ ValueError - If any variables, dimensions or attributes conflict. - - See Also - -------- - Dataset.update : update a dataset in place - """ - new_vars = utils.safe_merge(self.variables, other.variables, - compat=utils.variable_equal) - new_dims = utils.safe_merge(self.dimensions, other.dimensions) - new_attr = utils.ordered_dict_intersection(self.attributes, - other.attributes) - new_indices = utils.safe_merge(self.indices.cache, other.indices.cache, - compat=np.array_equal) - return type(self)(new_vars, new_dims, new_attr, indices=new_indices) - - def update(self, other): - """Update this dataset in place with the contents of another dataset - - Unlike `dict.update`, this method generally not allow for overriding - data. Variables, dimensions and indices are checked for conflicts. - However, conflicting attributes are removed. - - Parameters - ---------- - other : Dataset - Dataset with which to update this dataset. - - Raises - ------ - ValueError - If any variables, dimensions or attributes conflict. - - See Also - -------- - Dataset.merge : merge two datasets into a new dataset + If any variables or dimensions conflict. Conflicting attributes + are silently dropped. """ # check for conflicts utils.update_safety_check(self.variables, other.variables, @@ -684,11 +651,15 @@ def update(self, other): utils.update_safety_check(self.indices.cache, other.indices.cache, compat=np.array_equal) # update contents - self.variables.update(other.variables) - self.dimensions.update(other.dimensions) - self.indices.update(other.indices.cache) - # override attributes without checking for compatibility - utils.remove_incompatible_items(self.attributes, other.attributes) + obj = self if inplace else self.copy() + obj.store.set_variables(other.variables) + obj.store.set_dimensions(other.dimensions) + obj._indices.update(other.indices.cache) + # remove conflicting attributes + for k, v in other.attributes.iteritems(): + if k in self.attributes and not v != self.attributes[k]: + obj.store.del_attribute(k) + return obj def select(self, *names): """Returns a new dataset that contains the named variables @@ -789,29 +760,6 @@ def replace(self, name, variable): ds.add_variable(name, variable) return ds - def to_dataview(self, name, extra_variables=None): - """Return a dataview selected from this dataset - - Parameters - ---------- - name : str - Name of variable on which to orient (and name) the new dataview. - extra_variables : sequence of str, optional - Additional variables from this dataset to include in the dataview's - dataset. These variables's coordinates (if any) are also included. - By default, the dataview's dataset only includes `name` and all of - its coordinate variables. - - Returns - ------- - dataview : DataView - Dataview with a selection of variables from this dataset and the - name `name`. - """ - if extra_variables is None: - extra_variables = [] - return DataView(self.select(*([name] + list(extra_variables))), name) - def iterator(self, dim=None, views=False): """Iterator along a data dimension @@ -984,39 +932,6 @@ def iterarray(self, var, dim=None): slicer[axis] = slice(i, i + 1) yield (None, data[slicer]) - def squeeze(self, dimension): - """ - Squeezes a dimension of length 1, returning a copy of the object - with that dimension removed. - """ - if self.dimensions[dimension] != 1: - raise ValueError(("Can only squeeze along dimensions with" + - "length one, %s has length %d") % - (dimension, self.dimensions[dimension])) - # Create a new Data instance - obj = type(self)() - # Copy dimensions - for (name, length) in self.dimensions.iteritems(): - if not name == dimension: - obj.create_dimension(name, length) - # Copy variables - for (name, var) in self.variables.iteritems(): - if not name == dimension: - dims = list(var.dimensions) - data = var.data - if dimension in dims: - shape = list(var.data.shape) - index = dims.index(dimension) - shape.pop(index) - dims.pop(index) - data = data.reshape(shape) - obj.create_variable(name=name, - dims=tuple(dims), - data=data, - attributes=var.attributes.copy()) - obj.store.unchecked_set_attributes(self.attributes.copy()) - return obj - if __name__ == "__main__": """ diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index bfc0b5b5028..710458a7729 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -59,13 +59,18 @@ def variable(self): def variable(self, value): self.dataset.set_variable(self.name, value) - # _data and _data.setter are necessary for _DataWrapperMixin + # _data is necessary for _DataWrapperMixin @property def _data(self): return self.variable._data - @_data.setter - def _data(self, value): - self.variable._data = value + + @property + def data(self): + """The dataview's data as a numpy.ndarray""" + return self.variable.data + @data.setter + def data(self, value): + self.variable.data = value @property def dimensions(self): @@ -251,9 +256,10 @@ def aggregated_by(self, func, new_dim_name, **kwargs): func, new_dim_name, agg_var, **kwargs) # TODO: add options for how to summarize variables along aggregated # dimensions instead of just dropping them - drop = ({self.name, new_dim_name} | + drop = ({self.name} | + ({new_dim_name} if new_dim_name in self.dataset else set()) | {k for k, v in self.dataset.variables.iteritems() - if any(dim in agg_var.dimensions for dim in v.dimensions)}) + if any(dim in agg_var.dimensions for dim in v.dimensions)}) ds = self.dataset.unselect(*drop) ds.add_coordinate(unique) ds.add_variable(self.name, aggregated) @@ -287,7 +293,7 @@ def func(self, other): if not reflexive else f(other_variable, self.variable)) if hasattr(other, 'unselected'): - dv.dataset.update(other.unselected()) + dv.dataset.merge(other.unselected(), inplace=True) return dv return func @@ -299,7 +305,7 @@ def func(self, other): other_variable = getattr(other, 'variable', other) self.variable = f(self.variable, other_variable) if hasattr(other, 'unselected'): - self.dataset.update(other.unselected()) + self.dataset.merge(other.unselected(), inplace=True) return self return func diff --git a/src/scidata/utils.py b/src/scidata/utils.py index 901cba81953..c70ee4363f6 100644 --- a/src/scidata/utils.py +++ b/src/scidata/utils.py @@ -129,49 +129,6 @@ def update_safety_check(first_dict, second_dict, compat=operator.eq): 'overriding values') -def safe_update(first_dict, second_dict, compat=operator.eq): - """Safely update a dictionary with another dictionary - - Raises ValueError if dictionaries have non-compatible values for any key, - where compatibility is determined by the `compat` function. - - Parameters - ---------- - first_dict, second_dict : dict-like - Mappings to merge. The first dictionary is modified in place. - compat : function, optional - Binary operator to determine if two values are compatible. By default, - checks for equality. - """ - update_safety_check(first_dict, second_dict, compat=compat) - first_dict.update(second_dict) - - -def safe_merge(first_dict, second_dict, compat=operator.eq): - """Safely merge two dictionaries into a new OrderedDict - - Raises ValueError if dictionaries have non-compatible values for any key, - where compatibility is determined by the `compat` function. - - Parameters - ---------- - first_dict, second_dict : dict-like - Mappings to merge. - compat : function, optional - Binary operator to determine if two values are compatible. By default, - checks for equality. - - Returns - ------- - merged : OrderedDict - Merged contents. - """ - update_safety_check(first_dict, second_dict, compat=compat) - new_dict = OrderedDict(first_dict) - new_dict.update(second_dict) - return new_dict - - def remove_incompatible_items(first_dict, second_dict, compat=operator.eq): """Remove incompatible items from the first dictionary in-place @@ -188,7 +145,7 @@ def remove_incompatible_items(first_dict, second_dict, compat=operator.eq): """ for k, v in second_dict.iteritems(): if k in first_dict and not compat(v, first_dict[k]): - del k + del first_dict[k] def ordered_dict_intersection(first_dict, second_dict, compat=operator.eq): diff --git a/src/scidata/variable.py b/src/scidata/variable.py index d1b1d52645e..845334758d0 100644 --- a/src/scidata/variable.py +++ b/src/scidata/variable.py @@ -1,4 +1,3 @@ -import copy import functools import warnings from collections import OrderedDict @@ -22,7 +21,7 @@ def _as_compatible_data(data): required = ['dtype', 'shape', 'size', 'ndim'] if not all(hasattr(data, attr) for attr in required): warnings.warn('converting data to np.ndarray because %s lacks some of ' - 'the necesssary attributes for lazy use' + 'the necesssary attributes for direct use' % type(data).__name__, RuntimeWarning, stacklevel=3) data = np.asarray(data) return data @@ -34,7 +33,7 @@ class Variable(_DataWrapperMixin): which describe a single varRiable. A single variable object is not fully described outside the context of its parent Dataset. """ - def __init__(self, dims, data, attributes=None): + def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): data = _as_compatible_data(data) if len(dims) != data.ndim: raise ValueError('data must have same shape as the number of ' @@ -44,6 +43,23 @@ def __init__(self, dims, data, attributes=None): if attributes is None: attributes = {} self._attributes = OrderedDict(attributes) + self._indexing_mode = indexing_mode + + @property + def data(self): + """The variable's data as a numpy.ndarray""" + if not isinstance(self._data, np.ndarray): + self._data = np.asarray(self._data[...]) + self._indexing_mode = 'numpy' + return self._data + + @data.setter + def data(self, value): + value = np.asarray(value) + if value.shape != self.shape: + raise ValueError("replacement data must match the Variable's " + "shape") + self._data = value @property def dimensions(self): @@ -59,7 +75,8 @@ def _remap_indexer(self, key): utils.orthogonal_indexer """ key = utils.expanded_indexer(key, self.ndim) - if any(not isinstance(k, (int, slice)) for k in key): + if (self._indexing_mode == 'numpy' + and any(not isinstance(k, (int, slice)) for k in key)): # key would trigger fancy indexing key = utils.orthogonal_indexer(key, self.shape) return key @@ -87,16 +104,16 @@ def __getitem__(self, key): new_data = self._data[key] # orthogonal indexing should ensure the dimensionality is consistent assert new_data.ndim == len(dimensions) - # always return a Variable, because Variable subtypes may have - # different constructors and may not make sense without an attached - # datastore - return Variable(dimensions, new_data, self.attributes) + # return a variable with the same indexing_mode, because data should + # still be the same type as _data + return type(self)(dimensions, new_data, self.attributes, + indexing_mode=self._indexing_mode) def __setitem__(self, key, value): """__setitem__ is overloaded to access the underlying numpy data with orthogonal indexing (see __getitem__ for more details) """ - self.data[self._remap_indexer(key)] = value + self._data[self._remap_indexer(key)] = value def __iter__(self): for n in range(len(self)): @@ -107,33 +124,25 @@ def attributes(self): return self._attributes def copy(self): - """ - Returns a shallow copy of the current object. + """Returns a shallow copy of the current object. The data array is + always loaded into memory. """ return self.__copy__() def _copy(self, deepcopy=False): - # deepcopies should always be of a numpy view of the data, not the data - # itself, because non-memory backends don't necessarily have deepcopy - # defined sensibly (this is a problem for netCDF4 variables) - data = copy.deepcopy(self.data) if deepcopy else self._data + # np.array always makes a copy + data = np.array(self._data) if deepcopy else self.data # note: # dimensions is already an immutable tuple # attributes will be copied when the new Variable is created - return Variable(self.dimensions, data, self.attributes) + return type(self)(self.dimensions, data, self.attributes) def __copy__(self): - """ - Returns a shallow copy of the current object. - """ return self._copy(deepcopy=False) def __deepcopy__(self, memo=None): - """ - Returns a deep copy of the current object. - - memo does nothing but is required for compatability with copy.deepcopy - """ + # memo does nothing but is required for compatability with + # copy.deepcopy return self._copy(deepcopy=True) # mutable objects should not be hashable @@ -227,7 +236,7 @@ def transpose(self, *dimensions): dimensions = self.dimensions[::-1] axes = [dimensions.index(dim) for dim in self.dimensions] data = self.data.transpose(*axes) - return Variable(dimensions, data, self.attributes) + return type(self)(dimensions, data, self.attributes) def collapsed(self, func, dimension=None, axis=None, **kwargs): """Collapse this variable by applying `func` along some dimension(s) @@ -277,7 +286,7 @@ def collapsed(self, func, dimension=None, axis=None, **kwargs): for dim in dimension: var = var._collapsed(func, dim, **kwargs) else: - var = Variable([], func(self.data, **kwargs), self.attributes) + var = type(self)([], func(self.data, **kwargs), self.attributes) var._append_to_cell_methods(': '.join(self.dimensions) + ': ' + func.__name__) return var @@ -295,7 +304,7 @@ def _collapsed(self, f, dim, **kwargs): dims = tuple(dim for i, dim in enumerate(self.dimensions) if axis not in [i, i - self.ndim]) data = f(self.data, axis=axis, **kwargs) - new_var = Variable(dims, data, self.attributes) + new_var = type(self)(dims, data, self.attributes) new_var._append_to_cell_methods(self.dimensions[axis] + ': ' + f.__name__) return new_var @@ -327,11 +336,11 @@ def aggregated_by(self, func, new_dim_name, groups, **kwargs): """ if groups.ndim != 1: # TODO: remove this limitation? - raise ValueError('aggregation variables must be 1 dimensional') + raise ValueError('group variables must be 1 dimensional') dim = groups.dimensions[0] axis = self.dimensions.index(dim) if groups.size != self.shape[axis]: - raise ValueError('the aggregation variable\'s length does not ' + raise ValueError('the group variable\'s length does not ' 'match the length of this variable along its ' 'dimension') unique_values = np.unique(groups.data) @@ -340,17 +349,17 @@ def aggregated_by(self, func, new_dim_name, groups, **kwargs): for u in unique_values] stacked = stack_variables(aggregated, new_dim_name, unique_values.size) ordered_dims = [new_dim_name if d == dim else d for d in self.dimensions] - unique = Variable([new_dim_name], unique_values) + unique = type(self)([new_dim_name], unique_values) return unique, stacked.transpose(*ordered_dims) def __array_wrap__(self, result): - return Variable(self.dimensions, result, self.attributes) + return type(self)(self.dimensions, result, self.attributes) @staticmethod def _unary_op(f): @functools.wraps(f) def func(self): - return Variable(self.dimensions, f(self.data), self.attributes) + return type(self)(self.dimensions, f(self.data), self.attributes) return func @staticmethod @@ -368,7 +377,7 @@ def func(self, other): other.attributes) else: new_attr = self.attributes - return Variable(new_dims, new_data, new_attr) + return type(self)(new_dims, new_data, new_attr) return func @staticmethod @@ -388,8 +397,7 @@ def func(self, other): ops.inject_special_operations(Variable) -def stack_variables(variables, dim, length=None, - allow_conflicting_attributes=False): +def stack_variables(variables, dim, length=None): """Stack variables along a new dimension Parameters @@ -402,10 +410,6 @@ def stack_variables(variables, dim, length=None, Length of the new dimension. This is used to allocate the new data array for the stacked variable data before iterating over all items, which can be more memory efficient. - allow_conflicting_attributes : bool, optional - Whether or not to enforce safely checks to require identical attribute - data. If `True`, each variable be applied in turn to determien the new - attributes. Returns ------- @@ -433,9 +437,7 @@ def stack_variables(variables, dim, length=None, if var.dimensions != old_dimensions: raise ValueError('inconsistent dimensions between merge ' 'variables') - if not allow_conflicting_attributes: - utils.update_safety_check(attributes, var.attributes) - attributes.update(var.attributes) + utils.remove_incompatible_items(attributes, var.attributes) new_data[i] = var.data if i + 1 != length: diff --git a/test/test_data.py b/test/test_data.py index 5d5f42885e5..b6bf876ad2a 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -21,7 +21,7 @@ _testdim = sorted(_dims.keys())[0] def create_test_data(store=None): - obj = Dataset(store=store) + obj = Dataset() if store is None else Dataset.load_store(store) obj.add_dimension('time', 1000) for d, l in sorted(_dims.items()): obj.add_dimension(d, l) @@ -38,7 +38,7 @@ class DataTest(TestCase): #TODO: test constructor def get_store(self): - return None + return backends.InMemoryDataStore() def test_repr(self): data = create_test_data(self.get_store()) @@ -261,6 +261,19 @@ def test_loc_views(self): data.loc_views( time=pd.date_range('2000-01-01', periods=3))) + def test_variable_indexing(self): + data = create_test_data(self.get_store()) + v = data['var1'] + d1 = data['dim1'] + d2 = data['dim2'] + self.assertVarEqual(v, v[d1.data]) + self.assertVarEqual(v, v[d1]) + self.assertVarEqual(v[:3], v[d1 < 3]) + self.assertVarEqual(v[:, 3:], v[:, d2 >= 3]) + self.assertVarEqual(v[:3, 3:], v[d1 < 3, d2 >= 3]) + self.assertVarEqual(v[:3, :2], v[d1[:3], d2[:2]]) + self.assertVarEqual(v[:3, :2], v[range(3), range(2)]) + @unittest.skip('obsolete method should be removed') def test_take(self): data = create_test_data(self.get_store()) @@ -370,6 +383,17 @@ def test_merge(self): with self.assertRaises(ValueError): ds1.merge(ds2.renamed({'var3': 'var1'})) + def test_virtual_variables(self): + # need to fill this out + pass + + def test_write_store(self): + expected = create_test_data() + store = self.get_store() + expected.dump_to_store(store) + actual = Dataset.load_store(store) + self.assertEquals(expected, actual) + class NetCDF4DataTest(DataTest): def get_store(self): @@ -392,18 +416,17 @@ def test_repr(self): pass -class StoreTest(TestCase): - def test_stored_to_consistency(self): - store = backends.InMemoryDataStore() - expected = create_test_data(store) +# class StoreTest(TestCase): +# def test_store_consistency(self): +# mem_ds = create_test_data() - mem_nc = deepcopy(expected) - self.assertTrue(isinstance(mem_nc.store, backends.InMemoryDataStore)) +# fobj = StringIO() +# store = backends.ScipyDataStore(fobj, 'w') +# store = self.get_store() +# mem_ds.dump_to_store() - fobj = StringIO() - store = backends.ScipyDataStore(fobj, 'w') - actual = mem_nc.stored_to(store) - self.assertTrue(actual == expected) +# stored_ds = Dataset.load_store(store) +# self.assertEquals(mem_ds, stored_ds) if __name__ == "__main__": diff --git a/test/test_utils.py b/test/test_utils.py index fb8feaea1bd..497b16d2a3d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -69,16 +69,22 @@ def test(self): self.assertArrayEqual(expected, actual) -class TestSafeMerge(TestCase): +class TestDictionaryChecks(TestCase): def setUp(self): self.x = {'a': 'A', 'b': 'B'} self.y = {'c': 'C', 'b': 'B'} + self.z = {'a': 'Z'} - def test_good_merge(self): - actual = utils.safe_merge(self.x, self.y) - self.x.update(self.y) - self.assertEqual(self.x, actual) + def test_safe(self): + # should not raise exception: + utils.update_safety_check(self.x, self.y) - def test_bad_merge(self): + def test_unsafe(self): with self.assertRaises(ValueError): - utils.safe_merge(self.x, {'a': 'Z'}) + utils.update_safety_check(self.x, self.z) + + def test_ordered_dict_intersection(self): + self.assertEquals({'a': 'A', 'b': 'B'}, + utils.ordered_dict_intersection(self.x, self.y)) + self.assertEquals({'b': 'B'}, + utils.ordered_dict_intersection(self.x, self.z)) From 8c5ba8fc12df0f2e9349e15d55f5ccbb11e2d339 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 5 Feb 2014 14:50:50 -0800 Subject: [PATCH 07/45] Added virtual variables --- src/scidata/data.py | 61 ++- src/scidata/dataset.py | 903 ++++++++++++++++++++++++++++++++++++++++ src/scidata/dataview.py | 9 +- 3 files changed, 967 insertions(+), 6 deletions(-) create mode 100644 src/scidata/dataset.py diff --git a/src/scidata/data.py b/src/scidata/data.py index ae0336a7bb9..5bb69221453 100644 --- a/src/scidata/data.py +++ b/src/scidata/data.py @@ -141,6 +141,13 @@ def __repr__(self): % (type(self).__name__, contents)) +# list of attributes of pd.DatetimeIndex that are ndarrays of time info +_DATETIMEINDEX_COMPONENTS = ['year', 'month', 'day', 'hour', 'minute', + 'second', 'microsecond', 'nanosecond', 'date', + 'time', 'dayofyear', 'weekofyear', 'dayofweek', + 'quarter'] + + class Dataset(object): """ A netcdf-like data object consisting of dimensions, variables and @@ -257,8 +264,55 @@ def __contains__(self, key): def __iter__(self): return iter(self.variables) + @property + def _unique_datetimeindex(self): + time_indices = [k for k, v in self.indices.iteritems() + if isinstance(v, pd.DatetimeIndex)] + if len(time_indices) == 1: + return time_indices[0] + else: + return None + + def _get_virtual_variable(self, key): + if key in self.indices: + return Variable([key], self.indices[key].values) + time = self._unique_datetimeindex + if time is not None: + if key in _DATETIMEINDEX_COMPONENTS: + return Variable([time], getattr(self.indices[time], key)) + elif key == 'season': + seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) + month = self.indices[time].month + return Variable([time], seasons[(month // 3) % 4]) + raise ValueError('virtual variable %r not found' % key) + + def _get_virtual_dataview(self, key): + virtual_var = self._get_virtual_variable(key) + new_vars = OrderedDict(self.variables.items() + [(key, virtual_var)]) + ds = type(self)(new_vars, self.dimensions, self.attributes, + indices=self.indices.cache) + return DataView(ds, key) + + @property + def virtual_variables(self): + """Variables that don't exist in this dataset but for which dataviews + could be created on demand (because they can be calculated from other + dataset variables or dimensions) + """ + possible_vars = list(self.dimensions) + if self._unique_datetimeindex is not None: + possible_vars += _DATETIMEINDEX_COMPONENTS + ['season'] + return tuple(k for k in possible_vars if k not in self) + def __getitem__(self, key): - return DataView(self.select(key), key) + if key not in self.variables: + try: + return self._get_virtual_dataview(key) + except ValueError: + raise KeyError('dataset contains no variable with name %r ' + % key) + else: + return DataView(self.select(key), key) #TODO: add keys, items, and values methods (and the iter versions) to # complete the dict analogy? @@ -571,9 +625,8 @@ def views(self, **slicers): def _loc_to_int_indexer(self, dim, locations): index = self.indices[dim] if isinstance(locations, slice): - tmp_slice = index.slice_indexer(locations.start, locations.stop) - # assume step-size is valid unchanged - indexer = slice(tmp_slice.start, tmp_slice.stop, locations.step) + indexer = index.slice_indexer(locations.start, locations.stop, + locations.step) else: try: indexer = index.get_loc(locations) diff --git a/src/scidata/dataset.py b/src/scidata/dataset.py new file mode 100644 index 00000000000..71f094c427a --- /dev/null +++ b/src/scidata/dataset.py @@ -0,0 +1,903 @@ +# TODO Use various backend data stores. pytable, ncdf4, scipy.io, iris, memory +import os +import numpy as np +import netCDF4 as nc4 +import pandas as pd + +from cStringIO import StringIO +from collections import OrderedDict, MutableMapping + +from dataview import DataView +from utils import FrozenOrderedDict, Frozen +from variable import Variable +import backends, conventions, utils + +date2num = nc4.date2num +num2date = nc4.num2date + + +def construct_dimensions(variables): + """ + Given a dictionary of variables, construct a dimensions mapping + + Parameters + ---------- + variables : mapping + Mapping from variable names to Variable objects. + + Returns + ------- + dimensions : mapping + Mapping from dimension names to lengths. + + Raises + ------ + ValueError if variable dimensions are inconsistent. + """ + dimensions = OrderedDict() + for k, var in variables.iteritems(): + for dim, length in zip(var.dimensions, var.shape): + if dim not in dimensions: + dimensions[dim] = length + elif dimensions[dim] != length: + raise ValueError('dimension %r on variable %r has length %s ' + 'but already is saved with length %s' % + (dim, k, length, dimensions[dim])) + return dimensions + + +def check_dims_and_vars_consistency(dimensions, variables): + """ + Validate dimensions and variables are consistent + + Parameters + ---------- + dimensions : mapping + Mapping from dimension names to lengths. + variables : mapping + Mapping from variable names to Variable objects. + + Raises + ------ + ValueError if variable dimensions are inconsistent with the provided + dimensions. + """ + for k, var in variables.iteritems(): + if k in dimensions and var.ndim != 1: + raise ValueError('a coordinate variable must be defined with ' + '1-dimensional data') + for dim, length in zip(var.dimensions, var.shape): + if dim not in dimensions: + raise ValueError('dimension %r on variable %r is not one ' + 'of the dataset dimensions %r' % + (dim, k, list(dimensions))) + elif dimensions[dim] != length: + raise ValueError('dimension %r on variable %r has length ' + '%s but in on the dataset has length %s' % + (dim, k, length, dimensions[dim])) + + +def open_dataset(nc, *args, **kwargs): + #TODO: add tests for this function + # move this to a classmethod Dataset.open? + if isinstance(nc, basestring) and not nc.startswith('CDF'): + # If the initialization nc is a string and it doesn't + # appear to be the contents of a netcdf file we load + # it using the netCDF4 package + store = backends.NetCDF4DataStore(nc, *args, **kwargs) + else: + # If nc is a file-like object we read it using + # the scipy.io.netcdf package + store = backends.ScipyDataStore(nc, *args, **kwargs) + return Dataset.load_store(store) + + +class _IndicesCache(MutableMapping): + """Cache for Dataset indices""" + def __init__(self, dataset, cache=None): + self.dataset = dataset + self.cache = {} if cache is None else dict(cache) + # for performance reasons, we could remove this: + self.sync() + + def build_index(self, key): + """Cache the index for the dimension 'key'""" + self.cache[key] = self.dataset._create_index(key) + + def sync(self): + """Cache indices for all dimensions in this dataset""" + for key in self.dataset.dimensions: + self.build_index(key) + + def __getitem__(self, key): + if not key in self.cache: + assert key in self.dataset.dimensions + self.build_index(key) + return self.cache[key] + + def __setitem__(self, key, value): + self.cache[key] = value + + def __delitem__(self, key): + del self.cache[key] + + def __iter__(self): + return iter(self.dataset.dimensions) + + def __len__(self): + return len(self.dataset.dimensions) + + def __contains__(self, key): + return key in self.dataset.dimensions + + def __repr__(self): + contents = '\n'.join("'%s': %s" % + (k, str(v).replace( + '\n', '\n' + ' ' * (len(k) + 4))) + for k, v in self.items()) + return ("\n%s" + % (type(self).__name__, contents)) + + +# list of attributes of pd.DatetimeIndex that are ndarrays of time info +_DATETIMEINDEX_COMPONENTS = ['year', 'month', 'day', 'hour', 'minute', + 'second', 'microsecond', 'nanosecond', 'date', + 'time', 'dayofyear', 'weekofyear', 'dayofweek', + 'quarter'] + + +class Dataset(object): + """ + A netcdf-like data object consisting of dimensions, variables and + attributes which together form a self describing data set + + Datasets are containers of variable name. Getting an item from a Dataset + returns a DataView focused on that variable. + + Attributes + ---------- + dimensions : {name: length, ...} + variables : {name: variable, ...} + coordinates : {name: variable, ...} + Coordinates are simply variables that are also dimensions. They must + all have dimension 1. + noncoordinates : {name: variable, ...} + Variables that are not coordinates. + attributes : {key: value, ...} + indices : {dimension: index, ...} + Mapping from dimensions to pandas.Index objects. + store : backends.*DataStore + """ + def __init__(self, variables=None, dimensions=None, attributes=None, + indices=None, store=None): + """ + If dimensions are not provided, they are inferred from the variables. + + Only set a store if you want to Dataset operations to modify stored + data in-place. Otherwise, load data from a store using the + `open_dataset` function or the `from_store` class method. + """ + # TODO: fill out this docstring + if store is None: + store = backends.InMemoryDataStore() + self.store = store + + if attributes is not None: + store.set_attributes(attributes) + + if dimensions is not None: + store.set_dimensions(dimensions) + + if variables is not None: + if dimensions is None: + store.set_dimensions(construct_dimensions(variables)) + else: + check_dims_and_vars_consistency(dimensions, variables) + store.set_variables(variables) + + if indices is None: + indices = {} + else: + for k, v in indices.iteritems(): + if k not in self.dimensions or v.size != self.dimensions[k]: + raise ValueError('inconsistent index %r' % k) + self._indices = _IndicesCache(self, indices) + + @classmethod + def load_store(cls, store): + return cls(store.variables, store.dimensions, store.attributes) + + def _create_index(self, dim): + if dim in self.variables: + var = self.variables[dim] + data = var.data + attr = var.attributes + if 'units' in attr and 'since' in attr['units']: + index = utils.num2datetimeindex(data, attr['units'], + attr.get('calendar')) + else: + index = pd.Index(data) + elif dim in self.dimensions: + index = pd.Index(np.arange(self.dimensions[dim])) + else: + raise ValueError('cannot find index %r in dataset' % dim) + return index + + @property + def indices(self): + return self._indices + + @property + def variables(self): + return Frozen(self.store.variables) + + @property + def attributes(self): + return Frozen(self.store.attributes) + + @property + def dimensions(self): + return Frozen(self.store.dimensions) + + def copy(self): + """ + Returns a shallow copy of the current object. + """ + return self.__copy__() + + def __copy__(self): + """ + Returns a shallow copy of the current object. + """ + return type(self)(self.variables, self.dimensions, self.attributes, + indices=self.indices.cache) + + def __contains__(self, key): + """ + The 'in' operator will return true or false depending on + whether 'key' is a varibale in the data object or not. + """ + return key in self.variables + + def __iter__(self): + return iter(self.variables) + + @property + def _datetimeindices(self): + return [k for k, v in self.indices.iteritems() + if isinstance(v, pd.DatetimeIndex)] + + def _get_virtual_variable(self, key): + if key in self.indices: + return Variable([key], self.indices[key].values) + split_key = key.split('.') + if len(split_key) == 2: + var, suffix = split_key + if var in self._datetimeindices: + if suffix in _DATETIMEINDEX_COMPONENTS: + return Variable([var], getattr(self.indices[var], suffix)) + elif suffix == 'season': + seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) + month = self.indices[var].month + return Variable([var], seasons[(month // 3) % 4]) + raise ValueError('virtual variable %r not found' % key) + + def _get_virtual_dataview(self, key): + virtual_var = self._get_virtual_variable(key) + new_vars = OrderedDict(self.variables.items() + [(key, virtual_var)]) + ds = type(self)(new_vars, self.dimensions, self.attributes, + indices=self.indices.cache) + return DataView(ds, key) + + @property + def virtual_variables(self): + """Variables that don't exist in this dataset but for which dataviews + could be created on demand (because they can be calculated from other + dataset variables or dimensions) + """ + possible_vars = list(self.dimensions) + for k in self._datetimeindices: + for suffix in _DATETIMEINDEX_COMPONENTS + ['season']: + possible_vars.append('%s.%s' % (k, suffix)) + return tuple(k for k in possible_vars if k not in self) + + def __getitem__(self, key): + if key not in self.variables: + try: + return self._get_virtual_dataview(key) + except ValueError: + raise KeyError('dataset contains no variable with name %r ' + % key) + else: + return DataView(self.select(key), key) + + #TODO: add keys, items, and values methods (and the iter versions) to + # complete the dict analogy? + + # mutable objects should not be hashable + __hash__ = None + + def __eq__(self, other): + try: + # some stores (e.g., scipy) do not seem to preserve order, so don't + # require matching dimension or variable order for equality + return (sorted(self.dimensions.items()) + == sorted(other.dimensions.items()) + and sorted(self.attributes.items()) + == sorted(other.attributes.items()) + and all(k1 == k2 and utils.variable_equal(v1, v2) + for (k1, v1), (k2, v2) + in zip(sorted(self.variables.items()), + sorted(other.variables.items())))) + except AttributeError: + return False + + def __ne__(self, other): + return not self == other + + @property + def coordinates(self): + """Coordinates are variables with names that match dimensions""" + return FrozenOrderedDict([(dim, self.variables[dim]) + for dim in self.dimensions + if dim in self.variables and + self.variables[dim].data.ndim == 1 and + self.variables[dim].dimensions == (dim,)]) + + @property + def noncoordinates(self): + """Non-coordinates are variables with names that do not match + dimensions + """ + return FrozenOrderedDict([(name, v) + for (name, v) in self.variables.iteritems() + if name not in self.coordinates]) + + def dump_to_store(self, store): + """Store dataset contents to a backends.*DataStore object""" + target = type(self)(self.variables, self.dimensions, self.attributes, + store=store, indices=self.indices.cache) + target.store.sync() + return target + + def dump(self, filepath, *args, **kwdargs): + """Dump dataset contents to a location on disk using the netCDF4 + package + """ + nc4_store = backends.NetCDF4DataStore(filepath, mode='w', + *args, **kwdargs) + self.dump_to_store(nc4_store) + + def dumps(self): + """Serialize dataset contents to a string. The serialization creates an + in memory netcdf version 3 string using the scipy.io.netcdf package. + """ + fobj = StringIO() + scipy_store = backends.ScipyDataStore(fobj, mode='w') + self.dump_to_store(scipy_store) + return fobj.getvalue() + + def __str__(self): + """Create a ncdump-like summary of the object""" + summary = ["dimensions:"] + # prints dims that look like: + # dimension = length + dim_print = lambda d, l : "\t%s = %s" % (conventions.pretty_print(d, 30), + conventions.pretty_print(l, 10)) + # add each dimension to the summary + summary.extend([dim_print(d, l) for d, l in self.dimensions.iteritems()]) + + # Print variables + summary.append("variables:") + for vname, var in self.variables.iteritems(): + # this looks like: + # dtype name(dim1, dim2) + summary.append("\t%s %s(%s)" % (conventions.pretty_print(var.dtype, 8), + conventions.pretty_print(vname, 20), + conventions.pretty_print(', '.join(var.dimensions), 45))) + # attribute:value + summary.extend(["\t\t%s:%s" % (conventions.pretty_print(att, 30), + conventions.pretty_print(val, 30)) + for att, val in var.attributes.iteritems()]) + + summary.append("attributes:") + # attribute:value + summary.extend(["\t%s:%s" % (conventions.pretty_print(att, 30), + conventions.pretty_print(val, 30)) + for att, val in self.attributes.iteritems()]) + # create the actual summary + return '\n'.join(summary).replace('\t', ' ' * 4) + + def __repr__(self): + dim_summary = ', '.join('%s%s: %s' % ('@' if k in self else '', k, v) + for k, v in self.dimensions.iteritems()) + return '' % (type(self).__name__, + dim_summary, + ' '.join(self.noncoordinates)) + + def create_variable(self, name, dims, data, attributes=None): + """Create a new variable and add it to this dataset + + Parameters + ---------- + name : string + The name of the new variable. An exception will be raised + if the object already has a variable with this name. name + must satisfy netCDF-3 naming rules. If name equals the name + of a dimension, then the new variable is treated as a + coordinate variable and must be 1-dimensional. + dims : tuple + The dimensions of the new variable. Elements must be dimensions of + the object. + data : numpy.ndarray + Data to populate the new variable. + attributes : dict_like or None, optional + Attributes to assign to the new variable. If None (default), an + empty attribute dictionary is initialized. + + Returns + ------- + var : Variable + Reference to the newly created variable. + """ + # any error checking should be taken care of by add_variable + v = Variable(dims, np.asarray(data), attributes) + return self.add_variable(name, v) + + def create_coordinate(self, name, data, attributes=None): + """Create a new dimension and a corresponding coordinate variable + + This method combines the create_dimension and create_variable methods + for the common case when the variable is a 1-dimensional coordinate + variable with the same name as the dimension. + + If the dimension already exists, this function proceeds unless there is + already a corresponding variable or if the lengths disagree. + + Parameters + ---------- + name : string + The name of the new dimension and variable. An exception will be + raised if the object already has a dimension or variable with this + name. + data : array_like + The coordinate values along this dimension; must be 1-dimensional. + The size of data is the length of the new dimension. + attributes : dict_like or None, optional + Attributes to assign to the new variable. If None (default), an + empty attribute dictionary is initialized. + + Returns + ------- + var : Variable + Reference to the newly created coordinate variable. + """ + # any error checking should be taken care of by add_coordinate + v = Variable((name,), np.asarray(data), attributes) + return self.add_coordinate(v) + + def add_dimension(self, name, length): + """Add a dimension to this dataset + + Parameters + ---------- + name : string + The name of the new dimension. An exception will be raised if the + object already has a dimension with this name. + length : int + The length of the new dimension; must a be non-negative integer. + """ + if name in self.dimensions: + raise ValueError('dimension named %r already exists' % name) + length = int(length) + if length < 0: + raise ValueError('length must be non-negative') + self.store.set_dimension(name, length) + + def add_variable(self, name, var): + """Add a variable to the dataset + + Parameters + ---------- + name : string + The name under which the variable will be added. + variable : Variable + The variable to be added. If the desired action is to add a copy of + the variable be sure to do so before passing it to this function. + + Returns + ------- + variable + The variable object in the underlying datastore. + """ + if name in self.variables: + raise ValueError("Variable named %r already exists" % name) + return self.set_variable(name, var) + + def add_coordinate(self, var): + """Add a coordinate variable to the dataset + + Parameters + ---------- + variable : Variable + The coordinate variable to be added. Coordinate variables must be + 1D, and will be added under the same name as their sole dimension. + + Returns + ------- + variable + The variable object in the underlying datastore. + """ + # We need to be cleanly roll back the effects of + # create_dimension if create_variable fails, otherwise we will + # end up in a partial state. + name = var.dimensions[0] + if name in self.coordinates: + raise ValueError("coordinate named '%s' already exists" % name) + if var.ndim != 1: + raise ValueError("coordinate data must be 1-dimensional (vector)") + if name not in self.dimensions: + self.store.set_dimension(name, var.size) + elif self.dimensions[name] != var.size: + raise ValueError('dimension already exists with different length') + return self.store.set_variable(name, var) + + def set_variable(self, name, var): + """Set a variable in the dataset + + Unlike `add_variable`, this function allows for overriding existing + variables. + + Parameters + ---------- + name : string + The name under which the variable will be added. + variable : Variable + The variable to be added. If the desired action is to add a copy of + the variable be sure to do so before passing it to this function. + + Returns + ------- + variable + The variable object in the underlying datastore. + """ + check_dims_and_vars_consistency(self.dimensions, {name: var}) + new_var = self.store.set_variable(name, var) + if name in self.indices: + self.indices.build_index(name) + return new_var + + def indexed_by(self, **indexers): + """Return a new dataset with each variable indexed along the specified + dimension(s) + + This method selects values from each variable using its `__getitem__` + method, except this method does not require knowing the order of + each variable's dimensions. + + Parameters + ---------- + **indexers : {dim: indexer, ...} + Keyword arguments with names matching dimensions and values given + by integers, slice objects or arrays. + + Returns + ------- + obj : Dataset + A new Dataset with the same contents as this dataset, except each + variable and dimension is indexed by the appropriate indexers. In + general, each variable's data will be a view of the variable's data + in this dataset, unless numpy fancy indexing was triggered by using + an array indexer, in which case the data will be a copy. + + See Also + -------- + Dataset.labeled_by + Dataset.indexed_by + Variable.indexed_by + """ + invalid = [k for k in indexers if not k in self.dimensions] + if invalid: + raise ValueError("dimensions %r do not exist" % invalid) + + # all indexers should be int, slice or np.ndarrays + indexers = {k: np.asarray(v) if not isinstance(v, (int, slice)) else v + for k, v in indexers.iteritems()} + + variables = OrderedDict() + for name, var in self.variables.iteritems(): + var_indexers = {k: v for k, v in indexers.iteritems() + if k in var.dimensions} + variables[name] = var.indexed_by(**var_indexers) + + indices = {k: (v[indexers[k]] if k in indexers else v) + for k, v in self.indices.iteritems()} + # filter out non-indices (indices for which one value was selected) + indices = {k: v for k, v in indices.iteritems() + if isinstance(v, pd.Index)} + dimensions = OrderedDict((k, indices[k].size) for k in self.dimensions + if k in indices) + return type(self)(variables, dimensions, self.attributes, + indices=indices) + + def _loc_to_int_indexer(self, dim, locations): + index = self.indices[dim] + if isinstance(locations, slice): + indexer = index.slice_indexer(locations.start, locations.stop, + locations.step) + else: + try: + indexer = index.get_loc(locations) + except TypeError: + # value is an list or array + new_index, indexer = index.reindex(np.asarray(locations)) + if np.any(indexer < 0): + raise ValueError('not all values found in index %r' % dim) + # FIXME: don't throw away new_index (we'll need to recreate it + # later) + return indexer + + def labeled_by(self, **indexers): + """Return a new dataset with each variable indexed by coordinate labels + along the specified dimension(s) + + In contrast to `Dataset.indexed_by`, indexers for this method should + use coordinate values instead of integers. + + Under the hood, this method is powered by using Panda's powerful Index + objects. This makes label based indexing essentially just as fast as + using integer indexing. + + It also means this method uses pandas's (well documented) logic for + indexing. This means you can use string shortcuts for datetime indexes + (e.g., '2000-01' to select all values in January 2000). It also means + that slices are treated as inclusive of both the start and stop values, + unlike normal Python indexing. + + Parameters + ---------- + **indexers : {dim: indexer, ...} + Keyword arguments with names matching dimensions and values given + by individual, slices or arrays of coordinate values. + + Returns + ------- + obj : Dataset + A new Dataset with the same contents as this dataset, except each + variable and dimension is indexed by the appropriate indexers. In + general, each variable's data will be a view of the variable's data + in this dataset, unless numpy fancy indexing was triggered by using + an array indexer, in which case the data will be a copy. + + See Also + -------- + Dataset.labeled_by + Dataset.indexed_by + Variable.indexed_by + """ + return self.indexed_by(**{k: self._loc_to_int_indexer(k, v) + for k, v in indexers.iteritems()}) + + def renamed(self, name_dict): + """ + Returns a new object with renamed variables and dimensions + + Parameters + ---------- + name_dict : dict-like + Dictionary-like object whose keys are current variable or dimension + names and whose values are new names. + """ + for k in name_dict: + if k not in self.dimensions and k not in self.variables: + raise ValueError("Cannot rename %r because it is not a " + "variable or dimension in this dataset" % k) + variables = OrderedDict() + for k, v in self.variables.iteritems(): + name = name_dict.get(k, k) + dims = tuple(name_dict.get(dim, dim) for dim in v.dimensions) + #TODO: public interface for renaming a variable without loading + # data + variables[name] = Variable(dims, v._data, v.attributes) + + dimensions = OrderedDict((name_dict.get(k, k), v) + for k, v in self.dimensions.iteritems()) + indices = {name_dict.get(k, k): v + for k, v in self.indices.cache.items()} + return type(self)(variables, dimensions, self.attributes, + indices=indices) + + def merge(self, other, inplace=False): + """Merge two datasets into a single new dataset + + This method generally not allow for overriding data. Variables, + dimensions and indices are checked for conflicts. However, conflicting + attributes are removed. + + Parameters + ---------- + other : Dataset + Dataset to merge with this dataset. + inplace : bool, optional + If True, merge the other dataset into this dataset in-place. + + Returns + ------- + merged : Dataset + Merged dataset. + + Raises + ------ + ValueError + If any variables or dimensions conflict. Conflicting attributes + are silently dropped. + """ + # check for conflicts + utils.update_safety_check(self.variables, other.variables, + compat=utils.variable_equal) + utils.update_safety_check(self.dimensions, other.dimensions) + utils.update_safety_check(self.indices.cache, other.indices.cache, + compat=np.array_equal) + # update contents + obj = self if inplace else self.copy() + obj.store.set_variables(other.variables) + obj.store.set_dimensions(other.dimensions) + obj._indices.update(other.indices.cache) + # remove conflicting attributes + for k, v in other.attributes.iteritems(): + if k in self.attributes and not v != self.attributes[k]: + obj.store.del_attribute(k) + return obj + + def select(self, *names): + """Returns a new dataset that contains the named variables + + Dimensions on which those variables are defined are also included, as + well as the corresponding coordinate variables, and any variables + listed under the 'coordinates' attribute of the named variables. + + Parameters + ---------- + *names : str + Names of the variables to include in the returned object. + + Returns + ------- + Dataset + The returned object has the same attributes as the original. A + dimension is included if at least one of the specified variables is + defined along that dimension. Coordinate variables (1-dimensional + variables with the same name as a dimension) that correspond to an + included dimension are also included. All other variables are + dropped. + """ + if not all(k in self.variables for k in names): + raise ValueError( + "One or more of the specified variables does not exist") + + def get_aux_names(var): + names = set(var.dimensions) + if 'coordinates' in var.attributes: + coords = var.attributes['coordinates'] + if coords != '': + names |= set(coords.split(' ')) + return names + + aux_names = [get_aux_names(self.variables[k]) for k in names] + names = set(names).union(*aux_names) + + variables = OrderedDict((k, v) for k, v in self.variables.iteritems() + if k in names) + dimensions = OrderedDict((k, v) for k, v in self.dimensions.iteritems() + if k in names) + indices = {k: v for k, v in self.indices.cache.items() if k in names} + return type(self)(variables, dimensions, self.attributes, + indices=indices) + + def unselect(self, *names, **kwargs): + """Returns a new dataset without the named variables + + Parameters + ---------- + *names : str + Names of the variables to omit from the returned object. + omit_dimensions : bool, optional (default True) + Whether or not to also omit dimensions with the given names. + + Returns + ------- + Dataset + New dataset based on this dataset. Only the named variables + /dimensions are removed. + """ + if any(k not in self.variables and k not in self.dimensions + for k in names): + raise ValueError('One or more of the specified variable/dimension ' + 'names does not exist on this dataset') + variables = OrderedDict((k, v) for k, v in self.variables.iteritems() + if k not in names) + if kwargs.get('omit_dimensions', True): + dimensions = OrderedDict((k, v) for k, v + in self.dimensions.iteritems() + if k not in names) + indices = {k: v for k, v in self.indices.cache.items() + if k not in names} + else: + dimensions = self.dimensions + indices = self.indices + return type(self)(variables, dimensions, self.attributes, + indices=indices) + + def replace(self, name, variable): + """Returns a new dataset with the variable 'name' replaced with + 'variable' + + Parameters + ---------- + name : str + Name of the variable to replace in this object. + variable : Variable + Replacement variable. + + Returns + ------- + Dataset + New dataset based on this dataset. Dimensions are unchanged. + """ + ds = self.unselect(name, omit_dimensions=False) + ds.add_variable(name, variable) + return ds + + def iterator(self, dimension): + """Iterate along a data dimension + + Returns an iterator yielding (coordinate, dataset) pairs for each + coordinate value along the specified dimension. + + Parameters + ---------- + dimension : string + The dimension along which to iterate. + + Returns + ------- + it : iterator + The returned iterator yields pairs of scalar-valued coordinate + variables and Dataset objects. + """ + coord = self.variables[dimension] + for i in xrange(self.dimensions[dimension]): + yield (coord[i], self.indexed_by(**{dimension: i})) + + +if __name__ == "__main__": + """ + A bunch of regression tests. + """ + base_dir = os.path.dirname(__file__) + test_dir = os.path.join(base_dir, '..', '..', 'test', ) + write_test_path = os.path.join(test_dir, 'test_output.nc') + ecmwf_netcdf = os.path.join(test_dir, 'ECMWF_ERA-40_subset.nc') + + import time + st = time.time() + nc = Dataset(ecmwf_netcdf) + print "Seconds to read from filepath : ", time.time() - st + + st = time.time() + nc.dump(write_test_path) + print "Seconds to write : ", time.time() - st + + st = time.time() + nc_string = nc.dumps() + print "Seconds to serialize : ", time.time() - st + + st = time.time() + nc = Dataset(nc_string) + print "Seconds to deserialize : ", time.time() - st + + st = time.time() + with open(ecmwf_netcdf, 'r') as f: + nc = Dataset(f) + print "Seconds to read from fobj : ", time.time() - st + diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index 710458a7729..2e594f1b328 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -81,8 +81,13 @@ def _key_to_slicers(self, key): return zip(self.dimensions, key) def __getitem__(self, key): - slicers = dict(self._key_to_slicers(key)) - return type(self)(self.dataset.views(**slicers), self.name) + if isinstance(key, basestring): + # grab another dataview from the dataset + return self.dataset[key] + else: + # orthogonal array indexing + slicers = dict(self._key_to_slicers(key)) + return type(self)(self.dataset.views(**slicers), self.name) def __setitem__(self, key, value): self.variable[key] = value From fdfd4ced2b1171b1b277b9cf98b1cacec3492942 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 5 Feb 2014 15:25:50 -0800 Subject: [PATCH 08/45] Added utils.Frozen --- src/scidata/backends.py | 6 +++--- src/scidata/data.py | 12 +++++------ src/scidata/utils.py | 45 ++++++++++++++++++++++++----------------- test/test_utils.py | 13 +++++++++++- 4 files changed, 46 insertions(+), 30 deletions(-) diff --git a/src/scidata/backends.py b/src/scidata/backends.py index cec3a9581fb..726f16817fb 100644 --- a/src/scidata/backends.py +++ b/src/scidata/backends.py @@ -9,7 +9,7 @@ from scipy.io import netcdf from collections import OrderedDict -from utils import FrozenOrderedDict +from utils import FrozenOrderedDict, Frozen from variable import Variable import conventions @@ -83,11 +83,11 @@ def variables(self): @property def attributes(self): - return self.ds._attributes + return Frozen(self.ds._attributes) @property def dimensions(self): - return self.ds.dimensions + return Frozen(self.ds.dimensions) def set_dimension(self, name, length): """Set a dimension length""" diff --git a/src/scidata/data.py b/src/scidata/data.py index 5bb69221453..17b0a1c5ecd 100644 --- a/src/scidata/data.py +++ b/src/scidata/data.py @@ -8,7 +8,7 @@ from collections import OrderedDict, MutableMapping from dataview import DataView -from utils import FrozenOrderedDict +from utils import FrozenOrderedDict, Frozen from variable import Variable import backends, conventions, utils @@ -94,8 +94,6 @@ def open_dataset(nc, *args, **kwargs): class _IndicesCache(MutableMapping): """Cache for Dataset indices""" - # MutableMapping subclasses should implement: - # __getitem__, __setitem__, __delitem__, __iter__, __len__ def __init__(self, dataset, cache=None): self.dataset = dataset self.cache = {} if cache is None else dict(cache) @@ -130,7 +128,7 @@ def __len__(self): return len(self.dataset.dimensions) def __contains__(self, key): - return key in self.cache + return key in self.dataset.dimensions def __repr__(self): contents = '\n'.join("'%s': %s" % @@ -231,15 +229,15 @@ def indices(self): @property def variables(self): - return self.store.variables + return Frozen(self.store.variables) @property def attributes(self): - return self.store.attributes + return Frozen(self.store.attributes) @property def dimensions(self): - return self.store.dimensions + return Frozen(self.store.dimensions) def copy(self): """ diff --git a/src/scidata/utils.py b/src/scidata/utils.py index c70ee4363f6..517e5869077 100644 --- a/src/scidata/utils.py +++ b/src/scidata/utils.py @@ -1,6 +1,6 @@ import netCDF4 as nc4 import operator -from collections import OrderedDict +from collections import OrderedDict, Mapping from datetime import datetime import numpy as np @@ -172,22 +172,29 @@ def ordered_dict_intersection(first_dict, second_dict, compat=operator.eq): return new_dict -class FrozenOrderedDict(OrderedDict): - """A subclass of OrderedDict whose contents are frozen after initialization - to prevent tampering +class Frozen(Mapping): + """Wrapper around an object implementing the mapping interface to make it + immutable. If you really want to modify the mapping, the mutable version is + saved under the `mapping` attribute. """ - def __init__(self, *args, **kwds): - # bypass the disabled __setitem__ method - # initialize as an empty OrderedDict - super(FrozenOrderedDict, self).__init__() - # Capture arguments in an OrderedDict - args_dict = OrderedDict(*args, **kwds) - # Call __setitem__ of the superclass - for (key, value) in args_dict.iteritems(): - super(FrozenOrderedDict, self).__setitem__(key, value) - - def _not_implemented(self, *args, **kwargs): - raise TypeError('%s is immutable' % type(self).__name__) - - __setitem__ = __delitem__ = setdefault = update = pop = popitem = clear = \ - _not_implemented + def __init__(self, mapping): + self.mapping = mapping + + def __getitem__(self, key): + return self.mapping[key] + + def __iter__(self): + return iter(self.mapping) + + def __len__(self): + return len(self.mapping) + + def __contains__(self, key): + return key in self.mapping + + def __repr__(self): + return '%s(%r)' % (type(self).__name__, self.mapping) + + +def FrozenOrderedDict(*args, **kwargs): + return Frozen(OrderedDict(*args, **kwargs)) diff --git a/test/test_utils.py b/test/test_utils.py index 497b16d2a3d..429a6347f46 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -69,7 +69,7 @@ def test(self): self.assertArrayEqual(expected, actual) -class TestDictionaryChecks(TestCase): +class TestDictionaries(TestCase): def setUp(self): self.x = {'a': 'A', 'b': 'B'} self.y = {'c': 'C', 'b': 'B'} @@ -88,3 +88,14 @@ def test_ordered_dict_intersection(self): utils.ordered_dict_intersection(self.x, self.y)) self.assertEquals({'b': 'B'}, utils.ordered_dict_intersection(self.x, self.z)) + + def test_frozen(self): + x = utils.Frozen(self.x) + with self.assertRaises(TypeError): + x['foo'] = 'bar' + with self.assertRaises(TypeError): + del x['a'] + with self.assertRaises(AttributeError): + x.update(self.y) + self.assertEquals(x.mapping, self.x) + From 0e873566a920f1ff9fc3758b2084d7be4c3b9506 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 5 Feb 2014 15:35:07 -0800 Subject: [PATCH 09/45] Moved data.py to dataset.py --- src/scidata/__init__.py | 2 +- src/scidata/common.py | 2 - src/scidata/data.py | 1017 ------------------------ src/scidata/dataset.py | 300 ++++--- src/scidata/variable.py | 4 +- test/{test_data.py => test_dataset.py} | 2 - 6 files changed, 210 insertions(+), 1117 deletions(-) delete mode 100644 src/scidata/data.py rename test/{test_data.py => test_dataset.py} (99%) diff --git a/src/scidata/__init__.py b/src/scidata/__init__.py index 05cea9eb79a..0f80dede807 100644 --- a/src/scidata/__init__.py +++ b/src/scidata/__init__.py @@ -1,4 +1,4 @@ -from data import Dataset, open_dataset +from dataset import Dataset, open_dataset from dataview import DataView from variable import Variable diff --git a/src/scidata/common.py b/src/scidata/common.py index 98524452f5f..f6ae3bca3f7 100644 --- a/src/scidata/common.py +++ b/src/scidata/common.py @@ -1,5 +1,3 @@ -import numpy as np - class _DataWrapperMixin(object): @property diff --git a/src/scidata/data.py b/src/scidata/data.py deleted file mode 100644 index 17b0a1c5ecd..00000000000 --- a/src/scidata/data.py +++ /dev/null @@ -1,1017 +0,0 @@ -# TODO Use various backend data stores. pytable, ncdf4, scipy.io, iris, memory -import os -import numpy as np -import netCDF4 as nc4 -import pandas as pd - -from cStringIO import StringIO -from collections import OrderedDict, MutableMapping - -from dataview import DataView -from utils import FrozenOrderedDict, Frozen -from variable import Variable -import backends, conventions, utils - -date2num = nc4.date2num -num2date = nc4.num2date - - -def construct_dimensions(variables): - """ - Given a dictionary of variables, construct a dimensions mapping - - Parameters - ---------- - variables : mapping - Mapping from variable names to Variable objects. - - Returns - ------- - dimensions : mapping - Mapping from dimension names to lengths. - - Raises - ------ - ValueError if variable dimensions are inconsistent. - """ - dimensions = OrderedDict() - for k, var in variables.iteritems(): - for dim, length in zip(var.dimensions, var.shape): - if dim not in dimensions: - dimensions[dim] = length - elif dimensions[dim] != length: - raise ValueError('dimension %r on variable %r has length %s ' - 'but already is saved with length %s' % - (dim, k, length, dimensions[dim])) - return dimensions - - -def check_dims_and_vars_consistency(dimensions, variables): - """ - Validate dimensions and variables are consistent - - Parameters - ---------- - dimensions : mapping - Mapping from dimension names to lengths. - variables : mapping - Mapping from variable names to Variable objects. - - Raises - ------ - ValueError if variable dimensions are inconsistent with the provided - dimensions. - """ - for k, var in variables.iteritems(): - if k in dimensions and var.ndim != 1: - raise ValueError('a coordinate variable must be defined with ' - '1-dimensional data') - for dim, length in zip(var.dimensions, var.shape): - if dim not in dimensions: - raise ValueError('dimension %r on variable %r is not one ' - 'of the dataset dimensions %r' % - (dim, k, list(dimensions))) - elif dimensions[dim] != length: - raise ValueError('dimension %r on variable %r has length ' - '%s but in on the dataset has length %s' % - (dim, k, length, dimensions[dim])) - - -def open_dataset(nc, *args, **kwargs): - #TODO: add tests for this function - # move this to a classmethod Dataset.open? - if isinstance(nc, basestring) and not nc.startswith('CDF'): - # If the initialization nc is a string and it doesn't - # appear to be the contents of a netcdf file we load - # it using the netCDF4 package - store = backends.NetCDF4DataStore(nc, *args, **kwargs) - else: - # If nc is a file-like object we read it using - # the scipy.io.netcdf package - store = backends.ScipyDataStore(nc, *args, **kwargs) - return Dataset.load_store(store) - - -class _IndicesCache(MutableMapping): - """Cache for Dataset indices""" - def __init__(self, dataset, cache=None): - self.dataset = dataset - self.cache = {} if cache is None else dict(cache) - # for performance reasons, we could remove this: - self.sync() - - def build_index(self, key): - """Cache the index for the dimension 'key'""" - self.cache[key] = self.dataset._create_index(key) - - def sync(self): - """Cache indices for all dimensions in this dataset""" - for key in self.dataset.dimensions: - self.build_index(key) - - def __getitem__(self, key): - if not key in self.cache: - assert key in self.dataset.dimensions - self.build_index(key) - return self.cache[key] - - def __setitem__(self, key, value): - self.cache[key] = value - - def __delitem__(self, key): - del self.cache[key] - - def __iter__(self): - return iter(self.dataset.dimensions) - - def __len__(self): - return len(self.dataset.dimensions) - - def __contains__(self, key): - return key in self.dataset.dimensions - - def __repr__(self): - contents = '\n'.join("'%s': %s" % - (k, str(v).replace( - '\n', '\n' + ' ' * (len(k) + 4))) - for k, v in self.items()) - return ("\n%s" - % (type(self).__name__, contents)) - - -# list of attributes of pd.DatetimeIndex that are ndarrays of time info -_DATETIMEINDEX_COMPONENTS = ['year', 'month', 'day', 'hour', 'minute', - 'second', 'microsecond', 'nanosecond', 'date', - 'time', 'dayofyear', 'weekofyear', 'dayofweek', - 'quarter'] - - -class Dataset(object): - """ - A netcdf-like data object consisting of dimensions, variables and - attributes which together form a self describing data set - - Datasets are containers of variable name. Getting an item from a Dataset - returns a DataView focused on that variable. - - Attributes - ---------- - dimensions : {name: length, ...} - variables : {name: variable, ...} - coordinates : {name: variable, ...} - Coordinates are simply variables that are also dimensions. They must - all have dimension 1. - noncoordinates : {name: variable, ...} - Variables that are not coordinates. - attributes : {key: value, ...} - indices : {dimension: index, ...} - Mapping from dimensions to pandas.Index objects. - store : backends.*DataStore - """ - def __init__(self, variables=None, dimensions=None, attributes=None, - indices=None, store=None): - """ - If dimensions are not provided, they are inferred from the variables. - - Only set a store if you want to Dataset operations to modify stored - data in-place. Otherwise, load data from a store using the - `open_dataset` function or the `from_store` class method. - """ - # TODO: fill out this docstring - if store is None: - store = backends.InMemoryDataStore() - self.store = store - - if attributes is not None: - store.set_attributes(attributes) - - if dimensions is not None: - store.set_dimensions(dimensions) - - if variables is not None: - if dimensions is None: - store.set_dimensions(construct_dimensions(variables)) - else: - check_dims_and_vars_consistency(dimensions, variables) - store.set_variables(variables) - - if indices is None: - indices = {} - else: - for k, v in indices.iteritems(): - if k not in self.dimensions or v.size != self.dimensions[k]: - raise ValueError('inconsistent index %r' % k) - self._indices = _IndicesCache(self, indices) - - @classmethod - def load_store(cls, store): - return cls(store.variables, store.dimensions, store.attributes) - - def _create_index(self, dim): - if dim in self.variables: - var = self.variables[dim] - data = var.data - attr = var.attributes - if 'units' in attr and 'since' in attr['units']: - index = utils.num2datetimeindex(data, attr['units'], - attr.get('calendar')) - else: - index = pd.Index(data) - elif dim in self.dimensions: - index = pd.Index(np.arange(self.dimensions[dim])) - else: - raise ValueError('cannot find index %r in dataset' % dim) - return index - - @property - def indices(self): - return self._indices - - @property - def variables(self): - return Frozen(self.store.variables) - - @property - def attributes(self): - return Frozen(self.store.attributes) - - @property - def dimensions(self): - return Frozen(self.store.dimensions) - - def copy(self): - """ - Returns a shallow copy of the current object. - """ - return self.__copy__() - - def __copy__(self): - """ - Returns a shallow copy of the current object. - """ - return type(self)(self.variables, self.dimensions, self.attributes, - indices=self.indices.cache) - - def __contains__(self, key): - """ - The 'in' operator will return true or false depending on - whether 'key' is a varibale in the data object or not. - """ - return key in self.variables - - def __iter__(self): - return iter(self.variables) - - @property - def _unique_datetimeindex(self): - time_indices = [k for k, v in self.indices.iteritems() - if isinstance(v, pd.DatetimeIndex)] - if len(time_indices) == 1: - return time_indices[0] - else: - return None - - def _get_virtual_variable(self, key): - if key in self.indices: - return Variable([key], self.indices[key].values) - time = self._unique_datetimeindex - if time is not None: - if key in _DATETIMEINDEX_COMPONENTS: - return Variable([time], getattr(self.indices[time], key)) - elif key == 'season': - seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) - month = self.indices[time].month - return Variable([time], seasons[(month // 3) % 4]) - raise ValueError('virtual variable %r not found' % key) - - def _get_virtual_dataview(self, key): - virtual_var = self._get_virtual_variable(key) - new_vars = OrderedDict(self.variables.items() + [(key, virtual_var)]) - ds = type(self)(new_vars, self.dimensions, self.attributes, - indices=self.indices.cache) - return DataView(ds, key) - - @property - def virtual_variables(self): - """Variables that don't exist in this dataset but for which dataviews - could be created on demand (because they can be calculated from other - dataset variables or dimensions) - """ - possible_vars = list(self.dimensions) - if self._unique_datetimeindex is not None: - possible_vars += _DATETIMEINDEX_COMPONENTS + ['season'] - return tuple(k for k in possible_vars if k not in self) - - def __getitem__(self, key): - if key not in self.variables: - try: - return self._get_virtual_dataview(key) - except ValueError: - raise KeyError('dataset contains no variable with name %r ' - % key) - else: - return DataView(self.select(key), key) - - #TODO: add keys, items, and values methods (and the iter versions) to - # complete the dict analogy? - - # mutable objects should not be hashable - __hash__ = None - - def __eq__(self, other): - try: - # some stores (e.g., scipy) do not seem to preserve order, so don't - # require matching dimension or variable order for equality - return (sorted(self.dimensions.items()) - == sorted(other.dimensions.items()) - and sorted(self.attributes.items()) - == sorted(other.attributes.items()) - and all(k1 == k2 and utils.variable_equal(v1, v2) - for (k1, v1), (k2, v2) - in zip(sorted(self.variables.items()), - sorted(other.variables.items())))) - except AttributeError: - return False - - def __ne__(self, other): - return not self == other - - @property - def coordinates(self): - """Coordinates are variables with names that match dimensions""" - return FrozenOrderedDict([(dim, self.variables[dim]) - for dim in self.dimensions - if dim in self.variables and - self.variables[dim].data.ndim == 1 and - self.variables[dim].dimensions == (dim,)]) - - @property - def noncoordinates(self): - """Non-coordinates are variables with names that do not match - dimensions - """ - return FrozenOrderedDict([(name, v) - for (name, v) in self.variables.iteritems() - if name not in self.coordinates]) - - def dump_to_store(self, store): - """Store dataset contents to a backends.*DataStore object""" - target = type(self)(self.variables, self.dimensions, self.attributes, - store=store, indices=self.indices.cache) - target.store.sync() - return target - - def dump(self, filepath, *args, **kwdargs): - """Dump dataset contents to a location on disk using the netCDF4 - package - """ - nc4_store = backends.NetCDF4DataStore(filepath, mode='w', - *args, **kwdargs) - self.dump_to_store(nc4_store) - - def dumps(self): - """Serialize dataset contents to a string. The serialization creates an - in memory netcdf version 3 string using the scipy.io.netcdf package. - """ - fobj = StringIO() - scipy_store = backends.ScipyDataStore(fobj, mode='w') - self.dump_to_store(scipy_store) - return fobj.getvalue() - - def __str__(self): - """Create a ncdump-like summary of the object""" - summary = ["dimensions:"] - # prints dims that look like: - # dimension = length - dim_print = lambda d, l : "\t%s = %s" % (conventions.pretty_print(d, 30), - conventions.pretty_print(l, 10)) - # add each dimension to the summary - summary.extend([dim_print(d, l) for d, l in self.dimensions.iteritems()]) - - # Print variables - summary.append("variables:") - for vname, var in self.variables.iteritems(): - # this looks like: - # dtype name(dim1, dim2) - summary.append("\t%s %s(%s)" % (conventions.pretty_print(var.dtype, 8), - conventions.pretty_print(vname, 20), - conventions.pretty_print(', '.join(var.dimensions), 45))) - # attribute:value - summary.extend(["\t\t%s:%s" % (conventions.pretty_print(att, 30), - conventions.pretty_print(val, 30)) - for att, val in var.attributes.iteritems()]) - - summary.append("attributes:") - # attribute:value - summary.extend(["\t%s:%s" % (conventions.pretty_print(att, 30), - conventions.pretty_print(val, 30)) - for att, val in self.attributes.iteritems()]) - # create the actual summary - return '\n'.join(summary).replace('\t', ' ' * 4) - - def __repr__(self): - dim_summary = ', '.join('%s%s: %s' % ('@' if k in self else '', k, v) - for k, v in self.dimensions.iteritems()) - return '' % (type(self).__name__, - dim_summary, - ' '.join(self.noncoordinates)) - - def create_variable(self, name, dims, data, attributes=None): - """Create a new variable and add it to this dataset - - Parameters - ---------- - name : string - The name of the new variable. An exception will be raised - if the object already has a variable with this name. name - must satisfy netCDF-3 naming rules. If name equals the name - of a dimension, then the new variable is treated as a - coordinate variable and must be 1-dimensional. - dims : tuple - The dimensions of the new variable. Elements must be dimensions of - the object. - data : numpy.ndarray - Data to populate the new variable. - attributes : dict_like or None, optional - Attributes to assign to the new variable. If None (default), an - empty attribute dictionary is initialized. - - Returns - ------- - var : Variable - Reference to the newly created variable. - """ - # any error checking should be taken care of by add_variable - v = Variable(dims, np.asarray(data), attributes) - return self.add_variable(name, v) - - def create_coordinate(self, name, data, attributes=None): - """Create a new dimension and a corresponding coordinate variable - - This method combines the create_dimension and create_variable methods - for the common case when the variable is a 1-dimensional coordinate - variable with the same name as the dimension. - - If the dimension already exists, this function proceeds unless there is - already a corresponding variable or if the lengths disagree. - - Parameters - ---------- - name : string - The name of the new dimension and variable. An exception will be - raised if the object already has a dimension or variable with this - name. - data : array_like - The coordinate values along this dimension; must be 1-dimensional. - The size of data is the length of the new dimension. - attributes : dict_like or None, optional - Attributes to assign to the new variable. If None (default), an - empty attribute dictionary is initialized. - - Returns - ------- - var : Variable - Reference to the newly created coordinate variable. - """ - # any error checking should be taken care of by add_coordinate - v = Variable((name,), np.asarray(data), attributes) - return self.add_coordinate(v) - - def add_dimension(self, name, length): - """Add a dimension to this dataset - - Parameters - ---------- - name : string - The name of the new dimension. An exception will be raised if the - object already has a dimension with this name. - length : int - The length of the new dimension; must a be non-negative integer. - """ - if name in self.dimensions: - raise ValueError('dimension named %r already exists' % name) - length = int(length) - if length < 0: - raise ValueError('length must be non-negative') - self.store.set_dimension(name, length) - - def add_variable(self, name, var): - """Add a variable to the dataset - - Parameters - ---------- - name : string - The name under which the variable will be added. - variable : Variable - The variable to be added. If the desired action is to add a copy of - the variable be sure to do so before passing it to this function. - - Returns - ------- - variable - The variable object in the underlying datastore. - """ - if name in self.variables: - raise ValueError("Variable named %r already exists" % name) - return self.set_variable(name, var) - - def add_coordinate(self, var): - """Add a coordinate variable to the dataset - - Parameters - ---------- - variable : Variable - The coordinate variable to be added. Coordinate variables must be - 1D, and will be added under the same name as their sole dimension. - - Returns - ------- - variable - The variable object in the underlying datastore. - """ - # We need to be cleanly roll back the effects of - # create_dimension if create_variable fails, otherwise we will - # end up in a partial state. - name = var.dimensions[0] - if name in self.coordinates: - raise ValueError("coordinate named '%s' already exists" % name) - if var.ndim != 1: - raise ValueError("coordinate data must be 1-dimensional (vector)") - if name not in self.dimensions: - self.store.set_dimension(name, var.size) - elif self.dimensions[name] != var.size: - raise ValueError('dimension already exists with different length') - return self.store.set_variable(name, var) - - def set_variable(self, name, var): - """Set a variable in the dataset - - Unlike `add_variable`, this function allows for overriding existing - variables. - - Parameters - ---------- - name : string - The name under which the variable will be added. - variable : Variable - The variable to be added. If the desired action is to add a copy of - the variable be sure to do so before passing it to this function. - - Returns - ------- - variable - The variable object in the underlying datastore. - """ - check_dims_and_vars_consistency(self.dimensions, {name: var}) - new_var = self.store.set_variable(name, var) - if name in self.indices: - self.indices.build_index(name) - return new_var - - def views(self, **slicers): - """Return a new object whose contents are a view of a slice from the - current object along a specified dimension - - Parameters - ---------- - slicers : {dim: slice, ...} - A dictionary mapping from dimensions to integers or slice objects. - - Returns - ------- - obj : Data object - The returned object has the same attributes, dimensions, - variable names and variable attributes as the original. - Variables that are not defined along the specified - dimensions are viewed in their entirety. Variables that are - defined along the specified dimension have their data - contents taken along the specified dimension. - - Care must be taken since modifying (most) values in the returned - object will result in modification to the parent object. - - See Also - -------- - view - numpy.take - Variable.take - """ - invalid = [k for k in slicers if not k in self.dimensions] - if invalid: - raise ValueError("dimensions %r do not exist" % invalid) - - # all slicers should be int, slice or np.ndarrays - slicers = {k: np.asarray(v) if not isinstance(v, (int, slice)) else v - for k, v in slicers.iteritems()} - - variables = OrderedDict() - for name, var in self.variables.iteritems(): - var_slicers = {k: v for k, v in slicers.iteritems() - if k in var.dimensions} - variables[name] = var.views(**var_slicers) - - indices = {k: (v[slicers[k]] if k in slicers else v) - for k, v in self.indices.iteritems()} - # filter out non-indices (indices for which one value was selected) - indices = {k: v for k, v in indices.iteritems() - if isinstance(v, pd.Index)} - dimensions = OrderedDict((k, indices[k].size) for k in self.dimensions - if k in indices) - return type(self)(variables, dimensions, self.attributes, - indices=indices) - - def _loc_to_int_indexer(self, dim, locations): - index = self.indices[dim] - if isinstance(locations, slice): - indexer = index.slice_indexer(locations.start, locations.stop, - locations.step) - else: - try: - indexer = index.get_loc(locations) - except TypeError: - # value is an list or array - new_index, indexer = index.reindex(np.asarray(locations)) - if np.any(indexer < 0): - raise ValueError('not all values found in index %r' % dim) - # FIXME: don't throw away new_index (we'll need to recreate it - # later) - return indexer - - def loc_views(self, **slicers): - return self.views(**{k: self._loc_to_int_indexer(k, v) - for k, v in slicers.iteritems()}) - - def renamed(self, name_dict): - """ - Returns a new object with renamed variables and dimensions - - Parameters - ---------- - name_dict : dict-like - Dictionary-like object whose keys are current variable or dimension - names and whose values are new names. - """ - for k in name_dict: - if k not in self.dimensions and k not in self.variables: - raise ValueError("Cannot rename %r because it is not a " - "variable or dimension in this dataset" % k) - variables = OrderedDict() - for k, v in self.variables.iteritems(): - name = name_dict.get(k, k) - dims = tuple(name_dict.get(dim, dim) for dim in v.dimensions) - #TODO: public interface for renaming a variable without loading - # data - variables[name] = Variable(dims, v._data, v.attributes) - - dimensions = OrderedDict((name_dict.get(k, k), v) - for k, v in self.dimensions.iteritems()) - indices = {name_dict.get(k, k): v - for k, v in self.indices.cache.items()} - return type(self)(variables, dimensions, self.attributes, - indices=indices) - - def merge(self, other, inplace=False): - """Merge two datasets into a single new dataset - - This method generally not allow for overriding data. Variables, - dimensions and indices are checked for conflicts. However, conflicting - attributes are removed. - - Parameters - ---------- - other : Dataset - Dataset to merge with this dataset. - inplace : bool, optional - If True, merge the other dataset into this dataset in-place. - - Returns - ------- - merged : Dataset - Merged dataset. - - Raises - ------ - ValueError - If any variables or dimensions conflict. Conflicting attributes - are silently dropped. - """ - # check for conflicts - utils.update_safety_check(self.variables, other.variables, - compat=utils.variable_equal) - utils.update_safety_check(self.dimensions, other.dimensions) - utils.update_safety_check(self.indices.cache, other.indices.cache, - compat=np.array_equal) - # update contents - obj = self if inplace else self.copy() - obj.store.set_variables(other.variables) - obj.store.set_dimensions(other.dimensions) - obj._indices.update(other.indices.cache) - # remove conflicting attributes - for k, v in other.attributes.iteritems(): - if k in self.attributes and not v != self.attributes[k]: - obj.store.del_attribute(k) - return obj - - def select(self, *names): - """Returns a new dataset that contains the named variables - - Dimensions on which those variables are defined are also included, as - well as the corresponding coordinate variables, and any variables - listed under the 'coordinates' attribute of the named variables. - - Parameters - ---------- - *names : str - Names of the variables to include in the returned object. - - Returns - ------- - Dataset - The returned object has the same attributes as the original. A - dimension is included if at least one of the specified variables is - defined along that dimension. Coordinate variables (1-dimensional - variables with the same name as a dimension) that correspond to an - included dimension are also included. All other variables are - dropped. - """ - if not all(k in self.variables for k in names): - raise ValueError( - "One or more of the specified variables does not exist") - - def get_aux_names(var): - names = set(var.dimensions) - if 'coordinates' in var.attributes: - coords = var.attributes['coordinates'] - if coords != '': - names |= set(coords.split(' ')) - return names - - aux_names = [get_aux_names(self.variables[k]) for k in names] - names = set(names).union(*aux_names) - - variables = OrderedDict((k, v) for k, v in self.variables.iteritems() - if k in names) - dimensions = OrderedDict((k, v) for k, v in self.dimensions.iteritems() - if k in names) - indices = {k: v for k, v in self.indices.cache.items() if k in names} - return type(self)(variables, dimensions, self.attributes, - indices=indices) - - def unselect(self, *names, **kwargs): - """Returns a new dataset without the named variables - - Parameters - ---------- - *names : str - Names of the variables to omit from the returned object. - omit_dimensions : bool, optional (default True) - Whether or not to also omit dimensions with the given names. - - Returns - ------- - Dataset - New dataset based on this dataset. Only the named variables - /dimensions are removed. - """ - if any(k not in self.variables and k not in self.dimensions - for k in names): - raise ValueError('One or more of the specified variable/dimension ' - 'names does not exist on this dataset') - variables = OrderedDict((k, v) for k, v in self.variables.iteritems() - if k not in names) - if kwargs.get('omit_dimensions', True): - dimensions = OrderedDict((k, v) for k, v - in self.dimensions.iteritems() - if k not in names) - indices = {k: v for k, v in self.indices.cache.items() - if k not in names} - else: - dimensions = self.dimensions - indices = self.indices - return type(self)(variables, dimensions, self.attributes, - indices=indices) - - def replace(self, name, variable): - """Returns a new dataset with the variable 'name' replaced with - 'variable' - - Parameters - ---------- - name : str - Name of the variable to replace in this object. - variable : Variable - Replacement variable. - - Returns - ------- - Dataset - New dataset based on this dataset. Dimensions are unchanged. - """ - ds = self.unselect(name, omit_dimensions=False) - ds.add_variable(name, variable) - return ds - - def iterator(self, dim=None, views=False): - """Iterator along a data dimension - - Return an iterator yielding (coordinate, data_object) pairs - that are singleton along the specified dimension - - Parameters - ---------- - dim : string, optional - The dimension along which you want to iterate. If None - (default), then the iterator operates along the record - dimension; if there is no record dimension, an exception - will be raised. - views : boolean, optional - If True, the iterator will give views of the data along - the dimension, otherwise copies. - - Returns - ------- - it : iterator - The returned iterator yields pairs of scalar-valued - coordinate variables and data objects. The yielded data - objects contain *copies* onto the underlying numpy arrays of - the original data object. If the data object does not have - a coordinate variable with the same name as the specified - dimension, then the returned coordinate value is None. If - multiple dimensions of a variable equal dim (e.g. a - correlation matrix), then that variable is iterated along - the first matching dimension. - - Examples - -------- - >>> d = Data() - >>> d.create_coordinate(name='x', data=numpy.arange(10)) - >>> d.create_coordinate(name='y', data=numpy.arange(20)) - >>> print d - - dimensions: - name | length - =========================== - x | 10 - y | 20 - - variables: - name | dtype | shape | dimensions - ===================================================================== - x | int32 | (10,) | ('x',) - y | int32 | (20,) | ('y',) - - attributes: - None - - >>> i = d.iterator(dim='x') - >>> (a, b) = i.next() - >>> print a - - dtype: - int32 - - dimensions: - name | length - =========================== - x | 1 - - attributes: - None - - >>> print b - - dimensions: - name | length - =========================== - x | 1 - y | 20 - - variables: - name | dtype | shape | dimensions - ===================================================================== - x | int32 | (1,) | ('x',) - y | int32 | (20,) | ('y',) - - attributes: - None - - """ - # Determine the size of the dim we're about to iterate over - n = self.dimensions[dim] - # Iterate over the object - if dim in self.coordinates: - coord = self.variables[dim] - if views: - for i in xrange(n): - s = slice(i, i + 1) - yield (coord.view(s, dim=dim), - self.view(s, dim=dim)) - else: - for i in xrange(n): - indices = np.array([i]) - yield (coord.take(indices, dim=dim), - self.take(indices, dim=dim)) - else: - if views: - for i in xrange(n): - yield (None, self.view(slice(i, i + 1), dim=dim)) - else: - for i in xrange(n): - yield (None, self.take(np.array([i]), dim=dim)) - - def iterarray(self, var, dim=None): - """Iterator along a data dimension returning the corresponding slices - of the underlying data of a variable. - - Return an iterator yielding (scalar, ndarray) pairs that are singleton - along the specified dimension. While iterator is more general, this - method has less overhead and in turn should be considerably faster. - - Parameters - ---------- - var : string - The variable over which you want to iterate. - - dim : string, optional - The dimension along which you want to iterate. If None - (default), then the iterator operates along the record - dimension; if there is no record dimension, an exception - will be raised. - - Returns - ------- - it : iterator - The returned iterator yields pairs of scalar-valued - and ndarray objects. The yielded data objects contain *views* - onto the underlying numpy arrays of the original data object. - - Examples - -------- - >>> d = Data() - >>> d.create_coordinate(name='t', data=numpy.arange(5)) - >>> d.create_dimension(name='h', length=3) - >>> d.create_variable(name='x', dim=('t', 'h'),\ - ... data=numpy.random.random((10, 3,))) - >>> print d['x'].data - [[ 0.33499995 0.47606901 0.41334325] - [ 0.20229308 0.73693437 0.97451746] - [ 0.40020704 0.29763575 0.85588908] - [ 0.44114434 0.79233816 0.59115313] - [ 0.18583972 0.55084889 0.95478946]] - >>> i = d.iterarray(var='x', dim='t') - >>> (a, b) = i.next() - >>> print a - 0 - >>> print b - [[ 0.33499995 0.47606901 0.41334325]] - """ - # Get a reference to the underlying ndarray for the desired variable - # and build a list of slice objects - data = self.variables[var].data - axis = list(self.variables[var].dimensions).index(dim) - slicer = [slice(None)] * data.ndim - # Determine the size of the dim we're about to iterate over - n = self.dimensions[dim] - # Iterate over dim returning views of the variable. - if dim in self.coordinates: - coord = self.variables[dim].data - for i in xrange(n): - slicer[axis] = slice(i, i + 1) - yield (coord[i], data[slicer]) - else: - for i in xrange(n): - slicer[axis] = slice(i, i + 1) - yield (None, data[slicer]) - - -if __name__ == "__main__": - """ - A bunch of regression tests. - """ - base_dir = os.path.dirname(__file__) - test_dir = os.path.join(base_dir, '..', '..', 'test', ) - write_test_path = os.path.join(test_dir, 'test_output.nc') - ecmwf_netcdf = os.path.join(test_dir, 'ECMWF_ERA-40_subset.nc') - - import time - st = time.time() - nc = Dataset(ecmwf_netcdf) - print "Seconds to read from filepath : ", time.time() - st - - st = time.time() - nc.dump(write_test_path) - print "Seconds to write : ", time.time() - st - - st = time.time() - nc_string = nc.dumps() - print "Seconds to serialize : ", time.time() - st - - st = time.time() - nc = Dataset(nc_string) - print "Seconds to deserialize : ", time.time() - st - - st = time.time() - with open(ecmwf_netcdf, 'r') as f: - nc = Dataset(f) - print "Seconds to read from fobj : ", time.time() - st - diff --git a/src/scidata/dataset.py b/src/scidata/dataset.py index 71f094c427a..17b0a1c5ecd 100644 --- a/src/scidata/dataset.py +++ b/src/scidata/dataset.py @@ -263,23 +263,25 @@ def __iter__(self): return iter(self.variables) @property - def _datetimeindices(self): - return [k for k, v in self.indices.iteritems() - if isinstance(v, pd.DatetimeIndex)] + def _unique_datetimeindex(self): + time_indices = [k for k, v in self.indices.iteritems() + if isinstance(v, pd.DatetimeIndex)] + if len(time_indices) == 1: + return time_indices[0] + else: + return None def _get_virtual_variable(self, key): if key in self.indices: return Variable([key], self.indices[key].values) - split_key = key.split('.') - if len(split_key) == 2: - var, suffix = split_key - if var in self._datetimeindices: - if suffix in _DATETIMEINDEX_COMPONENTS: - return Variable([var], getattr(self.indices[var], suffix)) - elif suffix == 'season': - seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) - month = self.indices[var].month - return Variable([var], seasons[(month // 3) % 4]) + time = self._unique_datetimeindex + if time is not None: + if key in _DATETIMEINDEX_COMPONENTS: + return Variable([time], getattr(self.indices[time], key)) + elif key == 'season': + seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) + month = self.indices[time].month + return Variable([time], seasons[(month // 3) % 4]) raise ValueError('virtual variable %r not found' % key) def _get_virtual_dataview(self, key): @@ -296,9 +298,8 @@ def virtual_variables(self): dataset variables or dimensions) """ possible_vars = list(self.dimensions) - for k in self._datetimeindices: - for suffix in _DATETIMEINDEX_COMPONENTS + ['season']: - possible_vars.append('%s.%s' % (k, suffix)) + if self._unique_datetimeindex is not None: + possible_vars += _DATETIMEINDEX_COMPONENTS + ['season'] return tuple(k for k in possible_vars if k not in self) def __getitem__(self, key): @@ -567,50 +568,49 @@ def set_variable(self, name, var): self.indices.build_index(name) return new_var - def indexed_by(self, **indexers): - """Return a new dataset with each variable indexed along the specified - dimension(s) - - This method selects values from each variable using its `__getitem__` - method, except this method does not require knowing the order of - each variable's dimensions. + def views(self, **slicers): + """Return a new object whose contents are a view of a slice from the + current object along a specified dimension Parameters ---------- - **indexers : {dim: indexer, ...} - Keyword arguments with names matching dimensions and values given - by integers, slice objects or arrays. + slicers : {dim: slice, ...} + A dictionary mapping from dimensions to integers or slice objects. Returns ------- - obj : Dataset - A new Dataset with the same contents as this dataset, except each - variable and dimension is indexed by the appropriate indexers. In - general, each variable's data will be a view of the variable's data - in this dataset, unless numpy fancy indexing was triggered by using - an array indexer, in which case the data will be a copy. + obj : Data object + The returned object has the same attributes, dimensions, + variable names and variable attributes as the original. + Variables that are not defined along the specified + dimensions are viewed in their entirety. Variables that are + defined along the specified dimension have their data + contents taken along the specified dimension. + + Care must be taken since modifying (most) values in the returned + object will result in modification to the parent object. See Also -------- - Dataset.labeled_by - Dataset.indexed_by - Variable.indexed_by + view + numpy.take + Variable.take """ - invalid = [k for k in indexers if not k in self.dimensions] + invalid = [k for k in slicers if not k in self.dimensions] if invalid: raise ValueError("dimensions %r do not exist" % invalid) - # all indexers should be int, slice or np.ndarrays - indexers = {k: np.asarray(v) if not isinstance(v, (int, slice)) else v - for k, v in indexers.iteritems()} + # all slicers should be int, slice or np.ndarrays + slicers = {k: np.asarray(v) if not isinstance(v, (int, slice)) else v + for k, v in slicers.iteritems()} variables = OrderedDict() for name, var in self.variables.iteritems(): - var_indexers = {k: v for k, v in indexers.iteritems() + var_slicers = {k: v for k, v in slicers.iteritems() if k in var.dimensions} - variables[name] = var.indexed_by(**var_indexers) + variables[name] = var.views(**var_slicers) - indices = {k: (v[indexers[k]] if k in indexers else v) + indices = {k: (v[slicers[k]] if k in slicers else v) for k, v in self.indices.iteritems()} # filter out non-indices (indices for which one value was selected) indices = {k: v for k, v in indices.iteritems() @@ -637,46 +637,9 @@ def _loc_to_int_indexer(self, dim, locations): # later) return indexer - def labeled_by(self, **indexers): - """Return a new dataset with each variable indexed by coordinate labels - along the specified dimension(s) - - In contrast to `Dataset.indexed_by`, indexers for this method should - use coordinate values instead of integers. - - Under the hood, this method is powered by using Panda's powerful Index - objects. This makes label based indexing essentially just as fast as - using integer indexing. - - It also means this method uses pandas's (well documented) logic for - indexing. This means you can use string shortcuts for datetime indexes - (e.g., '2000-01' to select all values in January 2000). It also means - that slices are treated as inclusive of both the start and stop values, - unlike normal Python indexing. - - Parameters - ---------- - **indexers : {dim: indexer, ...} - Keyword arguments with names matching dimensions and values given - by individual, slices or arrays of coordinate values. - - Returns - ------- - obj : Dataset - A new Dataset with the same contents as this dataset, except each - variable and dimension is indexed by the appropriate indexers. In - general, each variable's data will be a view of the variable's data - in this dataset, unless numpy fancy indexing was triggered by using - an array indexer, in which case the data will be a copy. - - See Also - -------- - Dataset.labeled_by - Dataset.indexed_by - Variable.indexed_by - """ - return self.indexed_by(**{k: self._loc_to_int_indexer(k, v) - for k, v in indexers.iteritems()}) + def loc_views(self, **slicers): + return self.views(**{k: self._loc_to_int_indexer(k, v) + for k, v in slicers.iteritems()}) def renamed(self, name_dict): """ @@ -848,26 +811,177 @@ def replace(self, name, variable): ds.add_variable(name, variable) return ds - def iterator(self, dimension): - """Iterate along a data dimension + def iterator(self, dim=None, views=False): + """Iterator along a data dimension - Returns an iterator yielding (coordinate, dataset) pairs for each - coordinate value along the specified dimension. + Return an iterator yielding (coordinate, data_object) pairs + that are singleton along the specified dimension Parameters ---------- - dimension : string - The dimension along which to iterate. + dim : string, optional + The dimension along which you want to iterate. If None + (default), then the iterator operates along the record + dimension; if there is no record dimension, an exception + will be raised. + views : boolean, optional + If True, the iterator will give views of the data along + the dimension, otherwise copies. Returns ------- it : iterator - The returned iterator yields pairs of scalar-valued coordinate - variables and Dataset objects. + The returned iterator yields pairs of scalar-valued + coordinate variables and data objects. The yielded data + objects contain *copies* onto the underlying numpy arrays of + the original data object. If the data object does not have + a coordinate variable with the same name as the specified + dimension, then the returned coordinate value is None. If + multiple dimensions of a variable equal dim (e.g. a + correlation matrix), then that variable is iterated along + the first matching dimension. + + Examples + -------- + >>> d = Data() + >>> d.create_coordinate(name='x', data=numpy.arange(10)) + >>> d.create_coordinate(name='y', data=numpy.arange(20)) + >>> print d + + dimensions: + name | length + =========================== + x | 10 + y | 20 + + variables: + name | dtype | shape | dimensions + ===================================================================== + x | int32 | (10,) | ('x',) + y | int32 | (20,) | ('y',) + + attributes: + None + + >>> i = d.iterator(dim='x') + >>> (a, b) = i.next() + >>> print a + + dtype: + int32 + + dimensions: + name | length + =========================== + x | 1 + + attributes: + None + + >>> print b + + dimensions: + name | length + =========================== + x | 1 + y | 20 + + variables: + name | dtype | shape | dimensions + ===================================================================== + x | int32 | (1,) | ('x',) + y | int32 | (20,) | ('y',) + + attributes: + None + """ - coord = self.variables[dimension] - for i in xrange(self.dimensions[dimension]): - yield (coord[i], self.indexed_by(**{dimension: i})) + # Determine the size of the dim we're about to iterate over + n = self.dimensions[dim] + # Iterate over the object + if dim in self.coordinates: + coord = self.variables[dim] + if views: + for i in xrange(n): + s = slice(i, i + 1) + yield (coord.view(s, dim=dim), + self.view(s, dim=dim)) + else: + for i in xrange(n): + indices = np.array([i]) + yield (coord.take(indices, dim=dim), + self.take(indices, dim=dim)) + else: + if views: + for i in xrange(n): + yield (None, self.view(slice(i, i + 1), dim=dim)) + else: + for i in xrange(n): + yield (None, self.take(np.array([i]), dim=dim)) + + def iterarray(self, var, dim=None): + """Iterator along a data dimension returning the corresponding slices + of the underlying data of a variable. + + Return an iterator yielding (scalar, ndarray) pairs that are singleton + along the specified dimension. While iterator is more general, this + method has less overhead and in turn should be considerably faster. + + Parameters + ---------- + var : string + The variable over which you want to iterate. + + dim : string, optional + The dimension along which you want to iterate. If None + (default), then the iterator operates along the record + dimension; if there is no record dimension, an exception + will be raised. + + Returns + ------- + it : iterator + The returned iterator yields pairs of scalar-valued + and ndarray objects. The yielded data objects contain *views* + onto the underlying numpy arrays of the original data object. + + Examples + -------- + >>> d = Data() + >>> d.create_coordinate(name='t', data=numpy.arange(5)) + >>> d.create_dimension(name='h', length=3) + >>> d.create_variable(name='x', dim=('t', 'h'),\ + ... data=numpy.random.random((10, 3,))) + >>> print d['x'].data + [[ 0.33499995 0.47606901 0.41334325] + [ 0.20229308 0.73693437 0.97451746] + [ 0.40020704 0.29763575 0.85588908] + [ 0.44114434 0.79233816 0.59115313] + [ 0.18583972 0.55084889 0.95478946]] + >>> i = d.iterarray(var='x', dim='t') + >>> (a, b) = i.next() + >>> print a + 0 + >>> print b + [[ 0.33499995 0.47606901 0.41334325]] + """ + # Get a reference to the underlying ndarray for the desired variable + # and build a list of slice objects + data = self.variables[var].data + axis = list(self.variables[var].dimensions).index(dim) + slicer = [slice(None)] * data.ndim + # Determine the size of the dim we're about to iterate over + n = self.dimensions[dim] + # Iterate over dim returning views of the variable. + if dim in self.coordinates: + coord = self.variables[dim].data + for i in xrange(n): + slicer[axis] = slice(i, i + 1) + yield (coord[i], data[slicer]) + else: + for i in xrange(n): + slicer[axis] = slice(i, i + 1) + yield (None, data[slicer]) if __name__ == "__main__": diff --git a/src/scidata/variable.py b/src/scidata/variable.py index 845334758d0..420967da959 100644 --- a/src/scidata/variable.py +++ b/src/scidata/variable.py @@ -5,7 +5,7 @@ import numpy as np import conventions -import data +import dataset import dataview import ops import utils @@ -449,7 +449,7 @@ def stack_variables(variables, dim, length=None): def _broadcast_var_data(self, other): self_data = self.data - if isinstance(other, data.Dataset): + if isinstance(other, dataset.Dataset): raise TypeError('datasets do not support mathematical operations') elif all(hasattr(other, attr) for attr in ['dimensions', 'data', 'shape']): # validate dimensions diff --git a/test/test_data.py b/test/test_dataset.py similarity index 99% rename from test/test_data.py rename to test/test_dataset.py index b6bf876ad2a..28e0203a838 100644 --- a/test/test_data.py +++ b/test/test_dataset.py @@ -35,8 +35,6 @@ def create_test_data(store=None): return obj class DataTest(TestCase): - #TODO: test constructor - def get_store(self): return backends.InMemoryDataStore() From 0354aa05f24ada52b5cfc35fc2a8772b0598b641 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 5 Feb 2014 15:55:22 -0800 Subject: [PATCH 10/45] Fixed iterator and removed broken methods --- src/scidata/dataset.py | 173 +++------------------------------------- src/scidata/dataview.py | 20 +++++ test/test_dataset.py | 89 ++------------------- test/test_dataview.py | 8 ++ 4 files changed, 46 insertions(+), 244 deletions(-) diff --git a/src/scidata/dataset.py b/src/scidata/dataset.py index 17b0a1c5ecd..4d72f54dfd0 100644 --- a/src/scidata/dataset.py +++ b/src/scidata/dataset.py @@ -811,177 +811,26 @@ def replace(self, name, variable): ds.add_variable(name, variable) return ds - def iterator(self, dim=None, views=False): - """Iterator along a data dimension + def iterator(self, dimension): + """Iterate along a data dimension - Return an iterator yielding (coordinate, data_object) pairs - that are singleton along the specified dimension + Returns an iterator yielding (coordinate, dataset) pairs for each + coordinate value along the specified dimension. Parameters ---------- - dim : string, optional - The dimension along which you want to iterate. If None - (default), then the iterator operates along the record - dimension; if there is no record dimension, an exception - will be raised. - views : boolean, optional - If True, the iterator will give views of the data along - the dimension, otherwise copies. + dimension : string + The dimension along which to iterate. Returns ------- it : iterator - The returned iterator yields pairs of scalar-valued - coordinate variables and data objects. The yielded data - objects contain *copies* onto the underlying numpy arrays of - the original data object. If the data object does not have - a coordinate variable with the same name as the specified - dimension, then the returned coordinate value is None. If - multiple dimensions of a variable equal dim (e.g. a - correlation matrix), then that variable is iterated along - the first matching dimension. - - Examples - -------- - >>> d = Data() - >>> d.create_coordinate(name='x', data=numpy.arange(10)) - >>> d.create_coordinate(name='y', data=numpy.arange(20)) - >>> print d - - dimensions: - name | length - =========================== - x | 10 - y | 20 - - variables: - name | dtype | shape | dimensions - ===================================================================== - x | int32 | (10,) | ('x',) - y | int32 | (20,) | ('y',) - - attributes: - None - - >>> i = d.iterator(dim='x') - >>> (a, b) = i.next() - >>> print a - - dtype: - int32 - - dimensions: - name | length - =========================== - x | 1 - - attributes: - None - - >>> print b - - dimensions: - name | length - =========================== - x | 1 - y | 20 - - variables: - name | dtype | shape | dimensions - ===================================================================== - x | int32 | (1,) | ('x',) - y | int32 | (20,) | ('y',) - - attributes: - None - + The returned iterator yields pairs of scalar-valued coordinate + variables and Dataset objects. """ - # Determine the size of the dim we're about to iterate over - n = self.dimensions[dim] - # Iterate over the object - if dim in self.coordinates: - coord = self.variables[dim] - if views: - for i in xrange(n): - s = slice(i, i + 1) - yield (coord.view(s, dim=dim), - self.view(s, dim=dim)) - else: - for i in xrange(n): - indices = np.array([i]) - yield (coord.take(indices, dim=dim), - self.take(indices, dim=dim)) - else: - if views: - for i in xrange(n): - yield (None, self.view(slice(i, i + 1), dim=dim)) - else: - for i in xrange(n): - yield (None, self.take(np.array([i]), dim=dim)) - - def iterarray(self, var, dim=None): - """Iterator along a data dimension returning the corresponding slices - of the underlying data of a variable. - - Return an iterator yielding (scalar, ndarray) pairs that are singleton - along the specified dimension. While iterator is more general, this - method has less overhead and in turn should be considerably faster. - - Parameters - ---------- - var : string - The variable over which you want to iterate. - - dim : string, optional - The dimension along which you want to iterate. If None - (default), then the iterator operates along the record - dimension; if there is no record dimension, an exception - will be raised. - - Returns - ------- - it : iterator - The returned iterator yields pairs of scalar-valued - and ndarray objects. The yielded data objects contain *views* - onto the underlying numpy arrays of the original data object. - - Examples - -------- - >>> d = Data() - >>> d.create_coordinate(name='t', data=numpy.arange(5)) - >>> d.create_dimension(name='h', length=3) - >>> d.create_variable(name='x', dim=('t', 'h'),\ - ... data=numpy.random.random((10, 3,))) - >>> print d['x'].data - [[ 0.33499995 0.47606901 0.41334325] - [ 0.20229308 0.73693437 0.97451746] - [ 0.40020704 0.29763575 0.85588908] - [ 0.44114434 0.79233816 0.59115313] - [ 0.18583972 0.55084889 0.95478946]] - >>> i = d.iterarray(var='x', dim='t') - >>> (a, b) = i.next() - >>> print a - 0 - >>> print b - [[ 0.33499995 0.47606901 0.41334325]] - """ - # Get a reference to the underlying ndarray for the desired variable - # and build a list of slice objects - data = self.variables[var].data - axis = list(self.variables[var].dimensions).index(dim) - slicer = [slice(None)] * data.ndim - # Determine the size of the dim we're about to iterate over - n = self.dimensions[dim] - # Iterate over dim returning views of the variable. - if dim in self.coordinates: - coord = self.variables[dim].data - for i in xrange(n): - slicer[axis] = slice(i, i + 1) - yield (coord[i], data[slicer]) - else: - for i in xrange(n): - slicer[axis] = slice(i, i + 1) - yield (None, data[slicer]) + coord = self.variables[dimension] + for i in xrange(self.dimensions[dimension]): + yield (coord[i], self.views(**{dimension: i})) if __name__ == "__main__": diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index 2e594f1b328..fb8911ee72a 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -168,6 +168,26 @@ def replace_focus(self, new_var): ds = self.dataset.replace(self.name, new_var) return type(self)(ds, self.name) + def iterator(self, dimension): + """Iterate along a data dimension + + Returns an iterator yielding (coordinate, dataview) pairs for each + coordinate value along the specified dimension. + + Parameters + ---------- + dimension : string + The dimension along which to iterate. + + Returns + ------- + it : iterator + The returned iterator yields pairs of scalar-valued coordinate + variables and DataView objects. + """ + for (x, dataset) in self.dataset.iterator(dimension): + yield (x, type(self)(dataset, self.name)) + def transpose(self, *dimensions): """Return a new DataView object with transposed dimensions diff --git a/test/test_dataset.py b/test/test_dataset.py index 28e0203a838..4872da0a19c 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -20,6 +20,7 @@ _testvar = sorted(_vars.keys())[0] _testdim = sorted(_dims.keys())[0] + def create_test_data(store=None): obj = Dataset() if store is None else Dataset.load_store(store) obj.add_dimension('time', 1000) @@ -34,6 +35,7 @@ def create_test_data(store=None): var.attributes['foo'] = 'variable' return obj + class DataTest(TestCase): def get_store(self): return backends.InMemoryDataStore() @@ -43,41 +45,13 @@ def test_repr(self): self.assertEqual('', repr(data)) - @unittest.skip('method needs rewrite and/or removal') def test_iterator(self): data = create_test_data(self.get_store()) - # iterate over the first dim - iterdim = _testdim - for t, sub in data.iterator(dim=iterdim): - ind = int(np.where(data.variables[iterdim].data == t.data)[0]) - # make sure all the slices match - for v in _vars.keys(): - if iterdim in data[v].dimensions: - dim_axis = list(data[v].dimensions).index(iterdim) - expected = data[v].data.take( - [ind], axis=dim_axis).reshape(sub[v].data.shape) - np.testing.assert_array_equal(sub[v].data, expected) - self.assertEquals(sub.dimensions[iterdim], 1) - # test that the yielded objects are copies of the original - for (t, sub) in data.iterator(dim=iterdim): - sub[_testvar][:] = -71 - self.assertTrue((data[_testvar].data != -71).all()) - - def test_iterarray(self): - data = create_test_data(self.get_store()) - # iterate over the first dim - iterdim = _testdim - for t, d in data.iterarray(dim=iterdim, var=_testvar): - ind = int(np.where(data.variables[iterdim].data == t)[0]) - # make sure all the slices match - dim_axis = list(data[_testvar].dimensions).index(iterdim) - expected = data.variables[_testvar].data.take([ind], axis=dim_axis) - np.testing.assert_array_equal(d, expected) - # test that the yielded objects are views of the original - # This test doesn't make sense for the netCDF4 backend - # for (t, d) in data.iterarray(dim=iterdim, var=_testvar): - # d[:] = -71 - # self.assertTrue((data[_testvar].data == -71).all()) + for n, (t, sub) in enumerate(list(data.iterator('dim1'))[:3]): + self.assertEqual(data['dim1'][n], t) + self.assertVarEqual(data['var1'][n], sub['var1']) + self.assertVarEqual(data['var2'][n], sub['var2']) + self.assertVarEqual(data['var3'][:, n], sub['var3']) def test_dimension(self): a = Dataset() @@ -272,55 +246,6 @@ def test_variable_indexing(self): self.assertVarEqual(v[:3, :2], v[d1[:3], d2[:2]]) self.assertVarEqual(v[:3, :2], v[range(3), range(2)]) - @unittest.skip('obsolete method should be removed') - def test_take(self): - data = create_test_data(self.get_store()) - slicedim = _testdim - # using a list - ret = data.take(indices=range(2, 5), dim=slicedim) - self.assertEquals(len(ret[slicedim].data), 3) - # using a numpy vector - ret = data.take(indices=np.array([2, 3, 4,]), dim=slicedim) - self.assertEquals(len(ret[slicedim].data), 3) - # With a random index - indices = np.random.randint(data.dimensions[slicedim], size=10) - ret = data.take(indices=indices, dim=slicedim) - # Verify that only the specified dimension was altered - for d in data.dimensions: - if d == slicedim: - self.assertEqual(ret.dimensions[d], indices.size) - else: - self.assertEqual(data.dimensions[d], ret.dimensions[d]) - # Verify that the data is what we expect - for v in data.variables: - self.assertEqual(data[v].dimensions, ret[v].dimensions) - self.assertEqual(data[v].attributes, ret[v].attributes) - if slicedim in data[v].dimensions: - expected = data[v].data.take( - indices, axis=data[v].dimensions.index(slicedim)) - else: - expected = data[v].data[:] - actual = ret[v].data - np.testing.assert_array_equal(expected, actual) - # Test that our take is a copy - ret[v].data.fill(np.pi) - self.assertTrue(not (data[v].data == np.pi).any()) - self.assertRaises(KeyError, data.take, - indices=indices, dim='not_a_dim') - self.assertRaises(IndexError, data.take, - indices=[data.dimensions[slicedim] + 10], - dim=slicedim) - - @unittest.skip('method needs rewrite and/or removal') - def test_squeeze(self): - data = create_test_data(self.get_store()) - singleton = data.take([1], 'dim2') - squeezed = singleton.squeeze('dim2') - assert not 'dim2' in squeezed.dimensions - for x in [v for v, d in _vars.iteritems() if 'dim2' in d]: - np.testing.assert_array_equal(singleton[x].data.flatten(), - squeezed[x].data) - def test_select(self): data = create_test_data(self.get_store()) ret = data.select(_testvar) diff --git a/test/test_dataview.py b/test/test_dataview.py index 6adf6ce5e4f..746cc9ad1dd 100644 --- a/test/test_dataview.py +++ b/test/test_dataview.py @@ -51,6 +51,14 @@ def test_items(self): self.assertEqual(self.dv[:3, :5].dataset, self.ds.views(x=slice(3), y=slice(5))) + def test_iteration(self): + for ((act_x, act_dv), (exp_x, exp_ds)) in \ + zip(self.dv.iterator('y'), self.ds.iterator('y')): + self.assertVarEqual(exp_x, act_x) + self.assertViewEqual(DataView(exp_ds, 'foo'), act_dv) + for ((_, exp_dv), act_dv) in zip(self.dv.iterator('x'), self.dv): + self.assertViewEqual(exp_dv, act_dv) + def test_views(self): self.assertViewEqual(self.dv, self.dv.views(x=slice(None))) self.assertViewEqual(self.dv[:3], self.dv.views(x=slice(3))) From f626652f8634ca5abb290281e38f2affa17b82c7 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 5 Feb 2014 17:16:04 -0800 Subject: [PATCH 11/45] Renamed views & loc_views to indexed_by and labeled_by --- src/scidata/dataset.py | 96 ++++++++++++++++++++++++++++------------- src/scidata/dataview.py | 29 ++++++++----- src/scidata/variable.py | 40 +++++++---------- test/test_dataset.py | 51 ++++++++-------------- test/test_dataview.py | 41 +++++++++++++----- test/test_variable.py | 12 +++--- 6 files changed, 155 insertions(+), 114 deletions(-) diff --git a/src/scidata/dataset.py b/src/scidata/dataset.py index 4d72f54dfd0..e560475d855 100644 --- a/src/scidata/dataset.py +++ b/src/scidata/dataset.py @@ -568,49 +568,50 @@ def set_variable(self, name, var): self.indices.build_index(name) return new_var - def views(self, **slicers): - """Return a new object whose contents are a view of a slice from the - current object along a specified dimension + def indexed_by(self, **indexers): + """Return a new dataset with each variable indexed along the specified + dimension(s) + + This method selects values from each variable using its `__getitem__` + method, except this method does not require knowing the order of + each variable's dimensions. Parameters ---------- - slicers : {dim: slice, ...} - A dictionary mapping from dimensions to integers or slice objects. + **indexers : {dim: indexer, ...} + Keyword arguments with names matching dimensions and values given + by integers, slice objects or arrays. Returns ------- - obj : Data object - The returned object has the same attributes, dimensions, - variable names and variable attributes as the original. - Variables that are not defined along the specified - dimensions are viewed in their entirety. Variables that are - defined along the specified dimension have their data - contents taken along the specified dimension. - - Care must be taken since modifying (most) values in the returned - object will result in modification to the parent object. + obj : Dataset + A new Dataset with the same contents as this dataset, except each + variable and dimension is indexed by the appropriate indexers. In + general, each variable's data will be a view of the variable's data + in this dataset, unless numpy fancy indexing was triggered by using + an array indexer, in which case the data will be a copy. See Also -------- - view - numpy.take - Variable.take + Dataset.labeled_by + Dataset.indexed_by + Variable.indexed_by """ - invalid = [k for k in slicers if not k in self.dimensions] + invalid = [k for k in indexers if not k in self.dimensions] if invalid: raise ValueError("dimensions %r do not exist" % invalid) - # all slicers should be int, slice or np.ndarrays - slicers = {k: np.asarray(v) if not isinstance(v, (int, slice)) else v - for k, v in slicers.iteritems()} + # all indexers should be int, slice or np.ndarrays + indexers = {k: np.asarray(v) if not isinstance(v, (int, slice)) else v + for k, v in indexers.iteritems()} variables = OrderedDict() for name, var in self.variables.iteritems(): - var_slicers = {k: v for k, v in slicers.iteritems() + var_indexers = {k: v for k, v in indexers.iteritems() if k in var.dimensions} - variables[name] = var.views(**var_slicers) + variables[name] = var.indexed_by(**var_indexers) - indices = {k: (v[slicers[k]] if k in slicers else v) + indices = {k: (v[indexers[k]] if k in indexers else v) for k, v in self.indices.iteritems()} # filter out non-indices (indices for which one value was selected) indices = {k: v for k, v in indices.iteritems() @@ -637,9 +638,46 @@ def _loc_to_int_indexer(self, dim, locations): # later) return indexer - def loc_views(self, **slicers): - return self.views(**{k: self._loc_to_int_indexer(k, v) - for k, v in slicers.iteritems()}) + def labeled_by(self, **indexers): + """Return a new dataset with each variable indexed by coordinate labels + along the specified dimension(s) + + In contrast to `Dataset.indexed_by`, indexers for this method should + use coordinate values instead of integers. + + Under the hood, this method is powered by using Panda's powerful Index + objects. This makes label based indexing essentially just as fast as + using integer indexing. + + It also means this method uses pandas's (well documented) logic for + indexing. This means you can use string shortcuts for datetime indexes + (e.g., '2000-01' to select all values in January 2000). It also means + that slices are treated as inclusive of both the start and stop values, + unlike normal Python indexing. + + Parameters + ---------- + **indexers : {dim: indexer, ...} + Keyword arguments with names matching dimensions and values given + by individual, slices or arrays of coordinate values. + + Returns + ------- + obj : Dataset + A new Dataset with the same contents as this dataset, except each + variable and dimension is indexed by the appropriate indexers. In + general, each variable's data will be a view of the variable's data + in this dataset, unless numpy fancy indexing was triggered by using + an array indexer, in which case the data will be a copy. + + See Also + -------- + Dataset.labeled_by + Dataset.indexed_by + Variable.indexed_by + """ + return self.indexed_by(**{k: self._loc_to_int_indexer(k, v) + for k, v in indexers.iteritems()}) def renamed(self, name_dict): """ @@ -830,7 +868,7 @@ def iterator(self, dimension): """ coord = self.variables[dimension] for i in xrange(self.dimensions[dimension]): - yield (coord[i], self.views(**{dimension: i})) + yield (coord[i], self.indexed_by(**{dimension: i})) if __name__ == "__main__": diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index fb8911ee72a..beb9b2e15d7 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -14,7 +14,7 @@ def __init__(self, dataview): def _remap_key(self, key): return tuple(self.dataview.dataset._loc_to_int_indexer(k, v) - for k, v in self.dataview._key_to_slicers(key)) + for k, v in self.dataview._key_to_indexers(key)) def __getitem__(self, key): return self.dataview[self._remap_key(key)] @@ -76,7 +76,7 @@ def data(self, value): def dimensions(self): return self.variable.dimensions - def _key_to_slicers(self, key): + def _key_to_indexers(self, key): key = expanded_indexer(key, self.ndim) return zip(self.dimensions, key) @@ -86,8 +86,8 @@ def __getitem__(self, key): return self.dataset[key] else: # orthogonal array indexing - slicers = dict(self._key_to_slicers(key)) - return type(self)(self.dataset.views(**slicers), self.name) + indexers = dict(self._key_to_indexers(key)) + return type(self)(self.dataset.indexed_by(**indexers), self.name) def __setitem__(self, key, value): self.variable[key] = value @@ -138,16 +138,25 @@ def __repr__(self): contents = ': %s' % self.data return '' % (type(self).__name__, self.name, contents) - def views(self, **slicers): - """Return a new Dataset whose contents are a view of a slice from the - current dataset along specified dimensions + def indexed_by(self, **indexers): + """Return a new dataview whose dataset is given by indexing along the + specified dimension(s) See Also -------- - Dataset.views + Dataset.indexed_by """ - ds = self.dataset.views(**slicers) - return type(self)(ds, self.name) + return type(self)(self.dataset.indexed_by(**indexers), self.name) + + def labeled_by(self, **indexers): + """Return a new dataview whose dataset is given by selecting coordinate + labels along the specified dimension(s) + + See Also + -------- + Dataset.labeled_by + """ + return type(self)(self.dataset.labeled_by(**indexers), self.name) def renamed(self, new_name): """Returns a new DataView with this DataView's focus variable renamed diff --git a/src/scidata/variable.py b/src/scidata/variable.py index 420967da959..668546ba1b5 100644 --- a/src/scidata/variable.py +++ b/src/scidata/variable.py @@ -48,7 +48,7 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): @property def data(self): """The variable's data as a numpy.ndarray""" - if not isinstance(self._data, np.ndarray): + if not isinstance(self._data, (np.ndarray, np.string_)): self._data = np.asarray(self._data[...]) self._indexing_mode = 'numpy' return self._data @@ -175,40 +175,32 @@ def __repr__(self): contents = ': %s' % self.data return '' % (type(self).__name__, contents) - def views(self, **slicers): - """Return a new Variable object whose contents are a view of the object - sliced along a specified dimension. + def indexed_by(self, **indexers): + """Return a new variable indexed along the specified dimension(s) Parameters ---------- - slicers : {dim: slice, ...} - A dictionary mapping from dim to slice, dim represents - the dimension to slice along slice represents the range of the - values to extract. + **indexers : {dim: indexer, ...} + Keyword arguments with names matching dimensions and values given + by integers, slice objects or arrays. Returns ------- obj : Variable object - The returned object has the same attributes and dimensions - as the original. Data contents are taken along the - specified dimension. Care must be taken since modifying (most) - values in the returned object will result in modification to the - parent object. - - See Also - -------- - view - take + A new Variable with the selected data and dimensions. In general, + the new variable's data will be a view of this variable's data, + unless numpy fancy indexing was triggered by using an array + indexer, in which case the data will be a copy. """ - invalid = [k for k in slicers if not k in self.dimensions] + invalid = [k for k in indexers if not k in self.dimensions] if invalid: raise ValueError("dimensions %r do not exist" % invalid) - slices = [slice(None)] * self.data.ndim + key = [slice(None)] * self.data.ndim for i, dim in enumerate(self.dimensions): - if dim in slicers: - slices[i] = slicers[dim] - return self[tuple(slices)] + if dim in indexers: + key[i] = indexers[dim] + return self[tuple(key)] def transpose(self, *dimensions): """Return a new Variable object with transposed dimensions @@ -344,7 +336,7 @@ def aggregated_by(self, func, new_dim_name, groups, **kwargs): 'match the length of this variable along its ' 'dimension') unique_values = np.unique(groups.data) - aggregated = [self.views(**{dim: groups == u}).collapsed( + aggregated = [self.indexed_by(**{dim: groups == u}).collapsed( func, dim, axis=None, **kwargs) for u in unique_values] stacked = stack_variables(aggregated, new_dim_name, unique_values.size) diff --git a/test/test_dataset.py b/test/test_dataset.py index 4872da0a19c..bf0002cc0b3 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -172,10 +172,10 @@ def test_attributes(self): self.assertRaises(ValueError, b.attributes.__setitem__, 'foo', np.zeros((2, 2))) self.assertRaises(ValueError, b.attributes.__setitem__, 'foo', dict()) - def test_views(self): + def test_indexed_by(self): data = create_test_data(self.get_store()) slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 2)} - ret = data.views(**slicers) + ret = data.indexed_by(**slicers) # Verify that only the specified dimension was altered self.assertItemsEqual(data.dimensions, ret.dimensions) @@ -204,33 +204,33 @@ def test_views(self): # np.testing.assert_array_equal(expected, actual) with self.assertRaises(ValueError): - data.views(not_a_dim=slice(0, 2)) + data.indexed_by(not_a_dim=slice(0, 2)) - ret = data.views(dim1=0) + ret = data.indexed_by(dim1=0) self.assertEqual({'time': 1000, 'dim2': 50, 'dim3': 10}, ret.dimensions) - ret = data.views(time=slice(2), dim1=0, dim2=slice(5)) + ret = data.indexed_by(time=slice(2), dim1=0, dim2=slice(5)) self.assertEqual({'time': 2, 'dim2': 5, 'dim3': 10}, ret.dimensions) - ret = data.views(time=0, dim1=0, dim2=slice(5)) + ret = data.indexed_by(time=0, dim1=0, dim2=slice(5)) self.assertItemsEqual({'dim2': 5, 'dim3': 10}, ret.dimensions) - def test_loc_views(self): + def test_labeled_by(self): data = create_test_data(self.get_store()) int_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 2)} loc_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 1)} - self.assertEqual(data.views(**int_slicers), - data.loc_views(**loc_slicers)) + self.assertEqual(data.indexed_by(**int_slicers), + data.labeled_by(**loc_slicers)) data.create_variable('time', ['time'], np.arange(1000, dtype=np.int32), {'units': 'days since 2000-01-01'}) - self.assertEqual(data.views(time=0), - data.loc_views(time='2000-01-01')) - self.assertEqual(data.views(time=slice(10)), - data.loc_views(time=slice('2000-01-01', + self.assertEqual(data.indexed_by(time=0), + data.labeled_by(time='2000-01-01')) + self.assertEqual(data.indexed_by(time=slice(10)), + data.labeled_by(time=slice('2000-01-01', '2000-01-10'))) - self.assertEqual(data, data.loc_views(time=slice('1999', '2005'))) - self.assertEqual(data.views(time=slice(3)), - data.loc_views( + self.assertEqual(data, data.labeled_by(time=slice('1999', '2005'))) + self.assertEqual(data.indexed_by(time=slice(3)), + data.labeled_by( time=pd.date_range('2000-01-01', periods=3))) def test_variable_indexing(self): @@ -302,7 +302,7 @@ def test_merge(self): actual = ds1.merge(ds2) self.assertEqual(expected, actual) with self.assertRaises(ValueError): - ds1.merge(ds2.views(dim1=0)) + ds1.merge(ds2.indexed_by(dim1=0)) with self.assertRaises(ValueError): ds1.merge(ds2.renamed({'var3': 'var1'})) @@ -337,20 +337,3 @@ def get_store(self): def test_repr(self): # scipy.io.netcdf does not keep track of dimension order :( pass - - -# class StoreTest(TestCase): -# def test_store_consistency(self): -# mem_ds = create_test_data() - -# fobj = StringIO() -# store = backends.ScipyDataStore(fobj, 'w') -# store = self.get_store() -# mem_ds.dump_to_store() - -# stored_ds = Dataset.load_store(store) -# self.assertEquals(mem_ds, stored_ds) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_dataview.py b/test/test_dataview.py index 746cc9ad1dd..399ec4b96d9 100644 --- a/test/test_dataview.py +++ b/test/test_dataview.py @@ -32,9 +32,13 @@ def test_properties(self): self.assertArrayEqual(v, self.ds.indices[k]) def test_items(self): - # test indexing - x = self.ds['x'] - y = self.ds['y'] + # strings pull out dataviews + self.assertViewEqual(self.dv, self.ds['foo']) + x = self.dv['x'] + y = self.dv['y'] + self.assertViewEqual(DataView(self.ds.select('x'), 'x'), x) + self.assertViewEqual(DataView(self.ds.select('y'), 'y'), y) + # integer indexing I = ReturnItem() for i in [I[:], I[...], I[x.data], I[x.variable], I[x], I[x, y], I[x.data > -1], I[x.variable > -1], I[x > -1], @@ -44,12 +48,8 @@ def test_items(self): I[x.data[:3]], I[x.variable[:3]], I[x[:3]], I[x[:3], y[:4]], I[x.data > 3], I[x.variable > 3], I[x > 3], I[x > 3, y > 3]]: self.assertVarEqual(self.v[i], self.dv[i]) - # test index + # check that the new index is consistent self.assertEqual(list(self.dv[0].indices), ['y']) - # test matches views - self.assertEqual(self.dv[0].dataset, self.ds.views(x=0)) - self.assertEqual(self.dv[:3, :5].dataset, - self.ds.views(x=slice(3), y=slice(5))) def test_iteration(self): for ((act_x, act_dv), (exp_x, exp_ds)) in \ @@ -59,9 +59,28 @@ def test_iteration(self): for ((_, exp_dv), act_dv) in zip(self.dv.iterator('x'), self.dv): self.assertViewEqual(exp_dv, act_dv) - def test_views(self): - self.assertViewEqual(self.dv, self.dv.views(x=slice(None))) - self.assertViewEqual(self.dv[:3], self.dv.views(x=slice(3))) + def test_indexed_by(self): + self.assertEqual(self.dv[0].dataset, self.ds.indexed_by(x=0)) + self.assertEqual(self.dv[:3, :5].dataset, + self.ds.indexed_by(x=slice(3), y=slice(5))) + self.assertViewEqual(self.dv, self.dv.indexed_by(x=slice(None))) + self.assertViewEqual(self.dv[:3], self.dv.indexed_by(x=slice(3))) + + def test_labeled_by(self): + self.ds.set_variable('x', Variable(['x'], np.array(list('abcdefghij')))) + self.assertViewEqual(self.dv, self.dv.labeled_by(x=slice(None))) + self.assertViewEqual(self.dv[1], self.dv.labeled_by(x='b')) + self.assertViewEqual(self.dv[:3], self.dv.labeled_by(x=slice('c'))) + + def test_loc(self): + self.ds.set_variable('x', Variable(['x'], np.array(list('abcdefghij')))) + self.assertViewEqual(self.dv[:3], self.dv.loc[:'c']) + self.assertViewEqual(self.dv[1], self.dv.loc['b']) + self.assertViewEqual(self.dv[:3], self.dv.loc[['a', 'b', 'c']]) + self.assertViewEqual(self.dv[:3, :4], + self.dv.loc[['a', 'b', 'c'], np.arange(4)]) + self.dv.loc['a':'j'] = 0 + self.assertTrue(np.all(self.dv.data == 0)) def test_renamed(self): renamed = self.dv.renamed('bar') diff --git a/test/test_variable.py b/test/test_variable.py index f8ff8e023ca..a01b25a57af 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -71,14 +71,14 @@ def test_items(self): v.data[:] = 0 self.assertTrue(np.all(v.data == 0)) - def test_views(self): + def test_indexed_by(self): v = Variable(['time', 'x'], self.d) - self.assertVarEqual(v.views(time=slice(None)), v) - self.assertVarEqual(v.views(time=0), v[0]) - self.assertVarEqual(v.views(time=slice(0, 3)), v[:3]) - self.assertVarEqual(v.views(x=0), v[:, 0]) + self.assertVarEqual(v.indexed_by(time=slice(None)), v) + self.assertVarEqual(v.indexed_by(time=0), v[0]) + self.assertVarEqual(v.indexed_by(time=slice(0, 3)), v[:3]) + self.assertVarEqual(v.indexed_by(x=0), v[:, 0]) with self.assertRaisesRegexp(ValueError, 'do not exist'): - v.views(not_a_dim=0) + v.indexed_by(not_a_dim=0) def test_transpose(self): v = Variable(['time', 'x'], self.d) From 883f0fe457c078e8ed741c25387a5e13fc936064 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 5 Feb 2014 20:40:16 -0800 Subject: [PATCH 12/45] More flexible __getitem__ and __setitem__ --- src/scidata/__init__.py | 2 +- src/scidata/backends.py | 4 +- src/scidata/common.py | 2 +- src/scidata/dataset.py | 112 ++++++++++++++++++++----------- src/scidata/dataview.py | 124 ++++++++++++++++++++++++---------- src/scidata/variable.py | 144 +++++++++++++++++++++++----------------- test/test_dataset.py | 69 +++++++++++++++++-- test/test_dataview.py | 27 +++++--- test/test_variable.py | 37 +++++------ 9 files changed, 345 insertions(+), 176 deletions(-) diff --git a/src/scidata/__init__.py b/src/scidata/__init__.py index 0f80dede807..27f673cbeba 100644 --- a/src/scidata/__init__.py +++ b/src/scidata/__init__.py @@ -1,5 +1,5 @@ from dataset import Dataset, open_dataset -from dataview import DataView +from dataview import DataView, intersection from variable import Variable import backends diff --git a/src/scidata/backends.py b/src/scidata/backends.py index 726f16817fb..5941271944d 100644 --- a/src/scidata/backends.py +++ b/src/scidata/backends.py @@ -3,6 +3,8 @@ DataStores provide a uniform interface for saving and loading data in different formats. They should not be used directly, but rather through Dataset objects. """ +# TODO: implement backend logic directly in OrderedDict subclasses, to allow +# for directly manipulating Dataset.variables and the like? import netCDF4 as nc4 import numpy as np @@ -91,7 +93,7 @@ def dimensions(self): def set_dimension(self, name, length): """Set a dimension length""" - if name in self.ds.dimensions: + if name in self.dimensions: raise ValueError('%s does not support modifying dimensions' % type(self).__name__) self.ds.createDimension(name, length) diff --git a/src/scidata/common.py b/src/scidata/common.py index f6ae3bca3f7..d0ae0a2e639 100644 --- a/src/scidata/common.py +++ b/src/scidata/common.py @@ -77,7 +77,7 @@ def __array__(self, dtype=None): @classmethod def _collapse_method(cls, f, name=None, module=None): def func(self, dimension=None, axis=None, **kwargs): - return self.collapsed(f, dimension, axis, **kwargs) + return self.collapse(f, dimension, axis, **kwargs) if name is None: name = f.__name__ func.__name__ = name diff --git a/src/scidata/dataset.py b/src/scidata/dataset.py index e560475d855..4cf1bb793a7 100644 --- a/src/scidata/dataset.py +++ b/src/scidata/dataset.py @@ -5,7 +5,7 @@ import pandas as pd from cStringIO import StringIO -from collections import OrderedDict, MutableMapping +from collections import OrderedDict, Mapping, MutableMapping from dataview import DataView from utils import FrozenOrderedDict, Frozen @@ -36,6 +36,9 @@ def construct_dimensions(variables): """ dimensions = OrderedDict() for k, var in variables.iteritems(): + if k in var.dimensions and var.ndim != 1: + raise ValueError('a coordinate variable must be defined with ' + '1-dimensional data') for dim, length in zip(var.dimensions, var.shape): if dim not in dimensions: dimensions[dim] = length @@ -73,12 +76,11 @@ def check_dims_and_vars_consistency(dimensions, variables): (dim, k, list(dimensions))) elif dimensions[dim] != length: raise ValueError('dimension %r on variable %r has length ' - '%s but in on the dataset has length %s' % + '%s but on the dataset has length %s' % (dim, k, length, dimensions[dim])) def open_dataset(nc, *args, **kwargs): - #TODO: add tests for this function # move this to a classmethod Dataset.open? if isinstance(nc, basestring) and not nc.startswith('CDF'): # If the initialization nc is a string and it doesn't @@ -146,13 +148,12 @@ def __repr__(self): 'quarter'] -class Dataset(object): - """ - A netcdf-like data object consisting of dimensions, variables and +class Dataset(Mapping): + """A netcdf-like data object consisting of dimensions, variables and attributes which together form a self describing data set - Datasets are containers of variable name. Getting an item from a Dataset - returns a DataView focused on that variable. + Datasets are mappings from variable names to dataviews focused on those + variable. Attributes ---------- @@ -167,15 +168,19 @@ class Dataset(object): indices : {dimension: index, ...} Mapping from dimensions to pandas.Index objects. store : backends.*DataStore + Don't modify the store directly unless you want to avoid all validation + checks. """ def __init__(self, variables=None, dimensions=None, attributes=None, indices=None, store=None): """ If dimensions are not provided, they are inferred from the variables. - Only set a store if you want to Dataset operations to modify stored - data in-place. Otherwise, load data from a store using the - `open_dataset` function or the `from_store` class method. + In general, load data from a store using the `open_dataset` function or + the `from_store` class method. The `store` argument should only be used + if you want to Dataset operations to modify stored data in-place. + Note, however, that modifying datasets in-place is not entirely + implemented and thus may lead to unexpected behavior. """ # TODO: fill out this docstring if store is None: @@ -259,29 +264,30 @@ def __contains__(self, key): """ return key in self.variables + def __len__(self): + return len(self.variable) + def __iter__(self): return iter(self.variables) @property - def _unique_datetimeindex(self): - time_indices = [k for k, v in self.indices.iteritems() - if isinstance(v, pd.DatetimeIndex)] - if len(time_indices) == 1: - return time_indices[0] - else: - return None + def _datetimeindices(self): + return [k for k, v in self.indices.iteritems() + if isinstance(v, pd.DatetimeIndex)] def _get_virtual_variable(self, key): if key in self.indices: return Variable([key], self.indices[key].values) - time = self._unique_datetimeindex - if time is not None: - if key in _DATETIMEINDEX_COMPONENTS: - return Variable([time], getattr(self.indices[time], key)) - elif key == 'season': - seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) - month = self.indices[time].month - return Variable([time], seasons[(month // 3) % 4]) + split_key = key.split('.') + if len(split_key) == 2: + var, suffix = split_key + if var in self._datetimeindices: + if suffix in _DATETIMEINDEX_COMPONENTS: + return Variable([var], getattr(self.indices[var], suffix)) + elif suffix == 'season': + # seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) + month = self.indices[var].month + return Variable([var], (month // 3) % 4 + 1) raise ValueError('virtual variable %r not found' % key) def _get_virtual_dataview(self, key): @@ -298,8 +304,9 @@ def virtual_variables(self): dataset variables or dimensions) """ possible_vars = list(self.dimensions) - if self._unique_datetimeindex is not None: - possible_vars += _DATETIMEINDEX_COMPONENTS + ['season'] + for k in self._datetimeindices: + for suffix in _DATETIMEINDEX_COMPONENTS + ['season']: + possible_vars.append('%s.%s' % (k, suffix)) return tuple(k for k in possible_vars if k not in self) def __getitem__(self, key): @@ -312,8 +319,18 @@ def __getitem__(self, key): else: return DataView(self.select(key), key) - #TODO: add keys, items, and values methods (and the iter versions) to - # complete the dict analogy? + def __setitem__(self, key, value): + # TODO: allow this operation to be destructive, overriding existing + # variables? If so, we may want to implement __delitem__, too. + # (We would need to change DataView.__setitem__ in that case, because + # we definitely don't want to override focus variables.) + if isinstance(value, DataView): + self.merge(value.renamed(key).dataset, inplace=True) + elif isinstance(value, Variable): + self.set_variable(key, value) + else: + raise TypeError('only DataViews and Variables can be added to ' + 'datasets via `__setitem__`') # mutable objects should not be hashable __hash__ = None @@ -422,10 +439,9 @@ def create_variable(self, name, dims, data, attributes=None): Parameters ---------- name : string - The name of the new variable. An exception will be raised - if the object already has a variable with this name. name - must satisfy netCDF-3 naming rules. If name equals the name - of a dimension, then the new variable is treated as a + The name of the new variable. An exception will be raised if the + object already has a variable with this name. If name equals the + name of a dimension, then the new variable is treated as a coordinate variable and must be 1-dimensional. dims : tuple The dimensions of the new variable. Elements must be dimensions of @@ -562,9 +578,18 @@ def set_variable(self, name, var): variable The variable object in the underlying datastore. """ - check_dims_and_vars_consistency(self.dimensions, {name: var}) + # check old + new dimensions for consistency checks + new_dims = OrderedDict() + for dim, length in zip(var.dimensions, var.shape): + if dim not in self.dimensions: + new_dims[dim] = length + check_dims_and_vars_consistency( + dict(self.dimensions.items() + new_dims.items()), + {name: var}) + # now set the new dimensions and variables, and rebuild the indices + self.store.set_dimensions(new_dims) new_var = self.store.set_variable(name, var) - if name in self.indices: + if name in list(self.indices) + list(new_dims): self.indices.build_index(name) return new_var @@ -734,15 +759,22 @@ def merge(self, other, inplace=False): are silently dropped. """ # check for conflicts - utils.update_safety_check(self.variables, other.variables, + utils.update_safety_check(self.noncoordinates, other.noncoordinates, compat=utils.variable_equal) utils.update_safety_check(self.dimensions, other.dimensions) - utils.update_safety_check(self.indices.cache, other.indices.cache, + # note: coordinates are checked by comparing indices instead of + # variables, which lets us merge two datasets even if they have + # different time units + utils.update_safety_check(self.indices, other.indices, compat=np.array_equal) # update contents obj = self if inplace else self.copy() - obj.store.set_variables(other.variables) - obj.store.set_dimensions(other.dimensions) + obj.store.set_variables(OrderedDict((k, v) for k, v + in other.variables.iteritems() + if k not in obj.variables)) + obj.store.set_dimensions(OrderedDict((k, v) for k, v + in other.dimensions.iteritems() + if k not in obj.dimensions)) obj._indices.update(other.indices.cache) # remove conflicting attributes for k, v in other.attributes.iteritems(): diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index beb9b2e15d7..75375a33fea 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -1,3 +1,5 @@ +# TODO: replace aggregate and iterator methods by a 'groupby' method/object +# like pandas import functools import re @@ -36,28 +38,28 @@ class DataView(_DataWrapperMixin): dataviews are much lighter weight than cubes. They are simply aligned, labeled datasets and do not explicitly guarantee or rely on the CF model. """ - def __init__(self, dataset, name): + def __init__(self, dataset, focus): """ Parameters ---------- dataset : scidata.Dataset The dataset on which to build this data view. - name : str + focus : str The name of the "focus variable" in dataset on which this view is oriented. """ - if not name in dataset: - raise ValueError('name %r is not a variable in dataset %r' - % (name, dataset)) + if not focus in dataset: + raise ValueError('focus %r is not a variable in dataset %r' + % (focus, dataset)) self.dataset = dataset - self.name = name + self.focus = focus @property def variable(self): - return self.dataset.variables[self.name] + return self.dataset.variables[self.focus] @variable.setter def variable(self, value): - self.dataset.set_variable(self.name, value) + self.dataset.set_variable(self.focus, value) # _data is necessary for _DataWrapperMixin @property @@ -77,8 +79,7 @@ def dimensions(self): return self.variable.dimensions def _key_to_indexers(self, key): - key = expanded_indexer(key, self.ndim) - return zip(self.dimensions, key) + return zip(self.dimensions, expanded_indexer(key, self.ndim)) def __getitem__(self, key): if isinstance(key, basestring): @@ -86,11 +87,18 @@ def __getitem__(self, key): return self.dataset[key] else: # orthogonal array indexing - indexers = dict(self._key_to_indexers(key)) - return type(self)(self.dataset.indexed_by(**indexers), self.name) + return self.indexed_by(**dict(self._key_to_indexers(key))) def __setitem__(self, key, value): - self.variable[key] = value + if isinstance(key, basestring): + # add a variable or dataview to the dataset + self.dataset[key] = value + else: + # orthogonal array indexing + self.variable[key] = value + + def __contains__(self, key): + return key in self.dataset @property def loc(self): @@ -117,14 +125,14 @@ def copy(self): def __copy__(self): # shallow copy the underlying dataset - return DataView(self.dataset.copy(), self.name) + return DataView(self.dataset.copy(), self.focus) # mutable objects should not be hashable __hash__ = None def __str__(self): #TODO: make this less hacky - return re.sub(' {4}(%s\s+%s)' % (self.dtype, self.name), + return re.sub(' {4}(%s\s+%s)' % (self.dtype, self.focus), r'--> \1', str(self.dataset)) def __repr__(self): @@ -136,7 +144,7 @@ def __repr__(self): contents = ' (%s): %s' % (dim_summary, self.dtype) else: contents = ': %s' % self.data - return '' % (type(self).__name__, self.name, contents) + return '' % (type(self).__name__, self.focus, contents) def indexed_by(self, **indexers): """Return a new dataview whose dataset is given by indexing along the @@ -146,7 +154,7 @@ def indexed_by(self, **indexers): -------- Dataset.indexed_by """ - return type(self)(self.dataset.indexed_by(**indexers), self.name) + return type(self)(self.dataset.indexed_by(**indexers), self.focus) def labeled_by(self, **indexers): """Return a new dataview whose dataset is given by selecting coordinate @@ -156,26 +164,26 @@ def labeled_by(self, **indexers): -------- Dataset.labeled_by """ - return type(self)(self.dataset.labeled_by(**indexers), self.name) + return type(self)(self.dataset.labeled_by(**indexers), self.focus) def renamed(self, new_name): """Returns a new DataView with this DataView's focus variable renamed """ - renamed_dataset = self.dataset.renamed({self.name: new_name}) + renamed_dataset = self.dataset.renamed({self.focus: new_name}) return type(self)(renamed_dataset, new_name) def unselected(self): """Returns a copy of this DataView's dataset with this DataView's focus variable removed """ - return self.dataset.unselect(self.name) + return self.dataset.unselect(self.focus) def replace_focus(self, new_var): """Returns a copy of this DataView's dataset with this DataView's focus variable replaced by 'new_var' """ - ds = self.dataset.replace(self.name, new_var) - return type(self)(ds, self.name) + ds = self.dataset.replace(self.focus, new_var) + return type(self)(ds, self.focus) def iterator(self, dimension): """Iterate along a data dimension @@ -195,7 +203,7 @@ def iterator(self, dimension): variables and DataView objects. """ for (x, dataset) in self.dataset.iterator(dimension): - yield (x, type(self)(dataset, self.name)) + yield (x, type(self)(dataset, self.focus)) def transpose(self, *dimensions): """Return a new DataView object with transposed dimensions @@ -221,7 +229,7 @@ def transpose(self, *dimensions): """ return self.replace_focus(self.variable.transpose(*dimensions)) - def collapsed(self, func, dimension=None, axis=None, **kwargs): + def collapse(self, func, dimension=None, axis=None, **kwargs): """Collapse this variable by applying `func` along some dimension(s) Parameters @@ -242,7 +250,7 @@ def collapsed(self, func, dimension=None, axis=None, **kwargs): Note ---- - If `collapsed` is called with multiple dimensions (or axes, which + If `collapse` is called with multiple dimensions (or axes, which are converted into dimensions), then the collapse operation is performed repeatedly along each dimension in turn from left to right. @@ -252,19 +260,19 @@ def collapsed(self, func, dimension=None, axis=None, **kwargs): DataView with this dataview's variable replaced with a variable with summarized data and the indicated dimension(s) removed. """ - var = self.variable.collapsed(func, dimension, axis, **kwargs) + var = self.variable.collapse(func, dimension, axis, **kwargs) dropped_dims = set(self.dimensions) - set(var.dimensions) # For now, take an aggressive strategy of removing all variables # associated with any dropped dimensions # TODO: save some summary (mean? bounds?) of dropped variables - drop = ({self.name} | dropped_dims | + drop = ({self.focus} | dropped_dims | {k for k, v in self.dataset.variables.iteritems() if any(dim in dropped_dims for dim in v.dimensions)}) ds = self.dataset.unselect(*drop) - ds.add_variable(self.name, var) - return type(self)(ds, self.name) + ds.add_variable(self.focus, var) + return type(self)(ds, self.focus) - def aggregated_by(self, func, new_dim_name, **kwargs): + def aggregate(self, func, new_dim_name, **kwargs): """Aggregate this dataview by applying `func` to grouped elements Parameters @@ -286,18 +294,53 @@ def aggregated_by(self, func, new_dim_name, **kwargs): DataView with aggregated data and the new dimension `new_dim_name`. """ agg_var = self.dataset[new_dim_name] - unique, aggregated = self.variable.aggregated_by( + unique, aggregated = self.variable.aggregate( func, new_dim_name, agg_var, **kwargs) # TODO: add options for how to summarize variables along aggregated # dimensions instead of just dropping them - drop = ({self.name} | + drop = ({self.focus} | ({new_dim_name} if new_dim_name in self.dataset else set()) | {k for k, v in self.dataset.variables.iteritems() if any(dim in agg_var.dimensions for dim in v.dimensions)}) ds = self.dataset.unselect(*drop) ds.add_coordinate(unique) - ds.add_variable(self.name, aggregated) - return type(self)(ds, self.name) + ds.add_variable(self.focus, aggregated) + return type(self)(ds, self.focus) + + @classmethod + def from_stack(cls, dataviews, new_dim_name='stacked_dimension'): + """Stack dataviews along a new dimension to form a new dataview + + Parameters + ---------- + dataviews : iterable of Variable and/or DataView + Variables and/or DataView objects to stack together. + dim : str, optional + Name of the new dimension. + + Returns + ------- + stacked : DataView + Stacked dataview formed by stacking all the supplied variables + along the new dimension. The new dimension will be the first + dimension in the stacked dataview. + """ + views = list(dataviews) + if not views: + raise ValueError('DataView.from_stack was supplied with an ' + 'empty argument') + dataset = Dataset() + focus = default_focus = 'stacked_variable' + for view in views: + if isinstance(view, cls): + dataset.merge(view.unselected(), inplace=True) + if focus == default_focus: + focus = view.focus + elif focus != view.focus: + raise ValueError('DataView.from_stack requires that all ' + 'stacked views have the same focus') + dataset[focus] = Variable.from_stack(dataviews, new_dim_name) + return cls(dataset, focus) def __array_wrap__(self, result): return self.replace_focus(self.variable.__array_wrap__(result)) @@ -343,5 +386,16 @@ def func(self, other): return self return func - ops.inject_special_operations(DataView, priority=60) + + +def intersection(dataview1, dataview2): + """Given two dataview objects, returns two new dataviews where all indices + found on both dataviews are replaced by their intersection + """ + overlapping_indices = {k: dataview1.indices[k] & dataview2.indices[k] + for k in dataview1.indices + if k in dataview2.indices} + return tuple(dv.indexed_by(**overlapping_indices) + for dv in [dataview1, dataview2]) + diff --git a/src/scidata/variable.py b/src/scidata/variable.py index 668546ba1b5..9a36c1ad770 100644 --- a/src/scidata/variable.py +++ b/src/scidata/variable.py @@ -30,14 +30,34 @@ def _as_compatible_data(data): class Variable(_DataWrapperMixin): """ A netcdf-like variable consisting of dimensions, data and attributes - which describe a single varRiable. A single variable object is not + which describe a single Variable. A single variable object is not fully described outside the context of its parent Dataset. """ def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): + """ + Parameters + ---------- + dims : str or sequence of str + Name(s) of the the data dimension(s). Must be either a string (only + for 1D data) or a sequence of strings with length equal to the + number of dimensions. + data : array_like + Data array which supports numpy-like data access. + attributes : dict_like or None, optional + Attributes to assign to the new variable. If None (default), an + empty attribute dictionary is initialized. + indexing_mode : {'numpy', 'orthogonal'} + String indicating how data handles to fancy indexing (with + arrays). Two modes are supported: 'numpy' (fancy indexing like + numpy.ndarray objects) and 'orthogonal' (array indexing accesses + different dimensions independently, like netCDF4 variables). + """ + if isinstance(dims, basestring): + dims = [dims] data = _as_compatible_data(data) if len(dims) != data.ndim: - raise ValueError('data must have same shape as the number of ' - 'dimensions') + raise ValueError('data and dimensions must have the same ' + 'dimensionality') self._dimensions = tuple(dims) self._data = data if attributes is None: @@ -230,7 +250,7 @@ def transpose(self, *dimensions): data = self.data.transpose(*axes) return type(self)(dimensions, data, self.attributes) - def collapsed(self, func, dimension=None, axis=None, **kwargs): + def collapse(self, func, dimension=None, axis=None, **kwargs): """Collapse this variable by applying `func` along some dimension(s) Parameters @@ -251,7 +271,7 @@ def collapsed(self, func, dimension=None, axis=None, **kwargs): Note ---- - If `collapsed` is called with multiple dimensions (or axes, which + If `collapse` is called with multiple dimensions (or axes, which are converted into dimensions), then the collapse operation is performed repeatedly along each dimension in turn from left to right. @@ -276,7 +296,7 @@ def collapsed(self, func, dimension=None, axis=None, **kwargs): dimension = [dimension] var = self for dim in dimension: - var = var._collapsed(func, dim, **kwargs) + var = var._collapse(func, dim, **kwargs) else: var = type(self)([], func(self.data, **kwargs), self.attributes) var._append_to_cell_methods(': '.join(self.dimensions) @@ -290,7 +310,7 @@ def _append_to_cell_methods(self, string): base = '' self.attributes['cell_methods'] = base + string - def _collapsed(self, f, dim, **kwargs): + def _collapse(self, f, dim, **kwargs): """Collapse a single dimension""" axis = self.dimensions.index(dim) dims = tuple(dim for i, dim in enumerate(self.dimensions) @@ -301,7 +321,7 @@ def _collapsed(self, f, dim, **kwargs): + ': ' + f.__name__) return new_var - def aggregated_by(self, func, new_dim_name, groups, **kwargs): + def aggregate(self, func, new_dim_name, groups, **kwargs): """Aggregate this variable by applying `func` to grouped elements Parameters @@ -336,14 +356,66 @@ def aggregated_by(self, func, new_dim_name, groups, **kwargs): 'match the length of this variable along its ' 'dimension') unique_values = np.unique(groups.data) - aggregated = [self.indexed_by(**{dim: groups == u}).collapsed( + aggregated = (self.indexed_by(**{dim: groups.data == u}).collapse( func, dim, axis=None, **kwargs) - for u in unique_values] - stacked = stack_variables(aggregated, new_dim_name, unique_values.size) + for u in unique_values) + stacked = type(self).from_stack(aggregated, new_dim_name, + length=unique_values.size) ordered_dims = [new_dim_name if d == dim else d for d in self.dimensions] unique = type(self)([new_dim_name], unique_values) return unique, stacked.transpose(*ordered_dims) + @classmethod + def from_stack(cls, variables, new_dim_name='stacked_dimension', + length=None): + """Stack variables along a new dimension to form a new variable + + Parameters + ---------- + variables : iterable of Variable + Variables to stack together. + new_dim_name : str, optional + Name of the new dimension. + length : int, optional + Length of the new dimension. This is used to allocate the new data + array for the stacked variable data before iterating over all + items, which can be more memory efficient. + + Returns + ------- + stacked : Variable + Stacked variable formed by stacking all the supplied variables + along the new dimension. The new dimension will be the first + dimension in the stacked variable. + """ + if length is None: + # so much for lazy evaluation! we need to figure out how many + # variables there are + variables = list(variables) + length = len(variables) + + i = -1 + for i, var in enumerate(variables): + if i == 0: + new_data = np.empty((length,) + var.shape, dtype=var.dtype) + old_dimensions = var.dimensions + attributes = OrderedDict(var.attributes) + else: + if i == length: + raise ValueError('too many stack variables; supplied ' + 'length was %s' % length) + if var.dimensions != old_dimensions: + raise ValueError('inconsistent dimensions between merge ' + 'variables') + utils.remove_incompatible_items(attributes, var.attributes) + new_data[i] = var.data + + if i + 1 != length: + raise ValueError('only %s stack variables; supplied length ' + 'was %s' % (i + 1, length)) + + return cls((new_dim_name,) + old_dimensions, new_data, attributes) + def __array_wrap__(self, result): return type(self)(self.dimensions, result, self.attributes) @@ -389,56 +461,6 @@ def func(self, other): ops.inject_special_operations(Variable) -def stack_variables(variables, dim, length=None): - """Stack variables along a new dimension - - Parameters - ---------- - variables : iterable of Variable - Variables to stack. - dim : str - Name of the new dimension - length : int, optional - Length of the new dimension. This is used to allocate the new data - array for the stacked variable data before iterating over all items, - which can be more memory efficient. - - Returns - ------- - stacked : Variable - Stacked variable formed by stacking all the supplied variables along - the new dimension. The new dimension will be the first dimension in the - stacked variable. - """ - if length is None: - # so much for lazy evaluation! we need to figure out how many variables - # there are - variables = list(variables) - length = len(variables) - - i = -1 - for i, var in enumerate(variables): - if i == 0: - new_data = np.empty((length,) + var.shape, dtype=var.dtype) - old_dimensions = var.dimensions - attributes = OrderedDict(var.attributes) - else: - if i == length: - raise ValueError('too many stack variables; supplied length ' - 'was %s' % length) - if var.dimensions != old_dimensions: - raise ValueError('inconsistent dimensions between merge ' - 'variables') - utils.remove_incompatible_items(attributes, var.attributes) - new_data[i] = var.data - - if i + 1 != length: - raise ValueError('only %s stack variables; supplied length ' - 'was %s' % (i + 1, length)) - - return Variable((dim,) + old_dimensions, new_data, attributes) - - def _broadcast_var_data(self, other): self_data = self.data if isinstance(other, dataset.Dataset): diff --git a/test/test_dataset.py b/test/test_dataset.py index bf0002cc0b3..ce415d6a61b 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from scidata import Dataset, Variable, backends +from scidata import Dataset, DataView, Variable, backends, open_dataset from . import TestCase @@ -45,6 +45,15 @@ def test_repr(self): self.assertEqual('', repr(data)) + def test_init(self): + var1 = Variable('x', np.arange(100)) + var2 = Variable('x', np.arange(1000)) + var3 = Variable(['x', 'y'], np.arange(1000).reshape(100, 10)) + with self.assertRaisesRegexp(ValueError, 'already is saved with len'): + Dataset({'a': var1, 'b': var2}) + with self.assertRaisesRegexp(ValueError, 'must be defined with 1-d'): + Dataset({'a': var1, 'x': var3}) + def test_iterator(self): data = create_test_data(self.get_store()) for n, (t, sub) in enumerate(list(data.iterator('dim1'))[:3]): @@ -82,8 +91,8 @@ def test_variable(self): self.assertRaises(ValueError, a.create_variable, name='foo', dims=('time', 'x',), data=d) # dimension must be defined - self.assertRaises(ValueError, a.create_variable, - name='qux', dims=('time', 'missing_dim',), data=d) + # self.assertRaises(ValueError, a.create_variable, + # name='qux', dims=('time', 'missing_dim',), data=d) # try to add variable with dim (10,3) with data that's (3,10) self.assertRaises(ValueError, a.create_variable, name='qux', dims=('time', 'x'), data=d.T) @@ -302,13 +311,40 @@ def test_merge(self): actual = ds1.merge(ds2) self.assertEqual(expected, actual) with self.assertRaises(ValueError): - ds1.merge(ds2.indexed_by(dim1=0)) + ds1.merge(ds2.indexed_by(dim1=slice(2))) with self.assertRaises(ValueError): ds1.merge(ds2.renamed({'var3': 'var1'})) - def test_virtual_variables(self): - # need to fill this out - pass + def test_getitem(self): + data = create_test_data(self.get_store()) + data.create_variable('time', ['time'], np.arange(1000, dtype=np.int32), + {'units': 'days since 2000-01-01'}) + self.assertIsInstance(data['var1'], DataView) + self.assertVarEqual(data['var1'], data.variables['var1']) + self.assertItemsEqual(data['var1'].dataset.variables, + {'var1', 'dim1', 'dim2'}) + # access virtual variables + self.assertVarEqual(data['time.dayofyear'][:300], + Variable('time', 1 + np.arange(300))) + self.assertArrayEqual(data['time.month'].data, + data.indices['time'].month) + + def test_setitem(self): + # assign a variable + var = Variable(['dim1'], np.random.randn(100)) + data1 = create_test_data(self.get_store()) + data1.set_variable('A', var) + data2 = data1.copy() + data2['A'] = var + self.assertEqual(data1, data2) + # assign a dataview + dv = 2 * data2['A'] + data1.set_variable('B', dv.variable) + data2['B'] = dv + self.assertEqual(data1, data2) + # assign an array + with self.assertRaisesRegexp(TypeError, 'DataViews and Variables'): + data2['C'] = var.data def test_write_store(self): expected = create_test_data() @@ -324,6 +360,17 @@ def get_store(self): os.close(f) return backends.NetCDF4DataStore(self.tmp_file, mode='w') + def test_dump_and_open_dataset(self): + data = create_test_data(self.get_store()) + f, tmp_file = tempfile.mkstemp(suffix='.nc') + os.close(f) + data.dump(tmp_file) + + expected = data.copy() + actual = open_dataset(tmp_file) + self.assertEquals(expected, actual) + os.remove(tmp_file) + def tearDown(self): if hasattr(self, 'tmp_file') and os.path.exists(self.tmp_file): os.remove(self.tmp_file) @@ -334,6 +381,14 @@ def get_store(self): fobj = StringIO() return backends.ScipyDataStore(fobj, 'w') + def test_dump_and_open_dataset(self): + data = create_test_data(self.get_store()) + serialized = data.dumps() + + expected = data.copy() + actual = open_dataset(StringIO(serialized)) + self.assertEquals(expected, actual) + def test_repr(self): # scipy.io.netcdf does not keep track of dimension order :( pass diff --git a/test/test_dataview.py b/test/test_dataview.py index 399ec4b96d9..8b4d909a7cd 100644 --- a/test/test_dataview.py +++ b/test/test_dataview.py @@ -1,13 +1,13 @@ import numpy as np -from scidata import Dataset, DataView, Variable +from scidata import Dataset, DataView, Variable, intersection from . import TestCase, ReturnItem class TestDataView(TestCase): def assertViewEqual(self, dv1, dv2): self.assertEqual(dv1.dataset, dv2.dataset) - self.assertEqual(dv1.name, dv2.name) + self.assertEqual(dv1.focus, dv2.focus) def setUp(self): self.x = np.random.random((10, 20)) @@ -19,7 +19,7 @@ def setUp(self): def test_properties(self): self.assertIs(self.dv.dataset, self.ds) - self.assertEqual(self.dv.name, 'foo') + self.assertEqual(self.dv.focus, 'foo') self.assertVarEqual(self.dv.variable, self.v) self.assertArrayEqual(self.dv.data, self.v.data) for attr in ['dimensions', 'dtype', 'shape', 'size', 'ndim', @@ -85,7 +85,7 @@ def test_loc(self): def test_renamed(self): renamed = self.dv.renamed('bar') self.assertEqual(renamed.dataset, self.ds.renamed({'foo': 'bar'})) - self.assertEqual(renamed.name, 'bar') + self.assertEqual(renamed.focus, 'bar') def test_dataset_getitem(self): dv = self.ds['foo'] @@ -125,20 +125,27 @@ def test_inplace_math(self): self.assertIs(b.data, x) self.assertIs(b.dataset, self.ds) - def test_collapsed(self): - self.assertVarEqual(self.dv.collapsed(np.mean, 'x'), - self.v.collapsed(np.mean, 'x')) + def test_collapse(self): + self.assertVarEqual(self.dv.collapse(np.mean, 'x'), + self.v.collapse(np.mean, 'x')) # needs more... # should check which extra dimensions are dropped - def test_aggregated_by(self): + def test_aggregate(self): agg_var = Variable(['y'], np.array(['a'] * 9 + ['c'] + ['b'] * 7 + ['c'] * 3)) self.ds.add_variable('abc', agg_var) expected_unique, expected_var = \ - self.dv.variable.aggregated_by(np.mean, 'abc', agg_var) + self.dv.variable.aggregate(np.mean, 'abc', agg_var) expected = DataView(Dataset( {'foo': expected_var, 'x': self.ds.variables['x'], 'abc': expected_unique}), 'foo') - actual = self.dv.aggregated_by(np.mean, 'abc') + actual = self.dv.aggregate(np.mean, 'abc') self.assertViewEqual(expected, actual) + + def test_intersection(self): + with self.assertRaises(ValueError): + self.dv + self.dv[:5] + dv1, dv2 = intersection(self.dv, self.dv[:5]) + self.assertViewEqual(dv1, self.dv[:5]) + self.assertViewEqual(dv2, self.dv[:5]) diff --git a/test/test_variable.py b/test/test_variable.py index a01b25a57af..289b1c87585 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -3,7 +3,6 @@ import numpy as np from scidata import Variable, Dataset -from scidata.variable import stack_variables from . import TestCase @@ -177,23 +176,23 @@ def test_array_interface(self): # test ufuncs self.assertVarEqual(np.sin(v), Variable(['x'], np.sin(x))) - def test_collapsed(self): + def test_collapse(self): v = Variable(['time', 'x'], self.d) # intentionally test with an operation for which order matters - self.assertVarEqual(v.collapsed(np.std, 'time'), + self.assertVarEqual(v.collapse(np.std, 'time'), Variable(['x'], self.d.std(axis=0), {'cell_methods': 'time: std'})) - self.assertVarEqual(v.collapsed(np.std, axis=0), - v.collapsed(np.std, dimension='time')) - self.assertVarEqual(v.collapsed(np.std, ['x', 'time']), + self.assertVarEqual(v.collapse(np.std, axis=0), + v.collapse(np.std, dimension='time')) + self.assertVarEqual(v.collapse(np.std, ['x', 'time']), Variable([], self.d.std(axis=1).std(axis=0), {'cell_methods': 'x: std time: std'})) - self.assertVarEqual(v.collapsed(np.std), + self.assertVarEqual(v.collapse(np.std), Variable([], self.d.std(), {'cell_methods': 'time: x: std'})) - self.assertVarEqual(v.mean('time'), v.collapsed(np.mean, 'time')) + self.assertVarEqual(v.mean('time'), v.collapse(np.mean, 'time')) - def test_aggregated_by(self): + def test_aggregate(self): agg_var = Variable(['y'], np.array(['a', 'a', 'b'])) v = Variable(['x', 'y'], self.d) expected_unique = Variable(['abc'], np.array(['a', 'b'])) @@ -201,32 +200,30 @@ def test_aggregated_by(self): np.array([self.d[:, :2].sum(axis=1), self.d[:, 2:].sum(axis=1)]).T, {'cell_methods': 'y: sum'}) - actual_unique, actual_aggregated = v.aggregated_by(np.sum, 'abc', agg_var) + actual_unique, actual_aggregated = v.aggregate(np.sum, 'abc', agg_var) self.assertVarEqual(expected_unique, actual_unique) self.assertVarEqual(expected_aggregated, actual_aggregated) # should be equivalent to aggregate by a dataview, too alt_agg_var = Dataset({'abc': agg_var})['abc'] - actual_unique, actual_aggregated = v.aggregated_by(np.sum, 'abc', + actual_unique, actual_aggregated = v.aggregate(np.sum, 'abc', alt_agg_var) self.assertVarEqual(expected_unique, actual_unique) self.assertVarEqual(expected_aggregated, actual_aggregated) - def test_stack_variables(self): + def test_from_stack(self): x = np.arange(5) y = np.ones(5) v = Variable(['a'], x) w = Variable(['a'], y) self.assertVarEqual(Variable(['b', 'a'], np.array([x, y])), - stack_variables([v, w], 'b')) + Variable.from_stack([v, w], 'b')) self.assertVarEqual(Variable(['b', 'a'], np.array([x, y])), - stack_variables((v, w), 'b')) + Variable.from_stack((v, w), 'b')) self.assertVarEqual(Variable(['b', 'a'], np.array([x, y])), - stack_variables((v, w), 'b', length=2)) + Variable.from_stack((v, w), 'b', length=2)) with self.assertRaisesRegexp(ValueError, 'too many'): - stack_variables([v, w], 'b', length=1) + Variable.from_stack([v, w], 'b', length=1) with self.assertRaisesRegexp(ValueError, r'only \d+ stack'): - stack_variables([v, w, w], 'b', length=4) + Variable.from_stack([v, w, w], 'b', length=4) with self.assertRaisesRegexp(ValueError, 'inconsistent dimensions'): - stack_variables([v, Variable(['c'], y)], 'b') - - + Variable.from_stack([v, Variable(['c'], y)], 'b') From c4794c515f53af00c2625accbff8fbbba81eef28 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 6 Feb 2014 10:44:42 -0800 Subject: [PATCH 13/45] Patched in more numpy methods --- src/scidata/dataview.py | 6 ++++-- src/scidata/ops.py | 30 ++++++++++++++++++++++++++++++ src/scidata/variable.py | 5 +++-- test/test_dataview.py | 16 ++++++++++++++++ test/test_variable.py | 4 ++++ 5 files changed, 57 insertions(+), 4 deletions(-) diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index 75375a33fea..7ace6c1a8cc 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -182,6 +182,8 @@ def replace_focus(self, new_var): """Returns a copy of this DataView's dataset with this DataView's focus variable replaced by 'new_var' """ + if not hasattr(new_var, 'dimensions'): + new_var = type(self.variable)(self.variable.dimensions, new_var) ds = self.dataset.replace(self.focus, new_var) return type(self)(ds, self.focus) @@ -348,8 +350,8 @@ def __array_wrap__(self, result): @staticmethod def _unary_op(f): @functools.wraps(f) - def func(self): - return self.replace_focus(f(self.variable)) + def func(self, *args, **kwargs): + return self.replace_focus(f(self.variable, *args, **kwargs)) return func def _check_indices_compat(self, other): diff --git a/src/scidata/ops.py b/src/scidata/ops.py index ef1f7f3c469..7e450aa1645 100644 --- a/src/scidata/ops.py +++ b/src/scidata/ops.py @@ -7,11 +7,35 @@ CMP_BINARY_OPS = ['lt', 'le', 'eq', 'ne', 'ge', 'gt'] NUM_BINARY_OPS = ['add', 'sub', 'mul', 'div', 'truediv', 'floordiv', 'mod', 'pow', 'and', 'xor', 'or'] +# methods which should return the standard numpy return value unchanged +# some of these can probably be wrapped +NUMPY_CONVERT_METHODS = ['choose', 'compress', 'flatten', 'item', 'itemset', + 'nonzero', 'ravel', 'repeat', 'reshape', + 'searchsorted', 'squeeze', 'swapaxes', 'take', + 'trace', 'diagonal', 'dot'] +# methods which don't modify the data shape, so the result should still be +# wrapped in an Variable/DataView +NUMPY_UNARY_METHODS = ['argsort', 'clip', 'conj', 'conjugate', 'fill', + 'getfield', 'newbyteorder', 'put', 'round', 'setfield', + 'setflags', 'view'] +# methods which remove an axis NUMPY_COLLAPSE_METHODS = ['all', 'any', 'argmax', 'argmin', 'cumprod', 'cumsum', 'max', 'mean', 'min', 'prod', 'ptp', 'std', 'sum', 'var'] +def _data_method_wrapper(f): + def func(self, *args, **kwargs): + return getattr(self.data, f)(*args, **kwargs) + return func + + +def _method_wrapper(f): + def func(self, *args, **kwargs): + return getattr(self, f)(*args, **kwargs) + return func + + def inject_special_operations(cls, priority=50): # priortize our operations over those of numpy.ndarray (priority=1) # and numpy.matrix (priority=10) @@ -30,6 +54,12 @@ def inject_special_operations(cls, priority=50): cls._binary_op(op(name), reflexive=True)) setattr(cls, op_str('i' + name), cls._inplace_binary_op(op('i' + name))) + # patch in numpy methods + for name in NUMPY_CONVERT_METHODS: + setattr(cls, name, _data_method_wrapper(name)) + for name in NUMPY_UNARY_METHODS: + setattr(cls, name, cls._unary_op(_method_wrapper(name))) + # TODO: change these to use methods instead of numpy functions for name in NUMPY_COLLAPSE_METHODS: setattr(cls, name, cls._collapse_method(getattr(np, name), name, 'numpy')) diff --git a/src/scidata/variable.py b/src/scidata/variable.py index 9a36c1ad770..eb2d14d339a 100644 --- a/src/scidata/variable.py +++ b/src/scidata/variable.py @@ -422,8 +422,9 @@ def __array_wrap__(self, result): @staticmethod def _unary_op(f): @functools.wraps(f) - def func(self): - return type(self)(self.dimensions, f(self.data), self.attributes) + def func(self, *args, **kwargs): + return type(self)(self.dimensions, f(self.data, *args, **kwargs), + self.attributes) return func @staticmethod diff --git a/test/test_dataview.py b/test/test_dataview.py index 8b4d909a7cd..9bba606344c 100644 --- a/test/test_dataview.py +++ b/test/test_dataview.py @@ -87,10 +87,26 @@ def test_renamed(self): self.assertEqual(renamed.dataset, self.ds.renamed({'foo': 'bar'})) self.assertEqual(renamed.focus, 'bar') + def test_replace_focus(self): + self.assertVarEqual(self.dv, self.dv.replace_focus(self.v)) + self.assertVarEqual(self.dv, self.dv.replace_focus(self.x)) + def test_dataset_getitem(self): dv = self.ds['foo'] self.assertViewEqual(dv, self.dv) + def test_array_interface(self): + self.assertArrayEqual(np.asarray(self.dv), self.x) + # test patched in methods + self.assertArrayEqual(self.dv.take([2, 3]), self.x.take([2, 3])) + self.assertViewEqual(self.dv.argsort(), + self.dv.replace_focus(self.x.argsort())) + self.assertViewEqual(self.dv.clip(2, 3), + self.dv.replace_focus(self.x.clip(2, 3))) + # test ufuncs + self.assertViewEqual(np.sin(self.dv), + self.dv.replace_focus(np.sin(self.x))) + def test_math(self): x = self.x v = self.v diff --git a/test/test_variable.py b/test/test_variable.py index 289b1c87585..9ff2bbc097b 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -173,6 +173,10 @@ def test_array_interface(self): x = np.arange(5) v = Variable(['x'], x) self.assertArrayEqual(np.asarray(v), x) + # test patched in methods + self.assertArrayEqual(v.take([2, 3]), x.take([2, 3])) + self.assertVarEqual(v.argsort(), v) + self.assertVarEqual(v.clip(2, 3), Variable('x', x.clip(2, 3))) # test ufuncs self.assertVarEqual(np.sin(v), Variable(['x'], np.sin(x))) From d517bd146e7f88df1deefc0b7f826898cb001d82 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 6 Feb 2014 12:57:40 -0800 Subject: [PATCH 14/45] Faster aggregate Now O(n) instead of O(n^2), even though we do have to iterate through every element of an array in Python (gasp!). --- src/scidata/dataview.py | 24 +++++++++++++----------- src/scidata/variable.py | 41 ++++++++++++++++++++++++++++++++--------- test/test_dataview.py | 2 ++ 3 files changed, 47 insertions(+), 20 deletions(-) diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index 7ace6c1a8cc..0b1ce5e9c24 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -274,7 +274,7 @@ def collapse(self, func, dimension=None, axis=None, **kwargs): ds.add_variable(self.focus, var) return type(self)(ds, self.focus) - def aggregate(self, func, new_dim_name, **kwargs): + def aggregate(self, func, new_dim, **kwargs): """Aggregate this dataview by applying `func` to grouped elements Parameters @@ -283,27 +283,29 @@ def aggregate(self, func, new_dim_name, **kwargs): Function which can be called in the form `func(x, axis=axis, **kwargs)` to reduce an np.ndarray over an integer valued axis. - new_dim_name : str or sequence of str, optional - Name of the variable in this dataview's dataset by which to group - variable elements. The dimension along which this variable exists - will be replaced by this name. + new_dim : str or DataView + Name of a variable in this dataview's dataset or DataView by which + to group variable elements. The dimension along which this variable + exists will be replaced by this name. The variable or dataview must + be one-dimensional. **kwargs : dict Additional keyword arguments passed on to `func`. Returns ------- aggregated : DataView - DataView with aggregated data and the new dimension `new_dim_name`. + DataView with aggregated data and the new dimension `new_dim`. """ - agg_var = self.dataset[new_dim_name] + if isinstance(new_dim, basestring): + new_dim = self.dataset[new_dim] unique, aggregated = self.variable.aggregate( - func, new_dim_name, agg_var, **kwargs) + func, new_dim.focus, new_dim, **kwargs) # TODO: add options for how to summarize variables along aggregated - # dimensions instead of just dropping them + # dimensions instead of just dropping them? drop = ({self.focus} | - ({new_dim_name} if new_dim_name in self.dataset else set()) | + ({new_dim.focus} if new_dim.focus in self.dataset else set()) | {k for k, v in self.dataset.variables.iteritems() - if any(dim in agg_var.dimensions for dim in v.dimensions)}) + if any(dim in new_dim.dimensions for dim in v.dimensions)}) ds = self.dataset.unselect(*drop) ds.add_coordinate(unique) ds.add_variable(self.focus, aggregated) diff --git a/src/scidata/variable.py b/src/scidata/variable.py index eb2d14d339a..1d089eb7369 100644 --- a/src/scidata/variable.py +++ b/src/scidata/variable.py @@ -27,6 +27,29 @@ def _as_compatible_data(data): return data +def unique_value_groups(ar): + """Group an array by its unique values + + Parameters + ---------- + ar : array_like + Input array. This will be flattened if it is not already 1-D. + + Returns + ------- + values : np.ndarray + Sorted, unique values as returned by `np.unique`. + indices : list of lists of int + Each element provides the integer indices in `ar` with values given by + the corresponding value in `unique_values`. + """ + values, inverse = np.unique(ar, return_inverse=True) + groups = [[] for _ in range(len(values))] + for n, g in enumerate(inverse): + groups[g].append(n) + return values, groups + + class Variable(_DataWrapperMixin): """ A netcdf-like variable consisting of dimensions, data and attributes @@ -321,7 +344,7 @@ def _collapse(self, f, dim, **kwargs): + ': ' + f.__name__) return new_var - def aggregate(self, func, new_dim_name, groups, **kwargs): + def aggregate(self, func, new_dim_name, group_by, **kwargs): """Aggregate this variable by applying `func` to grouped elements Parameters @@ -332,7 +355,7 @@ def aggregate(self, func, new_dim_name, groups, **kwargs): integer valued axis. new_dim_name : str or sequence of str, optional Name of the new dimension to create. - groups : Variable + group_by : Variable 1D variable which contains the values by which to group. **kwargs : dict Additional keyword arguments passed on to `func`. @@ -344,21 +367,21 @@ def aggregate(self, func, new_dim_name, groups, **kwargs): `new_dim_name`. aggregated : Variable Variable with aggregated data and the original dimension from - `groups` replaced by `new_dim_name`. + `group_by` replaced by `new_dim_name`. """ - if groups.ndim != 1: + if group_by.ndim != 1: # TODO: remove this limitation? raise ValueError('group variables must be 1 dimensional') - dim = groups.dimensions[0] + dim = group_by.dimensions[0] axis = self.dimensions.index(dim) - if groups.size != self.shape[axis]: + if group_by.size != self.shape[axis]: raise ValueError('the group variable\'s length does not ' 'match the length of this variable along its ' 'dimension') - unique_values = np.unique(groups.data) - aggregated = (self.indexed_by(**{dim: groups.data == u}).collapse( + unique_values, group_indices = unique_value_groups(group_by.data) + aggregated = (self.indexed_by(**{dim: indices}).collapse( func, dim, axis=None, **kwargs) - for u in unique_values) + for indices in group_indices) stacked = type(self).from_stack(aggregated, new_dim_name, length=unique_values.size) ordered_dims = [new_dim_name if d == dim else d for d in self.dimensions] diff --git a/test/test_dataview.py b/test/test_dataview.py index 9bba606344c..fd281be6edc 100644 --- a/test/test_dataview.py +++ b/test/test_dataview.py @@ -158,6 +158,8 @@ def test_aggregate(self): 'abc': expected_unique}), 'foo') actual = self.dv.aggregate(np.mean, 'abc') self.assertViewEqual(expected, actual) + actual = self.dv.aggregate(np.mean, self.ds['abc']) + self.assertViewEqual(expected, actual) def test_intersection(self): with self.assertRaises(ValueError): From ba7083d60c2d4a4e0923ca947875c636c6ae7ec4 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 6 Feb 2014 15:26:37 -0800 Subject: [PATCH 15/45] More array interface tests for ufuncs --- test/test_dataview.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/test_dataview.py b/test/test_dataview.py index fd281be6edc..4f63e1ce904 100644 --- a/test/test_dataview.py +++ b/test/test_dataview.py @@ -106,6 +106,9 @@ def test_array_interface(self): # test ufuncs self.assertViewEqual(np.sin(self.dv), self.dv.replace_focus(np.sin(self.x))) + self.assertViewEqual(self.dv, np.maximum(self.v, self.dv)) + self.ds['bar'] = Variable(['x', 'y'], np.zeros((10, 20))) + self.assertViewEqual(self.dv, np.maximum(self.dv, self.ds['bar'])) def test_math(self): x = self.x From dcdabee38b047fd4f6c03e121c50d4e03b0e047c Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 6 Feb 2014 18:54:58 -0800 Subject: [PATCH 16/45] Fixed DataView.from_stack --- src/scidata/dataview.py | 10 ++++++---- test/test_dataview.py | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index 0b1ce5e9c24..9177f75b467 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -5,7 +5,9 @@ import numpy as np +import dataset import ops +import variable from common import _DataWrapperMixin from utils import expanded_indexer, FrozenOrderedDict @@ -333,18 +335,18 @@ def from_stack(cls, dataviews, new_dim_name='stacked_dimension'): if not views: raise ValueError('DataView.from_stack was supplied with an ' 'empty argument') - dataset = Dataset() + ds = dataset.Dataset() focus = default_focus = 'stacked_variable' for view in views: if isinstance(view, cls): - dataset.merge(view.unselected(), inplace=True) + ds.merge(view.unselected(), inplace=True) if focus == default_focus: focus = view.focus elif focus != view.focus: raise ValueError('DataView.from_stack requires that all ' 'stacked views have the same focus') - dataset[focus] = Variable.from_stack(dataviews, new_dim_name) - return cls(dataset, focus) + ds[focus] = variable.Variable.from_stack(dataviews, new_dim_name) + return cls(ds, focus) def __array_wrap__(self, result): return self.replace_focus(self.variable.__array_wrap__(result)) diff --git a/test/test_dataview.py b/test/test_dataview.py index 4f63e1ce904..010a6b33af4 100644 --- a/test/test_dataview.py +++ b/test/test_dataview.py @@ -164,6 +164,20 @@ def test_aggregate(self): actual = self.dv.aggregate(np.mean, self.ds['abc']) self.assertViewEqual(expected, actual) + def test_from_stack(self): + self.ds['bar'] = Variable(['x', 'y'], np.random.randn(10, 20)) + foo = self.ds['foo'] + bar = self.ds['bar'].renamed('foo') + # from dataviews: + self.assertVarEqual(Variable(['w', 'x', 'y'], + np.array([foo.data, bar.data])), + DataView.from_stack([foo, bar], 'w')) + # from variables: + self.assertVarEqual(Variable(['w', 'x', 'y'], + np.array([foo.data, bar.data])), + DataView.from_stack([foo.variable, + bar.variable], 'w')) + def test_intersection(self): with self.assertRaises(ValueError): self.dv + self.dv[:5] From 697f135a6804da7ffb87c77ec613e5313fbbfcfe Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 6 Feb 2014 19:00:39 -0800 Subject: [PATCH 17/45] Initial docs with Sphinx --- .gitignore | 2 + doc/Makefile | 177 ++++++++++++++++++++++++++ doc/conf.py | 272 ++++++++++++++++++++++++++++++++++++++++ doc/index.rst | 24 ++++ src/scidata/__init__.py | 2 + 5 files changed, 477 insertions(+) create mode 100644 doc/Makefile create mode 100644 doc/conf.py create mode 100644 doc/index.rst diff --git a/.gitignore b/.gitignore index d2d6f360b5d..e8f59a8999c 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,5 @@ nosetests.xml .mr.developer.cfg .project .pydevproject + +doc/_build diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 00000000000..78e298c02b5 --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,177 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/scidata.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/scidata.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/scidata" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/scidata" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 00000000000..6107216b9a4 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,272 @@ +# -*- coding: utf-8 -*- +# +# scidata documentation build configuration file, created by +# sphinx-quickstart on Thu Feb 6 18:57:54 2014. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.viewcode', + 'numpydoc', +] + +numpydoc_class_members_toctree = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'scidata' +copyright = u'2014, Stephan Hoyer and Alex Kleeman' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.1-dev' +# The full version, including alpha/beta/rc tags. +release = '0.1-dev' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'scidatadoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ('index', 'scidata.tex', u'scidata Documentation', + u'Stephan Hoyer and Alex Kleeman', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'scidata', u'scidata Documentation', + [u'Stephan Hoyer and Alex Kleeman'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'scidata', u'scidata Documentation', + u'Stephan Hoyer and Alex Kleeman', 'scidata', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'http://docs.python.org/': None} diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 00000000000..09eae1412bb --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,24 @@ +.. scidata documentation master file, created by + sphinx-quickstart on Thu Feb 6 18:57:54 2014. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +SciData reference +================= + +Contents: + +.. toctree:: + :maxdepth: 2 + +.. automodule:: scidata + :members: + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/src/scidata/__init__.py b/src/scidata/__init__.py index 27f673cbeba..15cfaa68077 100644 --- a/src/scidata/__init__.py +++ b/src/scidata/__init__.py @@ -3,3 +3,5 @@ from variable import Variable import backends + +__all__ = ['open_dataset', 'Dataset', 'DataView', 'Variable', 'intersection'] From 941e83359ea81ab128d8f67e79b80cdd13f21b8b Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 6 Feb 2014 22:55:08 -0800 Subject: [PATCH 18/45] `to_dataframe` method implements pandas.DataFrame export Refactored the `broadcast_variables` function and fixed a bug in `variable.transpose` (see the new test case). --- setup.py | 7 +-- src/scidata/__init__.py | 7 ++- src/scidata/dataset.py | 24 ++++++++- src/scidata/dataview.py | 9 ++++ src/scidata/variable.py | 111 ++++++++++++++++++++++------------------ test/test_dataset.py | 10 ++++ test/test_variable.py | 4 ++ 7 files changed, 116 insertions(+), 56 deletions(-) diff --git a/setup.py b/setup.py index 7743a55f8c5..ea96e79939b 100644 --- a/setup.py +++ b/setup.py @@ -6,11 +6,12 @@ from distutils.core import setup setup(name='scidata', - version='0.01', + version='0.1-dev', description='Objects for holding self describing scientific data in python', - author='Alex Kleeman', + author='Stephan Hoyer, Alex Kleeman', author_email='TODO', - install_requires=['scipy >= 0.10.0', 'numpy >= 1.7', 'netCDF4 >= 1.0.6'], + install_requires=['scipy >= 0.10.0', 'numpy >= 1.7', 'netCDF4 >= 1.0.6', + 'pandas >= 0.13.1'], tests_require=['nose >= 1.0'], url='https://github.com/akleeman/scidata', test_suite='nose.collector', diff --git a/src/scidata/__init__.py b/src/scidata/__init__.py index 15cfaa68077..36386d53c6e 100644 --- a/src/scidata/__init__.py +++ b/src/scidata/__init__.py @@ -1,7 +1,10 @@ from dataset import Dataset, open_dataset from dataview import DataView, intersection -from variable import Variable +from utils import orthogonal_indexer, num2datetimeindex, variable_equal +from variable import Variable, broadcast_variables import backends -__all__ = ['open_dataset', 'Dataset', 'DataView', 'Variable', 'intersection'] +__all__ = ['open_dataset', 'Dataset', 'DataView', 'Variable', 'intersection', + 'broadcast_variables', 'orthogonal_indexer', 'num2datetimeindex', + 'variable_equal'] diff --git a/src/scidata/dataset.py b/src/scidata/dataset.py index 4cf1bb793a7..cd60b44f597 100644 --- a/src/scidata/dataset.py +++ b/src/scidata/dataset.py @@ -9,7 +9,7 @@ from dataview import DataView from utils import FrozenOrderedDict, Frozen -from variable import Variable +from variable import Variable, broadcast_variables import backends, conventions, utils date2num = nc4.date2num @@ -902,6 +902,28 @@ def iterator(self, dimension): for i in xrange(self.dimensions[dimension]): yield (coord[i], self.indexed_by(**{dimension: i})) + def to_dataframe(self): + """Convert this dataset into a pandas.DataFrame + + Non-coordinate variables in this dataset form the columns of the + DataFrame. The DataFrame is be indexed by the Cartesian product of + this dataset's indices. + """ + index_names = self.indices.keys() + columns = self.noncoordinates.keys() + data = [] + # we need a template to broadcast all dataset variables against + template = Variable(self.dimensions.keys(), + np.empty(self.dimensions.values())) + for k in columns: + _, var = broadcast_variables(template, self[k]) + _, var_data = np.broadcast_arrays(template.data, var.data) + data.append(var_data.reshape(-1)) + # note: pd.MultiIndex.from_product is new in pandas-0.13.1 + index = pd.MultiIndex.from_product(self.indices.values(), + names=index_names) + return pd.DataFrame(OrderedDict(zip(columns, data)), index=index) + if __name__ == "__main__": """ diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index 9177f75b467..cd7b81f41b2 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -348,6 +348,15 @@ def from_stack(cls, dataviews, new_dim_name='stacked_dimension'): ds[focus] = variable.Variable.from_stack(dataviews, new_dim_name) return cls(ds, focus) + def to_dataframe(self): + """Convert this dataview into a pandas.DataFrame + + Non-coordinate variables in this dataview's dataset (which include the + view's data) form the columns of the DataFrame. The DataFrame is be + indexed by the Cartesian product of the dataset's indices. + """ + return self.dataset.to_dataframe() + def __array_wrap__(self, result): return self.replace_focus(self.variable.__array_wrap__(result)) diff --git a/src/scidata/variable.py b/src/scidata/variable.py index 1d089eb7369..c1b56e25fb4 100644 --- a/src/scidata/variable.py +++ b/src/scidata/variable.py @@ -269,7 +269,7 @@ def transpose(self, *dimensions): """ if len(dimensions) == 0: dimensions = self.dimensions[::-1] - axes = [dimensions.index(dim) for dim in self.dimensions] + axes = [self.dimensions.index(dim) for dim in dimensions] data = self.data.transpose(*axes) return type(self)(dimensions, data, self.attributes) @@ -456,7 +456,7 @@ def _binary_op(f, reflexive=False): def func(self, other): if isinstance(other, dataview.DataView): return NotImplemented - self_data, other_data, new_dims = _broadcast_var_data(self, other) + self_data, other_data, dims = _broadcast_variable_data(self, other) new_data = (f(self_data, other_data) if not reflexive else f(other_data, self_data)) @@ -465,14 +465,14 @@ def func(self, other): other.attributes) else: new_attr = self.attributes - return type(self)(new_dims, new_data, new_attr) + return type(self)(dims, new_data, new_attr) return func @staticmethod def _inplace_binary_op(f): @functools.wraps(f) def func(self, other): - self_data, other_data, dims = _broadcast_var_data(self, other) + self_data, other_data, dims = _broadcast_variable_data(self, other) if dims != self.dimensions: raise ValueError('dimensions cannot change for in-place ' 'operations') @@ -485,57 +485,68 @@ def func(self, other): ops.inject_special_operations(Variable) -def _broadcast_var_data(self, other): - self_data = self.data +def broadcast_variables(first, second): + """Given two variables, return two variables with matching dimensions and + numpy broadcast compatible data + + Parameters + ---------- + first, second : Variable + Variable objects to broadcast. + + Returns + ------- + first_broadcast, second_broadcast : Variable + Broadcast variables. The data on each variable will be a view of the + data on the corresponding original variables, but dimensions will be + reordered and inserted so that both broadcast variables have the same + dimensions. The new dimensions are sorted in order of appearence in the + first variable's dimensions followed by the second variable's + dimensions. + """ + # TODO: add unit tests specifically for this function + # validate dimensions + dim_lengths = dict(zip(first.dimensions, first.shape)) + for k, v in zip(second.dimensions, second.shape): + if k in dim_lengths and dim_lengths[k] != v: + raise ValueError('operands could not be broadcast together ' + 'with mismatched lengths for dimension %r: %s' + % (k, (dim_lengths[k], v))) + for dimensions in [first.dimensions, second.dimensions]: + if len(set(dimensions)) < len(dimensions): + raise ValueError('broadcasting requires that neither operand ' + 'has duplicate dimensions: %r' + % list(dimensions)) + + # build dimensions for new Variable + second_only_dims = [d for d in second.dimensions + if d not in first.dimensions] + dimensions = list(first.dimensions) + second_only_dims + + # expand first_data's dimensions so it's broadcast compatible after + # adding second's dimensions at the end + first_data = first.data[(Ellipsis,) + (None,) * len(second_only_dims)] + new_first = Variable(dimensions, first_data) + # expand and reorder second_data so the dimensions line up + first_only_dims = [d for d in dimensions if d not in second.dimensions] + second_dims = list(second.dimensions) + first_only_dims + second_data = second.data[(Ellipsis,) + (None,) * len(first_only_dims)] + new_second = Variable(second_dims, second_data).transpose(*dimensions) + return new_first, new_second + + +def _broadcast_variable_data(self, other): if isinstance(other, dataset.Dataset): raise TypeError('datasets do not support mathematical operations') elif all(hasattr(other, attr) for attr in ['dimensions', 'data', 'shape']): - # validate dimensions - dim_lengths = dict(zip(self.dimensions, self.shape)) - for k, v in zip(other.dimensions, other.shape): - if k in dim_lengths and dim_lengths[k] != v: - raise ValueError('operands could not be broadcast together ' - 'with mismatched lengths for dimension %r: %s' - % (k, (dim_lengths[k], v))) - for dimensions in [self.dimensions, other.dimensions]: - if len(set(dimensions)) < len(dimensions): - raise ValueError('broadcasting requires that neither operand ' - 'has duplicate dimensions: %r' - % list(dimensions)) - - # build dimensions for new Variable - other_only_dims = [dim for dim in other.dimensions - if dim not in self.dimensions] - dimensions = list(self.dimensions) + other_only_dims - - # expand self_data's dimensions so it's broadcast compatible after - # adding other's dimensions to the end - for _ in xrange(len(other_only_dims)): - self_data = np.expand_dims(self_data, axis=-1) - - # expand and reorder other_data so the dimensions line up - self_only_dims = [dim for dim in dimensions - if dim not in other.dimensions] - other_data = other.data - for _ in xrange(len(self_only_dims)): - other_data = np.expand_dims(other_data, axis=-1) - other_dims = list(other.dimensions) + self_only_dims - axes = [other_dims.index(dim) for dim in dimensions] - other_data = other_data.transpose(axes) + # `other` satisfies the Variable API + new_self, new_other = broadcast_variables(self, other) + self_data = new_self.data + other_data = new_other.data + dimensions = new_self.dimensions else: # rely on numpy broadcasting rules + self_data = self.data other_data = other dimensions = self.dimensions return self_data, other_data, dimensions - - -def _math_safe_attributes(v): - """Given a variable, return the variables's attributes that are safe for - mathematical operations (e.g., all those except for 'units') - """ - try: - attr = v.attributes - except AttributeError: - return {} - else: - return OrderedDict((k, v) for k, v in attr.items() if k != 'units') diff --git a/test/test_dataset.py b/test/test_dataset.py index ce415d6a61b..8295046b17f 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -353,6 +353,16 @@ def test_write_store(self): actual = Dataset.load_store(store) self.assertEquals(expected, actual) + def test_to_dataframe(self): + x = np.random.randn(10) + y = np.random.randn(10) + ds = Dataset({'a': Variable('t', x), 'b': Variable('t', y)}) + expected = pd.DataFrame(np.array([x, y]).T, columns=['a', 'b'], + index=pd.Index(np.arange(10), name='t')) + actual = ds.to_dataframe() + # use the .equals method to check all DataFrame metadata + self.assertTrue(expected.equals(actual)) + class NetCDF4DataTest(DataTest): def get_store(self): diff --git a/test/test_variable.py b/test/test_variable.py index 9ff2bbc097b..ac5a022a416 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -86,7 +86,11 @@ def test_transpose(self): x = np.random.randn(2, 3, 4, 5) w = Variable(['a', 'b', 'c', 'd'], x) w2 = Variable(['d', 'b', 'c', 'a'], np.einsum('abcd->dbca', x)) + self.assertEqual(w2.shape, (5, 3, 4, 2)) self.assertVarEqual(w2, w.transpose('d', 'b', 'c', 'a')) + self.assertVarEqual(w, w2.transpose('a', 'b', 'c', 'd')) + w3 = Variable(['b', 'c', 'd', 'a'], np.einsum('abcd->bcda', x)) + self.assertVarEqual(w, w3.transpose('a', 'b', 'c', 'd')) def test_1d_math(self): x = np.arange(5) From d6c0b825539f1ed040cdabbea639556edfc39a1a Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 7 Feb 2014 00:41:02 -0800 Subject: [PATCH 19/45] New README.md --- README.md | 70 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index df617932e0b..d7c44a5513e 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,49 @@ -scidata -======= - -Objects for holding self describing scientific data in python. The goal of this project is to -provide a Common Data Model (http://www.unidata.ucar.edu/software/thredds/current/netcdf-java/CDM/) -allowing users to read write and manipulate netcdf-like data without worrying about where the data -source lives. A dataset that is too large to fit in memory, served from an OpenDAP server, streamed -or stored as NetCDF3, NetCDF4, grib (?), HDF5 and others can all be inspected and manipulated using -the same methods. - -Of course there are already several packages in python that offer similar functionality (netCDF4, -scipy.io, pupynere, iris, ... ) but each of those packages have their own shortcomings: - -netCDF4 - Doesn't allow streaming. If you want to create a new object it needs to live on disk. -scipy.io / pupynere - Only works with NetCDF3 and doesn't support DAP making it difficult to work with large datasets. -iris - is REALLY close to what this project will provide, but iris strays further from the CDM, - than I would like. (if you read then write a netcdf file using iris all global attributes - are pushed down to variable level attributes. +# scidata: objects for working with scientific data in Python + +**scidata** is a Python package for working with aligned sets of homogeneous, +n-dimensional arrays. It implements flexible array operations and dataset +manipulation for in-memory datasets within the [Common Data Model][cdm] widely +used for self-describing scientific data (netCDF, OpenDAP, etc.). + +## Main Feaures + + - A `DataView` object that is compatible with NumPy's ndarray and ufuncs + but keeps ancilliary variables and metadata intact. + - Array broadcasting based on dimension names and coordinate indices + instead of only shapes. + - Aggregate variables across dimensions or grouped by other variables. + - Fast label-based indexing and time-series functionality built on + [pandas][pandas]. + +## Design Goals + + - Provide a data analysis toolkit as fast and powerful as pandas but + designed for working with datasets of aligned, homogeneous N-dimensional + arrays. + - Whenever possible, build on top of and interoperate with pandas and the + rest of the awesome [scientific python stack][scipy]. + - Be as fast as NumPy. + - Provide a uniform API for loading and saving scientific data in a variety + of formats (including streaming data). + - Use metadata according to [conventions][cf] when appropriate, but don't + strictly enforce it. Conflicting attributes (e.g., units) may be silently + dropped. + +## Prior Art + + - [Iris][iris] is an awesome package for working with meteorological data with + unfortunately complex data-structures and strict enforcement of metadata + conventions. Scidata's `DataView` is largely based on the Iris `Cube`. + - [netCDF4-python][nc4] provides scidata's primary interface for working with + netCDF and OpenDAP datasets. + - [pandas][pandas] fast and powerful but oriented around working with + tabular datasets. pandas has experimental N-dimensional panels, but they + don't support aligned math with other objects. We believe the `DataView`/ + `Cube` model is better suited to working with scientific datasets. + +[pandas]: http://pandas.pydata.org/ +[cdm]: http://www.unidata.ucar.edu/software/thredds/current/netcdf-java/CDM/ +[cf]: http://cf-pcmdi.llnl.gov/documents/cf-conventions/1.6/cf-conventions.html +[scipy]: http://scipy.org/ +[nc4]: http://netcdf4-python.googlecode.com/svn/trunk/docs/netCDF4-module.html +[iris]: http://scitools.org.uk/iris/ From fb3289d5d5cd8fdc3f6a8d92b2cc451a94899408 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 7 Feb 2014 01:01:12 -0800 Subject: [PATCH 20/45] Fixed intersection Added better test that uses string labels instead of just integers. --- src/scidata/dataset.py | 6 ++---- src/scidata/dataview.py | 5 ++++- src/scidata/utils.py | 2 ++ test/test_dataview.py | 1 + 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/scidata/dataset.py b/src/scidata/dataset.py index cd60b44f597..3134322d909 100644 --- a/src/scidata/dataset.py +++ b/src/scidata/dataset.py @@ -655,12 +655,10 @@ def _loc_to_int_indexer(self, dim, locations): try: indexer = index.get_loc(locations) except TypeError: - # value is an list or array - new_index, indexer = index.reindex(np.asarray(locations)) + # value is a list or array + indexer = index.get_indexer(np.asarray(locations)) if np.any(indexer < 0): raise ValueError('not all values found in index %r' % dim) - # FIXME: don't throw away new_index (we'll need to recreate it - # later) return indexer def labeled_by(self, **indexers): diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index cd7b81f41b2..778b332eae9 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -408,9 +408,12 @@ def intersection(dataview1, dataview2): """Given two dataview objects, returns two new dataviews where all indices found on both dataviews are replaced by their intersection """ + # TODO: automatically calculate the intersection when doing math with + # dataviews, or better yet calculate the union of the indices and fill in + # the mis-aligned data with NaN. overlapping_indices = {k: dataview1.indices[k] & dataview2.indices[k] for k in dataview1.indices if k in dataview2.indices} - return tuple(dv.indexed_by(**overlapping_indices) + return tuple(dv.labeled_by(**overlapping_indices) for dv in [dataview1, dataview2]) diff --git a/src/scidata/utils.py b/src/scidata/utils.py index 517e5869077..c6d9b3ed2ec 100644 --- a/src/scidata/utils.py +++ b/src/scidata/utils.py @@ -103,6 +103,8 @@ def variable_equal(v1, v2): # _data is not part of the public interface, so it's okay if its # missing pass + # TODO: replace this with a NaN safe version. + # see: pandas.core.common.array_equivalent return np.array_equal(v1.data, v2.data) else: return False diff --git a/test/test_dataview.py b/test/test_dataview.py index 010a6b33af4..03e3b9ae5e6 100644 --- a/test/test_dataview.py +++ b/test/test_dataview.py @@ -179,6 +179,7 @@ def test_from_stack(self): bar.variable], 'w')) def test_intersection(self): + self.ds.set_variable('x', Variable(['x'], np.array(list('abcdefghij')))) with self.assertRaises(ValueError): self.dv + self.dv[:5] dv1, dv2 = intersection(self.dv, self.dv[:5]) From b5ff8d93cccaeaf91769c6811a54985ed1d8fbd2 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 7 Feb 2014 10:24:10 -0800 Subject: [PATCH 21/45] Renamed DataView.replace_focus to DataView.refocus --- src/scidata/backends.py | 2 ++ src/scidata/dataview.py | 10 +++++----- test/test_dataview.py | 12 ++++++------ 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/scidata/backends.py b/src/scidata/backends.py index 5941271944d..ae004804520 100644 --- a/src/scidata/backends.py +++ b/src/scidata/backends.py @@ -165,6 +165,8 @@ def convert_nc4_variable(var): class NetCDF4DataStore(AbstractDataStore): def __init__(self, filename, *args, **kwdargs): + # TODO: set auto_maskandscale=True so we can handle the array + # packing/unpacking ourselves (using NaN instead of masked arrays) self.ds = nc4.Dataset(filename, *args, **kwdargs) @property diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index 778b332eae9..413b4b26f6b 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -180,7 +180,7 @@ def unselected(self): """ return self.dataset.unselect(self.focus) - def replace_focus(self, new_var): + def refocus(self, new_var): """Returns a copy of this DataView's dataset with this DataView's focus variable replaced by 'new_var' """ @@ -231,7 +231,7 @@ def transpose(self, *dimensions): numpy.transpose Variable.tranpose """ - return self.replace_focus(self.variable.transpose(*dimensions)) + return self.refocus(self.variable.transpose(*dimensions)) def collapse(self, func, dimension=None, axis=None, **kwargs): """Collapse this variable by applying `func` along some dimension(s) @@ -358,13 +358,13 @@ def to_dataframe(self): return self.dataset.to_dataframe() def __array_wrap__(self, result): - return self.replace_focus(self.variable.__array_wrap__(result)) + return self.refocus(self.variable.__array_wrap__(result)) @staticmethod def _unary_op(f): @functools.wraps(f) def func(self, *args, **kwargs): - return self.replace_focus(f(self.variable, *args, **kwargs)) + return self.refocus(f(self.variable, *args, **kwargs)) return func def _check_indices_compat(self, other): @@ -381,7 +381,7 @@ def _binary_op(f, reflexive=False): def func(self, other): self._check_indices_compat(other) other_variable = getattr(other, 'variable', other) - dv = self.replace_focus(f(self.variable, other_variable) + dv = self.refocus(f(self.variable, other_variable) if not reflexive else f(other_variable, self.variable)) if hasattr(other, 'unselected'): diff --git a/test/test_dataview.py b/test/test_dataview.py index 03e3b9ae5e6..6c604a62132 100644 --- a/test/test_dataview.py +++ b/test/test_dataview.py @@ -87,9 +87,9 @@ def test_renamed(self): self.assertEqual(renamed.dataset, self.ds.renamed({'foo': 'bar'})) self.assertEqual(renamed.focus, 'bar') - def test_replace_focus(self): - self.assertVarEqual(self.dv, self.dv.replace_focus(self.v)) - self.assertVarEqual(self.dv, self.dv.replace_focus(self.x)) + def test_refocus(self): + self.assertVarEqual(self.dv, self.dv.refocus(self.v)) + self.assertVarEqual(self.dv, self.dv.refocus(self.x)) def test_dataset_getitem(self): dv = self.ds['foo'] @@ -100,12 +100,12 @@ def test_array_interface(self): # test patched in methods self.assertArrayEqual(self.dv.take([2, 3]), self.x.take([2, 3])) self.assertViewEqual(self.dv.argsort(), - self.dv.replace_focus(self.x.argsort())) + self.dv.refocus(self.x.argsort())) self.assertViewEqual(self.dv.clip(2, 3), - self.dv.replace_focus(self.x.clip(2, 3))) + self.dv.refocus(self.x.clip(2, 3))) # test ufuncs self.assertViewEqual(np.sin(self.dv), - self.dv.replace_focus(np.sin(self.x))) + self.dv.refocus(np.sin(self.x))) self.assertViewEqual(self.dv, np.maximum(self.v, self.dv)) self.ds['bar'] = Variable(['x', 'y'], np.zeros((10, 20))) self.assertViewEqual(self.dv, np.maximum(self.dv, self.ds['bar'])) From 3c5856fe778c8d8d2595c4785bb35e135499786b Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 7 Feb 2014 12:32:19 -0800 Subject: [PATCH 22/45] README edits --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d7c44a5513e..96f6e3c217a 100644 --- a/README.md +++ b/README.md @@ -22,12 +22,12 @@ used for self-describing scientific data (netCDF, OpenDAP, etc.). arrays. - Whenever possible, build on top of and interoperate with pandas and the rest of the awesome [scientific python stack][scipy]. - - Be as fast as NumPy. - Provide a uniform API for loading and saving scientific data in a variety of formats (including streaming data). - Use metadata according to [conventions][cf] when appropriate, but don't - strictly enforce it. Conflicting attributes (e.g., units) may be silently - dropped. + strictly enforce them. Conflicting attributes (e.g., units) should be + silently dropped instead of causing errors. The onus is on the user to + make sure that operations make sense. ## Prior Art @@ -36,7 +36,7 @@ used for self-describing scientific data (netCDF, OpenDAP, etc.). conventions. Scidata's `DataView` is largely based on the Iris `Cube`. - [netCDF4-python][nc4] provides scidata's primary interface for working with netCDF and OpenDAP datasets. - - [pandas][pandas] fast and powerful but oriented around working with + - [pandas][pandas] is fast and powerful but oriented around working with tabular datasets. pandas has experimental N-dimensional panels, but they don't support aligned math with other objects. We believe the `DataView`/ `Cube` model is better suited to working with scientific datasets. From 828c6dd3d043bfa46c93d64fed2df4601fec5f43 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 7 Feb 2014 16:58:45 -0800 Subject: [PATCH 23/45] Indexing bug fixes --- src/scidata/dataset.py | 22 ++++--------------- src/scidata/dataview.py | 47 ++++++++++++++++++++++++++--------------- src/scidata/utils.py | 21 ++++++++++++++++++ test/test_dataview.py | 14 ++++++++++++ 4 files changed, 69 insertions(+), 35 deletions(-) diff --git a/src/scidata/dataset.py b/src/scidata/dataset.py index 3134322d909..4b07908374c 100644 --- a/src/scidata/dataset.py +++ b/src/scidata/dataset.py @@ -8,7 +8,7 @@ from collections import OrderedDict, Mapping, MutableMapping from dataview import DataView -from utils import FrozenOrderedDict, Frozen +from utils import FrozenOrderedDict, Frozen, remap_loc_indexers from variable import Variable, broadcast_variables import backends, conventions, utils @@ -641,26 +641,13 @@ def indexed_by(self, **indexers): # filter out non-indices (indices for which one value was selected) indices = {k: v for k, v in indices.iteritems() if isinstance(v, pd.Index)} + variables = OrderedDict((k, v) for k, v in variables.iteritems() + if v.ndim > 0) dimensions = OrderedDict((k, indices[k].size) for k in self.dimensions if k in indices) return type(self)(variables, dimensions, self.attributes, indices=indices) - def _loc_to_int_indexer(self, dim, locations): - index = self.indices[dim] - if isinstance(locations, slice): - indexer = index.slice_indexer(locations.start, locations.stop, - locations.step) - else: - try: - indexer = index.get_loc(locations) - except TypeError: - # value is a list or array - indexer = index.get_indexer(np.asarray(locations)) - if np.any(indexer < 0): - raise ValueError('not all values found in index %r' % dim) - return indexer - def labeled_by(self, **indexers): """Return a new dataset with each variable indexed by coordinate labels along the specified dimension(s) @@ -699,8 +686,7 @@ def labeled_by(self, **indexers): Dataset.indexed_by Variable.indexed_by """ - return self.indexed_by(**{k: self._loc_to_int_indexer(k, v) - for k, v in indexers.iteritems()}) + return self.indexed_by(**remap_loc_indexers(self.indices, indexers)) def renamed(self, name_dict): """ diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index 413b4b26f6b..d24937c2523 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -2,6 +2,7 @@ # like pandas import functools import re +from collections import OrderedDict import numpy as np @@ -9,7 +10,7 @@ import ops import variable from common import _DataWrapperMixin -from utils import expanded_indexer, FrozenOrderedDict +from utils import expanded_indexer, FrozenOrderedDict, remap_loc_indexers class _LocIndexer(object): @@ -17,8 +18,9 @@ def __init__(self, dataview): self.dataview = dataview def _remap_key(self, key): - return tuple(self.dataview.dataset._loc_to_int_indexer(k, v) - for k, v in self.dataview._key_to_indexers(key)) + indexers = remap_loc_indexers(self.dataview.indices, + self.dataview._key_to_indexers(key)) + return tuple(indexers.values()) def __getitem__(self, key): return self.dataview[self._remap_key(key)] @@ -81,7 +83,8 @@ def dimensions(self): return self.variable.dimensions def _key_to_indexers(self, key): - return zip(self.dimensions, expanded_indexer(key, self.ndim)) + return OrderedDict( + zip(self.dimensions, expanded_indexer(key, self.ndim))) def __getitem__(self, key): if isinstance(key, basestring): @@ -89,7 +92,7 @@ def __getitem__(self, key): return self.dataset[key] else: # orthogonal array indexing - return self.indexed_by(**dict(self._key_to_indexers(key))) + return self.indexed_by(**self._key_to_indexers(key)) def __setitem__(self, key, value): if isinstance(key, basestring): @@ -156,7 +159,12 @@ def indexed_by(self, **indexers): -------- Dataset.indexed_by """ - return type(self)(self.dataset.indexed_by(**indexers), self.focus) + ds = self.dataset.indexed_by(**indexers) + if self.focus not in ds: + # always keep focus variable in the dataset, even if it was + # unselected because indexing made it a scaler + ds[self.focus] = self.variable.indexed_by(**indexers) + return type(self)(ds, self.focus) def labeled_by(self, **indexers): """Return a new dataview whose dataset is given by selecting coordinate @@ -166,7 +174,7 @@ def labeled_by(self, **indexers): -------- Dataset.labeled_by """ - return type(self)(self.dataset.labeled_by(**indexers), self.focus) + return self.indexed_by(**remap_loc_indexers(self.indices, indexers)) def renamed(self, new_name): """Returns a new DataView with this DataView's focus variable renamed @@ -182,11 +190,14 @@ def unselected(self): def refocus(self, new_var): """Returns a copy of this DataView's dataset with this DataView's - focus variable replaced by 'new_var' + focus variable replaced by `new_var` + + If `new_var` is a dataview, its contents will be merged in. """ if not hasattr(new_var, 'dimensions'): new_var = type(self.variable)(self.variable.dimensions, new_var) - ds = self.dataset.replace(self.focus, new_var) + ds = self.unselected() + ds[self.focus] = new_var return type(self)(ds, self.focus) def iterator(self, dimension): @@ -206,8 +217,8 @@ def iterator(self, dimension): The returned iterator yields pairs of scalar-valued coordinate variables and DataView objects. """ - for (x, dataset) in self.dataset.iterator(dimension): - yield (x, type(self)(dataset, self.focus)) + for (x, ds) in self.dataset.iterator(dimension): + yield (x, type(self)(ds, self.focus)) def transpose(self, *dimensions): """Return a new DataView object with transposed dimensions @@ -379,14 +390,16 @@ def _check_indices_compat(self, other): def _binary_op(f, reflexive=False): @functools.wraps(f) def func(self, other): + # TODO: automatically group by other variable dimensions self._check_indices_compat(other) - other_variable = getattr(other, 'variable', other) - dv = self.refocus(f(self.variable, other_variable) - if not reflexive - else f(other_variable, self.variable)) + ds = self.unselected() if hasattr(other, 'unselected'): - dv.dataset.merge(other.unselected(), inplace=True) - return dv + ds.merge(other.unselected(), inplace=True) + other_variable = getattr(other, 'variable', other) + ds[self.focus] = (f(self.variable, other_variable) + if not reflexive + else f(other_variable, self.variable)) + return ds[self.focus] return func @staticmethod diff --git a/src/scidata/utils.py b/src/scidata/utils.py index c6d9b3ed2ec..2fd54089395 100644 --- a/src/scidata/utils.py +++ b/src/scidata/utils.py @@ -62,6 +62,27 @@ def expand_array(k, length): return tuple(key) +def remap_loc_indexers(indices, indexers): + """Given mappings of indices and label based indexers, return equivalent + location based indexers + """ + new_indexers = OrderedDict() + for dim, loc in indexers.iteritems(): + index = indices[dim] + if isinstance(loc, slice): + indexer = index.slice_indexer(loc.start, loc.stop, loc.step) + else: + try: + indexer = index.get_loc(loc) + except TypeError: + # value is a list or array + indexer = index.get_indexer(np.asarray(loc)) + if np.any(indexer < 0): + raise ValueError('not all values found in index %r' % dim) + new_indexers[dim] = indexer + return new_indexers + + def num2datetimeindex(num_dates, units, calendar=None): """Convert an array of numeric dates in netCDF format into a pandas.DatetimeIndex diff --git a/test/test_dataview.py b/test/test_dataview.py index 6c604a62132..2b974464c55 100644 --- a/test/test_dataview.py +++ b/test/test_dataview.py @@ -50,6 +50,10 @@ def test_items(self): self.assertVarEqual(self.v[i], self.dv[i]) # check that the new index is consistent self.assertEqual(list(self.dv[0].indices), ['y']) + # we always to keep the dataview variable around + self.assertVarEqual(self.dv[0, 0], self.dv.variable[0, 0]) + self.assertEqual(self.dv[0, 0].dataset, + Dataset({'foo': self.dv.variable[0, 0]})) def test_iteration(self): for ((act_x, act_dv), (exp_x, exp_ds)) in \ @@ -133,6 +137,16 @@ def test_math(self): with self.assertRaisesRegexp(ValueError, 'not aligned'): b + a + def test_item_math(self): + self.ds.set_variable('x', Variable(['x'], np.array(list('abcdefghij')))) + self.assertVarEqual(self.dv + self.dv[0, 0], + self.dv + self.dv[0, 0].data) + new_data = self.x[0][None, :] + self.x[:, 0][:, None] + self.assertVarEqual(self.dv[:, 0] + self.dv[0], + Variable(['x', 'y'], new_data)) + self.assertVarEqual(self.dv[0] + self.dv[:, 0], + Variable(['y', 'x'], new_data.T)) + def test_inplace_math(self): x = self.x v = self.v From ef5ac51374488dcedaed834efeaf1f2f199d4171 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 7 Feb 2014 18:37:19 -0800 Subject: [PATCH 24/45] Fully tranverse dataset graphs with Dataset.select --- src/scidata/dataset.py | 47 +++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/src/scidata/dataset.py b/src/scidata/dataset.py index 4b07908374c..fabe8261306 100644 --- a/src/scidata/dataset.py +++ b/src/scidata/dataset.py @@ -358,9 +358,7 @@ def coordinates(self): """Coordinates are variables with names that match dimensions""" return FrozenOrderedDict([(dim, self.variables[dim]) for dim in self.dimensions - if dim in self.variables and - self.variables[dim].data.ndim == 1 and - self.variables[dim].dimensions == (dim,)]) + if dim in self.variables]) @property def noncoordinates(self): @@ -792,22 +790,33 @@ def select(self, *names): raise ValueError( "One or more of the specified variables does not exist") - def get_aux_names(var): - names = set(var.dimensions) - if 'coordinates' in var.attributes: - coords = var.attributes['coordinates'] - if coords != '': - names |= set(coords.split(' ')) - return names - - aux_names = [get_aux_names(self.variables[k]) for k in names] - names = set(names).union(*aux_names) - - variables = OrderedDict((k, v) for k, v in self.variables.iteritems() - if k in names) - dimensions = OrderedDict((k, v) for k, v in self.dimensions.iteritems() - if k in names) - indices = {k: v for k, v in self.indices.cache.items() if k in names} + def get_all_associated_names(name): + yield name + if name in self: + var = self.variables[name] + for dim in var.dimensions: + yield dim + if 'coordinates' in var.attributes: + coords = var.attributes['coordinates'] + if coords != '': + for coord in coords.split(' '): + yield coord + + queue = set(names) + selected_names = set() + while queue: + name = queue.pop() + new_names = set(get_all_associated_names(name)) + queue |= new_names - selected_names + selected_names |= new_names + + def ordered_keys_in(dictionary, selection): + return OrderedDict((k, v) for k, v in dictionary.iteritems() + if k in selection) + + variables = ordered_keys_in(self.variables, selected_names) + dimensions = ordered_keys_in(self.dimensions, selected_names) + indices = ordered_keys_in(self.indices.cache, selected_names) return type(self)(variables, dimensions, self.attributes, indices=indices) From a4b6ad9840abb0da206cab7b02d507d0998c9437 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 9 Feb 2014 15:50:34 -0800 Subject: [PATCH 25/45] DataView.from_stack can concatenate along existing dimensions, too --- src/scidata/__init__.py | 2 + src/scidata/dataview.py | 54 ++++++++++++------- src/scidata/utils.py | 6 ++- src/scidata/variable.py | 111 +++++++++++++++++++++++++++------------- test/test_dataview.py | 4 ++ test/test_variable.py | 13 ++++- 6 files changed, 133 insertions(+), 57 deletions(-) diff --git a/src/scidata/__init__.py b/src/scidata/__init__.py index 36386d53c6e..4a4789b7e3c 100644 --- a/src/scidata/__init__.py +++ b/src/scidata/__init__.py @@ -5,6 +5,8 @@ import backends +concat = DataView.from_stack + __all__ = ['open_dataset', 'Dataset', 'DataView', 'Variable', 'intersection', 'broadcast_variables', 'orthogonal_indexer', 'num2datetimeindex', 'variable_equal'] diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index d24937c2523..4854b648639 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -315,48 +315,66 @@ def aggregate(self, func, new_dim, **kwargs): func, new_dim.focus, new_dim, **kwargs) # TODO: add options for how to summarize variables along aggregated # dimensions instead of just dropping them? - drop = ({self.focus} | - ({new_dim.focus} if new_dim.focus in self.dataset else set()) | - {k for k, v in self.dataset.variables.iteritems() - if any(dim in new_dim.dimensions for dim in v.dimensions)}) + drop = {k for k, v in self.dataset.variables.iteritems() + if any(dim in new_dim.dimensions for dim in v.dimensions)} ds = self.dataset.unselect(*drop) ds.add_coordinate(unique) ds.add_variable(self.focus, aggregated) return type(self)(ds, self.focus) @classmethod - def from_stack(cls, dataviews, new_dim_name='stacked_dimension'): - """Stack dataviews along a new dimension to form a new dataview + def from_stack(cls, dataviews, dimension='stacked_dimension'): + """Stack dataviews along a new or existing dimension to form a new + dataview Parameters ---------- - dataviews : iterable of Variable and/or DataView - Variables and/or DataView objects to stack together. - dim : str, optional - Name of the new dimension. + dataviews : iterable of DataView or Variable + Variables to stack together. Each variable is expected to have + matching dimensions and shape except for along the stacked + dimension. + dimension : str or DataView, optional + Name of the dimension to stack along. This can either be a new + dimension name, in which case it is added along axis=0, or an + existing dimension name, in which case the location of the + dimension is unchanged. Where to insert the new dimension is + determined by the first dataview. Returns ------- stacked : DataView Stacked dataview formed by stacking all the supplied variables - along the new dimension. The new dimension will be the first - dimension in the stacked dataview. + along the new dimension. """ - views = list(dataviews) - if not views: + dataviews = list(dataviews) + if not dataviews: raise ValueError('DataView.from_stack was supplied with an ' 'empty argument') + + # create an empty dataset in which to stack variables + # start by putting in the dimension variable ds = dataset.Dataset() - focus = default_focus = 'stacked_variable' - for view in views: + if isinstance(dimension, basestring): + dim_name = dimension + else: + dim_name = dimension.focus + ds[dim_name] = dimension + + # figure out metadata for each dataview + focus = None + for view in dataviews: if isinstance(view, cls): ds.merge(view.unselected(), inplace=True) - if focus == default_focus: + if focus is None: focus = view.focus elif focus != view.focus: raise ValueError('DataView.from_stack requires that all ' 'stacked views have the same focus') - ds[focus] = variable.Variable.from_stack(dataviews, new_dim_name) + if focus is None: + focus = 'stacked_variable' + + # finally, merge in the stacked variables + ds[focus] = variable.Variable.from_stack(dataviews, dim_name) return cls(ds, focus) def to_dataframe(self): diff --git a/src/scidata/utils.py b/src/scidata/utils.py index 2fd54089395..f9334321c95 100644 --- a/src/scidata/utils.py +++ b/src/scidata/utils.py @@ -135,7 +135,8 @@ def update_safety_check(first_dict, second_dict, compat=operator.eq): """Check the safety of updating one dictionary with another Raises ValueError if dictionaries have non-compatible values for any key, - where compatibility is determined by the `compat` function. + where compatibility is determined by identity (they are the same item) or + the `compat` function. Parameters ---------- @@ -147,7 +148,8 @@ def update_safety_check(first_dict, second_dict, compat=operator.eq): checks for equality. """ for k, v in second_dict.iteritems(): - if k in first_dict and not compat(v, first_dict[k]): + if (k in first_dict and + not (v is first_dict[k] or compat(v, first_dict[k]))): raise ValueError('unsafe to merge dictionaries without ' 'overriding values') diff --git a/src/scidata/variable.py b/src/scidata/variable.py index c1b56e25fb4..f5c48d8384c 100644 --- a/src/scidata/variable.py +++ b/src/scidata/variable.py @@ -70,10 +70,13 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): Attributes to assign to the new variable. If None (default), an empty attribute dictionary is initialized. indexing_mode : {'numpy', 'orthogonal'} - String indicating how data handles to fancy indexing (with - arrays). Two modes are supported: 'numpy' (fancy indexing like - numpy.ndarray objects) and 'orthogonal' (array indexing accesses - different dimensions independently, like netCDF4 variables). + String indicating how the data parameter handles fancy indexing + (with arrays). Two modes are supported: 'numpy' (fancy indexing + like numpy.ndarray objects) and 'orthogonal' (array indexing + accesses different dimensions independently, like netCDF4 + variables). Accessing data from a Variable always uses orthogonal + indexing, so `indexing_mode` tells the variable whether index + lookups need to be internally converted to numpy-style indexing. """ if isinstance(dims, basestring): dims = [dims] @@ -389,55 +392,93 @@ def aggregate(self, func, new_dim_name, group_by, **kwargs): return unique, stacked.transpose(*ordered_dims) @classmethod - def from_stack(cls, variables, new_dim_name='stacked_dimension', + def from_stack(cls, variables, dimension='stacked_dimension', length=None): - """Stack variables along a new dimension to form a new variable + """Stack variables along a new or existing dimension to form a new + variable Parameters ---------- variables : iterable of Variable - Variables to stack together. - new_dim_name : str, optional - Name of the new dimension. + Variables to stack together. Each variable is expected to have + matching dimensions and shape except for along the stacked + dimension. + dimension : str, optional + Name of the dimension to stack along. This can either be a new + dimension name, in which case it is added along axis=0, or an + existing dimension name, in which case the location of the + dimension is unchanged. Where to insert the new dimension is + determined by the first variable. length : int, optional Length of the new dimension. This is used to allocate the new data array for the stacked variable data before iterating over all - items, which can be more memory efficient. + items, which is thus more memory efficient and a bit faster. Returns ------- stacked : Variable Stacked variable formed by stacking all the supplied variables - along the new dimension. The new dimension will be the first - dimension in the stacked variable. + along the new dimension. """ if length is None: - # so much for lazy evaluation! we need to figure out how many - # variables there are + # so much for lazy evaluation! we need to look at all the variables + # to figure out the dimensions of the stacked variable variables = list(variables) - length = len(variables) - - i = -1 - for i, var in enumerate(variables): + length = 0 + for var in variables: + if dimension in var.dimensions: + axis = var.dimensions.index(dimension) + length += var.shape[axis] + else: + length += 1 + + # manually keep track of progress along + i = 0 + for var in variables: if i == 0: - new_data = np.empty((length,) + var.shape, dtype=var.dtype) - old_dimensions = var.dimensions - attributes = OrderedDict(var.attributes) + # initialize the stacked variable with empty data + if dimension not in var.dimensions: + shape = (length,) + var.shape + dims = (dimension,) + var.dimensions + else: + shape = tuple(length if d == dimension else s + for d, s in zip(var.dimensions, var.shape)) + dims = var.dimensions + stacked = cls(dims, np.empty(shape, dtype=var.dtype), + var.attributes) + # required dimensions (including order) if we have any N - 1 + # dimensional variables + alt_dims = tuple(d for d in dims if d != dimension) + + if dimension in var.dimensions: + # transpose requires that the dimensions are equivalent + var = var.transpose(*stacked.dimensions) + axis = var.dimensions.index(dimension) + step = var.shape[axis] + elif var.dimensions == alt_dims: + step = 1 else: - if i == length: - raise ValueError('too many stack variables; supplied ' - 'length was %s' % length) - if var.dimensions != old_dimensions: - raise ValueError('inconsistent dimensions between merge ' - 'variables') - utils.remove_incompatible_items(attributes, var.attributes) - new_data[i] = var.data - - if i + 1 != length: - raise ValueError('only %s stack variables; supplied length ' - 'was %s' % (i + 1, length)) - - return cls((new_dim_name,) + old_dimensions, new_data, attributes) + raise ValueError('inconsistent dimensions') + + if i + step > length: + raise ValueError('actual length of stacked variables along %s ' + 'is greater than expected length %s' + % (dimension, length)) + + indexer = tuple((slice(i, i + step) if step > 1 else i) + if d == dimension else slice(None) + for d in stacked.dimensions) + # by-pass variable indexing for possible speedup + stacked.data[indexer] = var.data + utils.remove_incompatible_items(stacked.attributes, var.attributes) + i += step + + if i != length: + raise ValueError('actual length of stacked variables along %s is ' + '%s but expected length was %s' + % (dimension, i, length)) + + return stacked def __array_wrap__(self, result): return type(self)(self.dimensions, result, self.attributes) diff --git a/test/test_dataview.py b/test/test_dataview.py index 2b974464c55..fa5f2483dbd 100644 --- a/test/test_dataview.py +++ b/test/test_dataview.py @@ -191,6 +191,10 @@ def test_from_stack(self): np.array([foo.data, bar.data])), DataView.from_stack([foo.variable, bar.variable], 'w')) + # from iteration: + stacked = DataView.from_stack((v for _, v in foo.iterator('x')), + self.ds['x']) + self.assertViewEqual(foo, stacked) def test_intersection(self): self.ds.set_variable('x', Variable(['x'], np.array(list('abcdefghij')))) diff --git a/test/test_variable.py b/test/test_variable.py index ac5a022a416..ebb6b6bdd4f 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -229,9 +229,18 @@ def test_from_stack(self): Variable.from_stack((v, w), 'b')) self.assertVarEqual(Variable(['b', 'a'], np.array([x, y])), Variable.from_stack((v, w), 'b', length=2)) - with self.assertRaisesRegexp(ValueError, 'too many'): + with self.assertRaisesRegexp(ValueError, 'greater than expected'): Variable.from_stack([v, w], 'b', length=1) - with self.assertRaisesRegexp(ValueError, r'only \d+ stack'): + with self.assertRaisesRegexp(ValueError, 'but expected length was'): Variable.from_stack([v, w, w], 'b', length=4) with self.assertRaisesRegexp(ValueError, 'inconsistent dimensions'): Variable.from_stack([v, Variable(['c'], y)], 'b') + # test concatenating along a dimension + v = Variable(['time', 'x'], np.random.random((10, 8))) + self.assertVarEqual(v, Variable.from_stack([v[:5], v[5:]], 'time')) + self.assertVarEqual(v, Variable.from_stack([v[:5], v[5], v[6:]], 'time')) + self.assertVarEqual(v, Variable.from_stack([v[0], v[1:]], 'time')) + # test dimension order + self.assertVarEqual(v, Variable.from_stack([v[:, :5], v[:, 5:]], 'x')) + self.assertVarEqual(v.transpose(), + Variable.from_stack([v[:, 0], v[:, 1:]], 'x')) From f683b0757540c03f378f2b239eef02eaf7b4d347 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 9 Feb 2014 16:06:47 -0800 Subject: [PATCH 26/45] Added variable.T as shortcut for variable.transpose() --- src/scidata/common.py | 4 ++++ test/test_variable.py | 1 + 2 files changed, 5 insertions(+) diff --git a/src/scidata/common.py b/src/scidata/common.py index d0ae0a2e639..592d368b8b8 100644 --- a/src/scidata/common.py +++ b/src/scidata/common.py @@ -45,6 +45,10 @@ def __array__(self, dtype=None): # data = self.data # return dict(typestr=data.dtype.str, shape=data.shape, data=data) + @property + def T(self): + return self.transpose() + _collapse_method_docstring = \ """Collapse this {cls}'s data' by applying `{name}` along some dimension(s) diff --git a/test/test_variable.py b/test/test_variable.py index ebb6b6bdd4f..e00c5d6efdb 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -83,6 +83,7 @@ def test_transpose(self): v = Variable(['time', 'x'], self.d) v2 = Variable(['x', 'time'], self.d.T) self.assertVarEqual(v, v2.transpose()) + self.assertVarEqual(v.transpose(), v.T) x = np.random.randn(2, 3, 4, 5) w = Variable(['a', 'b', 'c', 'd'], x) w2 = Variable(['d', 'b', 'c', 'a'], np.einsum('abcd->dbca', x)) From 49edf3dea9b514a4fa24164476ce3548c02ff8d7 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 9 Feb 2014 21:07:31 -0500 Subject: [PATCH 27/45] Added Variable.apply and DataView.apply --- src/scidata/common.py | 11 ----------- src/scidata/dataview.py | 6 ++++++ src/scidata/utils.py | 1 - src/scidata/variable.py | 7 +++++++ test/test_variable.py | 8 ++++++++ 5 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/scidata/common.py b/src/scidata/common.py index 592d368b8b8..11133daa5da 100644 --- a/src/scidata/common.py +++ b/src/scidata/common.py @@ -89,14 +89,3 @@ def func(self, dimension=None, axis=None, **kwargs): name=('' if module is None else module + '.') + name, cls=cls.__name__) return func - - # we want something like this, right? - # def apply(self, func, dimension=None, axis=None, **kwargs): - # if dimension is not None and axis is not None: - # raise ValueError("cannot supply both 'axis' and 'dimension' " - # "arguments") - # if axis is None: - # axis = self.dimensions.index(dimension) - # f = self._unary_op(partial(func, axis=axis, **kwargs)) - # return f(self) - diff --git a/src/scidata/dataview.py b/src/scidata/dataview.py index 4854b648639..c01623cd1b4 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataview.py @@ -377,6 +377,12 @@ def from_stack(cls, dataviews, dimension='stacked_dimension'): ds[focus] = variable.Variable.from_stack(dataviews, dim_name) return cls(ds, focus) + def apply(self, func, *args, **kwargs): + """Apply `func` with *args and **kwargs to this dataview's data and + return the result as a new dataview + """ + return self.refocus(self.variable.apply(func, *args, **kwargs)) + def to_dataframe(self): """Convert this dataview into a pandas.DataFrame diff --git a/src/scidata/utils.py b/src/scidata/utils.py index f9334321c95..79439c7538e 100644 --- a/src/scidata/utils.py +++ b/src/scidata/utils.py @@ -61,7 +61,6 @@ def expand_array(k, length): key[n] = array_indexers[i] return tuple(key) - def remap_loc_indexers(indices, indexers): """Given mappings of indices and label based indexers, return equivalent location based indexers diff --git a/src/scidata/variable.py b/src/scidata/variable.py index f5c48d8384c..0bb5754f11e 100644 --- a/src/scidata/variable.py +++ b/src/scidata/variable.py @@ -480,6 +480,13 @@ def from_stack(cls, variables, dimension='stacked_dimension', return stacked + def apply(self, func, *args, **kwargs): + """Apply `func` with *args and **kwargs to this variable's data and + return the result as a new variable with the same dimensions + """ + data = np.asarray(func(self.data, *args, **kwargs)) + return type(self)(self.dimensions, data, self.attributes) + def __array_wrap__(self, result): return type(self)(self.dimensions, result, self.attributes) diff --git a/test/test_variable.py b/test/test_variable.py index e00c5d6efdb..120160f4439 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -185,6 +185,14 @@ def test_array_interface(self): # test ufuncs self.assertVarEqual(np.sin(v), Variable(['x'], np.sin(x))) + def test_apply(self): + x = np.arange(5) + v = Variable(['x'], x) + def numpy_only_square(x): + return np.asarray(x) ** 2 + self.assertArrayEqual(x ** 2, numpy_only_square(v)) + self.assertVarEqual(v ** 2, v.apply(numpy_only_square)) + def test_collapse(self): v = Variable(['time', 'x'], self.d) # intentionally test with an operation for which order matters From 1e2c47a90320610f9fbd3917acf4eea31c254462 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 9 Feb 2014 22:01:02 -0500 Subject: [PATCH 28/45] Revised and extended new README --- README.md | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 96f6e3c217a..f2fbdc85385 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ used for self-describing scientific data (netCDF, OpenDAP, etc.). - Array broadcasting based on dimension names and coordinate indices instead of only shapes. - Aggregate variables across dimensions or grouped by other variables. - - Fast label-based indexing and time-series functionality built on + - Fast label-based indexing and (limited) time-series functionality built on [pandas][pandas]. ## Design Goals @@ -31,15 +31,38 @@ used for self-describing scientific data (netCDF, OpenDAP, etc.). ## Prior Art - - [Iris][iris] is an awesome package for working with meteorological data with - unfortunately complex data-structures and strict enforcement of metadata - conventions. Scidata's `DataView` is largely based on the Iris `Cube`. - - [netCDF4-python][nc4] provides scidata's primary interface for working with - netCDF and OpenDAP datasets. + - [Iris][iris] (supported by the UK Met office) is a similar package + designed for working with geophysical datasets in Python. Iris provided + much of the inspiration for scidata (e.g., scidata's `DataView` is largely + based on the Iris `Cube`), but it has several limitations that led us to + build scidata instead of extending Iris: + 1. Iris has essentially one first-class object (the `Cube`) on which it + attempts to build all functionality (`Coord` supports a much more + limited set of functionality). scidata has its equivalent of the Cube + (the `DataView` object), but it is only a thin wrapper on the more + primitive building blocks of Dataset and Variable objects. + 2. Iris has a strict interpretation of [CF conventions][cf], which, + although a principled choice, we have found to be impractical for + everyday uses. With Iris, every quantity has physical (SI) units, all + coordinates have cell-bounds, and all metadata (units, cell-bounds and + other attributes) is required to match before merging or doing + operations with on multiple cubes. This means that a lot of time with + Iris is spent figuring out why cubes are incompatible and explicitly + removing possibly conflicting metadata. + 3. Iris can be slow and complex. Strictly interpretting metadata requires + a lot of work and (in our experience) can be difficult to build mental + models of how Iris functions work. Moreover, it means that a lot of + logic (e.g., constraint handling) uses non-vectorized operations. For + example, extracting all times within a range can be surprisingly slow + (e.g., 0.3 seconds vs 3 milliseconds in scidata to select along a time + dimension with 10000 elements). - [pandas][pandas] is fast and powerful but oriented around working with tabular datasets. pandas has experimental N-dimensional panels, but they don't support aligned math with other objects. We believe the `DataView`/ - `Cube` model is better suited to working with scientific datasets. + `Cube` model is better suited to working with scientific datasets. We use + pandas internally in scidata to support fast indexing. + - [netCDF4-python][nc4] provides scidata's primary interface for working with + netCDF and OpenDAP datasets. [pandas]: http://pandas.pydata.org/ [cdm]: http://www.unidata.ucar.edu/software/thredds/current/netcdf-java/CDM/ From 4cd1361370ed1112ac2436676d2c6074c515ec80 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 9 Feb 2014 23:25:50 -0500 Subject: [PATCH 29/45] Renamed "Variable" -> "Array" and "DataView" -> "DatasetArray" I think these names are much more straightforward. The only annoying aspect is that "array" is the name of a built-in module, which conflicts with naming the module "Array" is in "array". --- README.md | 16 +- src/scidata/__init__.py | 14 +- src/scidata/{variable.py => array_.py} | 73 ++++---- src/scidata/backends.py | 8 +- src/scidata/common.py | 2 +- src/scidata/dataset.py | 84 ++++----- src/scidata/{dataview.py => dataset_array.py} | 154 ++++++++-------- src/scidata/utils.py | 1 + test/__init__.py | 2 +- test/{test_variable.py => test_array.py} | 174 +++++++++--------- test/test_dataset.py | 34 ++-- ...test_dataview.py => test_dataset_array.py} | 58 +++--- test/test_utils.py | 20 +- 13 files changed, 322 insertions(+), 318 deletions(-) rename src/scidata/{variable.py => array_.py} (92%) rename src/scidata/{dataview.py => dataset_array.py} (74%) rename test/{test_variable.py => test_array.py} (56%) rename test/{test_dataview.py => test_dataset_array.py} (79%) diff --git a/README.md b/README.md index f2fbdc85385..7f2b04b86db 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ used for self-describing scientific data (netCDF, OpenDAP, etc.). ## Main Feaures - - A `DataView` object that is compatible with NumPy's ndarray and ufuncs + - A `DatasetArray` object that is compatible with NumPy's ndarray and ufuncs but keeps ancilliary variables and metadata intact. - Array broadcasting based on dimension names and coordinate indices instead of only shapes. @@ -33,13 +33,13 @@ used for self-describing scientific data (netCDF, OpenDAP, etc.). - [Iris][iris] (supported by the UK Met office) is a similar package designed for working with geophysical datasets in Python. Iris provided - much of the inspiration for scidata (e.g., scidata's `DataView` is largely - based on the Iris `Cube`), but it has several limitations that led us to - build scidata instead of extending Iris: + much of the inspiration for scidata (e.g., scidata's `DatasetArray` is + largely based on the Iris `Cube`), but it has several limitations that led + us to build scidata instead of extending Iris: 1. Iris has essentially one first-class object (the `Cube`) on which it attempts to build all functionality (`Coord` supports a much more limited set of functionality). scidata has its equivalent of the Cube - (the `DataView` object), but it is only a thin wrapper on the more + (the `DatasetArray` object), but it is only a thin wrapper on the more primitive building blocks of Dataset and Variable objects. 2. Iris has a strict interpretation of [CF conventions][cf], which, although a principled choice, we have found to be impractical for @@ -58,9 +58,9 @@ used for self-describing scientific data (netCDF, OpenDAP, etc.). dimension with 10000 elements). - [pandas][pandas] is fast and powerful but oriented around working with tabular datasets. pandas has experimental N-dimensional panels, but they - don't support aligned math with other objects. We believe the `DataView`/ - `Cube` model is better suited to working with scientific datasets. We use - pandas internally in scidata to support fast indexing. + don't support aligned math with other objects. We believe the + `DatasetArray`/ `Cube` model is better suited to working with scientific + datasets. We use pandas internally in scidata to support fast indexing. - [netCDF4-python][nc4] provides scidata's primary interface for working with netCDF and OpenDAP datasets. diff --git a/src/scidata/__init__.py b/src/scidata/__init__.py index 4a4789b7e3c..52bd1d7ae08 100644 --- a/src/scidata/__init__.py +++ b/src/scidata/__init__.py @@ -1,12 +1,12 @@ -from dataset import Dataset, open_dataset -from dataview import DataView, intersection -from utils import orthogonal_indexer, num2datetimeindex, variable_equal -from variable import Variable, broadcast_variables +from .array_ import Array, broadcast_variables +from .dataset import Dataset, open_dataset +from .dataset_array import DatasetArray, intersection +from .utils import orthogonal_indexer, num2datetimeindex, variable_equal -import backends +from . import backends -concat = DataView.from_stack +concat = DatasetArray.from_stack -__all__ = ['open_dataset', 'Dataset', 'DataView', 'Variable', 'intersection', +__all__ = ['open_dataset', 'Dataset', 'DatasetArray', 'Array', 'intersection', 'broadcast_variables', 'orthogonal_indexer', 'num2datetimeindex', 'variable_equal'] diff --git a/src/scidata/variable.py b/src/scidata/array_.py similarity index 92% rename from src/scidata/variable.py rename to src/scidata/array_.py index 0bb5754f11e..3a13208089a 100644 --- a/src/scidata/variable.py +++ b/src/scidata/array_.py @@ -6,10 +6,10 @@ import conventions import dataset -import dataview +import dataset_array import ops import utils -from common import _DataWrapperMixin +from common import AbstractArray def _as_compatible_data(data): @@ -24,6 +24,9 @@ def _as_compatible_data(data): 'the necesssary attributes for direct use' % type(data).__name__, RuntimeWarning, stacklevel=3) data = np.asarray(data) + elif isinstance(data, AbstractArray): + # we don't want nested Array objects + data = np.asarray(data) return data @@ -50,10 +53,10 @@ def unique_value_groups(ar): return values, groups -class Variable(_DataWrapperMixin): +class Array(AbstractArray): """ A netcdf-like variable consisting of dimensions, data and attributes - which describe a single Variable. A single variable object is not + which describe a single Array. A single variable object is not fully described outside the context of its parent Dataset. """ def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): @@ -74,7 +77,7 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): (with arrays). Two modes are supported: 'numpy' (fancy indexing like numpy.ndarray objects) and 'orthogonal' (array indexing accesses different dimensions independently, like netCDF4 - variables). Accessing data from a Variable always uses orthogonal + variables). Accessing data from a Array always uses orthogonal indexing, so `indexing_mode` tells the variable whether index lookups need to be internally converted to numpy-style indexing. """ @@ -103,7 +106,7 @@ def data(self): def data(self, value): value = np.asarray(value) if value.shape != self.shape: - raise ValueError("replacement data must match the Variable's " + raise ValueError("replacement data must match the Array's " "shape") self._data = value @@ -128,7 +131,7 @@ def _remap_indexer(self, key): return key def __getitem__(self, key): - """Return a new Variable object whose contents are consistent with + """Return a new Array object whose contents are consistent with getting the provided key from the underlying data NB. __getitem__ and __setitem__ implement "orthogonal indexing" like @@ -180,7 +183,7 @@ def _copy(self, deepcopy=False): data = np.array(self._data) if deepcopy else self.data # note: # dimensions is already an immutable tuple - # attributes will be copied when the new Variable is created + # attributes will be copied when the new Array is created return type(self)(self.dimensions, data, self.attributes) def __copy__(self): @@ -232,8 +235,8 @@ def indexed_by(self, **indexers): Returns ------- - obj : Variable object - A new Variable with the selected data and dimensions. In general, + obj : Array object + A new Array with the selected data and dimensions. In general, the new variable's data will be a view of this variable's data, unless numpy fancy indexing was triggered by using an array indexer, in which case the data will be a copy. @@ -249,7 +252,7 @@ def indexed_by(self, **indexers): return self[tuple(key)] def transpose(self, *dimensions): - """Return a new Variable object with transposed dimensions + """Return a new Array object with transposed dimensions Note: Although this operation returns a view of this variable's data, it is not lazy -- the data will be fully loaded. @@ -262,7 +265,7 @@ def transpose(self, *dimensions): Returns ------- - transposed : Variable + transposed : Array The returned object has transposed data and dimensions with the same attributes as the original. @@ -303,8 +306,8 @@ def collapse(self, func, dimension=None, axis=None, **kwargs): Returns ------- - collapsed : Variable - Variable with summarized data and the indicated dimension(s) + collapsed : Array + Array with summarized data and the indicated dimension(s) removed. """ if dimension is not None and axis is not None: @@ -358,18 +361,18 @@ def aggregate(self, func, new_dim_name, group_by, **kwargs): integer valued axis. new_dim_name : str or sequence of str, optional Name of the new dimension to create. - group_by : Variable + group_by : Array 1D variable which contains the values by which to group. **kwargs : dict Additional keyword arguments passed on to `func`. Returns ------- - unique : Variable + unique : Array 1D variable of unique values in group, along the dimension given by `new_dim_name`. - aggregated : Variable - Variable with aggregated data and the original dimension from + aggregated : Array + Array with aggregated data and the original dimension from `group_by` replaced by `new_dim_name`. """ if group_by.ndim != 1: @@ -399,8 +402,8 @@ def from_stack(cls, variables, dimension='stacked_dimension', Parameters ---------- - variables : iterable of Variable - Variables to stack together. Each variable is expected to have + variables : iterable of Array + Arrays to stack together. Each variable is expected to have matching dimensions and shape except for along the stacked dimension. dimension : str, optional @@ -416,7 +419,7 @@ def from_stack(cls, variables, dimension='stacked_dimension', Returns ------- - stacked : Variable + stacked : Array Stacked variable formed by stacking all the supplied variables along the new dimension. """ @@ -502,7 +505,7 @@ def func(self, *args, **kwargs): def _binary_op(f, reflexive=False): @functools.wraps(f) def func(self, other): - if isinstance(other, dataview.DataView): + if isinstance(other, dataset_array.DatasetArray): return NotImplemented self_data, other_data, dims = _broadcast_variable_data(self, other) new_data = (f(self_data, other_data) @@ -530,24 +533,24 @@ def func(self, other): return self return func -ops.inject_special_operations(Variable) +ops.inject_special_operations(Array) def broadcast_variables(first, second): - """Given two variables, return two variables with matching dimensions and - numpy broadcast compatible data + """Given two arrays, return two arrays with matching dimensions and numpy + broadcast compatible data Parameters ---------- - first, second : Variable - Variable objects to broadcast. + first, second : Array + Array objects to broadcast. Returns ------- - first_broadcast, second_broadcast : Variable - Broadcast variables. The data on each variable will be a view of the - data on the corresponding original variables, but dimensions will be - reordered and inserted so that both broadcast variables have the same + first_broadcast, second_broadcast : Array + Broadcast arrays. The data on each variable will be a view of the + data on the corresponding original arrays, but dimensions will be + reordered and inserted so that both broadcast arrays have the same dimensions. The new dimensions are sorted in order of appearence in the first variable's dimensions followed by the second variable's dimensions. @@ -566,7 +569,7 @@ def broadcast_variables(first, second): 'has duplicate dimensions: %r' % list(dimensions)) - # build dimensions for new Variable + # build dimensions for new Array second_only_dims = [d for d in second.dimensions if d not in first.dimensions] dimensions = list(first.dimensions) + second_only_dims @@ -574,12 +577,12 @@ def broadcast_variables(first, second): # expand first_data's dimensions so it's broadcast compatible after # adding second's dimensions at the end first_data = first.data[(Ellipsis,) + (None,) * len(second_only_dims)] - new_first = Variable(dimensions, first_data) + new_first = Array(dimensions, first_data) # expand and reorder second_data so the dimensions line up first_only_dims = [d for d in dimensions if d not in second.dimensions] second_dims = list(second.dimensions) + first_only_dims second_data = second.data[(Ellipsis,) + (None,) * len(first_only_dims)] - new_second = Variable(second_dims, second_data).transpose(*dimensions) + new_second = Array(second_dims, second_data).transpose(*dimensions) return new_first, new_second @@ -587,7 +590,7 @@ def _broadcast_variable_data(self, other): if isinstance(other, dataset.Dataset): raise TypeError('datasets do not support mathematical operations') elif all(hasattr(other, attr) for attr in ['dimensions', 'data', 'shape']): - # `other` satisfies the Variable API + # `other` satisfies the scidata.Array API new_self, new_other = broadcast_variables(self, other) self_data = new_self.data other_data = new_other.data diff --git a/src/scidata/backends.py b/src/scidata/backends.py index ae004804520..85e1645a595 100644 --- a/src/scidata/backends.py +++ b/src/scidata/backends.py @@ -11,9 +11,9 @@ from scipy.io import netcdf from collections import OrderedDict -from utils import FrozenOrderedDict, Frozen -from variable import Variable +import array_ as array import conventions +from utils import FrozenOrderedDict, Frozen class AbstractDataStore(object): @@ -65,7 +65,7 @@ def sync(self): def convert_scipy_variable(var): - return Variable(var.dimensions, var.data, var._attributes) + return array.Array(var.dimensions, var.data, var._attributes) class ScipyDataStore(AbstractDataStore): @@ -160,7 +160,7 @@ def convert_nc4_variable(var): # netcdf file would now have been scaled twice! attr = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs() if k not in ['scale_factor', 'add_offset']) - return Variable(var.dimensions, var, attr, indexing_mode='orthogonal') + return array.Array(var.dimensions, var, attr, indexing_mode='orthogonal') class NetCDF4DataStore(AbstractDataStore): diff --git a/src/scidata/common.py b/src/scidata/common.py index 11133daa5da..12c7a9e3690 100644 --- a/src/scidata/common.py +++ b/src/scidata/common.py @@ -1,5 +1,5 @@ -class _DataWrapperMixin(object): +class AbstractArray(object): @property def dtype(self): return self._data.dtype diff --git a/src/scidata/dataset.py b/src/scidata/dataset.py index fabe8261306..1058f9b73a5 100644 --- a/src/scidata/dataset.py +++ b/src/scidata/dataset.py @@ -7,10 +7,12 @@ from cStringIO import StringIO from collections import OrderedDict, Mapping, MutableMapping -from dataview import DataView +import array_ as array +import backends +import conventions +import utils +from dataset_array import DatasetArray from utils import FrozenOrderedDict, Frozen, remap_loc_indexers -from variable import Variable, broadcast_variables -import backends, conventions, utils date2num = nc4.date2num num2date = nc4.num2date @@ -23,7 +25,7 @@ def construct_dimensions(variables): Parameters ---------- variables : mapping - Mapping from variable names to Variable objects. + Mapping from variable names to Array objects. Returns ------- @@ -58,7 +60,7 @@ def check_dims_and_vars_consistency(dimensions, variables): dimensions : mapping Mapping from dimension names to lengths. variables : mapping - Mapping from variable names to Variable objects. + Mapping from variable names to Array objects. Raises ------ @@ -163,7 +165,7 @@ class Dataset(Mapping): Coordinates are simply variables that are also dimensions. They must all have dimension 1. noncoordinates : {name: variable, ...} - Variables that are not coordinates. + Arrays that are not coordinates. attributes : {key: value, ...} indices : {dimension: index, ...} Mapping from dimensions to pandas.Index objects. @@ -277,17 +279,17 @@ def _datetimeindices(self): def _get_virtual_variable(self, key): if key in self.indices: - return Variable([key], self.indices[key].values) + return array.Array([key], self.indices[key].values) split_key = key.split('.') if len(split_key) == 2: var, suffix = split_key if var in self._datetimeindices: if suffix in _DATETIMEINDEX_COMPONENTS: - return Variable([var], getattr(self.indices[var], suffix)) + return array.Array([var], getattr(self.indices[var], suffix)) elif suffix == 'season': # seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) month = self.indices[var].month - return Variable([var], (month // 3) % 4 + 1) + return array.Array([var], (month // 3) % 4 + 1) raise ValueError('virtual variable %r not found' % key) def _get_virtual_dataview(self, key): @@ -295,11 +297,11 @@ def _get_virtual_dataview(self, key): new_vars = OrderedDict(self.variables.items() + [(key, virtual_var)]) ds = type(self)(new_vars, self.dimensions, self.attributes, indices=self.indices.cache) - return DataView(ds, key) + return DatasetArray(ds, key) @property def virtual_variables(self): - """Variables that don't exist in this dataset but for which dataviews + """Arrays that don't exist in this dataset but for which dataviews could be created on demand (because they can be calculated from other dataset variables or dimensions) """ @@ -317,19 +319,19 @@ def __getitem__(self, key): raise KeyError('dataset contains no variable with name %r ' % key) else: - return DataView(self.select(key), key) + return DatasetArray(self.select(key), key) def __setitem__(self, key, value): # TODO: allow this operation to be destructive, overriding existing # variables? If so, we may want to implement __delitem__, too. - # (We would need to change DataView.__setitem__ in that case, because + # (We would need to change DatasetArray.__setitem__ in that case, because # we definitely don't want to override focus variables.) - if isinstance(value, DataView): + if isinstance(value, DatasetArray): self.merge(value.renamed(key).dataset, inplace=True) - elif isinstance(value, Variable): + elif isinstance(value, array.Array): self.set_variable(key, value) else: - raise TypeError('only DataViews and Variables can be added to ' + raise TypeError('only DatasetArrays and Arrays can be added to ' 'datasets via `__setitem__`') # mutable objects should not be hashable @@ -452,11 +454,11 @@ def create_variable(self, name, dims, data, attributes=None): Returns ------- - var : Variable + var : Array Reference to the newly created variable. """ # any error checking should be taken care of by add_variable - v = Variable(dims, np.asarray(data), attributes) + v = array.Array(dims, np.asarray(data), attributes) return self.add_variable(name, v) def create_coordinate(self, name, data, attributes=None): @@ -484,11 +486,11 @@ def create_coordinate(self, name, data, attributes=None): Returns ------- - var : Variable + var : Array Reference to the newly created coordinate variable. """ # any error checking should be taken care of by add_coordinate - v = Variable((name,), np.asarray(data), attributes) + v = array.Array((name,), np.asarray(data), attributes) return self.add_coordinate(v) def add_dimension(self, name, length): @@ -516,17 +518,17 @@ def add_variable(self, name, var): ---------- name : string The name under which the variable will be added. - variable : Variable + variable : Array The variable to be added. If the desired action is to add a copy of the variable be sure to do so before passing it to this function. Returns ------- - variable - The variable object in the underlying datastore. + Array + An Array object attached to the underlying datastore. """ if name in self.variables: - raise ValueError("Variable named %r already exists" % name) + raise ValueError("Array named %r already exists" % name) return self.set_variable(name, var) def add_coordinate(self, var): @@ -534,14 +536,14 @@ def add_coordinate(self, var): Parameters ---------- - variable : Variable + variable : Array The coordinate variable to be added. Coordinate variables must be 1D, and will be added under the same name as their sole dimension. Returns ------- variable - The variable object in the underlying datastore. + An Array object attached to the underlying datastore. """ # We need to be cleanly roll back the effects of # create_dimension if create_variable fails, otherwise we will @@ -567,14 +569,14 @@ def set_variable(self, name, var): ---------- name : string The name under which the variable will be added. - variable : Variable + variable : Array The variable to be added. If the desired action is to add a copy of the variable be sure to do so before passing it to this function. Returns ------- variable - The variable object in the underlying datastore. + An Array object attached to the underlying datastore. """ # check old + new dimensions for consistency checks new_dims = OrderedDict() @@ -592,12 +594,12 @@ def set_variable(self, name, var): return new_var def indexed_by(self, **indexers): - """Return a new dataset with each variable indexed along the specified + """Return a new dataset with each array indexed along the specified dimension(s) - This method selects values from each variable using its `__getitem__` + This method selects values from each array using its `__getitem__` method, except this method does not require knowing the order of - each variable's dimensions. + each array's dimensions. Parameters ---------- @@ -609,8 +611,8 @@ def indexed_by(self, **indexers): ------- obj : Dataset A new Dataset with the same contents as this dataset, except each - variable and dimension is indexed by the appropriate indexers. In - general, each variable's data will be a view of the variable's data + array and dimension is indexed by the appropriate indexers. In + general, each array's data will be a view of the array's data in this dataset, unless numpy fancy indexing was triggered by using an array indexer, in which case the data will be a copy. @@ -618,7 +620,7 @@ def indexed_by(self, **indexers): -------- Dataset.labeled_by Dataset.indexed_by - Variable.indexed_by + Array.indexed_by """ invalid = [k for k in indexers if not k in self.dimensions] if invalid: @@ -682,7 +684,7 @@ def labeled_by(self, **indexers): -------- Dataset.labeled_by Dataset.indexed_by - Variable.indexed_by + Array.indexed_by """ return self.indexed_by(**remap_loc_indexers(self.indices, indexers)) @@ -706,7 +708,7 @@ def renamed(self, name_dict): dims = tuple(name_dict.get(dim, dim) for dim in v.dimensions) #TODO: public interface for renaming a variable without loading # data - variables[name] = Variable(dims, v._data, v.attributes) + variables[name] = array.Array(dims, v._data, v.attributes) dimensions = OrderedDict((name_dict.get(k, k), v) for k, v in self.dimensions.iteritems()) @@ -718,7 +720,7 @@ def renamed(self, name_dict): def merge(self, other, inplace=False): """Merge two datasets into a single new dataset - This method generally not allow for overriding data. Variables, + This method generally not allow for overriding data. Arrays, dimensions and indices are checked for conflicts. However, conflicting attributes are removed. @@ -862,7 +864,7 @@ def replace(self, name, variable): ---------- name : str Name of the variable to replace in this object. - variable : Variable + variable : Array Replacement variable. Returns @@ -906,10 +908,10 @@ def to_dataframe(self): columns = self.noncoordinates.keys() data = [] # we need a template to broadcast all dataset variables against - template = Variable(self.dimensions.keys(), - np.empty(self.dimensions.values())) + template = array.Array(self.dimensions.keys(), + np.empty(self.dimensions.values())) for k in columns: - _, var = broadcast_variables(template, self[k]) + _, var = array.broadcast_variables(template, self[k]) _, var_data = np.broadcast_arrays(template.data, var.data) data.append(var_data.reshape(-1)) # note: pd.MultiIndex.from_product is new in pandas-0.13.1 diff --git a/src/scidata/dataview.py b/src/scidata/dataset_array.py similarity index 74% rename from src/scidata/dataview.py rename to src/scidata/dataset_array.py index c01623cd1b4..3d70ab48a4e 100644 --- a/src/scidata/dataview.py +++ b/src/scidata/dataset_array.py @@ -6,40 +6,39 @@ import numpy as np +import array_ import dataset import ops -import variable -from common import _DataWrapperMixin +from common import AbstractArray from utils import expanded_indexer, FrozenOrderedDict, remap_loc_indexers class _LocIndexer(object): - def __init__(self, dataview): - self.dataview = dataview + def __init__(self, array): + self.array = array def _remap_key(self, key): - indexers = remap_loc_indexers(self.dataview.indices, - self.dataview._key_to_indexers(key)) + indexers = remap_loc_indexers(self.array.indices, + self.array._key_to_indexers(key)) return tuple(indexers.values()) def __getitem__(self, key): - return self.dataview[self._remap_key(key)] + return self.array[self._remap_key(key)] def __setitem__(self, key, value): - self.dataview[self._remap_key(key)] = value + self.array[self._remap_key(key)] = value -class DataView(_DataWrapperMixin): - """ - A Dataset wrapper oriented around a single Variable +class DatasetArray(AbstractArray): + """A Dataset wrapper oriented around a single Array Dataviews are the primary way to do computations with Dataset variables. They are designed to make it easy to manipulate variables in the context of an intact Dataset object. Getting items from or doing mathematical - operations with a dataview returns another dataview. + operations with a dataset array returns another dataset array. - The design of dataviews is strongly inspired by the Iris Cube. However, - dataviews are much lighter weight than cubes. They are simply aligned, + The design of DatasetArray is strongly inspired by the Iris Cube. However, + dataset arrays are much lighter weight than cubes. They are simply aligned, labeled datasets and do not explicitly guarantee or rely on the CF model. """ def __init__(self, dataset, focus): @@ -49,8 +48,8 @@ def __init__(self, dataset, focus): dataset : scidata.Dataset The dataset on which to build this data view. focus : str - The name of the "focus variable" in dataset on which this view is - oriented. + The name of the "focus variable" in `dataset` on which this object + is oriented. """ if not focus in dataset: raise ValueError('focus %r is not a variable in dataset %r' @@ -65,14 +64,14 @@ def variable(self): def variable(self, value): self.dataset.set_variable(self.focus, value) - # _data is necessary for _DataWrapperMixin + # _data is necessary for AbstractArray @property def _data(self): return self.variable._data @property def data(self): - """The dataview's data as a numpy.ndarray""" + """The dataset array's data as a numpy.ndarray""" return self.variable.data @data.setter def data(self, value): @@ -88,7 +87,7 @@ def _key_to_indexers(self, key): def __getitem__(self, key): if isinstance(key, basestring): - # grab another dataview from the dataset + # grab another dataset array from the dataset return self.dataset[key] else: # orthogonal array indexing @@ -96,7 +95,7 @@ def __getitem__(self, key): def __setitem__(self, key, value): if isinstance(key, basestring): - # add a variable or dataview to the dataset + # add an array to the dataset self.dataset[key] = value else: # orthogonal array indexing @@ -130,7 +129,7 @@ def copy(self): def __copy__(self): # shallow copy the underlying dataset - return DataView(self.dataset.copy(), self.focus) + return DatasetArray(self.dataset.copy(), self.focus) # mutable objects should not be hashable __hash__ = None @@ -152,8 +151,8 @@ def __repr__(self): return '' % (type(self).__name__, self.focus, contents) def indexed_by(self, **indexers): - """Return a new dataview whose dataset is given by indexing along the - specified dimension(s) + """Return a new dataset array whose dataset is given by indexing along + the specified dimension(s) See Also -------- @@ -167,8 +166,8 @@ def indexed_by(self, **indexers): return type(self)(ds, self.focus) def labeled_by(self, **indexers): - """Return a new dataview whose dataset is given by selecting coordinate - labels along the specified dimension(s) + """Return a new dataset array whose dataset is given by selecting + coordinate labels along the specified dimension(s) See Also -------- @@ -177,20 +176,21 @@ def labeled_by(self, **indexers): return self.indexed_by(**remap_loc_indexers(self.indices, indexers)) def renamed(self, new_name): - """Returns a new DataView with this DataView's focus variable renamed + """Returns a new DatasetArray with this DatasetArray's focus variable + renamed """ renamed_dataset = self.dataset.renamed({self.focus: new_name}) return type(self)(renamed_dataset, new_name) def unselected(self): - """Returns a copy of this DataView's dataset with this DataView's - focus variable removed + """Returns a copy of this DatasetArray's dataset with this + DatasetArray's focus variable removed """ return self.dataset.unselect(self.focus) def refocus(self, new_var): - """Returns a copy of this DataView's dataset with this DataView's - focus variable replaced by `new_var` + """Returns a copy of this DatasetArray's dataset with this + DatasetArray's focus variable replaced by `new_var` If `new_var` is a dataview, its contents will be merged in. """ @@ -215,16 +215,16 @@ def iterator(self, dimension): ------- it : iterator The returned iterator yields pairs of scalar-valued coordinate - variables and DataView objects. + arrays and DatasetArray objects. """ for (x, ds) in self.dataset.iterator(dimension): yield (x, type(self)(ds, self.focus)) def transpose(self, *dimensions): - """Return a new DataView object with transposed dimensions + """Return a new DatasetArray object with transposed dimensions - Note: Although this operation returns a view of this dataview's - variable's data, it is not lazy -- the data will be fully loaded. + Note: Although this operation returns a view of this array's data, it + is not lazy -- the data will be fully loaded. Parameters ---------- @@ -234,18 +234,18 @@ def transpose(self, *dimensions): Returns ------- - transposed : DataView - The returned DataView's variable is transposed. + transposed : DatasetArray + The returned DatasetArray's variable is transposed. See Also -------- numpy.transpose - Variable.tranpose + Array.tranpose """ return self.refocus(self.variable.transpose(*dimensions)) def collapse(self, func, dimension=None, axis=None, **kwargs): - """Collapse this variable by applying `func` along some dimension(s) + """Collapse this array by applying `func` along some dimension(s) Parameters ---------- @@ -271,9 +271,9 @@ def collapse(self, func, dimension=None, axis=None, **kwargs): Returns ------- - collapsed : DataView - DataView with this dataview's variable replaced with a variable - with summarized data and the indicated dimension(s) removed. + collapsed : DatasetArray + DatasetArray with this object's array replaced with an array with + summarized data and the indicated dimension(s) removed. """ var = self.variable.collapse(func, dimension, axis, **kwargs) dropped_dims = set(self.dimensions) - set(var.dimensions) @@ -288,7 +288,7 @@ def collapse(self, func, dimension=None, axis=None, **kwargs): return type(self)(ds, self.focus) def aggregate(self, func, new_dim, **kwargs): - """Aggregate this dataview by applying `func` to grouped elements + """Aggregate this array by applying `func` to grouped elements Parameters ---------- @@ -296,18 +296,18 @@ def aggregate(self, func, new_dim, **kwargs): Function which can be called in the form `func(x, axis=axis, **kwargs)` to reduce an np.ndarray over an integer valued axis. - new_dim : str or DataView - Name of a variable in this dataview's dataset or DataView by which + new_dim : str or DatasetArray + Name of a variable in this array's dataset or DatasetArray by which to group variable elements. The dimension along which this variable - exists will be replaced by this name. The variable or dataview must - be one-dimensional. + exists will be replaced by this name. The array must be one- + dimensional. **kwargs : dict Additional keyword arguments passed on to `func`. Returns ------- - aggregated : DataView - DataView with aggregated data and the new dimension `new_dim`. + aggregated : DatasetArray + DatasetArray with aggregated data and the new dimension `new_dim`. """ if isinstance(new_dim, basestring): new_dim = self.dataset[new_dim] @@ -323,17 +323,17 @@ def aggregate(self, func, new_dim, **kwargs): return type(self)(ds, self.focus) @classmethod - def from_stack(cls, dataviews, dimension='stacked_dimension'): - """Stack dataviews along a new or existing dimension to form a new + def from_stack(cls, arrays, dimension='stacked_dimension'): + """Stack arrays along a new or existing dimension to form a new dataview Parameters ---------- - dataviews : iterable of DataView or Variable - Variables to stack together. Each variable is expected to have + arrays : iterable of DatasetArray or Array + Arrays to stack together. Each variable is expected to have matching dimensions and shape except for along the stacked dimension. - dimension : str or DataView, optional + dimension : str or DatasetArray, optional Name of the dimension to stack along. This can either be a new dimension name, in which case it is added along axis=0, or an existing dimension name, in which case the location of the @@ -342,13 +342,13 @@ def from_stack(cls, dataviews, dimension='stacked_dimension'): Returns ------- - stacked : DataView + stacked : DatasetArray Stacked dataview formed by stacking all the supplied variables along the new dimension. """ - dataviews = list(dataviews) - if not dataviews: - raise ValueError('DataView.from_stack was supplied with an ' + arrays = list(arrays) + if not arrays: + raise ValueError('DatasetArray.from_stack was supplied with an ' 'empty argument') # create an empty dataset in which to stack variables @@ -362,31 +362,31 @@ def from_stack(cls, dataviews, dimension='stacked_dimension'): # figure out metadata for each dataview focus = None - for view in dataviews: - if isinstance(view, cls): - ds.merge(view.unselected(), inplace=True) + for array in arrays: + if isinstance(array, cls): + ds.merge(array.unselected(), inplace=True) if focus is None: - focus = view.focus - elif focus != view.focus: - raise ValueError('DataView.from_stack requires that all ' + focus = array.focus + elif focus != array.focus: + raise ValueError('DatasetArray.from_stack requires that all ' 'stacked views have the same focus') if focus is None: focus = 'stacked_variable' # finally, merge in the stacked variables - ds[focus] = variable.Variable.from_stack(dataviews, dim_name) + ds[focus] = array_.Array.from_stack(arrays, dim_name) return cls(ds, focus) def apply(self, func, *args, **kwargs): - """Apply `func` with *args and **kwargs to this dataview's data and - return the result as a new dataview + """Apply `func` with *args and **kwargs to this array's data and + return the result as a new array """ return self.refocus(self.variable.apply(func, *args, **kwargs)) def to_dataframe(self): - """Convert this dataview into a pandas.DataFrame + """Convert this array into a pandas.DataFrame - Non-coordinate variables in this dataview's dataset (which include the + Non-coordinate variables in this array's dataset (which include the view's data) form the columns of the DataFrame. The DataFrame is be indexed by the Cartesian product of the dataset's indices. """ @@ -438,19 +438,17 @@ def func(self, other): return self return func -ops.inject_special_operations(DataView, priority=60) +ops.inject_special_operations(DatasetArray, priority=60) -def intersection(dataview1, dataview2): - """Given two dataview objects, returns two new dataviews where all indices - found on both dataviews are replaced by their intersection +def intersection(array1, array2): + """Given two dataset array objects, returns two new dataset arrays where + all indices found on both arrays are replaced by their intersection """ # TODO: automatically calculate the intersection when doing math with - # dataviews, or better yet calculate the union of the indices and fill in + # arrays, or better yet calculate the union of the indices and fill in # the mis-aligned data with NaN. - overlapping_indices = {k: dataview1.indices[k] & dataview2.indices[k] - for k in dataview1.indices - if k in dataview2.indices} + overlapping_indices = {k: array1.indices[k] & array2.indices[k] + for k in array1.indices if k in array2.indices} return tuple(dv.labeled_by(**overlapping_indices) - for dv in [dataview1, dataview2]) - + for dv in [array1, array2]) diff --git a/src/scidata/utils.py b/src/scidata/utils.py index 79439c7538e..f9334321c95 100644 --- a/src/scidata/utils.py +++ b/src/scidata/utils.py @@ -61,6 +61,7 @@ def expand_array(k, length): key[n] = array_indexers[i] return tuple(key) + def remap_loc_indexers(indices, indexers): """Given mappings of indices and label based indexers, return equivalent location based indexers diff --git a/test/__init__.py b/test/__init__.py index 517306bf909..8ba27cd3329 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -12,7 +12,7 @@ def assertVarEqual(self, v1, v2): def assertVarNotEqual(self, v1, v2): self.assertFalse(utils.variable_equal(v1, v2)) - def assertArrayEqual(self, a1, a2): + def assertNDArrayEqual(self, a1, a2): assert_array_equal(a1, a2) diff --git a/test/test_variable.py b/test/test_array.py similarity index 56% rename from test/test_variable.py rename to test/test_array.py index 120160f4439..82c03d41b9e 100644 --- a/test/test_variable.py +++ b/test/test_array.py @@ -2,16 +2,16 @@ import numpy as np -from scidata import Variable, Dataset +from scidata import Array, Dataset from . import TestCase -class TestVariable(TestCase): +class TestArray(TestCase): def setUp(self): self.d = np.random.random((10, 3)).astype(np.float64) def test_data(self): - v = Variable(['time', 'x'], self.d) + v = Array(['time', 'x'], self.d) self.assertIs(v.data, self.d) with self.assertRaises(ValueError): # wrong size @@ -21,16 +21,16 @@ def test_data(self): self.assertIs(v.data, d2) with warnings.catch_warnings(record=True) as w: - v = Variable(['x'], range(5)) + v = Array(['x'], range(5)) self.assertIn("converting data to np.ndarray", str(w[-1].message)) self.assertIsInstance(v.data, np.ndarray) with warnings.catch_warnings(record=True) as w: # don't warn for numpy numbers - v = Variable([], np.float32(1)) + v = Array([], np.float32(1)) self.assertFalse(w) def test_properties(self): - v = Variable(['time', 'x'], self.d, {'foo': 'bar'}) + v = Array(['time', 'x'], self.d, {'foo': 'bar'}) self.assertEqual(v.dimensions, ('time', 'x')) self.assertEqual(v.dtype, float) self.assertEqual(v.shape, (10, 3)) @@ -40,22 +40,22 @@ def test_properties(self): self.assertEqual(v.attributes, {'foo': u'bar'}) def test_repr(self): - v = Variable(['time', 'x'], self.d) - self.assertEqual('', + v = Array(['time', 'x'], self.d) + self.assertEqual('', repr(v)) def test_items(self): data = np.random.random((10, 11)) - v = Variable(['x', 'y'], data) + v = Array(['x', 'y'], data) # test slicing self.assertVarEqual(v, v[:]) self.assertVarEqual(v, v[...]) - self.assertVarEqual(Variable(['y'], data[0]), v[0]) - self.assertVarEqual(Variable(['x'], data[:, 0]), v[:, 0]) - self.assertVarEqual(Variable(['x', 'y'], data[:3, :2]), v[:3, :2]) + self.assertVarEqual(Array(['y'], data[0]), v[0]) + self.assertVarEqual(Array(['x'], data[:, 0]), v[:, 0]) + self.assertVarEqual(Array(['x', 'y'], data[:3, :2]), v[:3, :2]) # test array indexing - x = Variable(['x'], np.arange(10)) - y = Variable(['y'], np.arange(11)) + x = Array(['x'], np.arange(10)) + y = Array(['y'], np.arange(11)) self.assertVarEqual(v, v[x.data]) self.assertVarEqual(v, v[x]) self.assertVarEqual(v[:3], v[x < 3]) @@ -65,13 +65,13 @@ def test_items(self): self.assertVarEqual(v[:3, :2], v[range(3), range(2)]) # test iteration for n, item in enumerate(v): - self.assertVarEqual(Variable(['y'], data[n]), item) + self.assertVarEqual(Array(['y'], data[n]), item) # test setting v.data[:] = 0 self.assertTrue(np.all(v.data == 0)) def test_indexed_by(self): - v = Variable(['time', 'x'], self.d) + v = Array(['time', 'x'], self.d) self.assertVarEqual(v.indexed_by(time=slice(None)), v) self.assertVarEqual(v.indexed_by(time=0), v[0]) self.assertVarEqual(v.indexed_by(time=slice(0, 3)), v[:3]) @@ -80,85 +80,85 @@ def test_indexed_by(self): v.indexed_by(not_a_dim=0) def test_transpose(self): - v = Variable(['time', 'x'], self.d) - v2 = Variable(['x', 'time'], self.d.T) + v = Array(['time', 'x'], self.d) + v2 = Array(['x', 'time'], self.d.T) self.assertVarEqual(v, v2.transpose()) self.assertVarEqual(v.transpose(), v.T) x = np.random.randn(2, 3, 4, 5) - w = Variable(['a', 'b', 'c', 'd'], x) - w2 = Variable(['d', 'b', 'c', 'a'], np.einsum('abcd->dbca', x)) + w = Array(['a', 'b', 'c', 'd'], x) + w2 = Array(['d', 'b', 'c', 'a'], np.einsum('abcd->dbca', x)) self.assertEqual(w2.shape, (5, 3, 4, 2)) self.assertVarEqual(w2, w.transpose('d', 'b', 'c', 'a')) self.assertVarEqual(w, w2.transpose('a', 'b', 'c', 'd')) - w3 = Variable(['b', 'c', 'd', 'a'], np.einsum('abcd->bcda', x)) + w3 = Array(['b', 'c', 'd', 'a'], np.einsum('abcd->bcda', x)) self.assertVarEqual(w, w3.transpose('a', 'b', 'c', 'd')) def test_1d_math(self): x = np.arange(5) y = np.ones(5) - v = Variable(['x'], x) + v = Array(['x'], x) # unary ops self.assertVarEqual(v, +v) self.assertVarEqual(v, abs(v)) - self.assertArrayEqual((-v).data, -x) + self.assertNDArrayEqual((-v).data, -x) # bianry ops with numbers self.assertVarEqual(v, v + 0) self.assertVarEqual(v, 0 + v) self.assertVarEqual(v, v * 1) - self.assertArrayEqual((v > 2).data, x > 2) - self.assertArrayEqual((0 == v).data, 0 == x) - self.assertArrayEqual((v - 1).data, x - 1) - self.assertArrayEqual((1 - v).data, 1 - x) + self.assertNDArrayEqual((v > 2).data, x > 2) + self.assertNDArrayEqual((0 == v).data, 0 == x) + self.assertNDArrayEqual((v - 1).data, x - 1) + self.assertNDArrayEqual((1 - v).data, 1 - x) # binary ops with numpy arrays - self.assertArrayEqual((v * x).data, x ** 2) - self.assertArrayEqual((x * v).data, x ** 2) - self.assertArrayEqual(v - y, v - 1) - self.assertArrayEqual(y - v, 1 - v) + self.assertNDArrayEqual((v * x).data, x ** 2) + self.assertNDArrayEqual((x * v).data, x ** 2) + self.assertNDArrayEqual(v - y, v - 1) + self.assertNDArrayEqual(y - v, 1 - v) # verify attributes - v2 = Variable(['x'], x, {'units': 'meters'}) + v2 = Array(['x'], x, {'units': 'meters'}) self.assertVarEqual(v2, +v2) self.assertVarEqual(v2, 0 + v2) # binary ops with all variables - self.assertArrayEqual(v + v, 2 * v) - w = Variable(['x'], y, {'foo': 'bar'}) - self.assertVarEqual(v + w, Variable(['x'], x + y)) - self.assertArrayEqual((v * w).data, x * y) + self.assertNDArrayEqual(v + v, 2 * v) + w = Array(['x'], y, {'foo': 'bar'}) + self.assertVarEqual(v + w, Array(['x'], x + y)) + self.assertNDArrayEqual((v * w).data, x * y) # something complicated - self.assertArrayEqual((v ** 2 * w - 1 + x).data, x ** 2 * y - 1 + x) + self.assertNDArrayEqual((v ** 2 * w - 1 + x).data, x ** 2 * y - 1 + x) def test_broadcasting_math(self): x = np.random.randn(2, 3) - v = Variable(['a', 'b'], x) + v = Array(['a', 'b'], x) # 1d to 2d broadcasting self.assertVarEqual( v * v, - Variable(['a', 'b'], np.einsum('ab,ab->ab', x, x))) + Array(['a', 'b'], np.einsum('ab,ab->ab', x, x))) self.assertVarEqual( v * v[0], - Variable(['a', 'b'], np.einsum('ab,b->ab', x, x[0]))) + Array(['a', 'b'], np.einsum('ab,b->ab', x, x[0]))) self.assertVarEqual( v[0] * v, - Variable(['b', 'a'], np.einsum('b,ab->ba', x[0], x))) + Array(['b', 'a'], np.einsum('b,ab->ba', x[0], x))) self.assertVarEqual( v[0] * v[:, 0], - Variable(['b', 'a'], np.einsum('b,a->ba', x[0], x[:, 0]))) + Array(['b', 'a'], np.einsum('b,a->ba', x[0], x[:, 0]))) # higher dim broadcasting y = np.random.randn(3, 4, 5) - w = Variable(['b', 'c', 'd'], y) + w = Array(['b', 'c', 'd'], y) self.assertVarEqual( - v * w, Variable(['a', 'b', 'c', 'd'], + v * w, Array(['a', 'b', 'c', 'd'], np.einsum('ab,bcd->abcd', x, y))) self.assertVarEqual( - w * v, Variable(['b', 'c', 'd', 'a'], + w * v, Array(['b', 'c', 'd', 'a'], np.einsum('bcd,ab->bcda', y, x))) self.assertVarEqual( - v * w[0], Variable(['a', 'b', 'c', 'd'], + v * w[0], Array(['a', 'b', 'c', 'd'], np.einsum('ab,cd->abcd', x, y[0]))) def test_broadcasting_failures(self): - a = Variable(['x'], np.arange(10)) - b = Variable(['x'], np.arange(5)) - c = Variable(['x', 'x'], np.arange(100).reshape(10, 10)) + a = Array(['x'], np.arange(10)) + b = Array(['x'], np.arange(5)) + c = Array(['x', 'x'], np.arange(100).reshape(10, 10)) with self.assertRaisesRegexp(ValueError, 'mismatched lengths'): a + b with self.assertRaisesRegexp(ValueError, 'duplicate dimensions'): @@ -166,90 +166,90 @@ def test_broadcasting_failures(self): def test_inplace_math(self): x = np.arange(5) - v = Variable(['x'], x) + v = Array(['x'], x) v2 = v v2 += 1 self.assertIs(v, v2) # since we provided an ndarray for data, it is also modified in-place self.assertIs(v.data, x) - self.assertArrayEqual(v.data, np.arange(5) + 1) + self.assertNDArrayEqual(v.data, np.arange(5) + 1) def test_array_interface(self): x = np.arange(5) - v = Variable(['x'], x) - self.assertArrayEqual(np.asarray(v), x) + v = Array(['x'], x) + self.assertNDArrayEqual(np.asarray(v), x) # test patched in methods - self.assertArrayEqual(v.take([2, 3]), x.take([2, 3])) + self.assertNDArrayEqual(v.take([2, 3]), x.take([2, 3])) self.assertVarEqual(v.argsort(), v) - self.assertVarEqual(v.clip(2, 3), Variable('x', x.clip(2, 3))) + self.assertVarEqual(v.clip(2, 3), Array('x', x.clip(2, 3))) # test ufuncs - self.assertVarEqual(np.sin(v), Variable(['x'], np.sin(x))) + self.assertVarEqual(np.sin(v), Array(['x'], np.sin(x))) def test_apply(self): x = np.arange(5) - v = Variable(['x'], x) + v = Array(['x'], x) def numpy_only_square(x): return np.asarray(x) ** 2 - self.assertArrayEqual(x ** 2, numpy_only_square(v)) + self.assertNDArrayEqual(x ** 2, numpy_only_square(v)) self.assertVarEqual(v ** 2, v.apply(numpy_only_square)) def test_collapse(self): - v = Variable(['time', 'x'], self.d) + v = Array(['time', 'x'], self.d) # intentionally test with an operation for which order matters self.assertVarEqual(v.collapse(np.std, 'time'), - Variable(['x'], self.d.std(axis=0), - {'cell_methods': 'time: std'})) + Array(['x'], self.d.std(axis=0), + {'cell_methods': 'time: std'})) self.assertVarEqual(v.collapse(np.std, axis=0), v.collapse(np.std, dimension='time')) self.assertVarEqual(v.collapse(np.std, ['x', 'time']), - Variable([], self.d.std(axis=1).std(axis=0), - {'cell_methods': 'x: std time: std'})) + Array([], self.d.std(axis=1).std(axis=0), + {'cell_methods': 'x: std time: std'})) self.assertVarEqual(v.collapse(np.std), - Variable([], self.d.std(), + Array([], self.d.std(), {'cell_methods': 'time: x: std'})) self.assertVarEqual(v.mean('time'), v.collapse(np.mean, 'time')) def test_aggregate(self): - agg_var = Variable(['y'], np.array(['a', 'a', 'b'])) - v = Variable(['x', 'y'], self.d) - expected_unique = Variable(['abc'], np.array(['a', 'b'])) - expected_aggregated = Variable(['x', 'abc'], + agg_var = Array(['y'], np.array(['a', 'a', 'b'])) + v = Array(['x', 'y'], self.d) + expected_unique = Array(['abc'], np.array(['a', 'b'])) + expected_aggregated = Array(['x', 'abc'], np.array([self.d[:, :2].sum(axis=1), self.d[:, 2:].sum(axis=1)]).T, {'cell_methods': 'y: sum'}) actual_unique, actual_aggregated = v.aggregate(np.sum, 'abc', agg_var) self.assertVarEqual(expected_unique, actual_unique) self.assertVarEqual(expected_aggregated, actual_aggregated) - # should be equivalent to aggregate by a dataview, too + # should be equivalent to aggregate by a dataset array, too alt_agg_var = Dataset({'abc': agg_var})['abc'] actual_unique, actual_aggregated = v.aggregate(np.sum, 'abc', - alt_agg_var) + alt_agg_var) self.assertVarEqual(expected_unique, actual_unique) self.assertVarEqual(expected_aggregated, actual_aggregated) def test_from_stack(self): x = np.arange(5) y = np.ones(5) - v = Variable(['a'], x) - w = Variable(['a'], y) - self.assertVarEqual(Variable(['b', 'a'], np.array([x, y])), - Variable.from_stack([v, w], 'b')) - self.assertVarEqual(Variable(['b', 'a'], np.array([x, y])), - Variable.from_stack((v, w), 'b')) - self.assertVarEqual(Variable(['b', 'a'], np.array([x, y])), - Variable.from_stack((v, w), 'b', length=2)) + v = Array(['a'], x) + w = Array(['a'], y) + self.assertVarEqual(Array(['b', 'a'], np.array([x, y])), + Array.from_stack([v, w], 'b')) + self.assertVarEqual(Array(['b', 'a'], np.array([x, y])), + Array.from_stack((v, w), 'b')) + self.assertVarEqual(Array(['b', 'a'], np.array([x, y])), + Array.from_stack((v, w), 'b', length=2)) with self.assertRaisesRegexp(ValueError, 'greater than expected'): - Variable.from_stack([v, w], 'b', length=1) + Array.from_stack([v, w], 'b', length=1) with self.assertRaisesRegexp(ValueError, 'but expected length was'): - Variable.from_stack([v, w, w], 'b', length=4) + Array.from_stack([v, w, w], 'b', length=4) with self.assertRaisesRegexp(ValueError, 'inconsistent dimensions'): - Variable.from_stack([v, Variable(['c'], y)], 'b') + Array.from_stack([v, Array(['c'], y)], 'b') # test concatenating along a dimension - v = Variable(['time', 'x'], np.random.random((10, 8))) - self.assertVarEqual(v, Variable.from_stack([v[:5], v[5:]], 'time')) - self.assertVarEqual(v, Variable.from_stack([v[:5], v[5], v[6:]], 'time')) - self.assertVarEqual(v, Variable.from_stack([v[0], v[1:]], 'time')) + v = Array(['time', 'x'], np.random.random((10, 8))) + self.assertVarEqual(v, Array.from_stack([v[:5], v[5:]], 'time')) + self.assertVarEqual(v, Array.from_stack([v[:5], v[5], v[6:]], 'time')) + self.assertVarEqual(v, Array.from_stack([v[0], v[1:]], 'time')) # test dimension order - self.assertVarEqual(v, Variable.from_stack([v[:, :5], v[:, 5:]], 'x')) + self.assertVarEqual(v, Array.from_stack([v[:, :5], v[:, 5:]], 'x')) self.assertVarEqual(v.transpose(), - Variable.from_stack([v[:, 0], v[:, 1:]], 'x')) + Array.from_stack([v[:, 0], v[:, 1:]], 'x')) diff --git a/test/test_dataset.py b/test/test_dataset.py index 8295046b17f..504f56f725b 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from scidata import Dataset, DataView, Variable, backends, open_dataset +from scidata import Dataset, DatasetArray, Array, backends, open_dataset from . import TestCase @@ -46,9 +46,9 @@ def test_repr(self): '@dim2: 50, @dim3: 10): var1 var2 var3>', repr(data)) def test_init(self): - var1 = Variable('x', np.arange(100)) - var2 = Variable('x', np.arange(1000)) - var3 = Variable(['x', 'y'], np.arange(1000).reshape(100, 10)) + var1 = Array('x', np.arange(100)) + var2 = Array('x', np.arange(1000)) + var3 = Array(['x', 'y'], np.arange(1000).reshape(100, 10)) with self.assertRaisesRegexp(ValueError, 'already is saved with len'): Dataset({'a': var1, 'b': var2}) with self.assertRaisesRegexp(ValueError, 'must be defined with 1-d'): @@ -96,17 +96,17 @@ def test_variable(self): # try to add variable with dim (10,3) with data that's (3,10) self.assertRaises(ValueError, a.create_variable, name='qux', dims=('time', 'x'), data=d.T) - # Variable equality + # Array equality d = np.random.rand(10, 3) - v1 = Variable(('dim1','dim2'), data=d, + v1 = Array(('dim1','dim2'), data=d, attributes={'att1': 3, 'att2': [1,2,3]}) - v2 = Variable(('dim1','dim2'), data=d, + v2 = Array(('dim1','dim2'), data=d, attributes={'att1': 3, 'att2': [1,2,3]}) - v5 = Variable(('dim1','dim2'), data=d, + v5 = Array(('dim1','dim2'), data=d, attributes={'att1': 3, 'att2': [1,2,3]}) - v3 = Variable(('dim1','dim3'), data=d, + v3 = Array(('dim1','dim3'), data=d, attributes={'att1': 3, 'att2': [1,2,3]}) - v4 = Variable(('dim1','dim2'), data=d, + v4 = Array(('dim1','dim2'), data=d, attributes={'att1': 3, 'att2': [1,2,4]}) v5 = deepcopy(v1) v5.data[:] = np.random.rand(10,3) @@ -319,19 +319,19 @@ def test_getitem(self): data = create_test_data(self.get_store()) data.create_variable('time', ['time'], np.arange(1000, dtype=np.int32), {'units': 'days since 2000-01-01'}) - self.assertIsInstance(data['var1'], DataView) + self.assertIsInstance(data['var1'], DatasetArray) self.assertVarEqual(data['var1'], data.variables['var1']) self.assertItemsEqual(data['var1'].dataset.variables, {'var1', 'dim1', 'dim2'}) # access virtual variables self.assertVarEqual(data['time.dayofyear'][:300], - Variable('time', 1 + np.arange(300))) - self.assertArrayEqual(data['time.month'].data, - data.indices['time'].month) + Array('time', 1 + np.arange(300))) + self.assertNDArrayEqual(data['time.month'].data, + data.indices['time'].month) def test_setitem(self): # assign a variable - var = Variable(['dim1'], np.random.randn(100)) + var = Array(['dim1'], np.random.randn(100)) data1 = create_test_data(self.get_store()) data1.set_variable('A', var) data2 = data1.copy() @@ -343,7 +343,7 @@ def test_setitem(self): data2['B'] = dv self.assertEqual(data1, data2) # assign an array - with self.assertRaisesRegexp(TypeError, 'DataViews and Variables'): + with self.assertRaisesRegexp(TypeError, 'DatasetArrays and Arrays'): data2['C'] = var.data def test_write_store(self): @@ -356,7 +356,7 @@ def test_write_store(self): def test_to_dataframe(self): x = np.random.randn(10) y = np.random.randn(10) - ds = Dataset({'a': Variable('t', x), 'b': Variable('t', y)}) + ds = Dataset({'a': Array('t', x), 'b': Array('t', y)}) expected = pd.DataFrame(np.array([x, y]).T, columns=['a', 'b'], index=pd.Index(np.arange(10), name='t')) actual = ds.to_dataframe() diff --git a/test/test_dataview.py b/test/test_dataset_array.py similarity index 79% rename from test/test_dataview.py rename to test/test_dataset_array.py index fa5f2483dbd..e53a1d1efcf 100644 --- a/test/test_dataview.py +++ b/test/test_dataset_array.py @@ -1,27 +1,27 @@ import numpy as np -from scidata import Dataset, DataView, Variable, intersection +from scidata import Dataset, DatasetArray, Array, intersection from . import TestCase, ReturnItem -class TestDataView(TestCase): +class TestDatasetArray(TestCase): def assertViewEqual(self, dv1, dv2): self.assertEqual(dv1.dataset, dv2.dataset) self.assertEqual(dv1.focus, dv2.focus) def setUp(self): self.x = np.random.random((10, 20)) - self.v = Variable(['x', 'y'], self.x) + self.v = Array(['x', 'y'], self.x) self.ds = Dataset({'foo': self.v}) self.ds.create_coordinate('x', np.arange(10)) self.ds.create_coordinate('y', np.arange(20)) - self.dv = DataView(self.ds, 'foo') + self.dv = DatasetArray(self.ds, 'foo') def test_properties(self): self.assertIs(self.dv.dataset, self.ds) self.assertEqual(self.dv.focus, 'foo') self.assertVarEqual(self.dv.variable, self.v) - self.assertArrayEqual(self.dv.data, self.v.data) + self.assertNDArrayEqual(self.dv.data, self.v.data) for attr in ['dimensions', 'dtype', 'shape', 'size', 'ndim', 'attributes']: self.assertEqual(getattr(self.dv, attr), getattr(self.v, attr)) @@ -29,15 +29,15 @@ def test_properties(self): self.assertVarEqual(self.dv, self.v) self.assertEqual(list(self.dv.indices), list(self.ds.indices)) for k, v in self.dv.indices.iteritems(): - self.assertArrayEqual(v, self.ds.indices[k]) + self.assertNDArrayEqual(v, self.ds.indices[k]) def test_items(self): # strings pull out dataviews self.assertViewEqual(self.dv, self.ds['foo']) x = self.dv['x'] y = self.dv['y'] - self.assertViewEqual(DataView(self.ds.select('x'), 'x'), x) - self.assertViewEqual(DataView(self.ds.select('y'), 'y'), y) + self.assertViewEqual(DatasetArray(self.ds.select('x'), 'x'), x) + self.assertViewEqual(DatasetArray(self.ds.select('y'), 'y'), y) # integer indexing I = ReturnItem() for i in [I[:], I[...], I[x.data], I[x.variable], I[x], I[x, y], @@ -59,7 +59,7 @@ def test_iteration(self): for ((act_x, act_dv), (exp_x, exp_ds)) in \ zip(self.dv.iterator('y'), self.ds.iterator('y')): self.assertVarEqual(exp_x, act_x) - self.assertViewEqual(DataView(exp_ds, 'foo'), act_dv) + self.assertViewEqual(DatasetArray(exp_ds, 'foo'), act_dv) for ((_, exp_dv), act_dv) in zip(self.dv.iterator('x'), self.dv): self.assertViewEqual(exp_dv, act_dv) @@ -71,13 +71,13 @@ def test_indexed_by(self): self.assertViewEqual(self.dv[:3], self.dv.indexed_by(x=slice(3))) def test_labeled_by(self): - self.ds.set_variable('x', Variable(['x'], np.array(list('abcdefghij')))) + self.ds.set_variable('x', Array(['x'], np.array(list('abcdefghij')))) self.assertViewEqual(self.dv, self.dv.labeled_by(x=slice(None))) self.assertViewEqual(self.dv[1], self.dv.labeled_by(x='b')) self.assertViewEqual(self.dv[:3], self.dv.labeled_by(x=slice('c'))) def test_loc(self): - self.ds.set_variable('x', Variable(['x'], np.array(list('abcdefghij')))) + self.ds.set_variable('x', Array(['x'], np.array(list('abcdefghij')))) self.assertViewEqual(self.dv[:3], self.dv.loc[:'c']) self.assertViewEqual(self.dv[1], self.dv.loc['b']) self.assertViewEqual(self.dv[:3], self.dv.loc[['a', 'b', 'c']]) @@ -100,9 +100,9 @@ def test_dataset_getitem(self): self.assertViewEqual(dv, self.dv) def test_array_interface(self): - self.assertArrayEqual(np.asarray(self.dv), self.x) + self.assertNDArrayEqual(np.asarray(self.dv), self.x) # test patched in methods - self.assertArrayEqual(self.dv.take([2, 3]), self.x.take([2, 3])) + self.assertNDArrayEqual(self.dv.take([2, 3]), self.x.take([2, 3])) self.assertViewEqual(self.dv.argsort(), self.dv.refocus(self.x.argsort())) self.assertViewEqual(self.dv.clip(2, 3), @@ -111,7 +111,7 @@ def test_array_interface(self): self.assertViewEqual(np.sin(self.dv), self.dv.refocus(np.sin(self.x))) self.assertViewEqual(self.dv, np.maximum(self.v, self.dv)) - self.ds['bar'] = Variable(['x', 'y'], np.zeros((10, 20))) + self.ds['bar'] = Array(['x', 'y'], np.zeros((10, 20))) self.assertViewEqual(self.dv, np.maximum(self.dv, self.ds['bar'])) def test_math(self): @@ -130,22 +130,22 @@ def test_math(self): self.assertViewEqual(a, a + 0 * a) self.assertViewEqual(a, 0 * a + a) # test different indices - ds2 = self.ds.replace('x', Variable(['x'], 3 + np.arange(10))) - b = DataView(ds2, 'foo') + ds2 = self.ds.replace('x', Array(['x'], 3 + np.arange(10))) + b = DatasetArray(ds2, 'foo') with self.assertRaisesRegexp(ValueError, 'not aligned'): a + b with self.assertRaisesRegexp(ValueError, 'not aligned'): b + a def test_item_math(self): - self.ds.set_variable('x', Variable(['x'], np.array(list('abcdefghij')))) + self.ds.set_variable('x', Array(['x'], np.array(list('abcdefghij')))) self.assertVarEqual(self.dv + self.dv[0, 0], self.dv + self.dv[0, 0].data) new_data = self.x[0][None, :] + self.x[:, 0][:, None] self.assertVarEqual(self.dv[:, 0] + self.dv[0], - Variable(['x', 'y'], new_data)) + Array(['x', 'y'], new_data)) self.assertVarEqual(self.dv[0] + self.dv[:, 0], - Variable(['y', 'x'], new_data.T)) + Array(['y', 'x'], new_data.T)) def test_inplace_math(self): x = self.x @@ -165,12 +165,12 @@ def test_collapse(self): # should check which extra dimensions are dropped def test_aggregate(self): - agg_var = Variable(['y'], np.array(['a'] * 9 + ['c'] + ['b'] * 7 + + agg_var = Array(['y'], np.array(['a'] * 9 + ['c'] + ['b'] * 7 + ['c'] * 3)) self.ds.add_variable('abc', agg_var) expected_unique, expected_var = \ self.dv.variable.aggregate(np.mean, 'abc', agg_var) - expected = DataView(Dataset( + expected = DatasetArray(Dataset( {'foo': expected_var, 'x': self.ds.variables['x'], 'abc': expected_unique}), 'foo') actual = self.dv.aggregate(np.mean, 'abc') @@ -179,25 +179,25 @@ def test_aggregate(self): self.assertViewEqual(expected, actual) def test_from_stack(self): - self.ds['bar'] = Variable(['x', 'y'], np.random.randn(10, 20)) + self.ds['bar'] = Array(['x', 'y'], np.random.randn(10, 20)) foo = self.ds['foo'] bar = self.ds['bar'].renamed('foo') # from dataviews: - self.assertVarEqual(Variable(['w', 'x', 'y'], + self.assertVarEqual(Array(['w', 'x', 'y'], np.array([foo.data, bar.data])), - DataView.from_stack([foo, bar], 'w')) + DatasetArray.from_stack([foo, bar], 'w')) # from variables: - self.assertVarEqual(Variable(['w', 'x', 'y'], + self.assertVarEqual(Array(['w', 'x', 'y'], np.array([foo.data, bar.data])), - DataView.from_stack([foo.variable, - bar.variable], 'w')) + DatasetArray.from_stack([foo.variable, + bar.variable], 'w')) # from iteration: - stacked = DataView.from_stack((v for _, v in foo.iterator('x')), + stacked = DatasetArray.from_stack((v for _, v in foo.iterator('x')), self.ds['x']) self.assertViewEqual(foo, stacked) def test_intersection(self): - self.ds.set_variable('x', Variable(['x'], np.array(list('abcdefghij')))) + self.ds.set_variable('x', Array(['x'], np.array(list('abcdefghij')))) with self.assertRaises(ValueError): self.dv + self.dv[:5] dv1, dv2 = intersection(self.dv, self.dv[:5]) diff --git a/test/test_utils.py b/test/test_utils.py index 429a6347f46..204afba3869 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -19,9 +19,9 @@ def test_expanded_indexer(self): for i in [I[:], I[...], I[0, :, 10], I[..., 10], I[:5, ..., 0], I[y], I[y, y], I[..., y, y], I[..., 0, 1, 2, 3, 4]]: j = utils.expanded_indexer(i, x.ndim) - self.assertArrayEqual(x[i], x[j]) - self.assertArrayEqual(self.set_to_zero(x, i), - self.set_to_zero(x, j)) + self.assertNDArrayEqual(x[i], x[j]) + self.assertNDArrayEqual(self.set_to_zero(x, i), + self.set_to_zero(x, j)) def test_orthogonal_indexer(self): x = np.random.randn(10, 11, 12, 13, 14) @@ -35,9 +35,9 @@ def test_orthogonal_indexer(self): I[:3, 0, :4], I[:3, 0, :4, 0], I[y], I[:, y], I[0, y], I[:2, :3, y], I[0, y, :, :4, 0]]: j = utils.orthogonal_indexer(i, x.shape) - self.assertArrayEqual(x[i], x[j]) - self.assertArrayEqual(self.set_to_zero(x, i), - self.set_to_zero(x, j)) + self.assertNDArrayEqual(x[i], x[j]) + self.assertNDArrayEqual(self.set_to_zero(x, i), + self.set_to_zero(x, j)) # for more complicated cases, check orthogonal indexing is still # equivalent to slicing z = np.arange(2, 8, 2) @@ -50,9 +50,9 @@ def test_orthogonal_indexer(self): (I[0, :, y, :, 0], I[0, :, :5, :, 0], (11, 5, 13))]: k = utils.orthogonal_indexer(i, x.shape) self.assertEqual(shape, x[k].shape) - self.assertArrayEqual(x[j], x[k]) - self.assertArrayEqual(self.set_to_zero(x, j), - self.set_to_zero(x, k)) + self.assertNDArrayEqual(x[j], x[k]) + self.assertNDArrayEqual(self.set_to_zero(x, j), + self.set_to_zero(x, k)) # standard numpy (non-orthogonal) indexing doesn't work anymore with self.assertRaisesRegexp(ValueError, 'only supports 1d'): utils.orthogonal_indexer(x > 0, x.shape) @@ -66,7 +66,7 @@ def test(self): for calendar in ['standard', 'gregorian', 'proleptic_gregorian']: expected = pd.Index(nc4.num2date(num_dates, units, calendar)) actual = utils.num2datetimeindex(num_dates, units, calendar) - self.assertArrayEqual(expected, actual) + self.assertNDArrayEqual(expected, actual) class TestDictionaries(TestCase): From ad9a9135c4140d6a622578e49638ad78cd7601a1 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 13 Feb 2014 00:25:45 -0500 Subject: [PATCH 30/45] Array.groupby We should probably remove Array.aggregate to reduce confusion, but for now I'll keep aggregate for error checks. --- README.md | 3 +- src/scidata/array_.py | 127 +++++++++--------- src/scidata/common.py | 30 +++-- src/scidata/dataset.py | 11 +- src/scidata/dataset_array.py | 111 ++++++++++------ src/scidata/groupby.py | 250 +++++++++++++++++++++++++++++++++++ src/scidata/ops.py | 12 +- src/scidata/utils.py | 9 +- test/test_array.py | 45 +++++-- test/test_dataset.py | 4 + test/test_dataset_array.py | 40 +++++- 11 files changed, 505 insertions(+), 137 deletions(-) create mode 100644 src/scidata/groupby.py diff --git a/README.md b/README.md index 7f2b04b86db..0ec48d57d0e 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,8 @@ used for self-describing scientific data (netCDF, OpenDAP, etc.). but keeps ancilliary variables and metadata intact. - Array broadcasting based on dimension names and coordinate indices instead of only shapes. - - Aggregate variables across dimensions or grouped by other variables. + - Flexible split-apply-combine functionality with the `Array.groupby` method + (patterned after [pandas][pandas]). - Fast label-based indexing and (limited) time-series functionality built on [pandas][pandas]. diff --git a/src/scidata/array_.py b/src/scidata/array_.py index 3a13208089a..6579c5ba7e2 100644 --- a/src/scidata/array_.py +++ b/src/scidata/array_.py @@ -1,12 +1,14 @@ import functools import warnings from collections import OrderedDict +from itertools import izip import numpy as np import conventions import dataset import dataset_array +import groupby import ops import utils from common import AbstractArray @@ -350,6 +352,9 @@ def _collapse(self, f, dim, **kwargs): + ': ' + f.__name__) return new_var + def groupby(self, group_name, group_array, squeeze=True): + return groupby.GroupBy(self, group_name, group_array, squeeze=squeeze) + def aggregate(self, func, new_dim_name, group_by, **kwargs): """Aggregate this variable by applying `func` to grouped elements @@ -396,7 +401,7 @@ def aggregate(self, func, new_dim_name, group_by, **kwargs): @classmethod def from_stack(cls, variables, dimension='stacked_dimension', - length=None): + stacked_indexers=None, length=None, template=None): """Stack variables along a new or existing dimension to form a new variable @@ -406,12 +411,13 @@ def from_stack(cls, variables, dimension='stacked_dimension', Arrays to stack together. Each variable is expected to have matching dimensions and shape except for along the stacked dimension. - dimension : str, optional + dimension : str or DatasetArray, optional Name of the dimension to stack along. This can either be a new dimension name, in which case it is added along axis=0, or an existing dimension name, in which case the location of the dimension is unchanged. Where to insert the new dimension is determined by the first variable. + stacked_indexers : iterable of indexers, optional length : int, optional Length of the new dimension. This is used to allocate the new data array for the stacked variable data before iterating over all @@ -423,73 +429,68 @@ def from_stack(cls, variables, dimension='stacked_dimension', Stacked variable formed by stacking all the supplied variables along the new dimension. """ - if length is None: + if not isinstance(dimension, basestring): + length = dimension.size + dimension, = dimension.dimensions + + if length is None or stacked_indexers is None: # so much for lazy evaluation! we need to look at all the variables - # to figure out the dimensions of the stacked variable + # to figure out the indexers and/or dimensions of the stacked + # variable variables = list(variables) - length = 0 - for var in variables: + steps = [var.shape[var.dimensions.index(dimension)] + if dimension in var.dimensions else 1 + for var in variables] + if length is None: + length = sum(steps) + if stacked_indexers is None: + stacked_indexers = [] + i = 0 + for step in steps: + stacked_indexers.append(slice(i, i + step)) + i += step + if i != length: + raise ValueError('actual length of stacked variables ' + 'along %s is %r but expected length was ' + '%s' % (dimension, i, length)) + + # initialize the stacked variable with empty data + first_var, variables = groupby.peek_at(variables) + if dimension in first_var.dimensions: + axis = first_var.dimensions.index(dimension) + shape = tuple(length if n == axis else s + for n, s in enumerate(first_var.shape)) + dims = first_var.dimensions + else: + axis = 0 + shape = (length,) + first_var.shape + dims = (dimension,) + first_var.dimensions + attr = OrderedDict() if template is None else template.attributes + + stacked = cls(dims, np.empty(shape, dtype=first_var.dtype), attr) + stacked.attributes.update(first_var.attributes) + + alt_dims = tuple(d for d in dims if d != dimension) + + # copy in the data from the variables + for var, indexer in izip(variables, stacked_indexers): + if template is None: + # do sanity checks if we don't have a template if dimension in var.dimensions: - axis = var.dimensions.index(dimension) - length += var.shape[axis] - else: - length += 1 - - # manually keep track of progress along - i = 0 - for var in variables: - if i == 0: - # initialize the stacked variable with empty data - if dimension not in var.dimensions: - shape = (length,) + var.shape - dims = (dimension,) + var.dimensions - else: - shape = tuple(length if d == dimension else s - for d, s in zip(var.dimensions, var.shape)) - dims = var.dimensions - stacked = cls(dims, np.empty(shape, dtype=var.dtype), - var.attributes) - # required dimensions (including order) if we have any N - 1 - # dimensional variables - alt_dims = tuple(d for d in dims if d != dimension) - - if dimension in var.dimensions: - # transpose requires that the dimensions are equivalent - var = var.transpose(*stacked.dimensions) - axis = var.dimensions.index(dimension) - step = var.shape[axis] - elif var.dimensions == alt_dims: - step = 1 - else: - raise ValueError('inconsistent dimensions') - - if i + step > length: - raise ValueError('actual length of stacked variables along %s ' - 'is greater than expected length %s' - % (dimension, length)) - - indexer = tuple((slice(i, i + step) if step > 1 else i) - if d == dimension else slice(None) - for d in stacked.dimensions) - # by-pass variable indexing for possible speedup - stacked.data[indexer] = var.data - utils.remove_incompatible_items(stacked.attributes, var.attributes) - i += step - - if i != length: - raise ValueError('actual length of stacked variables along %s is ' - '%s but expected length was %s' - % (dimension, i, length)) + # transpose verifies that the dimensions are equivalent + if var.dimensions != stacked.dimensions: + var = var.transpose(*stacked.dimensions) + elif var.dimensions != alt_dims: + raise ValueError('inconsistent dimensions') + utils.remove_incompatible_items(stacked.attributes, + var.attributes) + + key = tuple(indexer if n == axis else slice(None) + for n in range(stacked.ndim)) + stacked.data[tuple(key)] = var.data return stacked - def apply(self, func, *args, **kwargs): - """Apply `func` with *args and **kwargs to this variable's data and - return the result as a new variable with the same dimensions - """ - data = np.asarray(func(self.data, *args, **kwargs)) - return type(self)(self.dimensions, data, self.attributes) - def __array_wrap__(self, result): return type(self)(self.dimensions, result, self.attributes) diff --git a/src/scidata/common.py b/src/scidata/common.py index 12c7a9e3690..eaa23d9fc90 100644 --- a/src/scidata/common.py +++ b/src/scidata/common.py @@ -1,5 +1,20 @@ -class AbstractArray(object): +class ImplementsCollapse(object): + @classmethod + def _collapse_method(cls, f, name=None, module=None): + def func(self, dimension=cls._collapse_dimension_default, + axis=cls._collapse_axis_default, **kwargs): + return self.collapse(f, dimension, axis, **kwargs) + if name is None: + name = f.__name__ + func.__name__ = name + func.__doc__ = cls._collapse_method_docstring.format( + name=('' if module is None else module + '.') + name, + cls=cls.__name__) + return func + + +class AbstractArray(ImplementsCollapse): @property def dtype(self): return self._data.dtype @@ -78,14 +93,5 @@ def T(self): indicated dimension(s) removed. """ - @classmethod - def _collapse_method(cls, f, name=None, module=None): - def func(self, dimension=None, axis=None, **kwargs): - return self.collapse(f, dimension, axis, **kwargs) - if name is None: - name = f.__name__ - func.__name__ = name - func.__doc__ = cls._collapse_method_docstring.format( - name=('' if module is None else module + '.') + name, - cls=cls.__name__) - return func + _collapse_dimension_default = None + _collapse_axis_default = None diff --git a/src/scidata/dataset.py b/src/scidata/dataset.py index 1058f9b73a5..5eb64f99609 100644 --- a/src/scidata/dataset.py +++ b/src/scidata/dataset.py @@ -309,7 +309,7 @@ def virtual_variables(self): for k in self._datetimeindices: for suffix in _DATETIMEINDEX_COMPONENTS + ['season']: possible_vars.append('%s.%s' % (k, suffix)) - return tuple(k for k in possible_vars if k not in self) + return tuple(k for k in possible_vars if k not in self.variables) def __getitem__(self, key): if key not in self.variables: @@ -327,6 +327,9 @@ def __setitem__(self, key, value): # (We would need to change DatasetArray.__setitem__ in that case, because # we definitely don't want to override focus variables.) if isinstance(value, DatasetArray): + # print 'value was ', repr(value) + # print 'renamed to ', repr(value.renamed()) + # print 'setting item', repr(value.renamed(key).dataset) self.merge(value.renamed(key).dataset, inplace=True) elif isinstance(value, array.Array): self.set_variable(key, value) @@ -830,7 +833,8 @@ def unselect(self, *names, **kwargs): *names : str Names of the variables to omit from the returned object. omit_dimensions : bool, optional (default True) - Whether or not to also omit dimensions with the given names. + Whether or not to also omit dimensions with the given names. All + variables along omited dimensions will also be removed. Returns ------- @@ -848,6 +852,9 @@ def unselect(self, *names, **kwargs): dimensions = OrderedDict((k, v) for k, v in self.dimensions.iteritems() if k not in names) + variables = OrderedDict((k, v) for k, v in variables.iteritems() + if all(d in dimensions + for d in v.dimensions)) indices = {k: v for k, v in self.indices.cache.items() if k not in names} else: diff --git a/src/scidata/dataset_array.py b/src/scidata/dataset_array.py index 3d70ab48a4e..369667ca044 100644 --- a/src/scidata/dataset_array.py +++ b/src/scidata/dataset_array.py @@ -8,6 +8,7 @@ import array_ import dataset +import groupby import ops from common import AbstractArray from utils import expanded_indexer, FrozenOrderedDict, remap_loc_indexers @@ -57,6 +58,12 @@ def __init__(self, dataset, focus): self.dataset = dataset self.focus = focus + @classmethod + def create(cls, focus, dimensions, data): + ds = dataset.Dataset() + ds.create_variable(focus, dimensions, data) + return ds[focus] + @property def variable(self): return self.dataset.variables[self.focus] @@ -188,6 +195,13 @@ def unselected(self): """ return self.dataset.unselect(self.focus) + def unselect(self, *names): + if self.focus in names: + raise ValueError('cannot unselect the focus variable of a ' + 'DatasetArray with unselect. Use the `unselected`' + 'method or the `unselect` method of the dataset.') + return type(self)(self.dataset.unselect(*names), self.focus) + def refocus(self, new_var): """Returns a copy of this DatasetArray's dataset with this DatasetArray's focus variable replaced by `new_var` @@ -220,6 +234,14 @@ def iterator(self, dimension): for (x, ds) in self.dataset.iterator(dimension): yield (x, type(self)(ds, self.focus)) + def groupby(self, group, squeeze=True): + if isinstance(group, basestring): + # merge in the group's dataset to allow group to be a virtual + # variable in this dataset + ds = self.dataset.merge(self.dataset[group].dataset) + group = DatasetArray(ds, group) + return groupby.GroupBy(self, group.focus, group, squeeze=squeeze) + def transpose(self, *dimensions): """Return a new DatasetArray object with transposed dimensions @@ -240,7 +262,7 @@ def transpose(self, *dimensions): See Also -------- numpy.transpose - Array.tranpose + Array.transpose """ return self.refocus(self.variable.transpose(*dimensions)) @@ -276,13 +298,12 @@ def collapse(self, func, dimension=None, axis=None, **kwargs): summarized data and the indicated dimension(s) removed. """ var = self.variable.collapse(func, dimension, axis, **kwargs) - dropped_dims = set(self.dimensions) - set(var.dimensions) + drop = set(self.dimensions) - set(var.dimensions) # For now, take an aggressive strategy of removing all variables # associated with any dropped dimensions # TODO: save some summary (mean? bounds?) of dropped variables - drop = ({self.focus} | dropped_dims | - {k for k, v in self.dataset.variables.iteritems() - if any(dim in dropped_dims for dim in v.dimensions)}) + drop |= {k for k, v in self.dataset.variables.iteritems() + if any(dim in drop for dim in v.dimensions)} ds = self.dataset.unselect(*drop) ds.add_variable(self.focus, var) return type(self)(ds, self.focus) @@ -323,65 +344,79 @@ def aggregate(self, func, new_dim, **kwargs): return type(self)(ds, self.focus) @classmethod - def from_stack(cls, arrays, dimension='stacked_dimension'): + def from_stack(cls, arrays, dimension='stacked_dimension', + stacked_indexers=None, length=None, template=None): """Stack arrays along a new or existing dimension to form a new dataview Parameters ---------- - arrays : iterable of DatasetArray or Array + arrays : iterable of Array Arrays to stack together. Each variable is expected to have matching dimensions and shape except for along the stacked dimension. - dimension : str or DatasetArray, optional + dimension : str or Array, optional Name of the dimension to stack along. This can either be a new dimension name, in which case it is added along axis=0, or an existing dimension name, in which case the location of the dimension is unchanged. Where to insert the new dimension is - determined by the first dataview. + determined by whether it is found in the first array. + stacked_indexers : optional + length : optional + template : optional Returns ------- stacked : DatasetArray - Stacked dataview formed by stacking all the supplied variables + Stacked dataset array formed by stacking all the supplied variables along the new dimension. """ - arrays = list(arrays) - if not arrays: - raise ValueError('DatasetArray.from_stack was supplied with an ' - 'empty argument') - # create an empty dataset in which to stack variables # start by putting in the dimension variable ds = dataset.Dataset() if isinstance(dimension, basestring): dim_name = dimension else: - dim_name = dimension.focus - ds[dim_name] = dimension - - # figure out metadata for each dataview - focus = None - for array in arrays: - if isinstance(array, cls): - ds.merge(array.unselected(), inplace=True) - if focus is None: - focus = array.focus - elif focus != array.focus: - raise ValueError('DatasetArray.from_stack requires that all ' - 'stacked views have the same focus') - if focus is None: - focus = 'stacked_variable' + dim_name, = dimension.dimensions + if hasattr(dimension, 'focus'): + ds[dimension.focus] = dimension + + if template is not None: + # use metadata from the template dataset array + focus = template.focus + drop = {k for k, v in template.dataset.variables.iteritems() + if k in [focus, dim_name]} + ds.merge(template.dataset.unselect(*drop), inplace=True) + else: + # figure out metadata by inspecting each array + focus = None + arrays = list(arrays) + for array in arrays: + if isinstance(array, cls): + unselected = array.unselected() + if dim_name in unselected: + unselected = unselected.unselect(dim_name) + ds.merge(unselected, inplace=True) + if focus is None: + focus = array.focus + elif focus != array.focus: + raise ValueError('DatasetArray.from_stack requires ' + 'that all stacked views have the ' + 'same focus') + if focus is None: + focus = 'stacked_variable' # finally, merge in the stacked variables - ds[focus] = array_.Array.from_stack(arrays, dim_name) - return cls(ds, focus) - - def apply(self, func, *args, **kwargs): - """Apply `func` with *args and **kwargs to this array's data and - return the result as a new array - """ - return self.refocus(self.variable.apply(func, *args, **kwargs)) + ds[focus] = array_.Array.from_stack(arrays, dimension, + stacked_indexers, length, template) + stacked = cls(ds, focus) + + if template is not None: + drop = set(template.dataset.dimensions) - set(stacked.dimensions) + drop |= {k for k, v in ds.variables.iteritems() + if any(dim in drop for dim in v.dimensions)} + stacked = stacked.unselect(*drop) + return stacked def to_dataframe(self): """Convert this array into a pandas.DataFrame diff --git a/src/scidata/groupby.py b/src/scidata/groupby.py new file mode 100644 index 00000000000..d335b909daa --- /dev/null +++ b/src/scidata/groupby.py @@ -0,0 +1,250 @@ +import itertools + +from common import ImplementsCollapse +from ops import inject_collapse_methods +import array_ +import dataset_array +import numpy as np + + +def unique_value_groups(ar): + """Group an array by its unique values + + Parameters + ---------- + ar : array_like + Input array. This will be flattened if it is not already 1-D. + + Returns + ------- + values : np.ndarray + Sorted, unique values as returned by `np.unique`. + indices : list of lists of int + Each element provides the integer indices in `ar` with values given by + the corresponding value in `unique_values`. + """ + values, inverse = np.unique(ar, return_inverse=True) + groups = [[] for _ in range(len(values))] + for n, g in enumerate(inverse): + groups[g].append(n) + return values, groups + + +def peek_at(iterable): + """Returns the first value from iterable, as well as a new iterable with + the same content as the original iterable + """ + gen = iter(iterable) + peek = gen.next() + return peek, itertools.chain([peek], gen) + + +class GroupBy(ImplementsCollapse): + """A object that implements the split-apply-combine pattern + + Modeled after `pandas.GroupBy`. The `GroupBy` object can be iterated over + (unique_value, grouped_array) pairs, but the main way to interact with a + groupby object are with the `apply` or `collapse` methods. You can also + directly call numpy methods like `mean` or `std`. + + See Also + -------- + Array.groupby + DatasetArray.groupby + """ + def __init__(self, array, group_name, group_coord, squeeze=True): + """See Array.groupby and DatasetArray.groupby + """ + if group_coord.ndim != 1: + # TODO: remove this limitation? + raise ValueError('`group_coord` must be 1 dimensional') + + self.array = array + self.group_coord = group_coord + self.group_dim, = group_coord.dimensions + self.group_axis = array.dimensions.index(self.group_dim) + + if group_coord.size != array.shape[self.group_axis]: + raise ValueError('the group variable\'s length does not ' + 'match the length of this variable along its ' + 'dimension') + + if group_name in array.dimensions: + # assume that group_coord already has sorted, unique values + if group_coord.dimensions != (group_name,): + raise ValueError('`group_coord` is required to be a coordinate ' + 'variable along the `group_name` dimension ' + 'if `group_name` is a dimension in `array`') + group_indices = np.arange(group_coord.size) + if not squeeze: + # group_indices = group_indices.reshape(-1, 1) + # use slices to do views instead of fancy indexing + group_indices = [slice(i, i + 1) for i in group_indices] + unique_coord = group_coord + else: + # look through group_coord to find the unique values + unique_values, group_indices = unique_value_groups(group_coord) + unique_coord = dataset_array.DatasetArray.create( + group_name, [group_name], unique_values) + + self.group_indices = group_indices + self.unique_coord = unique_coord + self._groups = None + + @property + def groups(self): + # provided for compatibility with pandas.groupby + if self._groups is None: + self._groups = dict(zip(self.unique_coord, self.group_indices)) + return self._groups + + def __len__(self): + return self.unique_coord.size + + def __iter__(self): + return itertools.izip(self.unique_coord, self.iter_arrays()) + + def iter_fast(self): + # extract the underlying Array object + array = self.array + if hasattr(self.array, 'variable'): + array = array.variable + + # build the new dimensions + index_int = isinstance(self.group_indices[0], int) + if index_int: + dims = tuple(d for n, d in enumerate(array.dimensions) + if n != self.group_axis) + else: + dims = array.dimensions + + # slice the data and build the new Arrays directly + for indices in self.group_indices: + indexer = tuple(indices if n == self.group_axis else slice(None) + for n in range(array.ndim)) + data = array.data[indexer] + yield array_.Array(dims, data) + + def iter_arrays(self): + for indices in self.group_indices: + yield self.array.indexed_by(**{self.group_dim: indices}) + + def apply(self, func, shortcut=True, **kwargs): + """Apply a function over each array in the group and stack them + together into a new array + + `func` is called like `func(ar, *args, **kwargs)` for each array `ar` + in this group. + + Apply uses heuristics (like `pandas.GroupBy.apply`) to figure out how + to stack together the array. The rule is: + 1. If the dimension along which the group coordinate is defined is + still in the first grouped array after applying `func`, then stack + over this dimension. + 2. Otherwise, stack over the new dimension given by name of this + grouping (the argument to the `groupby` function). + + Parameters + ---------- + func : function + Callable to apply to each array. + shortcut : bool, optional + Whether or not to shortcut evaluation under the assumptions that: + (1) The action of `func` does not depend on any of the array + metadata (attributes, indices or other contained arrays) but + only on the data and dimensions. + (2) The action of `func` creates arrays with homogeneous metadata, + that is, with the same dimensions and attributes. + If these conditions are satisfied (and they should be in most + cases), the `shortcut` provides significant speedup for common + groupby operations like applying numpy ufuncs. + **kwargs + Used to call `func(ar, **kwargs)` for each array `ar. + + Returns + ------- + applied : Array + A new Array of the same type from which this grouping was created. + """ + shortcut = kwargs.pop('shortcut', True) + applied = (func(ar, **kwargs) for ar in (self.iter_fast() if shortcut + else self.iter_array())) + + # peek at applied to determine which coordinate to stack over + applied_example, applied = peek_at(applied) + if self.group_dim in applied_example.dimensions: + stack_coord = self.group_coord + indexers = self.group_indices + else: + stack_coord = self.unique_coord + indexers = np.arange(self.unique_coord.size) + + from_stack_kwargs = {'template': self.array} if shortcut else {} + stacked = type(self.array).from_stack(applied, stack_coord, indexers, + **from_stack_kwargs) + + # now, reorder the stacked array's dimensions so that those that + # appeared in the original array appear in the same order they did + # originally + stack_dim, = stack_coord.dimensions + original_dims = [stack_dim if d == self.group_dim else d + for d in self.array.dimensions + if d in stacked.dimensions or d == self.group_dim] + iter_original_dims = iter(original_dims) + new_order = [iter_original_dims.next() if d in original_dims else d + for d in stacked.dimensions] + return stacked.transpose(*new_order) + + def collapse(self, func, dimension=Ellipsis, axis=Ellipsis, shortcut=True, + **kwargs): + # Ellipsis is used as a sentinel value for the altered default + if axis is Ellipsis and dimension is Ellipsis: + dimension = self.group_dim + if dimension is Ellipsis: + dimension = None + if axis is Ellipsis: + axis = None + def collapse_array(ar): + return ar.collapse(func, dimension, axis, **kwargs) + return self.apply(collapse_array, shortcut=shortcut) + + _collapse_method_docstring = \ + """Collapse this {cls}'s data' by applying `{name}` along some + dimension(s) + + Parameters + ---------- + dimension : str or sequence of str, optional + Dimension(s) over which to repeatedly apply `{name}`. + axis : int or sequence of int, optional + Axis(es) over which to repeatedly apply `{name}`. Only one of the + 'dimension' and 'axis' arguments can be supplied. If neither are + supplied, then `{name}` is calculated over the axis of the variable + over which the group was formed. + **kwargs : dict + Additional keyword arguments passed on to `{name}`. + + Note + ---- + If this method is called with multiple dimensions (or axes, which are + converted into dimensions), then `{name}` is performed repeatedly along + each dimension in turn from left to right. + + `Ellipsis` is used as the default dimension and axis for this method to + indicate that this operation is by default applied along the axis along + which the grouping variable lies. To instead apply `{name}` + simultaneously over all grouped values, use `dimension=None` (or + equivalently `axis=None`). + + Returns + ------- + collapsed : {cls} + New {cls} object with `{name}` applied to its data and the + indicated dimension(s) removed. + """ + + _collapse_dimension_default = Ellipsis + _collapse_axis_default = Ellipsis + + +inject_collapse_methods(GroupBy) diff --git a/src/scidata/ops.py b/src/scidata/ops.py index 7e450aa1645..264232d9ff7 100644 --- a/src/scidata/ops.py +++ b/src/scidata/ops.py @@ -36,6 +36,13 @@ def func(self, *args, **kwargs): return func +def inject_collapse_methods(cls): + # TODO: change these to use methods instead of numpy functions + for name in NUMPY_COLLAPSE_METHODS: + setattr(cls, name, cls._collapse_method(getattr(np, name), + name, 'numpy')) + + def inject_special_operations(cls, priority=50): # priortize our operations over those of numpy.ndarray (priority=1) # and numpy.matrix (priority=10) @@ -59,7 +66,4 @@ def inject_special_operations(cls, priority=50): setattr(cls, name, _data_method_wrapper(name)) for name in NUMPY_UNARY_METHODS: setattr(cls, name, cls._unary_op(_method_wrapper(name))) - # TODO: change these to use methods instead of numpy functions - for name in NUMPY_COLLAPSE_METHODS: - setattr(cls, name, cls._collapse_method(getattr(np, name), - name, 'numpy')) + inject_collapse_methods(cls) diff --git a/src/scidata/utils.py b/src/scidata/utils.py index f9334321c95..d8cd0be39b8 100644 --- a/src/scidata/utils.py +++ b/src/scidata/utils.py @@ -107,7 +107,7 @@ def num2datetimeindex(num_dates, units, calendar=None): return pd.Index(dates) -def variable_equal(v1, v2): +def variable_equal(v1, v2, rtol=1e-05, atol=1e-08): """True if two objects have the same dimensions, attributes and data; otherwise False @@ -126,7 +126,12 @@ def variable_equal(v1, v2): pass # TODO: replace this with a NaN safe version. # see: pandas.core.common.array_equivalent - return np.array_equal(v1.data, v2.data) + data1 = v1.data + data2 = v2.data + if np.issubdtype(data1.dtype, (str, object)): + return np.array_equal(data1, data2) + else: + return np.allclose(data1, data2, rtol=rtol, atol=atol) else: return False diff --git a/test/test_array.py b/test/test_array.py index 82c03d41b9e..c80ec736d2c 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -185,14 +185,6 @@ def test_array_interface(self): # test ufuncs self.assertVarEqual(np.sin(v), Array(['x'], np.sin(x))) - def test_apply(self): - x = np.arange(5) - v = Array(['x'], x) - def numpy_only_square(x): - return np.asarray(x) ** 2 - self.assertNDArrayEqual(x ** 2, numpy_only_square(v)) - self.assertVarEqual(v ** 2, v.apply(numpy_only_square)) - def test_collapse(self): v = Array(['time', 'x'], self.d) # intentionally test with an operation for which order matters @@ -209,14 +201,41 @@ def test_collapse(self): {'cell_methods': 'time: x: std'})) self.assertVarEqual(v.mean('time'), v.collapse(np.mean, 'time')) + def test_groupby(self): + agg_var = Array(['y'], np.array(['a', 'a', 'b'])) + v = Array(['x', 'y'], self.d) + + expected_unique = Array(['abc'], np.array(['a', 'b'])) + expected_aggregated = Array(['x', 'abc'], + np.array([self.d[:, :2].sum(axis=1), + self.d[:, 2:].sum(axis=1)]).T, + {'cell_methods': 'y: sum'}) + + x = Array('x', np.arange(10)) + y = Array('y', np.arange(3)) + self.assertVarEqual(v, v.groupby('y', y).apply(lambda x: x)) + self.assertVarEqual(v, v.groupby('x', x).apply(lambda x: x)) + + grouped = v.groupby('abc', agg_var) + self.assertVarEqual(expected_unique, grouped.unique_coord) + self.assertVarEqual(v, grouped.apply(lambda x: x)) + self.assertVarEqual(expected_aggregated, grouped.collapse(np.sum)) + + actual = list(grouped) + expected = zip(expected_unique, [v[:, :2], v[:, 2:]]) + self.assertEqual(len(expected), len(actual)) + for (ke, ve), (ka, va) in zip(expected, actual): + self.assertVarEqual(ke, ka) + self.assertVarEqual(ve, va) + def test_aggregate(self): agg_var = Array(['y'], np.array(['a', 'a', 'b'])) v = Array(['x', 'y'], self.d) expected_unique = Array(['abc'], np.array(['a', 'b'])) expected_aggregated = Array(['x', 'abc'], - np.array([self.d[:, :2].sum(axis=1), - self.d[:, 2:].sum(axis=1)]).T, - {'cell_methods': 'y: sum'}) + np.array([self.d[:, :2].sum(axis=1), + self.d[:, 2:].sum(axis=1)]).T, + {'cell_methods': 'y: sum'}) actual_unique, actual_aggregated = v.aggregate(np.sum, 'abc', agg_var) self.assertVarEqual(expected_unique, actual_unique) self.assertVarEqual(expected_aggregated, actual_aggregated) @@ -238,9 +257,9 @@ def test_from_stack(self): Array.from_stack((v, w), 'b')) self.assertVarEqual(Array(['b', 'a'], np.array([x, y])), Array.from_stack((v, w), 'b', length=2)) - with self.assertRaisesRegexp(ValueError, 'greater than expected'): + with self.assertRaisesRegexp(ValueError, 'actual length'): Array.from_stack([v, w], 'b', length=1) - with self.assertRaisesRegexp(ValueError, 'but expected length was'): + with self.assertRaisesRegexp(ValueError, 'actual length'): Array.from_stack([v, w, w], 'b', length=4) with self.assertRaisesRegexp(ValueError, 'inconsistent dimensions'): Array.from_stack([v, Array(['c'], y)], 'b') diff --git a/test/test_dataset.py b/test/test_dataset.py index 504f56f725b..9b692e8e94e 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -262,6 +262,10 @@ def test_select(self): self.assertTrue(_vars.keys()[1] not in ret.variables) self.assertRaises(ValueError, data.select, (_testvar, 'not_a_var')) + @unittest.skip('need to write this test') + def test_unselect(self): + pass + def test_copy(self): data = create_test_data(self.get_store()) var = data.variables[_testvar] diff --git a/test/test_dataset_array.py b/test/test_dataset_array.py index e53a1d1efcf..1d4c03d3d65 100644 --- a/test/test_dataset_array.py +++ b/test/test_dataset_array.py @@ -164,9 +164,45 @@ def test_collapse(self): # needs more... # should check which extra dimensions are dropped + def test_groupby(self): + agg_var = Array(['y'], np.array(['a'] * 9 + ['c'] + ['b'] * 10)) + self.dv['abc'] = agg_var + self.dv['y'] = 20 + 100 * self.ds['y'].variable + + identity = lambda x: x + self.assertViewEqual(self.dv, self.dv.groupby('x').apply(identity)) + self.assertViewEqual(self.dv, self.dv.groupby('x', squeeze=False + ).apply(identity)) + self.assertViewEqual(self.dv, self.dv.groupby('y').apply(identity)) + self.assertViewEqual(self.dv, self.dv.groupby('y', squeeze=False + ).apply(identity)) + + grouped = self.dv.groupby('abc') + + expected_sum_all = DatasetArray(Dataset( + {'foo': Array(['abc'], np.array([self.x[:, :9].sum(), + self.x[:, 10:].sum(), + self.x[:, 9:10].sum()]).T, + {'cell_methods': 'x: y: sum'}), + 'abc': Array(['abc'], np.array(['a', 'b', 'c']))}), 'foo') + self.assertViewEqual(expected_sum_all, + grouped.collapse(np.sum, dimension=None)) + self.assertViewEqual(expected_sum_all, grouped.sum(dimension=None)) + + expected_sum_axis1 = DatasetArray(Dataset( + {'foo': Array(['x', 'abc'], np.array([self.x[:, :9].sum(1), + self.x[:, 10:].sum(1), + self.x[:, 9:10].sum(1)]).T, + {'cell_methods': 'y: sum'}), + 'x': self.ds.variables['x'], + 'abc': Array(['abc'], np.array(['a', 'b', 'c']))}), 'foo') + self.assertViewEqual(expected_sum_axis1, grouped.collapse(np.sum)) + self.assertViewEqual(expected_sum_axis1, grouped.sum()) + + self.assertViewEqual(self.dv, grouped.apply(identity)) + def test_aggregate(self): - agg_var = Array(['y'], np.array(['a'] * 9 + ['c'] + ['b'] * 7 + - ['c'] * 3)) + agg_var = Array(['y'], np.array(['a'] * 9 + ['c'] + ['b'] * 10)) self.ds.add_variable('abc', agg_var) expected_unique, expected_var = \ self.dv.variable.aggregate(np.mean, 'abc', agg_var) From f08e2ebc58e34485eccdc8e25f9deb9afc9ed4a9 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 14 Feb 2014 16:01:31 -0500 Subject: [PATCH 31/45] Renamed package from 'scidata' to 'xray' --- README.md | 23 +++++++++++++---------- doc/conf.py | 14 +++++++------- doc/index.rst | 4 ++-- setup.py | 4 ++-- src/{scidata => xray}/__init__.py | 0 src/{scidata => xray}/array_.py | 4 ++-- src/{scidata => xray}/backends.py | 0 src/{scidata => xray}/common.py | 0 src/{scidata => xray}/conventions.py | 0 src/{scidata => xray}/dataset.py | 7 +++---- src/{scidata => xray}/dataset_array.py | 4 ++-- src/{scidata => xray}/groupby.py | 0 src/{scidata => xray}/ops.py | 0 src/{scidata => xray}/utils.py | 0 test/__init__.py | 2 +- test/test_array.py | 4 ++-- test/test_dataset.py | 4 ++-- test/test_dataset_array.py | 2 +- test/test_utils.py | 2 +- 19 files changed, 38 insertions(+), 36 deletions(-) rename src/{scidata => xray}/__init__.py (100%) rename src/{scidata => xray}/array_.py (99%) rename src/{scidata => xray}/backends.py (100%) rename src/{scidata => xray}/common.py (100%) rename src/{scidata => xray}/conventions.py (100%) rename src/{scidata => xray}/dataset.py (99%) rename src/{scidata => xray}/dataset_array.py (99%) rename src/{scidata => xray}/groupby.py (100%) rename src/{scidata => xray}/ops.py (100%) rename src/{scidata => xray}/utils.py (100%) diff --git a/README.md b/README.md index 0ec48d57d0e..b79cd9e0343 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,13 @@ -# scidata: objects for working with scientific data in Python +# xray: transparently manipulate scientific datasets in Python -**scidata** is a Python package for working with aligned sets of homogeneous, +**xray** is a Python package for working with aligned sets of homogeneous, n-dimensional arrays. It implements flexible array operations and dataset manipulation for in-memory datasets within the [Common Data Model][cdm] widely used for self-describing scientific data (netCDF, OpenDAP, etc.). +***Warning: xray is still in its early development phase. Expect the API to +change.*** + ## Main Feaures - A `DatasetArray` object that is compatible with NumPy's ndarray and ufuncs @@ -34,14 +37,14 @@ used for self-describing scientific data (netCDF, OpenDAP, etc.). - [Iris][iris] (supported by the UK Met office) is a similar package designed for working with geophysical datasets in Python. Iris provided - much of the inspiration for scidata (e.g., scidata's `DatasetArray` is - largely based on the Iris `Cube`), but it has several limitations that led - us to build scidata instead of extending Iris: + much of the inspiration for xray (e.g., xray's `DatasetArray` is largely + based on the Iris `Cube`), but it has several limitations that led us to + build xray instead of extending Iris: 1. Iris has essentially one first-class object (the `Cube`) on which it attempts to build all functionality (`Coord` supports a much more - limited set of functionality). scidata has its equivalent of the Cube + limited set of functionality). xray has its equivalent of the Cube (the `DatasetArray` object), but it is only a thin wrapper on the more - primitive building blocks of Dataset and Variable objects. + primitive building blocks of Dataset and Array objects. 2. Iris has a strict interpretation of [CF conventions][cf], which, although a principled choice, we have found to be impractical for everyday uses. With Iris, every quantity has physical (SI) units, all @@ -55,14 +58,14 @@ used for self-describing scientific data (netCDF, OpenDAP, etc.). models of how Iris functions work. Moreover, it means that a lot of logic (e.g., constraint handling) uses non-vectorized operations. For example, extracting all times within a range can be surprisingly slow - (e.g., 0.3 seconds vs 3 milliseconds in scidata to select along a time + (e.g., 0.3 seconds vs 3 milliseconds in xray to select along a time dimension with 10000 elements). - [pandas][pandas] is fast and powerful but oriented around working with tabular datasets. pandas has experimental N-dimensional panels, but they don't support aligned math with other objects. We believe the `DatasetArray`/ `Cube` model is better suited to working with scientific - datasets. We use pandas internally in scidata to support fast indexing. - - [netCDF4-python][nc4] provides scidata's primary interface for working with + datasets. We use pandas internally in xray to support fast indexing. + - [netCDF4-python][nc4] provides xray's primary interface for working with netCDF and OpenDAP datasets. [pandas]: http://pandas.pydata.org/ diff --git a/doc/conf.py b/doc/conf.py index 6107216b9a4..b3774bfff87 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# scidata documentation build configuration file, created by +# xray documentation build configuration file, created by # sphinx-quickstart on Thu Feb 6 18:57:54 2014. # # This file is execfile()d with the current directory set to its @@ -53,7 +53,7 @@ master_doc = 'index' # General information about the project. -project = u'scidata' +project = u'xray' copyright = u'2014, Stephan Hoyer and Alex Kleeman' # The version info for the project you're documenting, acts as replacement for @@ -186,7 +186,7 @@ #html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'scidatadoc' +htmlhelp_basename = 'xraydoc' # -- Options for LaTeX output --------------------------------------------- @@ -206,7 +206,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'scidata.tex', u'scidata Documentation', + ('index', 'xray.tex', u'xray Documentation', u'Stephan Hoyer and Alex Kleeman', 'manual'), ] @@ -236,7 +236,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'scidata', u'scidata Documentation', + ('index', 'xray', u'xray Documentation', [u'Stephan Hoyer and Alex Kleeman'], 1) ] @@ -250,8 +250,8 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'scidata', u'scidata Documentation', - u'Stephan Hoyer and Alex Kleeman', 'scidata', 'One line description of project.', + ('index', 'xray', u'xray Documentation', + u'Stephan Hoyer and Alex Kleeman', 'xray', 'One line description of project.', 'Miscellaneous'), ] diff --git a/doc/index.rst b/doc/index.rst index 09eae1412bb..29639696b24 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -3,8 +3,8 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -SciData reference -================= +xray reference +============== Contents: diff --git a/setup.py b/setup.py index ea96e79939b..e2895baf583 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ except: from distutils.core import setup -setup(name='scidata', +setup(name='xray', version='0.1-dev', description='Objects for holding self describing scientific data in python', author='Stephan Hoyer, Alex Kleeman', @@ -15,5 +15,5 @@ tests_require=['nose >= 1.0'], url='https://github.com/akleeman/scidata', test_suite='nose.collector', - packages=['scidata'], + packages=['xray'], package_dir={'': 'src'}) diff --git a/src/scidata/__init__.py b/src/xray/__init__.py similarity index 100% rename from src/scidata/__init__.py rename to src/xray/__init__.py diff --git a/src/scidata/array_.py b/src/xray/array_.py similarity index 99% rename from src/scidata/array_.py rename to src/xray/array_.py index 6579c5ba7e2..9429f23acb2 100644 --- a/src/scidata/array_.py +++ b/src/xray/array_.py @@ -224,7 +224,7 @@ def __repr__(self): contents = ' (%s): %s' % (dim_summary, self.dtype) else: contents = ': %s' % self.data - return '' % (type(self).__name__, contents) + return '' % (type(self).__name__, contents) def indexed_by(self, **indexers): """Return a new variable indexed along the specified dimension(s) @@ -591,7 +591,7 @@ def _broadcast_variable_data(self, other): if isinstance(other, dataset.Dataset): raise TypeError('datasets do not support mathematical operations') elif all(hasattr(other, attr) for attr in ['dimensions', 'data', 'shape']): - # `other` satisfies the scidata.Array API + # `other` satisfies the xray.Array API new_self, new_other = broadcast_variables(self, other) self_data = new_self.data other_data = new_other.data diff --git a/src/scidata/backends.py b/src/xray/backends.py similarity index 100% rename from src/scidata/backends.py rename to src/xray/backends.py diff --git a/src/scidata/common.py b/src/xray/common.py similarity index 100% rename from src/scidata/common.py rename to src/xray/common.py diff --git a/src/scidata/conventions.py b/src/xray/conventions.py similarity index 100% rename from src/scidata/conventions.py rename to src/xray/conventions.py diff --git a/src/scidata/dataset.py b/src/xray/dataset.py similarity index 99% rename from src/scidata/dataset.py rename to src/xray/dataset.py index 5eb64f99609..4c542e031b2 100644 --- a/src/scidata/dataset.py +++ b/src/xray/dataset.py @@ -139,7 +139,7 @@ def __repr__(self): (k, str(v).replace( '\n', '\n' + ' ' * (len(k) + 4))) for k, v in self.items()) - return ("\n%s" + return ("\n%s" % (type(self).__name__, contents)) @@ -432,9 +432,8 @@ def __str__(self): def __repr__(self): dim_summary = ', '.join('%s%s: %s' % ('@' if k in self else '', k, v) for k, v in self.dimensions.iteritems()) - return '' % (type(self).__name__, - dim_summary, - ' '.join(self.noncoordinates)) + return '' % (type(self).__name__, dim_summary, + ' '.join(self.noncoordinates)) def create_variable(self, name, dims, data, attributes=None): """Create a new variable and add it to this dataset diff --git a/src/scidata/dataset_array.py b/src/xray/dataset_array.py similarity index 99% rename from src/scidata/dataset_array.py rename to src/xray/dataset_array.py index 369667ca044..546904af449 100644 --- a/src/scidata/dataset_array.py +++ b/src/xray/dataset_array.py @@ -46,7 +46,7 @@ def __init__(self, dataset, focus): """ Parameters ---------- - dataset : scidata.Dataset + dataset : xray.Dataset The dataset on which to build this data view. focus : str The name of the "focus variable" in `dataset` on which this object @@ -155,7 +155,7 @@ def __repr__(self): contents = ' (%s): %s' % (dim_summary, self.dtype) else: contents = ': %s' % self.data - return '' % (type(self).__name__, self.focus, contents) + return '' % (type(self).__name__, self.focus, contents) def indexed_by(self, **indexers): """Return a new dataset array whose dataset is given by indexing along diff --git a/src/scidata/groupby.py b/src/xray/groupby.py similarity index 100% rename from src/scidata/groupby.py rename to src/xray/groupby.py diff --git a/src/scidata/ops.py b/src/xray/ops.py similarity index 100% rename from src/scidata/ops.py rename to src/xray/ops.py diff --git a/src/scidata/utils.py b/src/xray/utils.py similarity index 100% rename from src/scidata/utils.py rename to src/xray/utils.py diff --git a/test/__init__.py b/test/__init__.py index 8ba27cd3329..efb7ef80aaa 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -2,7 +2,7 @@ from numpy.testing import assert_array_equal -from scidata import utils +from xray import utils class TestCase(unittest.TestCase): diff --git a/test/test_array.py b/test/test_array.py index c80ec736d2c..f6a5423815d 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -2,7 +2,7 @@ import numpy as np -from scidata import Array, Dataset +from xray import Array, Dataset from . import TestCase @@ -41,7 +41,7 @@ def test_properties(self): def test_repr(self): v = Array(['time', 'x'], self.d) - self.assertEqual('', + self.assertEqual('', repr(v)) def test_items(self): diff --git a/test/test_dataset.py b/test/test_dataset.py index 9b692e8e94e..bc9b06e9656 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from scidata import Dataset, DatasetArray, Array, backends, open_dataset +from xray import Dataset, DatasetArray, Array, backends, open_dataset from . import TestCase @@ -42,7 +42,7 @@ def get_store(self): def test_repr(self): data = create_test_data(self.get_store()) - self.assertEqual('', repr(data)) def test_init(self): diff --git a/test/test_dataset_array.py b/test/test_dataset_array.py index 1d4c03d3d65..7861d0b6662 100644 --- a/test/test_dataset_array.py +++ b/test/test_dataset_array.py @@ -1,6 +1,6 @@ import numpy as np -from scidata import Dataset, DatasetArray, Array, intersection +from xray import Dataset, DatasetArray, Array, intersection from . import TestCase, ReturnItem diff --git a/test/test_utils.py b/test/test_utils.py index 204afba3869..b73b859adc9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -from scidata import utils +from xray import utils from . import TestCase, ReturnItem From cf3d6e261e3c59902698aef8f29cd49f06c551d4 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 14 Feb 2014 21:03:53 -0500 Subject: [PATCH 32/45] Updated required numpy version to 1.8 We need this for our current use of np.array_equal. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e2895baf583..f80c7269616 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ description='Objects for holding self describing scientific data in python', author='Stephan Hoyer, Alex Kleeman', author_email='TODO', - install_requires=['scipy >= 0.10.0', 'numpy >= 1.7', 'netCDF4 >= 1.0.6', + install_requires=['scipy >= 0.10.0', 'numpy >= 1.8', 'netCDF4 >= 1.0.6', 'pandas >= 0.13.1'], tests_require=['nose >= 1.0'], url='https://github.com/akleeman/scidata', From 0bd4c2980d9a9057b285d33db03fd446507e4b11 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sat, 15 Feb 2014 14:05:20 -0500 Subject: [PATCH 33/45] Added TODO notes --- src/xray/array_.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/xray/array_.py b/src/xray/array_.py index 9429f23acb2..24b6128deaf 100644 --- a/src/xray/array_.py +++ b/src/xray/array_.py @@ -164,6 +164,8 @@ def __setitem__(self, key, value): """__setitem__ is overloaded to access the underlying numpy data with orthogonal indexing (see __getitem__ for more details) """ + # TODO: change this to copy on write (by setting self.data) instead of + # modifying the original array (self._data). self._data[self._remap_indexer(key)] = value def __iter__(self): @@ -281,6 +283,7 @@ def transpose(self, *dimensions): data = self.data.transpose(*axes) return type(self)(dimensions, data, self.attributes) + # TODO: rename this method to 'reduce' def collapse(self, func, dimension=None, axis=None, **kwargs): """Collapse this variable by applying `func` along some dimension(s) @@ -355,6 +358,7 @@ def _collapse(self, f, dim, **kwargs): def groupby(self, group_name, group_array, squeeze=True): return groupby.GroupBy(self, group_name, group_array, squeeze=squeeze) + # TODO: remove this method (groupby encompasses its functionality) def aggregate(self, func, new_dim_name, group_by, **kwargs): """Aggregate this variable by applying `func` to grouped elements From 0af583e60398ea3f2d9a6e4a62d878993a48effe Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 16 Feb 2014 00:14:30 -0500 Subject: [PATCH 34/45] Simplified Dataset Implements most of GitHub issue #13. --- src/xray/array_.py | 47 +-- src/xray/backends.py | 18 +- src/xray/common.py | 8 +- src/xray/dataset.py | 629 ++++++++----------------------------- src/xray/dataset_array.py | 163 +++++----- src/xray/groupby.py | 6 +- src/xray/ops.py | 3 + src/xray/utils.py | 9 +- test/test_array.py | 28 +- test/test_dataset.py | 106 ++----- test/test_dataset_array.py | 153 ++++----- 11 files changed, 400 insertions(+), 770 deletions(-) diff --git a/src/xray/array_.py b/src/xray/array_.py index 24b6128deaf..94548c06c32 100644 --- a/src/xray/array_.py +++ b/src/xray/array_.py @@ -21,14 +21,11 @@ def _as_compatible_data(data): # don't check for __len__ or __iter__ so as not to warn if data is a numpy # numeric type like np.float32 required = ['dtype', 'shape', 'size', 'ndim'] - if not all(hasattr(data, attr) for attr in required): - warnings.warn('converting data to np.ndarray because %s lacks some of ' - 'the necesssary attributes for direct use' - % type(data).__name__, RuntimeWarning, stacklevel=3) + if np.iterable(data) and not all(hasattr(data, attr) for attr in required): data = np.asarray(data) elif isinstance(data, AbstractArray): # we don't want nested Array objects - data = np.asarray(data) + data = data.data return data @@ -84,13 +81,12 @@ def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): lookups need to be internally converted to numpy-style indexing. """ if isinstance(dims, basestring): - dims = [dims] - data = _as_compatible_data(data) - if len(dims) != data.ndim: + dims = (dims,) + self._dimensions = tuple(dims) + self._data = _as_compatible_data(data) + if len(dims) != self.ndim: raise ValueError('data and dimensions must have the same ' 'dimensionality') - self._dimensions = tuple(dims) - self._data = data if attributes is None: attributes = {} self._attributes = OrderedDict(attributes) @@ -106,27 +102,32 @@ def data(self): @data.setter def data(self, value): - value = np.asarray(value) + # allow any array to support pandas.Index objects + value = np.asanyarray(value) if value.shape != self.shape: raise ValueError("replacement data must match the Array's " "shape") self._data = value + self._indexing_mode = 'numpy' @property def dimensions(self): return self._dimensions - def _remap_indexer(self, key): + def _convert_indexer(self, key, indexing_mode=None): """Converts an orthogonal indexer into a fully expanded key (of the - same length as dimensions) suitable for indexing `_data` + same length as dimensions) suitable for indexing a data array with the + given indexing_mode. See Also -------- utils.expanded_indexer utils.orthogonal_indexer """ + if indexing_mode is None: + indexing_mode = self._indexing_mode key = utils.expanded_indexer(key, self.ndim) - if (self._indexing_mode == 'numpy' + if (indexing_mode == 'numpy' and any(not isinstance(k, (int, slice)) for k in key)): # key would trigger fancy indexing key = utils.orthogonal_indexer(key, self.shape) @@ -149,12 +150,20 @@ def __getitem__(self, key): If you really want to do indexing like `x[x > 0]`, manipulate the numpy array `x.data` directly. """ - key = self._remap_indexer(key) + key = self._convert_indexer(key) dimensions = [dim for k, dim in zip(key, self.dimensions) if not isinstance(k, int)] - new_data = self._data[key] + if len(key) == 1: + # unpack key so it can index a pandas.Index object (pandas.Index + # objects don't like tuples) + key, = key + # do location based indexing if supported by _data + new_data = getattr(self._data, 'iloc', self._data)[key] # orthogonal indexing should ensure the dimensionality is consistent - assert new_data.ndim == len(dimensions) + if hasattr(new_data, 'ndim'): + assert new_data.ndim == len(dimensions) + else: + assert len(dimensions) == 0 # return a variable with the same indexing_mode, because data should # still be the same type as _data return type(self)(dimensions, new_data, self.attributes, @@ -164,9 +173,7 @@ def __setitem__(self, key, value): """__setitem__ is overloaded to access the underlying numpy data with orthogonal indexing (see __getitem__ for more details) """ - # TODO: change this to copy on write (by setting self.data) instead of - # modifying the original array (self._data). - self._data[self._remap_indexer(key)] = value + self.data[self._convert_indexer(key, indexing_mode='numpy')] = value def __iter__(self): for n in range(len(self)): diff --git a/src/xray/backends.py b/src/xray/backends.py index 85e1645a595..7158e02e132 100644 --- a/src/xray/backends.py +++ b/src/xray/backends.py @@ -131,10 +131,13 @@ def set_attribute(self, key, value): def set_variable(self, name, variable): """Add a variable without checks""" - if name not in self.ds.variables: - self.ds.createVariable(name, variable.dtype, variable.dimensions) + data = variable.data + dtype_convert = {'int64': 'int32', 'float64': 'float32'} + if str(data.dtype) in dtype_convert: + data = np.asarray(data, dtype=dtype_convert[str(data.dtype)]) + self.ds.createVariable(name, data.dtype, variable.dimensions) scipy_var = self.ds.variables[name] - scipy_var[:] = variable.data[:] + scipy_var[:] = data[:] for k, v in variable.attributes.iteritems(): self._validate_attr_key(k) setattr(scipy_var, k, self._cast_attr_value(v)) @@ -197,11 +200,10 @@ def set_variable(self, name, variable): # we let the package handle the _FillValue attribute # instead of setting it ourselves. fill_value = variable.attributes.pop('_FillValue', None) - if name not in self.ds.variables: - self.ds.createVariable(varname=name, - datatype=variable.dtype, - dimensions=variable.dimensions, - fill_value=fill_value) + self.ds.createVariable(varname=name, + datatype=variable.dtype, + dimensions=variable.dimensions, + fill_value=fill_value) nc4_var = self.ds.variables[name] nc4_var[:] = variable.data[:] nc4_var.setncatts(variable.attributes) diff --git a/src/xray/common.py b/src/xray/common.py index eaa23d9fc90..64b47ee6a28 100644 --- a/src/xray/common.py +++ b/src/xray/common.py @@ -17,19 +17,19 @@ def func(self, dimension=cls._collapse_dimension_default, class AbstractArray(ImplementsCollapse): @property def dtype(self): - return self._data.dtype + return getattr(self._data, 'dtype', object) @property def shape(self): - return self._data.shape + return getattr(self._data, 'shape', ()) @property def size(self): - return self._data.size + return getattr(self._data, 'size', 1) @property def ndim(self): - return self._data.ndim + return getattr(self._data, 'ndim', 0) def __len__(self): return len(self._data) diff --git a/src/xray/dataset.py b/src/xray/dataset.py index 4c542e031b2..f1deba95d59 100644 --- a/src/xray/dataset.py +++ b/src/xray/dataset.py @@ -1,11 +1,9 @@ -# TODO Use various backend data stores. pytable, ncdf4, scipy.io, iris, memory -import os import numpy as np import netCDF4 as nc4 import pandas as pd from cStringIO import StringIO -from collections import OrderedDict, Mapping, MutableMapping +from collections import OrderedDict, Mapping import array_ as array import backends @@ -18,70 +16,6 @@ num2date = nc4.num2date -def construct_dimensions(variables): - """ - Given a dictionary of variables, construct a dimensions mapping - - Parameters - ---------- - variables : mapping - Mapping from variable names to Array objects. - - Returns - ------- - dimensions : mapping - Mapping from dimension names to lengths. - - Raises - ------ - ValueError if variable dimensions are inconsistent. - """ - dimensions = OrderedDict() - for k, var in variables.iteritems(): - if k in var.dimensions and var.ndim != 1: - raise ValueError('a coordinate variable must be defined with ' - '1-dimensional data') - for dim, length in zip(var.dimensions, var.shape): - if dim not in dimensions: - dimensions[dim] = length - elif dimensions[dim] != length: - raise ValueError('dimension %r on variable %r has length %s ' - 'but already is saved with length %s' % - (dim, k, length, dimensions[dim])) - return dimensions - - -def check_dims_and_vars_consistency(dimensions, variables): - """ - Validate dimensions and variables are consistent - - Parameters - ---------- - dimensions : mapping - Mapping from dimension names to lengths. - variables : mapping - Mapping from variable names to Array objects. - - Raises - ------ - ValueError if variable dimensions are inconsistent with the provided - dimensions. - """ - for k, var in variables.iteritems(): - if k in dimensions and var.ndim != 1: - raise ValueError('a coordinate variable must be defined with ' - '1-dimensional data') - for dim, length in zip(var.dimensions, var.shape): - if dim not in dimensions: - raise ValueError('dimension %r on variable %r is not one ' - 'of the dataset dimensions %r' % - (dim, k, list(dimensions))) - elif dimensions[dim] != length: - raise ValueError('dimension %r on variable %r has length ' - '%s but on the dataset has length %s' % - (dim, k, length, dimensions[dim])) - - def open_dataset(nc, *args, **kwargs): # move this to a classmethod Dataset.open? if isinstance(nc, basestring) and not nc.startswith('CDF'): @@ -96,53 +30,6 @@ def open_dataset(nc, *args, **kwargs): return Dataset.load_store(store) -class _IndicesCache(MutableMapping): - """Cache for Dataset indices""" - def __init__(self, dataset, cache=None): - self.dataset = dataset - self.cache = {} if cache is None else dict(cache) - # for performance reasons, we could remove this: - self.sync() - - def build_index(self, key): - """Cache the index for the dimension 'key'""" - self.cache[key] = self.dataset._create_index(key) - - def sync(self): - """Cache indices for all dimensions in this dataset""" - for key in self.dataset.dimensions: - self.build_index(key) - - def __getitem__(self, key): - if not key in self.cache: - assert key in self.dataset.dimensions - self.build_index(key) - return self.cache[key] - - def __setitem__(self, key, value): - self.cache[key] = value - - def __delitem__(self, key): - del self.cache[key] - - def __iter__(self): - return iter(self.dataset.dimensions) - - def __len__(self): - return len(self.dataset.dimensions) - - def __contains__(self, key): - return key in self.dataset.dimensions - - def __repr__(self): - contents = '\n'.join("'%s': %s" % - (k, str(v).replace( - '\n', '\n' + ' ' * (len(k) + 4))) - for k, v in self.items()) - return ("\n%s" - % (type(self).__name__, contents)) - - # list of attributes of pd.DatetimeIndex that are ndarrays of time info _DATETIMEINDEX_COMPONENTS = ['year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond', 'nanosecond', 'date', @@ -157,6 +44,8 @@ class Dataset(Mapping): Datasets are mappings from variable names to dataviews focused on those variable. + Note: the size of dimensions in a dataset cannot be changed. + Attributes ---------- dimensions : {name: length, ...} @@ -173,78 +62,79 @@ class Dataset(Mapping): Don't modify the store directly unless you want to avoid all validation checks. """ - def __init__(self, variables=None, dimensions=None, attributes=None, - indices=None, store=None): - """ - If dimensions are not provided, they are inferred from the variables. - - In general, load data from a store using the `open_dataset` function or - the `from_store` class method. The `store` argument should only be used - if you want to Dataset operations to modify stored data in-place. - Note, however, that modifying datasets in-place is not entirely - implemented and thus may lead to unexpected behavior. + def __init__(self, variables=None, attributes=None): + """To load data from a file or file-like object, use the `open_dataset` + function. """ - # TODO: fill out this docstring - if store is None: - store = backends.InMemoryDataStore() - self.store = store - - if attributes is not None: - store.set_attributes(attributes) - - if dimensions is not None: - store.set_dimensions(dimensions) - + self._variables = OrderedDict() + self._dimensions = OrderedDict() if variables is not None: - if dimensions is None: - store.set_dimensions(construct_dimensions(variables)) - else: - check_dims_and_vars_consistency(dimensions, variables) - store.set_variables(variables) + self._set_variables(variables) + if attributes is None: + attributes = {} + self._attributes = OrderedDict(attributes) - if indices is None: - indices = {} - else: - for k, v in indices.iteritems(): - if k not in self.dimensions or v.size != self.dimensions[k]: - raise ValueError('inconsistent index %r' % k) - self._indices = _IndicesCache(self, indices) - - @classmethod - def load_store(cls, store): - return cls(store.variables, store.dimensions, store.attributes) - - def _create_index(self, dim): - if dim in self.variables: - var = self.variables[dim] - data = var.data + def _as_variable(self, name, var): + if not isinstance(var, array.Array): + try: + var = array.Array(*var) + except TypeError: + raise TypeError('Dataset variables must be of type ' + 'DatasetArray or Array, or a sequence of the ' + 'form (dimensions, data[, attributes])') + + if name in var.dimensions: + # convert the coordinate into a pandas.Index + if var.ndim != 1: + raise ValueError('a coordinate variable must be defined with ' + '1-dimensional data') attr = var.attributes if 'units' in attr and 'since' in attr['units']: - index = utils.num2datetimeindex(data, attr['units'], - attr.get('calendar')) + var.data = utils.num2datetimeindex(var.data, attr.pop('units'), + attr.pop('calendar', None)) else: - index = pd.Index(data) - elif dim in self.dimensions: - index = pd.Index(np.arange(self.dimensions[dim])) - else: - raise ValueError('cannot find index %r in dataset' % dim) - return index + var.data = pd.Index(var.data) + return var + + def _set_variables(self, variables): + """Set a mapping of variables and update dimensions""" + # save new variables into a temporary list so all the error checking + # can be done before updating _variables + new_variables = [] + for k, var in variables.iteritems(): + var = self._as_variable(k, var) + for dim, size in zip(var.dimensions, var.shape): + if dim not in self._dimensions: + self._dimensions[dim] = size + if dim not in variables and dim not in self._variables: + coord = self._as_variable(dim, (dim, np.arange(size))) + new_variables.append((dim, coord)) + elif self._dimensions[dim] != size: + raise ValueError('dimension %r on variable %r has size %s ' + 'but already is saved with size %s' % + (dim, k, size, self._dimensions[dim])) + new_variables.append((k, var)) + self._variables.update(new_variables) - @property - def indices(self): - return self._indices + @classmethod + def load_store(cls, store): + return cls(store.variables, store.attributes) @property def variables(self): - return Frozen(self.store.variables) + return Frozen(self._variables) @property def attributes(self): - return Frozen(self.store.attributes) + return self._attributes + + @attributes.setter + def attributes(self, value): + self._attributes = OrderedDict(value) @property def dimensions(self): - return Frozen(self.store.dimensions) + return Frozen(self._dimensions) def copy(self): """ @@ -256,8 +146,7 @@ def __copy__(self): """ Returns a shallow copy of the current object. """ - return type(self)(self.variables, self.dimensions, self.attributes, - indices=self.indices.cache) + return type(self)(self.variables, self.attributes) def __contains__(self, key): """ @@ -267,37 +156,35 @@ def __contains__(self, key): return key in self.variables def __len__(self): - return len(self.variable) + return len(self.variables) def __iter__(self): return iter(self.variables) @property def _datetimeindices(self): - return [k for k, v in self.indices.iteritems() - if isinstance(v, pd.DatetimeIndex)] + return [k for k, v in self.variables.iteritems() + if isinstance(v._data, pd.DatetimeIndex)] def _get_virtual_variable(self, key): - if key in self.indices: - return array.Array([key], self.indices[key].values) split_key = key.split('.') if len(split_key) == 2: - var, suffix = split_key - if var in self._datetimeindices: - if suffix in _DATETIMEINDEX_COMPONENTS: - return array.Array([var], getattr(self.indices[var], suffix)) - elif suffix == 'season': + name, suffix = split_key + if name in self._datetimeindices: + if suffix == 'season': # seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) - month = self.indices[var].month - return array.Array([var], (month // 3) % 4 + 1) + month = self.variables[name].data.month + data = (month // 3) % 4 + 1 + else: + data = getattr(self.variables[name].data, suffix) + return array.Array(self.variables[name].dimensions, data) raise ValueError('virtual variable %r not found' % key) - def _get_virtual_dataview(self, key): + def _get_virtual_dataset_array(self, key): virtual_var = self._get_virtual_variable(key) - new_vars = OrderedDict(self.variables.items() + [(key, virtual_var)]) - ds = type(self)(new_vars, self.dimensions, self.attributes, - indices=self.indices.cache) - return DatasetArray(ds, key) + ds = self.copy() + ds[key] = virtual_var + return ds[key] @property def virtual_variables(self): @@ -305,37 +192,34 @@ def virtual_variables(self): could be created on demand (because they can be calculated from other dataset variables or dimensions) """ - possible_vars = list(self.dimensions) + possible_vars = [] for k in self._datetimeindices: for suffix in _DATETIMEINDEX_COMPONENTS + ['season']: possible_vars.append('%s.%s' % (k, suffix)) return tuple(k for k in possible_vars if k not in self.variables) def __getitem__(self, key): - if key not in self.variables: + if key in self.variables: + return DatasetArray(self.select(key), key) + else: try: - return self._get_virtual_dataview(key) + return self._get_virtual_dataset_array(key) except ValueError: raise KeyError('dataset contains no variable with name %r ' % key) - else: - return DatasetArray(self.select(key), key) def __setitem__(self, key, value): - # TODO: allow this operation to be destructive, overriding existing - # variables? If so, we may want to implement __delitem__, too. - # (We would need to change DatasetArray.__setitem__ in that case, because - # we definitely don't want to override focus variables.) if isinstance(value, DatasetArray): - # print 'value was ', repr(value) - # print 'renamed to ', repr(value.renamed()) - # print 'setting item', repr(value.renamed(key).dataset) self.merge(value.renamed(key).dataset, inplace=True) - elif isinstance(value, array.Array): - self.set_variable(key, value) else: - raise TypeError('only DatasetArrays and Arrays can be added to ' - 'datasets via `__setitem__`') + self._set_variables({key: value}) + + def __delitem__(self, key): + del self._variables[key] + dims = set().union(v.dimensions for v in self._variables.itervalues()) + for dim in self._dimensions: + if dim not in dims: + del self._dimensions[dim] # mutable objects should not be hashable __hash__ = None @@ -344,9 +228,7 @@ def __eq__(self, other): try: # some stores (e.g., scipy) do not seem to preserve order, so don't # require matching dimension or variable order for equality - return (sorted(self.dimensions.items()) - == sorted(other.dimensions.items()) - and sorted(self.attributes.items()) + return (sorted(self.attributes.items()) == sorted(other.attributes.items()) and all(k1 == k2 and utils.variable_equal(v1, v2) for (k1, v1), (k2, v2) @@ -360,10 +242,13 @@ def __ne__(self, other): @property def coordinates(self): - """Coordinates are variables with names that match dimensions""" + """Coordinates are variables with names that match dimensions + + They are always stored internally as arrays with data that is a + pandas.Index object + """ return FrozenOrderedDict([(dim, self.variables[dim]) - for dim in self.dimensions - if dim in self.variables]) + for dim in self.dimensions]) @property def noncoordinates(self): @@ -372,14 +257,14 @@ def noncoordinates(self): """ return FrozenOrderedDict([(name, v) for (name, v) in self.variables.iteritems() - if name not in self.coordinates]) + if name not in self.dimensions]) def dump_to_store(self, store): """Store dataset contents to a backends.*DataStore object""" - target = type(self)(self.variables, self.dimensions, self.attributes, - store=store, indices=self.indices.cache) - target.store.sync() - return target + store.set_dimensions(self.dimensions) + store.set_variables(self.variables) + store.set_attributes(self.attributes) + store.sync() def dump(self, filepath, *args, **kwdargs): """Dump dataset contents to a location on disk using the netCDF4 @@ -430,171 +315,11 @@ def __str__(self): return '\n'.join(summary).replace('\t', ' ' * 4) def __repr__(self): - dim_summary = ', '.join('%s%s: %s' % ('@' if k in self else '', k, v) - for k, v in self.dimensions.iteritems()) + dim_summary = ', '.join('%s: %s' % (k, v) for k, v + in self.dimensions.iteritems()) return '' % (type(self).__name__, dim_summary, ' '.join(self.noncoordinates)) - def create_variable(self, name, dims, data, attributes=None): - """Create a new variable and add it to this dataset - - Parameters - ---------- - name : string - The name of the new variable. An exception will be raised if the - object already has a variable with this name. If name equals the - name of a dimension, then the new variable is treated as a - coordinate variable and must be 1-dimensional. - dims : tuple - The dimensions of the new variable. Elements must be dimensions of - the object. - data : numpy.ndarray - Data to populate the new variable. - attributes : dict_like or None, optional - Attributes to assign to the new variable. If None (default), an - empty attribute dictionary is initialized. - - Returns - ------- - var : Array - Reference to the newly created variable. - """ - # any error checking should be taken care of by add_variable - v = array.Array(dims, np.asarray(data), attributes) - return self.add_variable(name, v) - - def create_coordinate(self, name, data, attributes=None): - """Create a new dimension and a corresponding coordinate variable - - This method combines the create_dimension and create_variable methods - for the common case when the variable is a 1-dimensional coordinate - variable with the same name as the dimension. - - If the dimension already exists, this function proceeds unless there is - already a corresponding variable or if the lengths disagree. - - Parameters - ---------- - name : string - The name of the new dimension and variable. An exception will be - raised if the object already has a dimension or variable with this - name. - data : array_like - The coordinate values along this dimension; must be 1-dimensional. - The size of data is the length of the new dimension. - attributes : dict_like or None, optional - Attributes to assign to the new variable. If None (default), an - empty attribute dictionary is initialized. - - Returns - ------- - var : Array - Reference to the newly created coordinate variable. - """ - # any error checking should be taken care of by add_coordinate - v = array.Array((name,), np.asarray(data), attributes) - return self.add_coordinate(v) - - def add_dimension(self, name, length): - """Add a dimension to this dataset - - Parameters - ---------- - name : string - The name of the new dimension. An exception will be raised if the - object already has a dimension with this name. - length : int - The length of the new dimension; must a be non-negative integer. - """ - if name in self.dimensions: - raise ValueError('dimension named %r already exists' % name) - length = int(length) - if length < 0: - raise ValueError('length must be non-negative') - self.store.set_dimension(name, length) - - def add_variable(self, name, var): - """Add a variable to the dataset - - Parameters - ---------- - name : string - The name under which the variable will be added. - variable : Array - The variable to be added. If the desired action is to add a copy of - the variable be sure to do so before passing it to this function. - - Returns - ------- - Array - An Array object attached to the underlying datastore. - """ - if name in self.variables: - raise ValueError("Array named %r already exists" % name) - return self.set_variable(name, var) - - def add_coordinate(self, var): - """Add a coordinate variable to the dataset - - Parameters - ---------- - variable : Array - The coordinate variable to be added. Coordinate variables must be - 1D, and will be added under the same name as their sole dimension. - - Returns - ------- - variable - An Array object attached to the underlying datastore. - """ - # We need to be cleanly roll back the effects of - # create_dimension if create_variable fails, otherwise we will - # end up in a partial state. - name = var.dimensions[0] - if name in self.coordinates: - raise ValueError("coordinate named '%s' already exists" % name) - if var.ndim != 1: - raise ValueError("coordinate data must be 1-dimensional (vector)") - if name not in self.dimensions: - self.store.set_dimension(name, var.size) - elif self.dimensions[name] != var.size: - raise ValueError('dimension already exists with different length') - return self.store.set_variable(name, var) - - def set_variable(self, name, var): - """Set a variable in the dataset - - Unlike `add_variable`, this function allows for overriding existing - variables. - - Parameters - ---------- - name : string - The name under which the variable will be added. - variable : Array - The variable to be added. If the desired action is to add a copy of - the variable be sure to do so before passing it to this function. - - Returns - ------- - variable - An Array object attached to the underlying datastore. - """ - # check old + new dimensions for consistency checks - new_dims = OrderedDict() - for dim, length in zip(var.dimensions, var.shape): - if dim not in self.dimensions: - new_dims[dim] = length - check_dims_and_vars_consistency( - dict(self.dimensions.items() + new_dims.items()), - {name: var}) - # now set the new dimensions and variables, and rebuild the indices - self.store.set_dimensions(new_dims) - new_var = self.store.set_variable(name, var) - if name in list(self.indices) + list(new_dims): - self.indices.build_index(name) - return new_var - def indexed_by(self, **indexers): """Return a new dataset with each array indexed along the specified dimension(s) @@ -635,20 +360,13 @@ def indexed_by(self, **indexers): variables = OrderedDict() for name, var in self.variables.iteritems(): var_indexers = {k: v for k, v in indexers.iteritems() - if k in var.dimensions} - variables[name] = var.indexed_by(**var_indexers) - - indices = {k: (v[indexers[k]] if k in indexers else v) - for k, v in self.indices.iteritems()} - # filter out non-indices (indices for which one value was selected) - indices = {k: v for k, v in indices.iteritems() - if isinstance(v, pd.Index)} - variables = OrderedDict((k, v) for k, v in variables.iteritems() - if v.ndim > 0) - dimensions = OrderedDict((k, indices[k].size) for k in self.dimensions - if k in indices) - return type(self)(variables, dimensions, self.attributes, - indices=indices) + if k in var.dimensions} + new_var = var.indexed_by(**var_indexers) + if new_var.ndim > 0: + # filter out variables reduced to numbers + variables[name] = new_var + + return type(self)(variables, self.attributes) def labeled_by(self, **indexers): """Return a new dataset with each variable indexed by coordinate labels @@ -688,7 +406,7 @@ def labeled_by(self, **indexers): Dataset.indexed_by Array.indexed_by """ - return self.indexed_by(**remap_loc_indexers(self.indices, indexers)) + return self.indexed_by(**remap_loc_indexers(self.variables, indexers)) def renamed(self, name_dict): """ @@ -701,23 +419,19 @@ def renamed(self, name_dict): names and whose values are new names. """ for k in name_dict: - if k not in self.dimensions and k not in self.variables: + if k not in self.variables: raise ValueError("Cannot rename %r because it is not a " - "variable or dimension in this dataset" % k) + "variable in this dataset" % k) variables = OrderedDict() for k, v in self.variables.iteritems(): name = name_dict.get(k, k) dims = tuple(name_dict.get(dim, dim) for dim in v.dimensions) #TODO: public interface for renaming a variable without loading - # data - variables[name] = array.Array(dims, v._data, v.attributes) + # data? + variables[name] = array.Array(dims, v._data, v.attributes, + v._indexing_mode) - dimensions = OrderedDict((name_dict.get(k, k), v) - for k, v in self.dimensions.iteritems()) - indices = {name_dict.get(k, k): v - for k, v in self.indices.cache.items()} - return type(self)(variables, dimensions, self.attributes, - indices=indices) + return type(self)(variables, self.attributes) def merge(self, other, inplace=False): """Merge two datasets into a single new dataset @@ -745,27 +459,17 @@ def merge(self, other, inplace=False): are silently dropped. """ # check for conflicts - utils.update_safety_check(self.noncoordinates, other.noncoordinates, + utils.update_safety_check(self.variables, other.variables, compat=utils.variable_equal) - utils.update_safety_check(self.dimensions, other.dimensions) - # note: coordinates are checked by comparing indices instead of - # variables, which lets us merge two datasets even if they have - # different time units - utils.update_safety_check(self.indices, other.indices, - compat=np.array_equal) # update contents obj = self if inplace else self.copy() - obj.store.set_variables(OrderedDict((k, v) for k, v - in other.variables.iteritems() - if k not in obj.variables)) - obj.store.set_dimensions(OrderedDict((k, v) for k, v - in other.dimensions.iteritems() - if k not in obj.dimensions)) - obj._indices.update(other.indices.cache) + obj._set_variables(OrderedDict((k, v) for k, v + in other.variables.iteritems() + if k not in obj.variables)) # remove conflicting attributes for k, v in other.attributes.iteritems(): - if k in self.attributes and not v != self.attributes[k]: - obj.store.del_attribute(k) + if k in self.attributes and v != self.attributes[k]: + del self.attributes[k] return obj def select(self, *names): @@ -783,12 +487,10 @@ def select(self, *names): Returns ------- Dataset - The returned object has the same attributes as the original. A - dimension is included if at least one of the specified variables is - defined along that dimension. Coordinate variables (1-dimensional - variables with the same name as a dimension) that correspond to an - included dimension are also included. All other variables are - dropped. + The returned object has the same attributes as the original. + Variables are included (recursively) if at least one of the + specified variables refers to that variable in its dimensions or + "coordinates" attribute. All other variables are dropped. """ if not all(k in self.variables for k in names): raise ValueError( @@ -814,32 +516,23 @@ def get_all_associated_names(name): queue |= new_names - selected_names selected_names |= new_names - def ordered_keys_in(dictionary, selection): - return OrderedDict((k, v) for k, v in dictionary.iteritems() - if k in selection) - - variables = ordered_keys_in(self.variables, selected_names) - dimensions = ordered_keys_in(self.dimensions, selected_names) - indices = ordered_keys_in(self.indices.cache, selected_names) - return type(self)(variables, dimensions, self.attributes, - indices=indices) + variables = OrderedDict((k, v) for k, v in self.variables.iteritems() + if k in selected_names) + return type(self)(variables, self.attributes) - def unselect(self, *names, **kwargs): + def unselect(self, *names): """Returns a new dataset without the named variables Parameters ---------- *names : str Names of the variables to omit from the returned object. - omit_dimensions : bool, optional (default True) - Whether or not to also omit dimensions with the given names. All - variables along omited dimensions will also be removed. Returns ------- Dataset - New dataset based on this dataset. Only the named variables - /dimensions are removed. + New dataset based on this dataset. Only the named variables are + removed. """ if any(k not in self.variables and k not in self.dimensions for k in names): @@ -847,20 +540,7 @@ def unselect(self, *names, **kwargs): 'names does not exist on this dataset') variables = OrderedDict((k, v) for k, v in self.variables.iteritems() if k not in names) - if kwargs.get('omit_dimensions', True): - dimensions = OrderedDict((k, v) for k, v - in self.dimensions.iteritems() - if k not in names) - variables = OrderedDict((k, v) for k, v in variables.iteritems() - if all(d in dimensions - for d in v.dimensions)) - indices = {k: v for k, v in self.indices.cache.items() - if k not in names} - else: - dimensions = self.dimensions - indices = self.indices - return type(self)(variables, dimensions, self.attributes, - indices=indices) + return type(self)(variables, self.attributes) def replace(self, name, variable): """Returns a new dataset with the variable 'name' replaced with @@ -878,8 +558,8 @@ def replace(self, name, variable): Dataset New dataset based on this dataset. Dimensions are unchanged. """ - ds = self.unselect(name, omit_dimensions=False) - ds.add_variable(name, variable) + ds = self.unselect(name) + ds[name] = variable return ds def iterator(self, dimension): @@ -910,7 +590,7 @@ def to_dataframe(self): DataFrame. The DataFrame is be indexed by the Cartesian product of this dataset's indices. """ - index_names = self.indices.keys() + index_names = self.coordinates.keys() columns = self.noncoordinates.keys() data = [] # we need a template to broadcast all dataset variables against @@ -921,39 +601,6 @@ def to_dataframe(self): _, var_data = np.broadcast_arrays(template.data, var.data) data.append(var_data.reshape(-1)) # note: pd.MultiIndex.from_product is new in pandas-0.13.1 - index = pd.MultiIndex.from_product(self.indices.values(), + index = pd.MultiIndex.from_product(self.coordinates.values(), names=index_names) return pd.DataFrame(OrderedDict(zip(columns, data)), index=index) - - -if __name__ == "__main__": - """ - A bunch of regression tests. - """ - base_dir = os.path.dirname(__file__) - test_dir = os.path.join(base_dir, '..', '..', 'test', ) - write_test_path = os.path.join(test_dir, 'test_output.nc') - ecmwf_netcdf = os.path.join(test_dir, 'ECMWF_ERA-40_subset.nc') - - import time - st = time.time() - nc = Dataset(ecmwf_netcdf) - print "Seconds to read from filepath : ", time.time() - st - - st = time.time() - nc.dump(write_test_path) - print "Seconds to write : ", time.time() - st - - st = time.time() - nc_string = nc.dumps() - print "Seconds to serialize : ", time.time() - st - - st = time.time() - nc = Dataset(nc_string) - print "Seconds to deserialize : ", time.time() - st - - st = time.time() - with open(ecmwf_netcdf, 'r') as f: - nc = Dataset(f) - print "Seconds to read from fobj : ", time.time() - st - diff --git a/src/xray/dataset_array.py b/src/xray/dataset_array.py index 546904af449..e1af2e4b76e 100644 --- a/src/xray/dataset_array.py +++ b/src/xray/dataset_array.py @@ -7,7 +7,7 @@ import numpy as np import array_ -import dataset +import dataset as dataset_ import groupby import ops from common import AbstractArray @@ -15,19 +15,19 @@ class _LocIndexer(object): - def __init__(self, array): - self.array = array + def __init__(self, ds_array): + self.ds_array = ds_array def _remap_key(self, key): - indexers = remap_loc_indexers(self.array.indices, - self.array._key_to_indexers(key)) + indexers = remap_loc_indexers(self.ds_array.dataset.variables, + self.ds_array._key_to_indexers(key)) return tuple(indexers.values()) def __getitem__(self, key): - return self.array[self._remap_key(key)] + return self.ds_array[self._remap_key(key)] def __setitem__(self, key, value): - self.array[self._remap_key(key)] = value + self.ds_array[self._remap_key(key)] = value class DatasetArray(AbstractArray): @@ -47,46 +47,42 @@ def __init__(self, dataset, focus): Parameters ---------- dataset : xray.Dataset - The dataset on which to build this data view. + The dataset on which to build this dataset array. focus : str The name of the "focus variable" in `dataset` on which this object is oriented. """ + if not isinstance(dataset, dataset_.Dataset): + dataset = dataset_.Dataset(dataset) if not focus in dataset: raise ValueError('focus %r is not a variable in dataset %r' % (focus, dataset)) self.dataset = dataset self.focus = focus - @classmethod - def create(cls, focus, dimensions, data): - ds = dataset.Dataset() - ds.create_variable(focus, dimensions, data) - return ds[focus] - @property - def variable(self): + def array(self): return self.dataset.variables[self.focus] - @variable.setter - def variable(self, value): - self.dataset.set_variable(self.focus, value) + @array.setter + def array(self, value): + self.dataset[self.focus] = value # _data is necessary for AbstractArray @property def _data(self): - return self.variable._data + return self.array._data @property def data(self): """The dataset array's data as a numpy.ndarray""" - return self.variable.data + return self.array.data @data.setter def data(self, value): - self.variable.data = value + self.array.data = value @property def dimensions(self): - return self.variable.dimensions + return self.array.dimensions def _key_to_indexers(self, key): return OrderedDict( @@ -106,7 +102,10 @@ def __setitem__(self, key, value): self.dataset[key] = value else: # orthogonal array indexing - self.variable[key] = value + self.array[key] = value + + def __delitem__(self, key): + del self.dataset[key] def __contains__(self, key): return key in self.dataset @@ -123,13 +122,12 @@ def __iter__(self): @property def attributes(self): - return self.variable.attributes + return self.array.attributes @property - def indices(self): - return FrozenOrderedDict((k, v) for k, v - in self.dataset.indices.iteritems() - if k in self.dimensions) + def coordinates(self): + return FrozenOrderedDict((k, self.dataset.variables[k]) + for k in self.dimensions) def copy(self): return self.__copy__() @@ -148,10 +146,8 @@ def __str__(self): def __repr__(self): if self.ndim > 0: - dim_summary = ', '.join('%s%s: %s' % - ('@' if k in self.dataset else '', k, v) - for k, v in zip(self.dimensions, - self.shape)) + dim_summary = ', '.join('%s: %s' % (k, v) for k, v + in zip(self.dimensions, self.shape)) contents = ' (%s): %s' % (dim_summary, self.dtype) else: contents = ': %s' % self.data @@ -169,7 +165,7 @@ def indexed_by(self, **indexers): if self.focus not in ds: # always keep focus variable in the dataset, even if it was # unselected because indexing made it a scaler - ds[self.focus] = self.variable.indexed_by(**indexers) + ds[self.focus] = self.array.indexed_by(**indexers) return type(self)(ds, self.focus) def labeled_by(self, **indexers): @@ -180,7 +176,8 @@ def labeled_by(self, **indexers): -------- Dataset.labeled_by """ - return self.indexed_by(**remap_loc_indexers(self.indices, indexers)) + return self.indexed_by(**remap_loc_indexers(self.dataset.variables, + indexers)) def renamed(self, new_name): """Returns a new DatasetArray with this DatasetArray's focus variable @@ -202,17 +199,19 @@ def unselect(self, *names): 'method or the `unselect` method of the dataset.') return type(self)(self.dataset.unselect(*names), self.focus) - def refocus(self, new_var): + def refocus(self, new_var, name=None): """Returns a copy of this DatasetArray's dataset with this DatasetArray's focus variable replaced by `new_var` - If `new_var` is a dataview, its contents will be merged in. + If `new_var` is a dataset array, its contents will be merged in. """ if not hasattr(new_var, 'dimensions'): - new_var = type(self.variable)(self.variable.dimensions, new_var) + new_var = type(self.array)(self.array.dimensions, new_var) ds = self.unselected() - ds[self.focus] = new_var - return type(self)(ds, self.focus) + if name is None: + name = self.focus + ds[name] = new_var + return type(self)(ds, name) def iterator(self, dimension): """Iterate along a data dimension @@ -231,10 +230,12 @@ def iterator(self, dimension): The returned iterator yields pairs of scalar-valued coordinate arrays and DatasetArray objects. """ + # TODO: remove this method (replaced by groupby) for (x, ds) in self.dataset.iterator(dimension): yield (x, type(self)(ds, self.focus)) def groupby(self, group, squeeze=True): + # TODO: document this method if isinstance(group, basestring): # merge in the group's dataset to allow group to be a virtual # variable in this dataset @@ -264,7 +265,7 @@ def transpose(self, *dimensions): numpy.transpose Array.transpose """ - return self.refocus(self.variable.transpose(*dimensions)) + return self.refocus(self.array.transpose(*dimensions)) def collapse(self, func, dimension=None, axis=None, **kwargs): """Collapse this array by applying `func` along some dimension(s) @@ -297,7 +298,8 @@ def collapse(self, func, dimension=None, axis=None, **kwargs): DatasetArray with this object's array replaced with an array with summarized data and the indicated dimension(s) removed. """ - var = self.variable.collapse(func, dimension, axis, **kwargs) + # TODO: rename this method "reduce" + var = self.array.collapse(func, dimension, axis, **kwargs) drop = set(self.dimensions) - set(var.dimensions) # For now, take an aggressive strategy of removing all variables # associated with any dropped dimensions @@ -305,7 +307,7 @@ def collapse(self, func, dimension=None, axis=None, **kwargs): drop |= {k for k, v in self.dataset.variables.iteritems() if any(dim in drop for dim in v.dimensions)} ds = self.dataset.unselect(*drop) - ds.add_variable(self.focus, var) + ds[self.focus] = var return type(self)(ds, self.focus) def aggregate(self, func, new_dim, **kwargs): @@ -330,17 +332,18 @@ def aggregate(self, func, new_dim, **kwargs): aggregated : DatasetArray DatasetArray with aggregated data and the new dimension `new_dim`. """ + # TODO: remove this method (replaced by groupby) if isinstance(new_dim, basestring): new_dim = self.dataset[new_dim] - unique, aggregated = self.variable.aggregate( + unique, aggregated = self.array.aggregate( func, new_dim.focus, new_dim, **kwargs) # TODO: add options for how to summarize variables along aggregated # dimensions instead of just dropping them? drop = {k for k, v in self.dataset.variables.iteritems() if any(dim in new_dim.dimensions for dim in v.dimensions)} ds = self.dataset.unselect(*drop) - ds.add_coordinate(unique) - ds.add_variable(self.focus, aggregated) + ds[unique.dimensions[0]] = unique + ds[self.focus] = aggregated return type(self)(ds, self.focus) @classmethod @@ -371,9 +374,7 @@ def from_stack(cls, arrays, dimension='stacked_dimension', Stacked dataset array formed by stacking all the supplied variables along the new dimension. """ - # create an empty dataset in which to stack variables - # start by putting in the dimension variable - ds = dataset.Dataset() + ds = dataset_.Dataset() if isinstance(dimension, basestring): dim_name = dimension else: @@ -384,8 +385,9 @@ def from_stack(cls, arrays, dimension='stacked_dimension', if template is not None: # use metadata from the template dataset array focus = template.focus + old_dim_name, = template.dataset.variables[dim_name].dimensions drop = {k for k, v in template.dataset.variables.iteritems() - if k in [focus, dim_name]} + if old_dim_name in v.dimensions} ds.merge(template.dataset.unselect(*drop), inplace=True) else: # figure out metadata by inspecting each array @@ -406,68 +408,65 @@ def from_stack(cls, arrays, dimension='stacked_dimension', if focus is None: focus = 'stacked_variable' - # finally, merge in the stacked variables ds[focus] = array_.Array.from_stack(arrays, dimension, stacked_indexers, length, template) - stacked = cls(ds, focus) - - if template is not None: - drop = set(template.dataset.dimensions) - set(stacked.dimensions) - drop |= {k for k, v in ds.variables.iteritems() - if any(dim in drop for dim in v.dimensions)} - stacked = stacked.unselect(*drop) - return stacked + return cls(ds, focus) def to_dataframe(self): """Convert this array into a pandas.DataFrame Non-coordinate variables in this array's dataset (which include the view's data) form the columns of the DataFrame. The DataFrame is be - indexed by the Cartesian product of the dataset's indices. + indexed by the Cartesian product of the dataset's coordinates. """ return self.dataset.to_dataframe() def __array_wrap__(self, result): - return self.refocus(self.variable.__array_wrap__(result)) + return self.refocus(self.array.__array_wrap__(result), + self.focus + '_') @staticmethod def _unary_op(f): @functools.wraps(f) def func(self, *args, **kwargs): - return self.refocus(f(self.variable, *args, **kwargs)) + return self.refocus(f(self.array, *args, **kwargs), + self.focus + '_' + f.__name__) return func - def _check_indices_compat(self, other): + def _check_coordinates_compat(self, other): # TODO: possibly automatically select index intersection instead? - if hasattr(other, 'indices'): - for k, v in self.indices.iteritems(): - if (k in other.indices - and not np.array_equal(v, other.indices[k])): - raise ValueError('index %r is not aligned' % k) + if hasattr(other, 'coordinates'): + for k, v in self.coordinates.iteritems(): + if (k in other.coordinates + and not np.array_equal(v, other.coordinates[k])): + raise ValueError('coordinate %r is not aligned' % k) @staticmethod def _binary_op(f, reflexive=False): @functools.wraps(f) def func(self, other): - # TODO: automatically group by other variable dimensions - self._check_indices_compat(other) + # TODO: automatically group by other variable dimensions to allow + # for broadcasting dimensions like 'dayofyear' against 'time' + self._check_coordinates_compat(other) ds = self.unselected() if hasattr(other, 'unselected'): ds.merge(other.unselected(), inplace=True) - other_variable = getattr(other, 'variable', other) - ds[self.focus] = (f(self.variable, other_variable) - if not reflexive - else f(other_variable, self.variable)) - return ds[self.focus] + other_array = getattr(other, 'array', other) + other_focus = getattr(other, 'focus', 'other') + focus = self.focus + '_' + f.__name__ + '_' + other_focus + ds[focus] = (f(self.array, other_array) + if not reflexive + else f(other_array, self.array)) + return type(self)(ds, focus) return func @staticmethod def _inplace_binary_op(f): @functools.wraps(f) def func(self, other): - self._check_indices_compat(other) - other_variable = getattr(other, 'variable', other) - self.variable = f(self.variable, other_variable) + self._check_coordinates_compat(other) + other_array = getattr(other, 'array', other) + self.array = f(self.array, other_array) if hasattr(other, 'unselected'): self.dataset.merge(other.unselected(), inplace=True) return self @@ -483,7 +482,9 @@ def intersection(array1, array2): # TODO: automatically calculate the intersection when doing math with # arrays, or better yet calculate the union of the indices and fill in # the mis-aligned data with NaN. - overlapping_indices = {k: array1.indices[k] & array2.indices[k] - for k in array1.indices if k in array2.indices} - return tuple(dv.labeled_by(**overlapping_indices) - for dv in [array1, array2]) + overlapping_coords = {k: (array1.coordinates[k].data + & array2.coordinates[k].data) + for k in array1.coordinates + if k in array2.coordinates} + return tuple(ar.labeled_by(**overlapping_coords) + for ar in [array1, array2]) diff --git a/src/xray/groupby.py b/src/xray/groupby.py index d335b909daa..88190e29dfe 100644 --- a/src/xray/groupby.py +++ b/src/xray/groupby.py @@ -3,7 +3,7 @@ from common import ImplementsCollapse from ops import inject_collapse_methods import array_ -import dataset_array +import dataset import numpy as np @@ -84,8 +84,8 @@ def __init__(self, array, group_name, group_coord, squeeze=True): else: # look through group_coord to find the unique values unique_values, group_indices = unique_value_groups(group_coord) - unique_coord = dataset_array.DatasetArray.create( - group_name, [group_name], unique_values) + unique_coord = dataset.Dataset( + {group_name: (group_name, unique_values)})[group_name] self.group_indices = group_indices self.unique_coord = unique_coord diff --git a/src/xray/ops.py b/src/xray/ops.py index 264232d9ff7..56632062cf7 100644 --- a/src/xray/ops.py +++ b/src/xray/ops.py @@ -1,3 +1,4 @@ +import functools import operator import numpy as np @@ -27,12 +28,14 @@ def _data_method_wrapper(f): def func(self, *args, **kwargs): return getattr(self.data, f)(*args, **kwargs) + func.__name__ = f return func def _method_wrapper(f): def func(self, *args, **kwargs): return getattr(self, f)(*args, **kwargs) + func.__name__ = f return func diff --git a/src/xray/utils.py b/src/xray/utils.py index d8cd0be39b8..06cd9551e61 100644 --- a/src/xray/utils.py +++ b/src/xray/utils.py @@ -68,7 +68,7 @@ def remap_loc_indexers(indices, indexers): """ new_indexers = OrderedDict() for dim, loc in indexers.iteritems(): - index = indices[dim] + index = indices[dim].data if isinstance(loc, slice): indexer = index.slice_indexer(loc.start, loc.stop, loc.step) else: @@ -128,7 +128,10 @@ def variable_equal(v1, v2, rtol=1e-05, atol=1e-08): # see: pandas.core.common.array_equivalent data1 = v1.data data2 = v2.data - if np.issubdtype(data1.dtype, (str, object)): + if hasattr(data1, 'equals'): + # handle pandas.Index objects + return data1.equals(data2) + elif np.issubdtype(data1.dtype, (str, object)): return np.array_equal(data1, data2) else: return np.allclose(data1, data2, rtol=rtol, atol=atol) @@ -156,7 +159,7 @@ def update_safety_check(first_dict, second_dict, compat=operator.eq): if (k in first_dict and not (v is first_dict[k] or compat(v, first_dict[k]))): raise ValueError('unsafe to merge dictionaries without ' - 'overriding values') + 'overriding values; conflicting key %r' % k) def remove_incompatible_items(first_dict, second_dict, compat=operator.eq): diff --git a/test/test_array.py b/test/test_array.py index f6a5423815d..73a50d91f99 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -1,3 +1,4 @@ +from copy import deepcopy import warnings import numpy as np @@ -11,7 +12,7 @@ def setUp(self): self.d = np.random.random((10, 3)).astype(np.float64) def test_data(self): - v = Array(['time', 'x'], self.d) + v = Array(['time', 'x'], self.d, indexing_mode='not-supported') self.assertIs(v.data, self.d) with self.assertRaises(ValueError): # wrong size @@ -19,15 +20,24 @@ def test_data(self): d2 = np.random.random((10, 3)) v.data = d2 self.assertIs(v.data, d2) + self.assertEqual(v._indexing_mode, 'numpy') - with warnings.catch_warnings(record=True) as w: - v = Array(['x'], range(5)) - self.assertIn("converting data to np.ndarray", str(w[-1].message)) - self.assertIsInstance(v.data, np.ndarray) - with warnings.catch_warnings(record=True) as w: - # don't warn for numpy numbers - v = Array([], np.float32(1)) - self.assertFalse(w) + def test_array_equality(self): + d = np.random.rand(10, 3) + v1 = Array(('dim1', 'dim2'), data=d, + attributes={'att1': 3, 'att2': [1, 2, 3]}) + v2 = Array(('dim1', 'dim2'), data=d, + attributes={'att1': 3, 'att2': [1, 2, 3]}) + v3 = Array(('dim1', 'dim3'), data=d, + attributes={'att1': 3, 'att2': [1, 2, 3]}) + v4 = Array(('dim1', 'dim2'), data=d, + attributes={'att1': 3, 'att2': [1, 2, 4]}) + v5 = deepcopy(v1) + v5.data[:] = np.random.rand(10, 3) + self.assertVarEqual(v1, v2) + self.assertVarNotEqual(v1, v3) + self.assertVarNotEqual(v1, v4) + self.assertVarNotEqual(v1, v5) def test_properties(self): v = Array(['time', 'x'], self.d, {'foo': 'bar'}) diff --git a/test/test_dataset.py b/test/test_dataset.py index bc9b06e9656..fe1e341b497 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -23,16 +23,12 @@ def create_test_data(store=None): obj = Dataset() if store is None else Dataset.load_store(store) - obj.add_dimension('time', 1000) - for d, l in sorted(_dims.items()): - obj.add_dimension(d, l) - var = obj.create_variable(name=d, dims=(d,), - data=np.arange(l, dtype=np.int32), - attributes={'units':'integers'}) + obj['time'] = ('time', pd.date_range('2000-01-01', periods=1000)) + for k, d in sorted(_dims.items()): + obj[k] = (k, np.arange(d)) for v, dims in sorted(_vars.items()): - var = obj.create_variable(name=v, dims=tuple(dims), - data=np.random.normal(size=tuple([_dims[d] for d in dims]))) - var.attributes['foo'] = 'variable' + data = np.random.normal(size=tuple(_dims[d] for d in dims)) + obj[v] = (dims, data, {'foo': 'variable'}) return obj @@ -42,14 +38,14 @@ def get_store(self): def test_repr(self): data = create_test_data(self.get_store()) - self.assertEqual('', repr(data)) + self.assertEqual('', repr(data)) def test_init(self): var1 = Array('x', np.arange(100)) var2 = Array('x', np.arange(1000)) var3 = Array(['x', 'y'], np.arange(1000).reshape(100, 10)) - with self.assertRaisesRegexp(ValueError, 'already is saved with len'): + with self.assertRaisesRegexp(ValueError, 'but already is saved'): Dataset({'a': var1, 'b': var2}) with self.assertRaisesRegexp(ValueError, 'must be defined with 1-d'): Dataset({'a': var1, 'x': var3}) @@ -62,77 +58,41 @@ def test_iterator(self): self.assertVarEqual(data['var2'][n], sub['var2']) self.assertVarEqual(data['var3'][:, n], sub['var3']) - def test_dimension(self): - a = Dataset() - a.add_dimension('time', 10) - a.add_dimension('x', 5) - # prevent duplicate creation - self.assertRaises(ValueError, a.add_dimension, 'time', 0) - # length must be integer - self.assertRaises(ValueError, a.add_dimension, 'foo', 'a') - self.assertRaises(TypeError, a.add_dimension, 'foo', [1,]) - self.assertRaises(ValueError, a.add_dimension, 'foo', -1) - self.assertTrue('foo' not in a.dimensions) - def test_variable(self): a = Dataset() - a.add_dimension('time', 10) - a.add_dimension('x', 3) d = np.random.random((10, 3)) - a.create_variable(name='foo', dims=('time', 'x',), data=d) + a['foo'] = (('time', 'x',), d) self.assertTrue('foo' in a.variables) self.assertTrue('foo' in a) - a.create_variable(name='bar', dims=('time', 'x',), data=d) + a['bar'] = (('time', 'x',), d) # order of creation is preserved - self.assertTrue(a.variables.keys() == ['foo', 'bar']) + self.assertTrue(a.variables.keys() == ['time', 'x', 'foo', 'bar']) self.assertTrue(all([a.variables['foo'][i].data == d[i] for i in np.ndindex(*d.shape)])) - # prevent duplicate creation - self.assertRaises(ValueError, a.create_variable, - name='foo', dims=('time', 'x',), data=d) - # dimension must be defined - # self.assertRaises(ValueError, a.create_variable, - # name='qux', dims=('time', 'missing_dim',), data=d) # try to add variable with dim (10,3) with data that's (3,10) - self.assertRaises(ValueError, a.create_variable, - name='qux', dims=('time', 'x'), data=d.T) - # Array equality - d = np.random.rand(10, 3) - v1 = Array(('dim1','dim2'), data=d, - attributes={'att1': 3, 'att2': [1,2,3]}) - v2 = Array(('dim1','dim2'), data=d, - attributes={'att1': 3, 'att2': [1,2,3]}) - v5 = Array(('dim1','dim2'), data=d, - attributes={'att1': 3, 'att2': [1,2,3]}) - v3 = Array(('dim1','dim3'), data=d, - attributes={'att1': 3, 'att2': [1,2,3]}) - v4 = Array(('dim1','dim2'), data=d, - attributes={'att1': 3, 'att2': [1,2,4]}) - v5 = deepcopy(v1) - v5.data[:] = np.random.rand(10,3) - self.assertVarEqual(v1, v2) - self.assertVarNotEqual(v1, v3) - self.assertVarNotEqual(v1, v4) - self.assertVarNotEqual(v1, v5) + with self.assertRaises(ValueError): + a['qux'] = (('time', 'x'), d.T) def test_coordinate(self): a = Dataset() vec = np.random.random((10,)) attributes = {'foo': 'bar'} - a.create_coordinate('x', data=vec, attributes=attributes) + a['x'] = ('x', vec, attributes) self.assertTrue('x' in a.coordinates) + self.assertIsInstance(a.coordinates['x'].data, pd.Index) self.assertVarEqual(a.coordinates['x'], a.variables['x']) b = Dataset() - b.add_dimension('x', vec.size) - b.create_variable('x', dims=('x',), data=vec, attributes=attributes) + b['x'] = ('x', vec, attributes) self.assertVarEqual(a['x'], b['x']) self.assertEquals(a.dimensions, b.dimensions) + with self.assertRaises(ValueError): + a['x'] = ('x', vec[:5]) arr = np.random.random((10, 1,)) scal = np.array(0) - self.assertRaises(ValueError, a.create_coordinate, - name='y', data=arr) - self.assertRaises(ValueError, a.create_coordinate, - name='y', data=scal) + with self.assertRaises(ValueError): + a['y'] = ('y', arr) + with self.assertRaises(ValueError): + a['y'] = ('y', scal) self.assertTrue('y' not in a.dimensions) @unittest.skip('attribute checks are not yet backend specific') @@ -207,10 +167,6 @@ def test_indexed_by(self): expected = data[v].data[slice_list] actual = ret[v].data np.testing.assert_array_equal(expected, actual) - # Test that our view accesses the same underlying array - # This test doesn't make sense for the netCDF4 backend - # actual.fill(np.pi) - # np.testing.assert_array_equal(expected, actual) with self.assertRaises(ValueError): data.indexed_by(not_a_dim=slice(0, 2)) @@ -230,8 +186,8 @@ def test_labeled_by(self): loc_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 1)} self.assertEqual(data.indexed_by(**int_slicers), data.labeled_by(**loc_slicers)) - data.create_variable('time', ['time'], np.arange(1000, dtype=np.int32), - {'units': 'days since 2000-01-01'}) + data['time'] = ('time', np.arange(1000, dtype=np.int32), + {'units': 'days since 2000-01-01'}) self.assertEqual(data.indexed_by(time=0), data.labeled_by(time='2000-01-01')) self.assertEqual(data.indexed_by(time=slice(10)), @@ -321,8 +277,8 @@ def test_merge(self): def test_getitem(self): data = create_test_data(self.get_store()) - data.create_variable('time', ['time'], np.arange(1000, dtype=np.int32), - {'units': 'days since 2000-01-01'}) + data['time'] = ('time', np.arange(1000, dtype=np.int32), + {'units': 'days since 2000-01-01'}) self.assertIsInstance(data['var1'], DatasetArray) self.assertVarEqual(data['var1'], data.variables['var1']) self.assertItemsEqual(data['var1'].dataset.variables, @@ -331,23 +287,23 @@ def test_getitem(self): self.assertVarEqual(data['time.dayofyear'][:300], Array('time', 1 + np.arange(300))) self.assertNDArrayEqual(data['time.month'].data, - data.indices['time'].month) + data.variables['time'].data.month) def test_setitem(self): # assign a variable var = Array(['dim1'], np.random.randn(100)) data1 = create_test_data(self.get_store()) - data1.set_variable('A', var) + data1['A'] = var data2 = data1.copy() data2['A'] = var self.assertEqual(data1, data2) - # assign a dataview + # assign a dataset array dv = 2 * data2['A'] - data1.set_variable('B', dv.variable) + data1['B'] = dv.array data2['B'] = dv self.assertEqual(data1, data2) # assign an array - with self.assertRaisesRegexp(TypeError, 'DatasetArrays and Arrays'): + with self.assertRaisesRegexp(TypeError, 'variables must be of type'): data2['C'] = var.data def test_write_store(self): diff --git a/test/test_dataset_array.py b/test/test_dataset_array.py index 7861d0b6662..3991210fead 100644 --- a/test/test_dataset_array.py +++ b/test/test_dataset_array.py @@ -5,83 +5,84 @@ class TestDatasetArray(TestCase): - def assertViewEqual(self, dv1, dv2): - self.assertEqual(dv1.dataset, dv2.dataset) - self.assertEqual(dv1.focus, dv2.focus) + def assertDSArrayEqual(self, ar1, ar2): + self.assertEqual(ar1.dataset, ar2.dataset) + self.assertEqual(ar1.focus, ar2.focus) + + def assertDSArrayEquiv(self, ar1, ar2): + random_name = 'randomly-renamed-variable' + self.assertEqual(ar1.renamed(random_name).dataset, + ar2.renamed(random_name).dataset) def setUp(self): self.x = np.random.random((10, 20)) self.v = Array(['x', 'y'], self.x) self.ds = Dataset({'foo': self.v}) - self.ds.create_coordinate('x', np.arange(10)) - self.ds.create_coordinate('y', np.arange(20)) self.dv = DatasetArray(self.ds, 'foo') def test_properties(self): self.assertIs(self.dv.dataset, self.ds) self.assertEqual(self.dv.focus, 'foo') - self.assertVarEqual(self.dv.variable, self.v) + self.assertVarEqual(self.dv.array, self.v) self.assertNDArrayEqual(self.dv.data, self.v.data) for attr in ['dimensions', 'dtype', 'shape', 'size', 'ndim', 'attributes']: self.assertEqual(getattr(self.dv, attr), getattr(self.v, attr)) self.assertEqual(len(self.dv), len(self.v)) self.assertVarEqual(self.dv, self.v) - self.assertEqual(list(self.dv.indices), list(self.ds.indices)) - for k, v in self.dv.indices.iteritems(): - self.assertNDArrayEqual(v, self.ds.indices[k]) + self.assertEqual(list(self.dv.coordinates), list(self.ds.coordinates)) + for k, v in self.dv.coordinates.iteritems(): + self.assertNDArrayEqual(v, self.ds.coordinates[k]) def test_items(self): # strings pull out dataviews - self.assertViewEqual(self.dv, self.ds['foo']) + self.assertDSArrayEqual(self.dv, self.ds['foo']) x = self.dv['x'] y = self.dv['y'] - self.assertViewEqual(DatasetArray(self.ds.select('x'), 'x'), x) - self.assertViewEqual(DatasetArray(self.ds.select('y'), 'y'), y) + self.assertDSArrayEqual(DatasetArray(self.ds.select('x'), 'x'), x) + self.assertDSArrayEqual(DatasetArray(self.ds.select('y'), 'y'), y) # integer indexing I = ReturnItem() - for i in [I[:], I[...], I[x.data], I[x.variable], I[x], I[x, y], - I[x.data > -1], I[x.variable > -1], I[x > -1], + for i in [I[:], I[...], I[x.data], I[x.array], I[x], I[x, y], + I[x.data > -1], I[x.array > -1], I[x > -1], I[x > -1, y > -1]]: self.assertVarEqual(self.dv, self.dv[i]) for i in [I[0], I[:, 0], I[:3, :2], - I[x.data[:3]], I[x.variable[:3]], I[x[:3]], I[x[:3], y[:4]], - I[x.data > 3], I[x.variable > 3], I[x > 3], I[x > 3, y > 3]]: + I[x.data[:3]], I[x.array[:3]], I[x[:3]], I[x[:3], y[:4]], + I[x.data > 3], I[x.array > 3], I[x > 3], I[x > 3, y > 3]]: self.assertVarEqual(self.v[i], self.dv[i]) - # check that the new index is consistent - self.assertEqual(list(self.dv[0].indices), ['y']) - # we always to keep the dataview variable around - self.assertVarEqual(self.dv[0, 0], self.dv.variable[0, 0]) + # make sure we always keep the array around, even if it's a scalar + self.assertVarEqual(self.dv[0, 0], self.dv.array[0, 0]) self.assertEqual(self.dv[0, 0].dataset, - Dataset({'foo': self.dv.variable[0, 0]})) + Dataset({'foo': self.dv.array[0, 0]})) def test_iteration(self): for ((act_x, act_dv), (exp_x, exp_ds)) in \ zip(self.dv.iterator('y'), self.ds.iterator('y')): self.assertVarEqual(exp_x, act_x) - self.assertViewEqual(DatasetArray(exp_ds, 'foo'), act_dv) + self.assertDSArrayEqual(DatasetArray(exp_ds, 'foo'), act_dv) for ((_, exp_dv), act_dv) in zip(self.dv.iterator('x'), self.dv): - self.assertViewEqual(exp_dv, act_dv) + self.assertDSArrayEqual(exp_dv, act_dv) def test_indexed_by(self): self.assertEqual(self.dv[0].dataset, self.ds.indexed_by(x=0)) self.assertEqual(self.dv[:3, :5].dataset, self.ds.indexed_by(x=slice(3), y=slice(5))) - self.assertViewEqual(self.dv, self.dv.indexed_by(x=slice(None))) - self.assertViewEqual(self.dv[:3], self.dv.indexed_by(x=slice(3))) + self.assertDSArrayEqual(self.dv, self.dv.indexed_by(x=slice(None))) + self.assertDSArrayEqual(self.dv[:3], self.dv.indexed_by(x=slice(3))) def test_labeled_by(self): - self.ds.set_variable('x', Array(['x'], np.array(list('abcdefghij')))) - self.assertViewEqual(self.dv, self.dv.labeled_by(x=slice(None))) - self.assertViewEqual(self.dv[1], self.dv.labeled_by(x='b')) - self.assertViewEqual(self.dv[:3], self.dv.labeled_by(x=slice('c'))) + self.ds['x'] = ('x', np.array(list('abcdefghij'))) + self.assertDSArrayEqual(self.dv, self.dv.labeled_by(x=slice(None))) + self.assertDSArrayEqual(self.dv[1], self.dv.labeled_by(x='b')) + self.assertDSArrayEqual(self.dv[:3], self.dv.labeled_by(x=slice('c'))) def test_loc(self): - self.ds.set_variable('x', Array(['x'], np.array(list('abcdefghij')))) - self.assertViewEqual(self.dv[:3], self.dv.loc[:'c']) - self.assertViewEqual(self.dv[1], self.dv.loc['b']) - self.assertViewEqual(self.dv[:3], self.dv.loc[['a', 'b', 'c']]) - self.assertViewEqual(self.dv[:3, :4], + self.ds['x'] = ('x', np.array(list('abcdefghij'))) + self.assertDSArrayEqual(self.dv[:3], self.dv.loc[:'c']) + self.assertDSArrayEqual(self.dv[1], self.dv.loc['b']) + self.assertDSArrayEqual(self.dv[:3], self.dv.loc[['a', 'b', 'c']]) + self.assertDSArrayEqual(self.dv[:3, :4], self.dv.loc[['a', 'b', 'c'], np.arange(4)]) self.dv.loc['a':'j'] = 0 self.assertTrue(np.all(self.dv.data == 0)) @@ -97,22 +98,22 @@ def test_refocus(self): def test_dataset_getitem(self): dv = self.ds['foo'] - self.assertViewEqual(dv, self.dv) + self.assertDSArrayEqual(dv, self.dv) def test_array_interface(self): self.assertNDArrayEqual(np.asarray(self.dv), self.x) # test patched in methods self.assertNDArrayEqual(self.dv.take([2, 3]), self.x.take([2, 3])) - self.assertViewEqual(self.dv.argsort(), - self.dv.refocus(self.x.argsort())) - self.assertViewEqual(self.dv.clip(2, 3), - self.dv.refocus(self.x.clip(2, 3))) + self.assertDSArrayEquiv(self.dv.argsort(), + self.dv.refocus(self.x.argsort())) + self.assertDSArrayEquiv(self.dv.clip(2, 3), + self.dv.refocus(self.x.clip(2, 3))) # test ufuncs - self.assertViewEqual(np.sin(self.dv), - self.dv.refocus(np.sin(self.x))) - self.assertViewEqual(self.dv, np.maximum(self.v, self.dv)) + self.assertDSArrayEquiv(np.sin(self.dv), + self.dv.refocus(np.sin(self.x))) + self.assertDSArrayEquiv(self.dv, np.maximum(self.v, self.dv)) self.ds['bar'] = Array(['x', 'y'], np.zeros((10, 20))) - self.assertViewEqual(self.dv, np.maximum(self.dv, self.ds['bar'])) + self.assertDSArrayEquiv(self.dv, np.maximum(self.dv, self.ds['bar'])) def test_math(self): x = self.x @@ -120,15 +121,15 @@ def test_math(self): a = self.dv # variable math was already tested extensively, so let's just make sure # that all types are properly converted here - self.assertViewEqual(a, +a) - self.assertViewEqual(a, a + 0) - self.assertViewEqual(a, 0 + a) - self.assertViewEqual(a, a + 0 * v) - self.assertViewEqual(a, 0 * v + a) - self.assertViewEqual(a, a + 0 * x) - self.assertViewEqual(a, 0 * x + a) - self.assertViewEqual(a, a + 0 * a) - self.assertViewEqual(a, 0 * a + a) + self.assertDSArrayEquiv(a, +a) + self.assertDSArrayEquiv(a, a + 0) + self.assertDSArrayEquiv(a, 0 + a) + self.assertDSArrayEquiv(a, a + 0 * v) + self.assertDSArrayEquiv(a, 0 * v + a) + self.assertDSArrayEquiv(a, a + 0 * x) + self.assertDSArrayEquiv(a, 0 * x + a) + self.assertDSArrayEquiv(a, a + 0 * a) + self.assertDSArrayEquiv(a, 0 * a + a) # test different indices ds2 = self.ds.replace('x', Array(['x'], 3 + np.arange(10))) b = DatasetArray(ds2, 'foo') @@ -138,7 +139,7 @@ def test_math(self): b + a def test_item_math(self): - self.ds.set_variable('x', Array(['x'], np.array(list('abcdefghij')))) + self.ds['x'] = ('x', np.array(list('abcdefghij'))) self.assertVarEqual(self.dv + self.dv[0, 0], self.dv + self.dv[0, 0].data) new_data = self.x[0][None, :] + self.x[:, 0][:, None] @@ -154,7 +155,7 @@ def test_inplace_math(self): b = a b += 1 self.assertIs(b, a) - self.assertIs(b.variable, v) + self.assertIs(b.array, v) self.assertIs(b.data, x) self.assertIs(b.dataset, self.ds) @@ -167,14 +168,14 @@ def test_collapse(self): def test_groupby(self): agg_var = Array(['y'], np.array(['a'] * 9 + ['c'] + ['b'] * 10)) self.dv['abc'] = agg_var - self.dv['y'] = 20 + 100 * self.ds['y'].variable + self.dv['y'] = 20 + 100 * self.ds['y'].array identity = lambda x: x - self.assertViewEqual(self.dv, self.dv.groupby('x').apply(identity)) - self.assertViewEqual(self.dv, self.dv.groupby('x', squeeze=False + self.assertDSArrayEqual(self.dv, self.dv.groupby('x').apply(identity)) + self.assertDSArrayEqual(self.dv, self.dv.groupby('x', squeeze=False ).apply(identity)) - self.assertViewEqual(self.dv, self.dv.groupby('y').apply(identity)) - self.assertViewEqual(self.dv, self.dv.groupby('y', squeeze=False + self.assertDSArrayEqual(self.dv, self.dv.groupby('y').apply(identity)) + self.assertDSArrayEqual(self.dv, self.dv.groupby('y', squeeze=False ).apply(identity)) grouped = self.dv.groupby('abc') @@ -185,9 +186,9 @@ def test_groupby(self): self.x[:, 9:10].sum()]).T, {'cell_methods': 'x: y: sum'}), 'abc': Array(['abc'], np.array(['a', 'b', 'c']))}), 'foo') - self.assertViewEqual(expected_sum_all, - grouped.collapse(np.sum, dimension=None)) - self.assertViewEqual(expected_sum_all, grouped.sum(dimension=None)) + self.assertDSArrayEqual(expected_sum_all, + grouped.collapse(np.sum, dimension=None)) + self.assertDSArrayEqual(expected_sum_all, grouped.sum(dimension=None)) expected_sum_axis1 = DatasetArray(Dataset( {'foo': Array(['x', 'abc'], np.array([self.x[:, :9].sum(1), @@ -196,23 +197,23 @@ def test_groupby(self): {'cell_methods': 'y: sum'}), 'x': self.ds.variables['x'], 'abc': Array(['abc'], np.array(['a', 'b', 'c']))}), 'foo') - self.assertViewEqual(expected_sum_axis1, grouped.collapse(np.sum)) - self.assertViewEqual(expected_sum_axis1, grouped.sum()) + self.assertDSArrayEqual(expected_sum_axis1, grouped.collapse(np.sum)) + self.assertDSArrayEqual(expected_sum_axis1, grouped.sum()) - self.assertViewEqual(self.dv, grouped.apply(identity)) + self.assertDSArrayEqual(self.dv, grouped.apply(identity)) def test_aggregate(self): - agg_var = Array(['y'], np.array(['a'] * 9 + ['c'] + ['b'] * 10)) - self.ds.add_variable('abc', agg_var) + agg_var = Array('y', np.array(['a'] * 9 + ['c'] + ['b'] * 10)) + self.ds['abc'] = agg_var expected_unique, expected_var = \ - self.dv.variable.aggregate(np.mean, 'abc', agg_var) + self.dv.array.aggregate(np.mean, 'abc', agg_var) expected = DatasetArray(Dataset( {'foo': expected_var, 'x': self.ds.variables['x'], 'abc': expected_unique}), 'foo') actual = self.dv.aggregate(np.mean, 'abc') - self.assertViewEqual(expected, actual) + self.assertDSArrayEqual(expected, actual) actual = self.dv.aggregate(np.mean, self.ds['abc']) - self.assertViewEqual(expected, actual) + self.assertDSArrayEqual(expected, actual) def test_from_stack(self): self.ds['bar'] = Array(['x', 'y'], np.random.randn(10, 20)) @@ -225,17 +226,17 @@ def test_from_stack(self): # from variables: self.assertVarEqual(Array(['w', 'x', 'y'], np.array([foo.data, bar.data])), - DatasetArray.from_stack([foo.variable, - bar.variable], 'w')) + DatasetArray.from_stack([foo.array, + bar.array], 'w')) # from iteration: stacked = DatasetArray.from_stack((v for _, v in foo.iterator('x')), self.ds['x']) - self.assertViewEqual(foo, stacked) + self.assertDSArrayEqual(foo, stacked) def test_intersection(self): - self.ds.set_variable('x', Array(['x'], np.array(list('abcdefghij')))) + self.ds['x'] = ('x', np.array(list('abcdefghij'))) with self.assertRaises(ValueError): self.dv + self.dv[:5] dv1, dv2 = intersection(self.dv, self.dv[:5]) - self.assertViewEqual(dv1, self.dv[:5]) - self.assertViewEqual(dv2, self.dv[:5]) + self.assertDSArrayEqual(dv1, self.dv[:5]) + self.assertDSArrayEqual(dv2, self.dv[:5]) From dd81b0f478109ad52e858776a738f557c32854c9 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 16 Feb 2014 11:47:35 -0500 Subject: [PATCH 35/45] Added utils.datetimeindex2num --- src/xray/backends.py | 38 ++++++++++++++++++++++---------------- src/xray/utils.py | 35 +++++++++++++++++++++++++++++++++++ test/test_utils.py | 15 +++++++++++++-- 3 files changed, 70 insertions(+), 18 deletions(-) diff --git a/src/xray/backends.py b/src/xray/backends.py index 7158e02e132..7c72d50dae8 100644 --- a/src/xray/backends.py +++ b/src/xray/backends.py @@ -7,28 +7,26 @@ # for directly manipulating Dataset.variables and the like? import netCDF4 as nc4 import numpy as np +import pandas as pd from scipy.io import netcdf from collections import OrderedDict -import array_ as array +import array_ import conventions -from utils import FrozenOrderedDict, Frozen +from utils import FrozenOrderedDict, Frozen, datetimeindex2num class AbstractDataStore(object): def set_dimensions(self, dimensions): - """Set the dimensions without checking validity""" for d, l in dimensions.iteritems(): self.set_dimension(d, l) def set_attributes(self, attributes): - """Set the attributes without checking validity""" for k, v in attributes.iteritems(): self.set_attribute(k, v) def set_variables(self, variables): - """Set the variables without checking validity""" for vn, v in variables.iteritems(): self.set_variable(vn, v) @@ -45,15 +43,12 @@ def __init__(self): self.attributes = OrderedDict() def set_dimension(self, name, length): - """Set a dimension length""" self.dimensions[name] = length def set_attribute(self, key, value): - """Set the attributes without checking validity""" self.attributes[key] = value def set_variable(self, name, variable): - """Set a variable without checks""" self.variables[name] = variable return self.variables[name] @@ -64,8 +59,18 @@ def sync(self): pass +def convert_to_cf_variable(array): + data = array.data + attributes = array.attributes.copy() + if isinstance(array.data, pd.DatetimeIndex): + (data, units, calendar) = datetimeindex2num(array.data) + attributes['units'] = units + attributes['calendar'] = calendar + return array_.Array(array.dimensions, data, attributes) + + def convert_scipy_variable(var): - return array.Array(var.dimensions, var.data, var._attributes) + return array_.Array(var.dimensions, var.data, var._attributes) class ScipyDataStore(AbstractDataStore): @@ -92,7 +97,6 @@ def dimensions(self): return Frozen(self.ds.dimensions) def set_dimension(self, name, length): - """Set a dimension length""" if name in self.dimensions: raise ValueError('%s does not support modifying dimensions' % type(self).__name__) @@ -130,7 +134,7 @@ def set_attribute(self, key, value): setattr(self.ds, key, self._cast_attr_value(value)) def set_variable(self, name, variable): - """Add a variable without checks""" + variable = convert_to_cf_variable(variable) data = variable.data dtype_convert = {'int64': 'int32', 'float64': 'float32'} if str(data.dtype) in dtype_convert: @@ -141,7 +145,6 @@ def set_variable(self, name, variable): for k, v in variable.attributes.iteritems(): self._validate_attr_key(k) setattr(scipy_var, k, self._cast_attr_value(v)) - return convert_scipy_variable(scipy_var) def del_attribute(self, key): delattr(self.ds, key) @@ -163,7 +166,7 @@ def convert_nc4_variable(var): # netcdf file would now have been scaled twice! attr = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs() if k not in ['scale_factor', 'add_offset']) - return array.Array(var.dimensions, var, attr, indexing_mode='orthogonal') + return array_.Array(var.dimensions, var, attr, indexing_mode='orthogonal') class NetCDF4DataStore(AbstractDataStore): @@ -187,14 +190,18 @@ def dimensions(self): return FrozenOrderedDict((k, len(v)) for k, v in self.ds.dimensions.iteritems()) def set_dimension(self, name, length): - """Set a dimension length""" self.ds.createDimension(name, size=length) def set_attribute(self, key, value): self.ds.setncatts({key: value}) + def _cast_data(self, data): + if isinstance(data, pd.DatetimeIndex): + data = datetimeindex2num(data) + return data + def set_variable(self, name, variable): - """Set a variable without checks""" + variable = convert_to_cf_variable(variable) # netCDF4 will automatically assign a fill value # depending on the datatype of the variable. Here # we let the package handle the _FillValue attribute @@ -207,7 +214,6 @@ def set_variable(self, name, variable): nc4_var = self.ds.variables[name] nc4_var[:] = variable.data[:] nc4_var.setncatts(variable.attributes) - return convert_nc4_variable(nc4_var) def del_attribute(self, key): self.ds.delncattr(key) diff --git a/src/xray/utils.py b/src/xray/utils.py index 06cd9551e61..48555f64a8f 100644 --- a/src/xray/utils.py +++ b/src/xray/utils.py @@ -107,6 +107,41 @@ def num2datetimeindex(num_dates, units, calendar=None): return pd.Index(dates) +def guess_time_units(dates): + """Given an array of dates suitable for input to `pandas.DatetimeIndex`, + returns a CF compatible time-unit string of the form "{time_unit} since + {date[0]}", where `time_unit` is 'days', 'hours', 'minutes' or 'seconds' + (the first one that can evenly divide all unique time deltas in `dates`) + """ + dates = pd.DatetimeIndex(dates) + unique_timedeltas = np.unique(np.diff(dates.values)) + for time_unit, delta in [('days', '1 days'), ('hours', '3600s'), + ('minutes', '60s'), ('seconds', '1s')]: + unit_delta = pd.to_timedelta(delta) + diffs = unique_timedeltas / unit_delta + if np.all(diffs == diffs.astype(int)): + break + else: + raise ValueError('could not automatically determine time units') + return '%s since %s' % (time_unit, dates[0]) + + +def datetimeindex2num(dates, units=None, calendar=None): + """Given an array of dates suitable for input to `pandas.DatetimeIndex`, + returns the tuple `(num, units, calendar)` suitable for CF complient time + variable. + """ + dates = pd.DatetimeIndex(dates) + if units is None: + units = guess_time_units(dates) + if calendar is None: + calendar = 'proleptic_gregorian' + # for now, don't bother doing any trickery like num2datetimeindex to + # convert dates to numbers faster + num = nc4.date2num(dates.to_pydatetime(), units, calendar) + return (num, units, calendar) + + def variable_equal(v1, v2, rtol=1e-05, atol=1e-08): """True if two objects have the same dimensions, attributes and data; otherwise False diff --git a/test/test_utils.py b/test/test_utils.py index b73b859adc9..11e9fedbe3d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -58,8 +58,8 @@ def test_orthogonal_indexer(self): utils.orthogonal_indexer(x > 0, x.shape) -class TestNum2DatetimeIndex(TestCase): - def test(self): +class TestDatetime(TestCase): + def test_num2datetimeindex(self): for num_dates, units in [ (np.arange(1000), 'days since 2000-01-01'), (12300 + np.arange(500), 'hours since 1680-01-01 00:00:00')]: @@ -68,6 +68,17 @@ def test(self): actual = utils.num2datetimeindex(num_dates, units, calendar) self.assertNDArrayEqual(expected, actual) + def test_guess_time_units(self): + for dates, expected in [(pd.date_range('1900-01-01', periods=5), + 'days since 1900-01-01 00:00:00'), + (pd.date_range('1900-01-01 12:00:00', freq='H', + periods=2), + 'hours since 1900-01-01 12:00:00'), + (['1900-01-01', '1900-01-02', + '1900-01-02 00:00:01'], + 'seconds since 1900-01-01 00:00:00')]: + self.assertEquals(expected, utils.guess_time_units(dates)) + class TestDictionaries(TestCase): def setUp(self): From 782a93334e84fc37952b79b09abe36059c9930a0 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 16 Feb 2014 12:16:17 -0500 Subject: [PATCH 36/45] Reworked virtual variables --- src/xray/dataset.py | 100 +++++++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 42 deletions(-) diff --git a/src/xray/dataset.py b/src/xray/dataset.py index f1deba95d59..7fa8115036e 100644 --- a/src/xray/dataset.py +++ b/src/xray/dataset.py @@ -37,6 +37,56 @@ def open_dataset(nc, *args, **kwargs): 'quarter'] +class _VariablesDict(OrderedDict): + """_VariablesDict is an OrderedDict subclass that also implements "virtual" + variables that are created from other variables on demand + + Currently, virtual variables are restricted to attributes of + pandas.DatetimeIndex objects (e.g., 'year', 'month', 'day', etc., plus + 'season' for climatological season), which are accessed by getting the item + 'time.year'. + """ + def _datetimeindices(self): + return [k for k, v in self.iteritems() + if isinstance(v._data, pd.DatetimeIndex)] + + @property + def virtual(self): + """Variables that don't exist in this dataset but for which could be + created on demand (because they can be calculated from other dataset + variables) + """ + virtual_vars = [] + for k in self._datetimeindices(): + for suffix in _DATETIMEINDEX_COMPONENTS + ['season']: + name = '%s.%s' % (k, suffix) + if name not in self: + virtual_vars.append(name) + return virtual_vars + + def _get_virtual_variable(self, key): + split_key = key.split('.') + if len(split_key) == 2: + ref_var, suffix = split_key + if ref_var in self._datetimeindices(): + if suffix == 'season': + # seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) + month = self[ref_var].data.month + data = (month // 3) % 4 + 1 + else: + data = getattr(self[ref_var].data, suffix) + return array.Array(self[ref_var].dimensions, data) + raise KeyError('virtual variable %r not found' % key) + + def __getitem__(self, key): + if key in self: + return OrderedDict.__getitem__(self, key) + elif key in self.virtual: + return self._get_virtual_variable(key) + else: + raise KeyError(repr(key)) + + class Dataset(Mapping): """A netcdf-like data object consisting of dimensions, variables and attributes which together form a self describing data set @@ -66,7 +116,7 @@ def __init__(self, variables=None, attributes=None): """To load data from a file or file-like object, use the `open_dataset` function. """ - self._variables = OrderedDict() + self._variables = _VariablesDict() self._dimensions = OrderedDict() if variables is not None: self._set_variables(variables) @@ -161,52 +211,16 @@ def __len__(self): def __iter__(self): return iter(self.variables) - @property - def _datetimeindices(self): - return [k for k, v in self.variables.iteritems() - if isinstance(v._data, pd.DatetimeIndex)] - - def _get_virtual_variable(self, key): - split_key = key.split('.') - if len(split_key) == 2: - name, suffix = split_key - if name in self._datetimeindices: - if suffix == 'season': - # seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) - month = self.variables[name].data.month - data = (month // 3) % 4 + 1 - else: - data = getattr(self.variables[name].data, suffix) - return array.Array(self.variables[name].dimensions, data) - raise ValueError('virtual variable %r not found' % key) - - def _get_virtual_dataset_array(self, key): - virtual_var = self._get_virtual_variable(key) - ds = self.copy() - ds[key] = virtual_var - return ds[key] - @property def virtual_variables(self): """Arrays that don't exist in this dataset but for which dataviews could be created on demand (because they can be calculated from other dataset variables or dimensions) """ - possible_vars = [] - for k in self._datetimeindices: - for suffix in _DATETIMEINDEX_COMPONENTS + ['season']: - possible_vars.append('%s.%s' % (k, suffix)) - return tuple(k for k in possible_vars if k not in self.variables) + return self._variables.virtual def __getitem__(self, key): - if key in self.variables: - return DatasetArray(self.select(key), key) - else: - try: - return self._get_virtual_dataset_array(key) - except ValueError: - raise KeyError('dataset contains no variable with name %r ' - % key) + return DatasetArray(self.select(key), key) def __setitem__(self, key, value): if isinstance(value, DatasetArray): @@ -492,13 +506,14 @@ def select(self, *names): specified variables refers to that variable in its dimensions or "coordinates" attribute. All other variables are dropped. """ - if not all(k in self.variables for k in names): + possible_vars = set(self) | set(self.virtual_variables) + if not set(names) <= possible_vars: raise ValueError( "One or more of the specified variables does not exist") def get_all_associated_names(name): yield name - if name in self: + if name in possible_vars: var = self.variables[name] for dim in var.dimensions: yield dim @@ -516,7 +531,8 @@ def get_all_associated_names(name): queue |= new_names - selected_names selected_names |= new_names - variables = OrderedDict((k, v) for k, v in self.variables.iteritems() + variables = OrderedDict((k, self.variables[k]) + for k in list(self) + self.virtual_variables if k in selected_names) return type(self)(variables, self.attributes) From e8738dbcbb8f2983459a0b62e809be729ee2228a Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 16 Feb 2014 12:27:14 -0500 Subject: [PATCH 37/45] Fix performance regression in array_._as_compatible_data --- src/xray/array_.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/xray/array_.py b/src/xray/array_.py index 94548c06c32..0f470902fae 100644 --- a/src/xray/array_.py +++ b/src/xray/array_.py @@ -21,8 +21,11 @@ def _as_compatible_data(data): # don't check for __len__ or __iter__ so as not to warn if data is a numpy # numeric type like np.float32 required = ['dtype', 'shape', 'size', 'ndim'] - if np.iterable(data) and not all(hasattr(data, attr) for attr in required): + if not all(hasattr(data, attr) for attr in required): data = np.asarray(data) + if data.ndim == 0: + # unpack 0d data + data = data[()] elif isinstance(data, AbstractArray): # we don't want nested Array objects data = data.data From 508e16fea01d283bd644b4cd168b68f7ec29ae7a Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 16 Feb 2014 13:38:16 -0500 Subject: [PATCH 38/45] Speedup orthogonal indexing --- src/xray/utils.py | 27 +++++++++++++++++++++++++-- test/test_utils.py | 5 ++++- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/xray/utils.py b/src/xray/utils.py index 48555f64a8f..72e44954914 100644 --- a/src/xray/utils.py +++ b/src/xray/utils.py @@ -55,9 +55,32 @@ def expand_array(k, length): # arrays) because integers (and only integers) collapse axes when used with # __getitem__ non_int_keys = [n for n, k in enumerate(key) if not isinstance(k, int)] + + def full_slices_unselected(n_list): + def all_full_slices(key_index): + return all(isinstance(key[n], slice) and key[n] == slice(None) + for n in key_index) + if not n_list: + return n_list + elif all_full_slices(range(n_list[0] + 1)): + return full_slices_unselected(n_list[1:]) + elif all_full_slices(range(n_list[-1], len(key))): + return full_slices_unselected(n_list[:-1]) + else: + return n_list + + # However, testing suggests it is OK to keep contiguous sequences of full + # slices at the start or the end of the key. Keeping slices around (when + # possible) instead of converting slices to arrays significantly speeds up + # indexing. + # (Honestly, I don't understand when it's not OK to keep slices even in + # between integer indices if as array is somewhere in the key, but such are + # the admittedly mind-boggling ways of numpy's advanced indexing.) + array_keys = full_slices_unselected(non_int_keys) + array_indexers = np.ix_(*(expand_array(key[n], shape[n]) - for n in non_int_keys)) - for i, n in enumerate(non_int_keys): + for n in array_keys)) + for i, n in enumerate(array_keys): key[n] = array_indexers[i] return tuple(key) diff --git a/test/test_utils.py b/test/test_utils.py index 11e9fedbe3d..bd7e3438512 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -46,8 +46,11 @@ def test_orthogonal_indexer(self): (I[y, z], I[:5, 2:8:2], (5, 3, 12, 13, 14)), (I[0, y, y], I[0, :5, :5], (5, 5, 13, 14)), (I[y, 0, z], I[:5, 0, 2:8:2], (5, 3, 13, 14)), + (I[y, :, z], I[:5, :, 2:8:2], (5, 11, 3, 13, 14)), (I[0, :2, y, y, 0], I[0, :2, :5, :5, 0], (2, 5, 5)), - (I[0, :, y, :, 0], I[0, :, :5, :, 0], (11, 5, 13))]: + (I[0, :, y, :, 0], I[0, :, :5, :, 0], (11, 5, 13)), + (I[:, :, y, :, 0], I[:, :, :5, :, 0], (10, 11, 5, 13)), + (I[:, :, y, z, :], I[:, :, :5, 2:8:2], (10, 11, 5, 3, 14))]: k = utils.orthogonal_indexer(i, x.shape) self.assertEqual(shape, x[k].shape) self.assertNDArrayEqual(x[j], x[k]) From 0547123c1d8ff8473946df8544644128811d8a9e Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 16 Feb 2014 14:40:28 -0500 Subject: [PATCH 39/45] Documentation and name cleanup --- src/xray/__init__.py | 4 +-- src/xray/array_.py | 32 ++++++++++---------- src/xray/common.py | 24 +++++++-------- src/xray/dataset.py | 34 ++++++++++++--------- src/xray/dataset_array.py | 54 +++++++++++++++++++-------------- src/xray/groupby.py | 62 ++++++++++++++++++++++++++++---------- src/xray/ops.py | 17 +++++------ test/test_array.py | 16 +++++----- test/test_dataset_array.py | 16 +++++----- 9 files changed, 151 insertions(+), 108 deletions(-) diff --git a/src/xray/__init__.py b/src/xray/__init__.py index 52bd1d7ae08..bd4408b3ddb 100644 --- a/src/xray/__init__.py +++ b/src/xray/__init__.py @@ -1,12 +1,12 @@ from .array_ import Array, broadcast_variables from .dataset import Dataset, open_dataset -from .dataset_array import DatasetArray, intersection +from .dataset_array import DatasetArray, align from .utils import orthogonal_indexer, num2datetimeindex, variable_equal from . import backends concat = DatasetArray.from_stack -__all__ = ['open_dataset', 'Dataset', 'DatasetArray', 'Array', 'intersection', +__all__ = ['open_dataset', 'Dataset', 'DatasetArray', 'Array', 'align', 'broadcast_variables', 'orthogonal_indexer', 'num2datetimeindex', 'variable_equal'] diff --git a/src/xray/array_.py b/src/xray/array_.py index 0f470902fae..bbd298832c9 100644 --- a/src/xray/array_.py +++ b/src/xray/array_.py @@ -56,10 +56,10 @@ def unique_value_groups(ar): class Array(AbstractArray): - """ - A netcdf-like variable consisting of dimensions, data and attributes - which describe a single Array. A single variable object is not - fully described outside the context of its parent Dataset. + """A netcdf-like variable consisting of dimensions, data and attributes + which describe a single Array. A single Array object is not fully described + outside the context of its parent Dataset (if you want such a fully + described object, use a DatasetArray instead). """ def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): """ @@ -239,7 +239,7 @@ def __repr__(self): return '' % (type(self).__name__, contents) def indexed_by(self, **indexers): - """Return a new variable indexed along the specified dimension(s) + """Return a new array indexed along the specified dimension(s) Parameters ---------- @@ -294,34 +294,34 @@ def transpose(self, *dimensions): return type(self)(dimensions, data, self.attributes) # TODO: rename this method to 'reduce' - def collapse(self, func, dimension=None, axis=None, **kwargs): - """Collapse this variable by applying `func` along some dimension(s) + def reduce(self, func, dimension=None, axis=None, **kwargs): + """Reduce this array by applying `func` along some dimension(s) Parameters ---------- func : function Function which can be called in the form - `func(x, axis=axis, **kwargs)` to return the result of collapsing an + `func(x, axis=axis, **kwargs)` to return the result of reducing an np.ndarray over an integer valued axis. dimension : str or sequence of str, optional Dimension(s) over which to repeatedly apply `func`. axis : int or sequence of int, optional Axis(es) over which to repeatedly apply `func`. Only one of the 'dimension' and 'axis' arguments can be supplied. If neither are - supplied, then the collapse is calculated over the flattened array + supplied, then the reduction is calculated over the flattened array (by calling `func(x)` without an axis argument). **kwargs : dict Additional keyword arguments passed on to `func`. Note ---- - If `collapse` is called with multiple dimensions (or axes, which - are converted into dimensions), then the collapse operation is + If `reduce` is called with multiple dimensions (or axes, which + are converted into dimensions), then the reduce operation is performed repeatedly along each dimension in turn from left to right. Returns ------- - collapsed : Array + reduced : Array Array with summarized data and the indicated dimension(s) removed. """ @@ -340,7 +340,7 @@ def collapse(self, func, dimension=None, axis=None, **kwargs): dimension = [dimension] var = self for dim in dimension: - var = var._collapse(func, dim, **kwargs) + var = var._reduce(func, dim, **kwargs) else: var = type(self)([], func(self.data, **kwargs), self.attributes) var._append_to_cell_methods(': '.join(self.dimensions) @@ -354,8 +354,8 @@ def _append_to_cell_methods(self, string): base = '' self.attributes['cell_methods'] = base + string - def _collapse(self, f, dim, **kwargs): - """Collapse a single dimension""" + def _reduce(self, f, dim, **kwargs): + """Reduce a single dimension""" axis = self.dimensions.index(dim) dims = tuple(dim for i, dim in enumerate(self.dimensions) if axis not in [i, i - self.ndim]) @@ -404,7 +404,7 @@ def aggregate(self, func, new_dim_name, group_by, **kwargs): 'match the length of this variable along its ' 'dimension') unique_values, group_indices = unique_value_groups(group_by.data) - aggregated = (self.indexed_by(**{dim: indices}).collapse( + aggregated = (self.indexed_by(**{dim: indices}).reduce( func, dim, axis=None, **kwargs) for indices in group_indices) stacked = type(self).from_stack(aggregated, new_dim_name, diff --git a/src/xray/common.py b/src/xray/common.py index 64b47ee6a28..3a6cde96eea 100644 --- a/src/xray/common.py +++ b/src/xray/common.py @@ -1,20 +1,20 @@ -class ImplementsCollapse(object): +class ImplementsReduce(object): @classmethod - def _collapse_method(cls, f, name=None, module=None): - def func(self, dimension=cls._collapse_dimension_default, - axis=cls._collapse_axis_default, **kwargs): - return self.collapse(f, dimension, axis, **kwargs) + def _reduce_method(cls, f, name=None, module=None): + def func(self, dimension=cls._reduce_dimension_default, + axis=cls._reduce_axis_default, **kwargs): + return self.reduce(f, dimension, axis, **kwargs) if name is None: name = f.__name__ func.__name__ = name - func.__doc__ = cls._collapse_method_docstring.format( + func.__doc__ = cls._reduce_method_docstring.format( name=('' if module is None else module + '.') + name, cls=cls.__name__) return func -class AbstractArray(ImplementsCollapse): +class AbstractArray(ImplementsReduce): @property def dtype(self): return getattr(self._data, 'dtype', object) @@ -64,8 +64,8 @@ def __array__(self, dtype=None): def T(self): return self.transpose() - _collapse_method_docstring = \ - """Collapse this {cls}'s data' by applying `{name}` along some + _reduce_method_docstring = \ + """Reduce this {cls}'s data' by applying `{name}` along some dimension(s) Parameters @@ -88,10 +88,10 @@ def T(self): Returns ------- - collapsed : {cls} + reduced : {cls} New {cls} object with `{name}` applied to its data and the indicated dimension(s) removed. """ - _collapse_dimension_default = None - _collapse_axis_default = None + _reduce_dimension_default = None + _reduce_axis_default = None diff --git a/src/xray/dataset.py b/src/xray/dataset.py index 7fa8115036e..85070e34cfa 100644 --- a/src/xray/dataset.py +++ b/src/xray/dataset.py @@ -88,33 +88,39 @@ def __getitem__(self, key): class Dataset(Mapping): - """A netcdf-like data object consisting of dimensions, variables and - attributes which together form a self describing data set + """A netcdf-like data object consisting of variables and attributes which + together form a self describing data set - Datasets are mappings from variable names to dataviews focused on those - variable. + Dataset implements the mapping interface with keys given by variable names + and values given by DatasetArray objects focused on each variable name. Note: the size of dimensions in a dataset cannot be changed. Attributes ---------- - dimensions : {name: length, ...} variables : {name: variable, ...} + attributes : {key: value, ...} + dimensions : {name: length, ...} coordinates : {name: variable, ...} - Coordinates are simply variables that are also dimensions. They must - all have dimension 1. noncoordinates : {name: variable, ...} - Arrays that are not coordinates. - attributes : {key: value, ...} - indices : {dimension: index, ...} - Mapping from dimensions to pandas.Index objects. - store : backends.*DataStore - Don't modify the store directly unless you want to avoid all validation - checks. + virtual_variables : list """ def __init__(self, variables=None, attributes=None): """To load data from a file or file-like object, use the `open_dataset` function. + + Parameters + ---------- + variables : dict-like, optional + A mapping from variable names to `xray.Array` objects or sequences + of the form `(dimensions, data[, attributes])` which can be used as + arguments to create a new `xray.Array`. Each dimension must have + the same length in all variables in which it appears. One + dimensional variables with name equal to their dimension are + coordinate variables, which means they are saved in the dataset as + `pandas.Index` objects. + attributes : dict-like, optional + Global attributes to save on this dataset. """ self._variables = _VariablesDict() self._dimensions = OrderedDict() diff --git a/src/xray/dataset_array.py b/src/xray/dataset_array.py index e1af2e4b76e..dca35bc32a4 100644 --- a/src/xray/dataset_array.py +++ b/src/xray/dataset_array.py @@ -31,12 +31,15 @@ def __setitem__(self, key, value): class DatasetArray(AbstractArray): - """A Dataset wrapper oriented around a single Array + """Hybrid between Dataset and Array - Dataviews are the primary way to do computations with Dataset variables. - They are designed to make it easy to manipulate variables in the context of - an intact Dataset object. Getting items from or doing mathematical - operations with a dataset array returns another dataset array. + Dataset arrays are the primary way to do computations with Dataset + variables. They are designed to make it easy to manipulate arrays in the + context of an intact Dataset object. Indeed, the contents of a DatasetArray + are uniquely defined by its `dataset` and `focus` paramters. + + Getting items from or doing mathematical operations with a dataset array + returns another dataset array. The design of DatasetArray is strongly inspired by the Iris Cube. However, dataset arrays are much lighter weight than cubes. They are simply aligned, @@ -50,7 +53,8 @@ def __init__(self, dataset, focus): The dataset on which to build this dataset array. focus : str The name of the "focus variable" in `dataset` on which this object - is oriented. + is oriented. This is the variable on which mathematical operations + are applied. """ if not isinstance(dataset, dataset_.Dataset): dataset = dataset_.Dataset(dataset) @@ -74,7 +78,7 @@ def _data(self): @property def data(self): - """The dataset array's data as a numpy.ndarray""" + """The array's data as a numpy.ndarray""" return self.array.data @data.setter def data(self, value): @@ -112,7 +116,7 @@ def __contains__(self, key): @property def loc(self): - """Attribute for location based indexing with pandas + """Attribute for location based indexing like pandas """ return _LocIndexer(self) @@ -124,6 +128,10 @@ def __iter__(self): def attributes(self): return self.array.attributes + @property + def variables(self): + return self.dataset.variables + @property def coordinates(self): return FrozenOrderedDict((k, self.dataset.variables[k]) @@ -267,39 +275,38 @@ def transpose(self, *dimensions): """ return self.refocus(self.array.transpose(*dimensions)) - def collapse(self, func, dimension=None, axis=None, **kwargs): - """Collapse this array by applying `func` along some dimension(s) + def reduce(self, func, dimension=None, axis=None, **kwargs): + """Reduce this array by applying `func` along some dimension(s) Parameters ---------- func : function Function which can be called in the form - `f(x, axis=axis, **kwargs)` to return the result of collapsing an + `f(x, axis=axis, **kwargs)` to return the result of reducing an np.ndarray over an integer valued axis. dimension : str or sequence of str, optional Dimension(s) over which to repeatedly apply `func`. axis : int or sequence of int, optional Axis(es) over which to repeatedly apply `func`. Only one of the 'dimension' and 'axis' arguments can be supplied. If neither are - supplied, then the collapse is calculated over the flattened array + supplied, then the reduction is calculated over the flattened array (by calling `f(x)` without an axis argument). **kwargs : dict Additional keyword arguments passed on to `func`. Note ---- - If `collapse` is called with multiple dimensions (or axes, which - are converted into dimensions), then the collapse operation is + If `reduce` is called with multiple dimensions (or axes, which + are converted into dimensions), then the reduce operation is performed repeatedly along each dimension in turn from left to right. Returns ------- - collapsed : DatasetArray + reduced : DatasetArray DatasetArray with this object's array replaced with an array with summarized data and the indicated dimension(s) removed. """ - # TODO: rename this method "reduce" - var = self.array.collapse(func, dimension, axis, **kwargs) + var = self.array.reduce(func, dimension, axis, **kwargs) drop = set(self.dimensions) - set(var.dimensions) # For now, take an aggressive strategy of removing all variables # associated with any dropped dimensions @@ -475,13 +482,14 @@ def func(self, other): ops.inject_special_operations(DatasetArray, priority=60) -def intersection(array1, array2): - """Given two dataset array objects, returns two new dataset arrays where - all indices found on both arrays are replaced by their intersection +def align(array1, array2): + """Given two Dataset or DatasetArray objects, returns two new objects where + all coordinates found on both datasets are replaced by their intersection, + and thus are aligned for performing mathematical operations. """ - # TODO: automatically calculate the intersection when doing math with - # arrays, or better yet calculate the union of the indices and fill in - # the mis-aligned data with NaN. + # TODO: automatically align when doing math with arrays, or better yet + # calculate the union of the indices and fill in the mis-aligned data with + # NaN. overlapping_coords = {k: (array1.coordinates[k].data & array2.coordinates[k].data) for k in array1.coordinates diff --git a/src/xray/groupby.py b/src/xray/groupby.py index 88190e29dfe..a1bf7469f0c 100644 --- a/src/xray/groupby.py +++ b/src/xray/groupby.py @@ -1,7 +1,7 @@ import itertools -from common import ImplementsCollapse -from ops import inject_collapse_methods +from common import ImplementsReduce +from ops import inject_reduce_methods import array_ import dataset import numpy as np @@ -39,12 +39,12 @@ def peek_at(iterable): return peek, itertools.chain([peek], gen) -class GroupBy(ImplementsCollapse): +class GroupBy(ImplementsReduce): """A object that implements the split-apply-combine pattern Modeled after `pandas.GroupBy`. The `GroupBy` object can be iterated over (unique_value, grouped_array) pairs, but the main way to interact with a - groupby object are with the `apply` or `collapse` methods. You can also + groupby object are with the `apply` or `reduce` methods. You can also directly call numpy methods like `mean` or `std`. See Also @@ -107,8 +107,8 @@ def __iter__(self): def iter_fast(self): # extract the underlying Array object array = self.array - if hasattr(self.array, 'variable'): - array = array.variable + if hasattr(self.array, 'array'): + array = array.array # build the new dimensions index_int = isinstance(self.group_indices[0], int) @@ -195,8 +195,38 @@ def apply(self, func, shortcut=True, **kwargs): for d in stacked.dimensions] return stacked.transpose(*new_order) - def collapse(self, func, dimension=Ellipsis, axis=Ellipsis, shortcut=True, + def reduce(self, func, dimension=Ellipsis, axis=Ellipsis, shortcut=True, **kwargs): + """Reduce this variable by applying `func` along some dimension(s) + + Parameters + ---------- + func : function + Function which can be called in the form + `func(x, axis=axis, **kwargs)` to return the result of collapsing an + np.ndarray over an integer valued axis. + dimension : str or sequence of str, optional + Dimension(s) over which to repeatedly apply `func`. + axis : int or sequence of int, optional + Axis(es) over which to repeatedly apply `func`. Only one of the + 'dimension' and 'axis' arguments can be supplied. If neither are + supplied, then the reduction is calculated over the flattened array + (by calling `func(x)` without an axis argument). + **kwargs : dict + Additional keyword arguments passed on to `func`. + + Note + ---- + If `reduce` is called with multiple dimensions (or axes, which + are converted into dimensions), then the reduce operation is + performed repeatedly along each dimension in turn from left to right. + + Returns + ------- + reduced : Array + Array with summarized data and the indicated dimension(s) + removed. + """ # Ellipsis is used as a sentinel value for the altered default if axis is Ellipsis and dimension is Ellipsis: dimension = self.group_dim @@ -204,12 +234,12 @@ def collapse(self, func, dimension=Ellipsis, axis=Ellipsis, shortcut=True, dimension = None if axis is Ellipsis: axis = None - def collapse_array(ar): - return ar.collapse(func, dimension, axis, **kwargs) - return self.apply(collapse_array, shortcut=shortcut) + def reduce_array(ar): + return ar.reduce(func, dimension, axis, **kwargs) + return self.apply(reduce_array, shortcut=shortcut) - _collapse_method_docstring = \ - """Collapse this {cls}'s data' by applying `{name}` along some + _reduce_method_docstring = \ + """Reduce this {cls}'s data' by applying `{name}` along some dimension(s) Parameters @@ -238,13 +268,13 @@ def collapse_array(ar): Returns ------- - collapsed : {cls} + reduced : {cls} New {cls} object with `{name}` applied to its data and the indicated dimension(s) removed. """ - _collapse_dimension_default = Ellipsis - _collapse_axis_default = Ellipsis + _reduce_dimension_default = Ellipsis + _reduce_axis_default = Ellipsis -inject_collapse_methods(GroupBy) +inject_reduce_methods(GroupBy) diff --git a/src/xray/ops.py b/src/xray/ops.py index 56632062cf7..4bb6215dede 100644 --- a/src/xray/ops.py +++ b/src/xray/ops.py @@ -1,4 +1,3 @@ -import functools import operator import numpy as np @@ -20,9 +19,9 @@ 'getfield', 'newbyteorder', 'put', 'round', 'setfield', 'setflags', 'view'] # methods which remove an axis -NUMPY_COLLAPSE_METHODS = ['all', 'any', 'argmax', 'argmin', 'cumprod', - 'cumsum', 'max', 'mean', 'min', 'prod', 'ptp', 'std', - 'sum', 'var'] +NUMPY_REDUCE_METHODS = ['all', 'any', 'argmax', 'argmin', 'cumprod', + 'cumsum', 'max', 'mean', 'min', 'prod', 'ptp', 'std', + 'sum', 'var'] def _data_method_wrapper(f): @@ -39,11 +38,11 @@ def func(self, *args, **kwargs): return func -def inject_collapse_methods(cls): +def inject_reduce_methods(cls): # TODO: change these to use methods instead of numpy functions - for name in NUMPY_COLLAPSE_METHODS: - setattr(cls, name, cls._collapse_method(getattr(np, name), - name, 'numpy')) + for name in NUMPY_REDUCE_METHODS: + setattr(cls, name, cls._reduce_method(getattr(np, name), + name, 'numpy')) def inject_special_operations(cls, priority=50): @@ -69,4 +68,4 @@ def inject_special_operations(cls, priority=50): setattr(cls, name, _data_method_wrapper(name)) for name in NUMPY_UNARY_METHODS: setattr(cls, name, cls._unary_op(_method_wrapper(name))) - inject_collapse_methods(cls) + inject_reduce_methods(cls) diff --git a/test/test_array.py b/test/test_array.py index 73a50d91f99..720553e5c1c 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -195,21 +195,21 @@ def test_array_interface(self): # test ufuncs self.assertVarEqual(np.sin(v), Array(['x'], np.sin(x))) - def test_collapse(self): + def test_reduce(self): v = Array(['time', 'x'], self.d) # intentionally test with an operation for which order matters - self.assertVarEqual(v.collapse(np.std, 'time'), + self.assertVarEqual(v.reduce(np.std, 'time'), Array(['x'], self.d.std(axis=0), {'cell_methods': 'time: std'})) - self.assertVarEqual(v.collapse(np.std, axis=0), - v.collapse(np.std, dimension='time')) - self.assertVarEqual(v.collapse(np.std, ['x', 'time']), + self.assertVarEqual(v.reduce(np.std, axis=0), + v.reduce(np.std, dimension='time')) + self.assertVarEqual(v.reduce(np.std, ['x', 'time']), Array([], self.d.std(axis=1).std(axis=0), {'cell_methods': 'x: std time: std'})) - self.assertVarEqual(v.collapse(np.std), + self.assertVarEqual(v.reduce(np.std), Array([], self.d.std(), {'cell_methods': 'time: x: std'})) - self.assertVarEqual(v.mean('time'), v.collapse(np.mean, 'time')) + self.assertVarEqual(v.mean('time'), v.reduce(np.mean, 'time')) def test_groupby(self): agg_var = Array(['y'], np.array(['a', 'a', 'b'])) @@ -229,7 +229,7 @@ def test_groupby(self): grouped = v.groupby('abc', agg_var) self.assertVarEqual(expected_unique, grouped.unique_coord) self.assertVarEqual(v, grouped.apply(lambda x: x)) - self.assertVarEqual(expected_aggregated, grouped.collapse(np.sum)) + self.assertVarEqual(expected_aggregated, grouped.reduce(np.sum)) actual = list(grouped) expected = zip(expected_unique, [v[:, :2], v[:, 2:]]) diff --git a/test/test_dataset_array.py b/test/test_dataset_array.py index 3991210fead..1f3356cf20f 100644 --- a/test/test_dataset_array.py +++ b/test/test_dataset_array.py @@ -1,6 +1,6 @@ import numpy as np -from xray import Dataset, DatasetArray, Array, intersection +from xray import Dataset, DatasetArray, Array, align from . import TestCase, ReturnItem @@ -159,9 +159,9 @@ def test_inplace_math(self): self.assertIs(b.data, x) self.assertIs(b.dataset, self.ds) - def test_collapse(self): - self.assertVarEqual(self.dv.collapse(np.mean, 'x'), - self.v.collapse(np.mean, 'x')) + def test_reduce(self): + self.assertVarEqual(self.dv.reduce(np.mean, 'x'), + self.v.reduce(np.mean, 'x')) # needs more... # should check which extra dimensions are dropped @@ -187,7 +187,7 @@ def test_groupby(self): {'cell_methods': 'x: y: sum'}), 'abc': Array(['abc'], np.array(['a', 'b', 'c']))}), 'foo') self.assertDSArrayEqual(expected_sum_all, - grouped.collapse(np.sum, dimension=None)) + grouped.reduce(np.sum, dimension=None)) self.assertDSArrayEqual(expected_sum_all, grouped.sum(dimension=None)) expected_sum_axis1 = DatasetArray(Dataset( @@ -197,7 +197,7 @@ def test_groupby(self): {'cell_methods': 'y: sum'}), 'x': self.ds.variables['x'], 'abc': Array(['abc'], np.array(['a', 'b', 'c']))}), 'foo') - self.assertDSArrayEqual(expected_sum_axis1, grouped.collapse(np.sum)) + self.assertDSArrayEqual(expected_sum_axis1, grouped.reduce(np.sum)) self.assertDSArrayEqual(expected_sum_axis1, grouped.sum()) self.assertDSArrayEqual(self.dv, grouped.apply(identity)) @@ -233,10 +233,10 @@ def test_from_stack(self): self.ds['x']) self.assertDSArrayEqual(foo, stacked) - def test_intersection(self): + def test_align(self): self.ds['x'] = ('x', np.array(list('abcdefghij'))) with self.assertRaises(ValueError): self.dv + self.dv[:5] - dv1, dv2 = intersection(self.dv, self.dv[:5]) + dv1, dv2 = align(self.dv, self.dv[:5]) self.assertDSArrayEqual(dv1, self.dv[:5]) self.assertDSArrayEqual(dv2, self.dv[:5]) From 039bdd01a17b5410158f2a760f00598e217a421b Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 16 Feb 2014 12:44:43 -0800 Subject: [PATCH 40/45] Better tests for GroupBy; bug fixes --- src/xray/array_.py | 23 ++++++++++++++- src/xray/dataset_array.py | 58 +++++++++++++++++++++++--------------- src/xray/groupby.py | 31 +++++++++++--------- test/test_dataset_array.py | 20 +++++++++---- 4 files changed, 90 insertions(+), 42 deletions(-) diff --git a/src/xray/array_.py b/src/xray/array_.py index bbd298832c9..ac101152782 100644 --- a/src/xray/array_.py +++ b/src/xray/array_.py @@ -293,7 +293,6 @@ def transpose(self, *dimensions): data = self.data.transpose(*axes) return type(self)(dimensions, data, self.attributes) - # TODO: rename this method to 'reduce' def reduce(self, func, dimension=None, axis=None, **kwargs): """Reduce this array by applying `func` along some dimension(s) @@ -366,6 +365,28 @@ def _reduce(self, f, dim, **kwargs): return new_var def groupby(self, group_name, group_array, squeeze=True): + """Group this dataset by unique values of the indicated group + + Parameters + ---------- + group_name : str + Name of the group array. + group_array : Array + Array whose unique values should be used to group this array. + squeeze : boolean, optional + If "group" is a coordinate of this array, `squeeze` controls + whether the subarrays have a dimension of length 1 along that + coordinate or if the dimension is squeezed out. + + Returns + ------- + grouped : GroupBy + A `GroupBy` object patterned after `pandas.GroupBy` that can be + iterated over in the form of `(unique_value, grouped_array)` pairs + or over which grouped operations can be applied with the `apply` + and `reduce` methods (and the associated aliases `mean`, `sum`, + `std`, etc.). + """ return groupby.GroupBy(self, group_name, group_array, squeeze=squeeze) # TODO: remove this method (groupby encompasses its functionality) diff --git a/src/xray/dataset_array.py b/src/xray/dataset_array.py index dca35bc32a4..560d0fcab36 100644 --- a/src/xray/dataset_array.py +++ b/src/xray/dataset_array.py @@ -215,9 +215,14 @@ def refocus(self, new_var, name=None): """ if not hasattr(new_var, 'dimensions'): new_var = type(self.array)(self.array.dimensions, new_var) - ds = self.unselected() + if self.focus not in self.dimensions: + ds = self.unselected() + else: + ds = self.dataset if name is None: - name = self.focus + name = self.focus + '_' + print new_var + print name ds[name] = new_var return type(self)(ds, name) @@ -243,7 +248,27 @@ def iterator(self, dimension): yield (x, type(self)(ds, self.focus)) def groupby(self, group, squeeze=True): - # TODO: document this method + """Group this dataset by unique values of the indicated group + + Parameters + ---------- + group : str or DatasetArray + Array whose unique values should be used to group this array. If a + string, must be the name of a variable contained in this dataset. + squeeze : boolean, optional + If "group" is a coordinate of this array, `squeeze` controls + whether the subarrays have a dimension of length 1 along that + coordinate or if the dimension is squeezed out. + + Returns + ------- + grouped : GroupBy + A `GroupBy` object patterned after `pandas.GroupBy` that can be + iterated over in the form of `(unique_value, grouped_array)` pairs + or over which grouped operations can be applied with the `apply` + and `reduce` methods (and the associated aliases `mean`, `sum`, + `std`, etc.). + """ if isinstance(group, basestring): # merge in the group's dataset to allow group to be a virtual # variable in this dataset @@ -266,14 +291,14 @@ def transpose(self, *dimensions): Returns ------- transposed : DatasetArray - The returned DatasetArray's variable is transposed. + The returned DatasetArray's array is transposed. See Also -------- numpy.transpose Array.transpose """ - return self.refocus(self.array.transpose(*dimensions)) + return self.refocus(self.array.transpose(*dimensions), self.focus) def reduce(self, func, dimension=None, axis=None, **kwargs): """Reduce this array by applying `func` along some dimension(s) @@ -339,19 +364,7 @@ def aggregate(self, func, new_dim, **kwargs): aggregated : DatasetArray DatasetArray with aggregated data and the new dimension `new_dim`. """ - # TODO: remove this method (replaced by groupby) - if isinstance(new_dim, basestring): - new_dim = self.dataset[new_dim] - unique, aggregated = self.array.aggregate( - func, new_dim.focus, new_dim, **kwargs) - # TODO: add options for how to summarize variables along aggregated - # dimensions instead of just dropping them? - drop = {k for k, v in self.dataset.variables.iteritems() - if any(dim in new_dim.dimensions for dim in v.dimensions)} - ds = self.dataset.unselect(*drop) - ds[unique.dimensions[0]] = unique - ds[self.focus] = aggregated - return type(self)(ds, self.focus) + return self.groupby(new_dim).reduce(func, **kwargs) @classmethod def from_stack(cls, arrays, dimension='stacked_dimension', @@ -403,8 +416,10 @@ def from_stack(cls, arrays, dimension='stacked_dimension', for array in arrays: if isinstance(array, cls): unselected = array.unselected() - if dim_name in unselected: - unselected = unselected.unselect(dim_name) + drop = {k for k, v in unselected.variables.iteritems() + if k == dim_name or dim_name in v.dimensions} + if drop: + unselected = unselected.unselect(*drop) ds.merge(unselected, inplace=True) if focus is None: focus = array.focus @@ -429,8 +444,7 @@ def to_dataframe(self): return self.dataset.to_dataframe() def __array_wrap__(self, result): - return self.refocus(self.array.__array_wrap__(result), - self.focus + '_') + return self.refocus(self.array.__array_wrap__(result)) @staticmethod def _unary_op(f): diff --git a/src/xray/groupby.py b/src/xray/groupby.py index a1bf7469f0c..2d2db92fcf7 100644 --- a/src/xray/groupby.py +++ b/src/xray/groupby.py @@ -84,8 +84,8 @@ def __init__(self, array, group_name, group_coord, squeeze=True): else: # look through group_coord to find the unique values unique_values, group_indices = unique_value_groups(group_coord) - unique_coord = dataset.Dataset( - {group_name: (group_name, unique_values)})[group_name] + variables = {group_name: (group_name, unique_values)} + unique_coord = dataset.Dataset(variables)[group_name] self.group_indices = group_indices self.unique_coord = unique_coord @@ -129,7 +129,7 @@ def iter_arrays(self): for indices in self.group_indices: yield self.array.indexed_by(**{self.group_dim: indices}) - def apply(self, func, shortcut=True, **kwargs): + def apply(self, func, shortcut=False, **kwargs): """Apply a function over each array in the group and stack them together into a new array @@ -166,9 +166,8 @@ def apply(self, func, shortcut=True, **kwargs): applied : Array A new Array of the same type from which this grouping was created. """ - shortcut = kwargs.pop('shortcut', True) applied = (func(ar, **kwargs) for ar in (self.iter_fast() if shortcut - else self.iter_array())) + else self.iter_arrays())) # peek at applied to determine which coordinate to stack over applied_example, applied = peek_at(applied) @@ -196,7 +195,7 @@ def apply(self, func, shortcut=True, **kwargs): return stacked.transpose(*new_order) def reduce(self, func, dimension=Ellipsis, axis=Ellipsis, shortcut=True, - **kwargs): + **kwargs): """Reduce this variable by applying `func` along some dimension(s) Parameters @@ -210,8 +209,8 @@ def reduce(self, func, dimension=Ellipsis, axis=Ellipsis, shortcut=True, axis : int or sequence of int, optional Axis(es) over which to repeatedly apply `func`. Only one of the 'dimension' and 'axis' arguments can be supplied. If neither are - supplied, then the reduction is calculated over the flattened array - (by calling `func(x)` without an axis argument). + supplied, then `{name}` is calculated over the axis of the variable + over which the group was formed. **kwargs : dict Additional keyword arguments passed on to `func`. @@ -221,6 +220,12 @@ def reduce(self, func, dimension=Ellipsis, axis=Ellipsis, shortcut=True, are converted into dimensions), then the reduce operation is performed repeatedly along each dimension in turn from left to right. + `Ellipsis` is used as a sentinel value for the default dimension and + axis to indicate that this operation is applied along the axis over + which the group was formed, instead of all axes. To instead apply + `{name}` simultaneously over all grouped values, use `dimension=None` + (or equivalently `axis=None`). + Returns ------- reduced : Array @@ -260,11 +265,11 @@ def reduce_array(ar): converted into dimensions), then `{name}` is performed repeatedly along each dimension in turn from left to right. - `Ellipsis` is used as the default dimension and axis for this method to - indicate that this operation is by default applied along the axis along - which the grouping variable lies. To instead apply `{name}` - simultaneously over all grouped values, use `dimension=None` (or - equivalently `axis=None`). + `Ellipsis` is used as a sentinel value for the default dimension and + axis to indicate that this operation is applied along the axis over + which the group was formed, instead of all axes. To instead apply + `{name}` simultaneously over all grouped values, use `dimension=None` + (or equivalently `axis=None`). Returns ------- diff --git a/test/test_dataset_array.py b/test/test_dataset_array.py index 1f3356cf20f..21771d302b3 100644 --- a/test/test_dataset_array.py +++ b/test/test_dataset_array.py @@ -95,6 +95,10 @@ def test_renamed(self): def test_refocus(self): self.assertVarEqual(self.dv, self.dv.refocus(self.v)) self.assertVarEqual(self.dv, self.dv.refocus(self.x)) + self.ds['x'] = ('x', np.array(list('abcdefghij'))) + self.assertVarEqual(self.dv.coordinates['x'], + self.dv['x'].refocus( + np.arange(10)).coordinates['x']) def test_dataset_getitem(self): dv = self.ds['foo'] @@ -171,12 +175,13 @@ def test_groupby(self): self.dv['y'] = 20 + 100 * self.ds['y'].array identity = lambda x: x - self.assertDSArrayEqual(self.dv, self.dv.groupby('x').apply(identity)) - self.assertDSArrayEqual(self.dv, self.dv.groupby('x', squeeze=False - ).apply(identity)) - self.assertDSArrayEqual(self.dv, self.dv.groupby('y').apply(identity)) - self.assertDSArrayEqual(self.dv, self.dv.groupby('y', squeeze=False - ).apply(identity)) + for g in ['x', 'y']: + for shortcut in [True, False]: + for squeeze in [True, False]: + expected = self.dv + actual = self.dv.groupby(g, squeeze=squeeze).apply( + identity, shortcut=shortcut) + self.assertDSArrayEqual(expected, actual) grouped = self.dv.groupby('abc') @@ -190,6 +195,9 @@ def test_groupby(self): grouped.reduce(np.sum, dimension=None)) self.assertDSArrayEqual(expected_sum_all, grouped.sum(dimension=None)) + grouped = self.dv.groupby('abc', squeeze=False) + self.assertDSArrayEqual(expected_sum_all, grouped.sum(dimension=None)) + expected_sum_axis1 = DatasetArray(Dataset( {'foo': Array(['x', 'abc'], np.array([self.x[:, :9].sum(1), self.x[:, 10:].sum(1), From fdba77f95e29bf4cb23da7ac1a7965b55e1820a3 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 16 Feb 2014 12:48:11 -0800 Subject: [PATCH 41/45] Removed aggregate and iteartor (they are replaced by groupby) --- src/xray/array_.py | 71 ++------------------------------------ src/xray/dataset.py | 32 ++++++++++------- src/xray/dataset_array.py | 51 ++------------------------- src/xray/groupby.py | 59 ++++++++++++++++++------------- test/test_array.py | 18 ---------- test/test_dataset.py | 4 +-- test/test_dataset_array.py | 31 +++++------------ 7 files changed, 70 insertions(+), 196 deletions(-) diff --git a/src/xray/array_.py b/src/xray/array_.py index ac101152782..6967079a7ac 100644 --- a/src/xray/array_.py +++ b/src/xray/array_.py @@ -32,29 +32,6 @@ def _as_compatible_data(data): return data -def unique_value_groups(ar): - """Group an array by its unique values - - Parameters - ---------- - ar : array_like - Input array. This will be flattened if it is not already 1-D. - - Returns - ------- - values : np.ndarray - Sorted, unique values as returned by `np.unique`. - indices : list of lists of int - Each element provides the integer indices in `ar` with values given by - the corresponding value in `unique_values`. - """ - values, inverse = np.unique(ar, return_inverse=True) - groups = [[] for _ in range(len(values))] - for n, g in enumerate(inverse): - groups[g].append(n) - return values, groups - - class Array(AbstractArray): """A netcdf-like variable consisting of dimensions, data and attributes which describe a single Array. A single Array object is not fully described @@ -387,52 +364,8 @@ def groupby(self, group_name, group_array, squeeze=True): and `reduce` methods (and the associated aliases `mean`, `sum`, `std`, etc.). """ - return groupby.GroupBy(self, group_name, group_array, squeeze=squeeze) - - # TODO: remove this method (groupby encompasses its functionality) - def aggregate(self, func, new_dim_name, group_by, **kwargs): - """Aggregate this variable by applying `func` to grouped elements - - Parameters - ---------- - func : function - Function which can be called in the form - `func(x, axis=axis, **kwargs)` to reduce an np.ndarray over an - integer valued axis. - new_dim_name : str or sequence of str, optional - Name of the new dimension to create. - group_by : Array - 1D variable which contains the values by which to group. - **kwargs : dict - Additional keyword arguments passed on to `func`. - - Returns - ------- - unique : Array - 1D variable of unique values in group, along the dimension given by - `new_dim_name`. - aggregated : Array - Array with aggregated data and the original dimension from - `group_by` replaced by `new_dim_name`. - """ - if group_by.ndim != 1: - # TODO: remove this limitation? - raise ValueError('group variables must be 1 dimensional') - dim = group_by.dimensions[0] - axis = self.dimensions.index(dim) - if group_by.size != self.shape[axis]: - raise ValueError('the group variable\'s length does not ' - 'match the length of this variable along its ' - 'dimension') - unique_values, group_indices = unique_value_groups(group_by.data) - aggregated = (self.indexed_by(**{dim: indices}).reduce( - func, dim, axis=None, **kwargs) - for indices in group_indices) - stacked = type(self).from_stack(aggregated, new_dim_name, - length=unique_values.size) - ordered_dims = [new_dim_name if d == dim else d for d in self.dimensions] - unique = type(self)([new_dim_name], unique_values) - return unique, stacked.transpose(*ordered_dims) + return groupby.ArrayGroupBy( + self, group_name, group_array, squeeze=squeeze) @classmethod def from_stack(cls, variables, dimension='stacked_dimension', diff --git a/src/xray/dataset.py b/src/xray/dataset.py index 85070e34cfa..a45f81ca433 100644 --- a/src/xray/dataset.py +++ b/src/xray/dataset.py @@ -8,6 +8,7 @@ import array_ as array import backends import conventions +import groupby import utils from dataset_array import DatasetArray from utils import FrozenOrderedDict, Frozen, remap_loc_indexers @@ -584,26 +585,31 @@ def replace(self, name, variable): ds[name] = variable return ds - def iterator(self, dimension): - """Iterate along a data dimension - - Returns an iterator yielding (coordinate, dataset) pairs for each - coordinate value along the specified dimension. + def groupby(self, group, squeeze=True): + """Group this dataset by unique values of the indicated group Parameters ---------- - dimension : string - The dimension along which to iterate. + group : str or DatasetArray + Array whose unique values should be used to group this array. If a + string, must be the name of a variable contained in this dataset. + squeeze : boolean, optional + If "group" is a coordinate of this array, `squeeze` controls + whether the subarrays have a dimension of length 1 along that + coordinate or if the dimension is squeezed out. Returns ------- - it : iterator - The returned iterator yields pairs of scalar-valued coordinate - variables and Dataset objects. + grouped : GroupBy + A `GroupBy` object patterned after `pandas.GroupBy` that can be + iterated over in the form of `(unique_value, grouped_array)` pairs. """ - coord = self.variables[dimension] - for i in xrange(self.dimensions[dimension]): - yield (coord[i], self.indexed_by(**{dimension: i})) + if isinstance(group, basestring): + # merge in the group's dataset to allow group to be a virtual + # variable in this dataset + ds = self.merge(self[group].dataset) + group = DatasetArray(ds, group) + return groupby.GroupBy(self, group.focus, group, squeeze=squeeze) def to_dataframe(self): """Convert this dataset into a pandas.DataFrame diff --git a/src/xray/dataset_array.py b/src/xray/dataset_array.py index 560d0fcab36..f2047595d97 100644 --- a/src/xray/dataset_array.py +++ b/src/xray/dataset_array.py @@ -216,37 +216,16 @@ def refocus(self, new_var, name=None): if not hasattr(new_var, 'dimensions'): new_var = type(self.array)(self.array.dimensions, new_var) if self.focus not in self.dimensions: + # only unselect the focus from the dataset if it isn't a coordinate + # variable ds = self.unselected() else: ds = self.dataset if name is None: name = self.focus + '_' - print new_var - print name ds[name] = new_var return type(self)(ds, name) - def iterator(self, dimension): - """Iterate along a data dimension - - Returns an iterator yielding (coordinate, dataview) pairs for each - coordinate value along the specified dimension. - - Parameters - ---------- - dimension : string - The dimension along which to iterate. - - Returns - ------- - it : iterator - The returned iterator yields pairs of scalar-valued coordinate - arrays and DatasetArray objects. - """ - # TODO: remove this method (replaced by groupby) - for (x, ds) in self.dataset.iterator(dimension): - yield (x, type(self)(ds, self.focus)) - def groupby(self, group, squeeze=True): """Group this dataset by unique values of the indicated group @@ -274,7 +253,7 @@ def groupby(self, group, squeeze=True): # variable in this dataset ds = self.dataset.merge(self.dataset[group].dataset) group = DatasetArray(ds, group) - return groupby.GroupBy(self, group.focus, group, squeeze=squeeze) + return groupby.ArrayGroupBy(self, group.focus, group, squeeze=squeeze) def transpose(self, *dimensions): """Return a new DatasetArray object with transposed dimensions @@ -342,30 +321,6 @@ def reduce(self, func, dimension=None, axis=None, **kwargs): ds[self.focus] = var return type(self)(ds, self.focus) - def aggregate(self, func, new_dim, **kwargs): - """Aggregate this array by applying `func` to grouped elements - - Parameters - ---------- - func : function - Function which can be called in the form - `func(x, axis=axis, **kwargs)` to reduce an np.ndarray over an - integer valued axis. - new_dim : str or DatasetArray - Name of a variable in this array's dataset or DatasetArray by which - to group variable elements. The dimension along which this variable - exists will be replaced by this name. The array must be one- - dimensional. - **kwargs : dict - Additional keyword arguments passed on to `func`. - - Returns - ------- - aggregated : DatasetArray - DatasetArray with aggregated data and the new dimension `new_dim`. - """ - return self.groupby(new_dim).reduce(func, **kwargs) - @classmethod def from_stack(cls, arrays, dimension='stacked_dimension', stacked_indexers=None, length=None, template=None): diff --git a/src/xray/groupby.py b/src/xray/groupby.py index 2d2db92fcf7..bab4ff33298 100644 --- a/src/xray/groupby.py +++ b/src/xray/groupby.py @@ -39,7 +39,7 @@ def peek_at(iterable): return peek, itertools.chain([peek], gen) -class GroupBy(ImplementsReduce): +class GroupBy(object): """A object that implements the split-apply-combine pattern Modeled after `pandas.GroupBy`. The `GroupBy` object can be iterated over @@ -52,29 +52,34 @@ class GroupBy(ImplementsReduce): Array.groupby DatasetArray.groupby """ - def __init__(self, array, group_name, group_coord, squeeze=True): + def __init__(self, obj, group_name, group_coord, squeeze=True): """See Array.groupby and DatasetArray.groupby """ if group_coord.ndim != 1: # TODO: remove this limitation? raise ValueError('`group_coord` must be 1 dimensional') - self.array = array + self.obj = obj self.group_coord = group_coord self.group_dim, = group_coord.dimensions - self.group_axis = array.dimensions.index(self.group_dim) - if group_coord.size != array.shape[self.group_axis]: + dimensions = obj.dimensions + try: + expected_size = dimensions[self.group_dim] + except TypeError: + expected_size = obj.shape[obj.dimensions.index(self.group_dim)] + + if group_coord.size != expected_size: raise ValueError('the group variable\'s length does not ' 'match the length of this variable along its ' 'dimension') - if group_name in array.dimensions: + if group_name in obj.dimensions: # assume that group_coord already has sorted, unique values if group_coord.dimensions != (group_name,): raise ValueError('`group_coord` is required to be a coordinate ' 'variable along the `group_name` dimension ' - 'if `group_name` is a dimension in `array`') + 'if `group_name` is a dimension in `obj`') group_indices = np.arange(group_coord.size) if not squeeze: # group_indices = group_indices.reshape(-1, 1) @@ -102,33 +107,39 @@ def __len__(self): return self.unique_coord.size def __iter__(self): - return itertools.izip(self.unique_coord, self.iter_arrays()) + return itertools.izip(self.unique_coord, self.iter_indexed()) + + def iter_indexed(self): + for indices in self.group_indices: + yield self.obj.indexed_by(**{self.group_dim: indices}) + - def iter_fast(self): +class ArrayGroupBy(GroupBy, ImplementsReduce): + def iter_shortcut(self): + """Fast version of `iter_groups` that yields Arrays without metadata + """ # extract the underlying Array object - array = self.array - if hasattr(self.array, 'array'): + array = self.obj + if hasattr(array, 'array'): array = array.array + group_axis = array.dimensions.index(self.group_dim) + # build the new dimensions index_int = isinstance(self.group_indices[0], int) if index_int: dims = tuple(d for n, d in enumerate(array.dimensions) - if n != self.group_axis) + if n != group_axis) else: dims = array.dimensions # slice the data and build the new Arrays directly for indices in self.group_indices: - indexer = tuple(indices if n == self.group_axis else slice(None) + indexer = tuple(indices if n == group_axis else slice(None) for n in range(array.ndim)) data = array.data[indexer] yield array_.Array(dims, data) - def iter_arrays(self): - for indices in self.group_indices: - yield self.array.indexed_by(**{self.group_dim: indices}) - def apply(self, func, shortcut=False, **kwargs): """Apply a function over each array in the group and stack them together into a new array @@ -166,8 +177,8 @@ def apply(self, func, shortcut=False, **kwargs): applied : Array A new Array of the same type from which this grouping was created. """ - applied = (func(ar, **kwargs) for ar in (self.iter_fast() if shortcut - else self.iter_arrays())) + applied = (func(ar, **kwargs) for ar in (self.iter_shortcut() if shortcut + else self.iter_indexed())) # peek at applied to determine which coordinate to stack over applied_example, applied = peek_at(applied) @@ -178,16 +189,16 @@ def apply(self, func, shortcut=False, **kwargs): stack_coord = self.unique_coord indexers = np.arange(self.unique_coord.size) - from_stack_kwargs = {'template': self.array} if shortcut else {} - stacked = type(self.array).from_stack(applied, stack_coord, indexers, - **from_stack_kwargs) + from_stack_kwargs = {'template': self.obj} if shortcut else {} + stacked = type(self.obj).from_stack(applied, stack_coord, indexers, + **from_stack_kwargs) # now, reorder the stacked array's dimensions so that those that # appeared in the original array appear in the same order they did # originally stack_dim, = stack_coord.dimensions original_dims = [stack_dim if d == self.group_dim else d - for d in self.array.dimensions + for d in self.obj.dimensions if d in stacked.dimensions or d == self.group_dim] iter_original_dims = iter(original_dims) new_order = [iter_original_dims.next() if d in original_dims else d @@ -282,4 +293,4 @@ def reduce_array(ar): _reduce_axis_default = Ellipsis -inject_reduce_methods(GroupBy) +inject_reduce_methods(ArrayGroupBy) diff --git a/test/test_array.py b/test/test_array.py index 720553e5c1c..224a71af4ae 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -238,24 +238,6 @@ def test_groupby(self): self.assertVarEqual(ke, ka) self.assertVarEqual(ve, va) - def test_aggregate(self): - agg_var = Array(['y'], np.array(['a', 'a', 'b'])) - v = Array(['x', 'y'], self.d) - expected_unique = Array(['abc'], np.array(['a', 'b'])) - expected_aggregated = Array(['x', 'abc'], - np.array([self.d[:, :2].sum(axis=1), - self.d[:, 2:].sum(axis=1)]).T, - {'cell_methods': 'y: sum'}) - actual_unique, actual_aggregated = v.aggregate(np.sum, 'abc', agg_var) - self.assertVarEqual(expected_unique, actual_unique) - self.assertVarEqual(expected_aggregated, actual_aggregated) - # should be equivalent to aggregate by a dataset array, too - alt_agg_var = Dataset({'abc': agg_var})['abc'] - actual_unique, actual_aggregated = v.aggregate(np.sum, 'abc', - alt_agg_var) - self.assertVarEqual(expected_unique, actual_unique) - self.assertVarEqual(expected_aggregated, actual_aggregated) - def test_from_stack(self): x = np.arange(5) y = np.ones(5) diff --git a/test/test_dataset.py b/test/test_dataset.py index fe1e341b497..29ff0a54701 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -50,9 +50,9 @@ def test_init(self): with self.assertRaisesRegexp(ValueError, 'must be defined with 1-d'): Dataset({'a': var1, 'x': var3}) - def test_iterator(self): + def test_groupby(self): data = create_test_data(self.get_store()) - for n, (t, sub) in enumerate(list(data.iterator('dim1'))[:3]): + for n, (t, sub) in enumerate(list(data.groupby('dim1'))[:3]): self.assertEqual(data['dim1'][n], t) self.assertVarEqual(data['var1'][n], sub['var1']) self.assertVarEqual(data['var2'][n], sub['var2']) diff --git a/test/test_dataset_array.py b/test/test_dataset_array.py index 21771d302b3..e0e42ce894b 100644 --- a/test/test_dataset_array.py +++ b/test/test_dataset_array.py @@ -56,14 +56,6 @@ def test_items(self): self.assertEqual(self.dv[0, 0].dataset, Dataset({'foo': self.dv.array[0, 0]})) - def test_iteration(self): - for ((act_x, act_dv), (exp_x, exp_ds)) in \ - zip(self.dv.iterator('y'), self.ds.iterator('y')): - self.assertVarEqual(exp_x, act_x) - self.assertDSArrayEqual(DatasetArray(exp_ds, 'foo'), act_dv) - for ((_, exp_dv), act_dv) in zip(self.dv.iterator('x'), self.dv): - self.assertDSArrayEqual(exp_dv, act_dv) - def test_indexed_by(self): self.assertEqual(self.dv[0].dataset, self.ds.indexed_by(x=0)) self.assertEqual(self.dv[:3, :5].dataset, @@ -169,6 +161,14 @@ def test_reduce(self): # needs more... # should check which extra dimensions are dropped + def test_groupby_iter(self): + for ((act_x, act_dv), (exp_x, exp_ds)) in \ + zip(self.dv.groupby('y'), self.ds.groupby('y')): + self.assertVarEqual(exp_x, act_x) + self.assertDSArrayEqual(DatasetArray(exp_ds, 'foo'), act_dv) + for ((_, exp_dv), act_dv) in zip(self.dv.groupby('x'), self.dv): + self.assertDSArrayEqual(exp_dv, act_dv) + def test_groupby(self): agg_var = Array(['y'], np.array(['a'] * 9 + ['c'] + ['b'] * 10)) self.dv['abc'] = agg_var @@ -210,19 +210,6 @@ def test_groupby(self): self.assertDSArrayEqual(self.dv, grouped.apply(identity)) - def test_aggregate(self): - agg_var = Array('y', np.array(['a'] * 9 + ['c'] + ['b'] * 10)) - self.ds['abc'] = agg_var - expected_unique, expected_var = \ - self.dv.array.aggregate(np.mean, 'abc', agg_var) - expected = DatasetArray(Dataset( - {'foo': expected_var, 'x': self.ds.variables['x'], - 'abc': expected_unique}), 'foo') - actual = self.dv.aggregate(np.mean, 'abc') - self.assertDSArrayEqual(expected, actual) - actual = self.dv.aggregate(np.mean, self.ds['abc']) - self.assertDSArrayEqual(expected, actual) - def test_from_stack(self): self.ds['bar'] = Array(['x', 'y'], np.random.randn(10, 20)) foo = self.ds['foo'] @@ -237,7 +224,7 @@ def test_from_stack(self): DatasetArray.from_stack([foo.array, bar.array], 'w')) # from iteration: - stacked = DatasetArray.from_stack((v for _, v in foo.iterator('x')), + stacked = DatasetArray.from_stack((v for _, v in foo.groupby('x')), self.ds['x']) self.assertDSArrayEqual(foo, stacked) From 1ab3f4d1e9d74b2912915a2b28fe488a40ec6db4 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 19 Feb 2014 10:48:54 -0800 Subject: [PATCH 42/45] to_dataframe() no longer creates a large empty array --- src/xray/dataset.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/xray/dataset.py b/src/xray/dataset.py index a45f81ca433..a0f46a635e1 100644 --- a/src/xray/dataset.py +++ b/src/xray/dataset.py @@ -622,8 +622,12 @@ def to_dataframe(self): columns = self.noncoordinates.keys() data = [] # we need a template to broadcast all dataset variables against - template = array.Array(self.dimensions.keys(), - np.empty(self.dimensions.values())) + # using stride_tricks lets us make the ndarray for broadcasting without + # having to allocate memory + shape = tuple(self.dimensions.values()) + empty_data = np.lib.stride_tricks.as_strided(np.array(0), shape=shape, + strides=[0] * len(shape)) + template = array.Array(self.dimensions.keys(), empty_data) for k in columns: _, var = array.broadcast_variables(template, self[k]) _, var_data = np.broadcast_arrays(template.data, var.data) From d8abfd3033497acf689ebd38bb97e20e8c53f8a8 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 20 Feb 2014 08:08:47 -0800 Subject: [PATCH 43/45] added DatasetArray.to_series() method --- src/xray/dataset_array.py | 12 ++++++++++++ test/test_dataset_array.py | 7 +++++++ 2 files changed, 19 insertions(+) diff --git a/src/xray/dataset_array.py b/src/xray/dataset_array.py index f2047595d97..626dea6efa9 100644 --- a/src/xray/dataset_array.py +++ b/src/xray/dataset_array.py @@ -5,6 +5,7 @@ from collections import OrderedDict import numpy as np +import pandas as pd import array_ import dataset as dataset_ @@ -398,6 +399,17 @@ def to_dataframe(self): """ return self.dataset.to_dataframe() + def to_series(self): + """Conver this array into a pandas.Series + + The Series is be indexed by the Cartesian product of the coordinates. + Unlike `to_dataframe`, only the variable at the focus of this array is + including in the returned series. + """ + index = pd.MultiIndex.from_product(self.coordinates.values(), + names=self.coordinates.keys()) + return pd.Series(self.data.reshape(-1), index=index, name=self.focus) + def __array_wrap__(self, result): return self.refocus(self.array.__array_wrap__(result)) diff --git a/test/test_dataset_array.py b/test/test_dataset_array.py index e0e42ce894b..90428e412cc 100644 --- a/test/test_dataset_array.py +++ b/test/test_dataset_array.py @@ -235,3 +235,10 @@ def test_align(self): dv1, dv2 = align(self.dv, self.dv[:5]) self.assertDSArrayEqual(dv1, self.dv[:5]) self.assertDSArrayEqual(dv2, self.dv[:5]) + + def test_to_series(self): + expected = self.dv.to_dataframe()['foo'] + actual = self.dv.to_series() + self.assertNDArrayEqual(expected.values, actual.values) + self.assertNDArrayEqual(expected.index.values, actual.index.values) + self.assertEqual('foo', actual.name) From 3f5bea2ad618194b99bfabe289783d8b98159cd6 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 20 Feb 2014 08:09:54 -0800 Subject: [PATCH 44/45] added unused context argument to __array_wrap__ --- src/xray/array_.py | 4 ++-- src/xray/dataset_array.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/xray/array_.py b/src/xray/array_.py index 6967079a7ac..874b46242cf 100644 --- a/src/xray/array_.py +++ b/src/xray/array_.py @@ -459,8 +459,8 @@ def from_stack(cls, variables, dimension='stacked_dimension', return stacked - def __array_wrap__(self, result): - return type(self)(self.dimensions, result, self.attributes) + def __array_wrap__(self, obj, context=None): + return type(self)(self.dimensions, obj, self.attributes) @staticmethod def _unary_op(f): diff --git a/src/xray/dataset_array.py b/src/xray/dataset_array.py index 626dea6efa9..84d4daf7f46 100644 --- a/src/xray/dataset_array.py +++ b/src/xray/dataset_array.py @@ -410,8 +410,8 @@ def to_series(self): names=self.coordinates.keys()) return pd.Series(self.data.reshape(-1), index=index, name=self.focus) - def __array_wrap__(self, result): - return self.refocus(self.array.__array_wrap__(result)) + def __array_wrap__(self, obj, context=None): + return self.refocus(self.array.__array_wrap__(obj, context)) @staticmethod def _unary_op(f): From 9488463c3388fbda04419208a794ef2f6ff49959 Mon Sep 17 00:00:00 2001 From: akleeman Date: Thu, 20 Feb 2014 16:12:58 -0800 Subject: [PATCH 45/45] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f80c7269616..18b124cc994 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup(name='xray', version='0.1-dev', description='Objects for holding self describing scientific data in python', - author='Stephan Hoyer, Alex Kleeman', + author='Stephan Hoyer, Alex Kleeman, Eugene Brevdo', author_email='TODO', install_requires=['scipy >= 0.10.0', 'numpy >= 1.8', 'netCDF4 >= 1.0.6', 'pandas >= 0.13.1'],