From 54c17ccc8569b3aa45e4c5a9d94d311a958de24e Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Tue, 19 Mar 2019 21:24:47 -0400 Subject: [PATCH 1/5] pass asclass so that TObjArrays can contain custom classes --- uproot/rootio.py | 2 +- uproot/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/uproot/rootio.py b/uproot/rootio.py index 60e303eb..7da5afed 100644 --- a/uproot/rootio.py +++ b/uproot/rootio.py @@ -666,7 +666,7 @@ def _defineclasses(streamerinfos, classes): if isinstance(streamerinfo, TStreamerInfo) and pyclassname not in builtin_classes and (pyclassname not in classes or hasattr(classes[pyclassname], "_versions")): code = [" @classmethod", - " def _readinto(cls, self, source, cursor, context, parent):", + " def _readinto(cls, self, source, cursor, context, parent, asclass=None):", " start, cnt, classversion = _startcheck(source, cursor)", " if cls._classversion != classversion:", " cursor.index = start", diff --git a/uproot/version.py b/uproot/version.py index 217273e7..b5741a0e 100644 --- a/uproot/version.py +++ b/uproot/version.py @@ -30,7 +30,7 @@ import re -__version__ = "3.4.15" +__version__ = "3.4.16" version = __version__ version_info = tuple(re.split(r"[-\.]", __version__)) From dba5d71f9bb30ee716e8e3575c37ec0a94924bfe Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Wed, 27 Mar 2019 06:49:25 -0500 Subject: [PATCH 2/5] [skip ci] fixing iterate over Pandas bug, adding new interfaces for it --- uproot/__init__.py | 4 ++++ uproot/_connect/to_pandas.py | 19 +++++++++++++++++-- uproot/tree.py | 14 +++++++++++--- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/uproot/__init__.py b/uproot/__init__.py index 861cfb90..856828a3 100644 --- a/uproot/__init__.py +++ b/uproot/__init__.py @@ -143,6 +143,10 @@ from uproot.write.TFile import TFileRecreate as recreate from uproot.write.TFile import TFileUpdate as update +# import uproot._connect.to_pandas +# pandas = uproot._connect.to_pandas.Pandas +# del uproot._connect.to_pandas + from uproot.source.memmap import MemmapSource from uproot.source.file import FileSource from uproot.source.xrootd import XRootDSource diff --git a/uproot/_connect/to_pandas.py b/uproot/_connect/to_pandas.py index 3cbc7490..7f007593 100644 --- a/uproot/_connect/to_pandas.py +++ b/uproot/_connect/to_pandas.py @@ -36,19 +36,34 @@ import awkward as awkwardbase +import uproot.tree import uproot.interp.numerical from uproot.interp.jagged import asjagged from uproot.interp.numerical import asdtype from uproot.interp.objects import asobj from uproot.interp.objects import astable +from uproot.source.memmap import MemmapSource +from uproot.source.xrootd import XRootDSource +from uproot.source.http import HTTPSource + +class Pandas(object): + @staticmethod + def iterate(path, treepath, branches=None, entrysteps=None, namedecode="utf-8", reportpath=False, reportfile=False, flatten=True, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True, localsource=MemmapSource.defaults, xrootdsource=XRootDSource.defaults, httpsource=HTTPSource.defaults, **options): + import pandas + return uproot.tree.iterate(path, treepath, branches=branches, entrysteps=entrysteps, outputtype=pandas.DataFrame, namedecode=namedecode, reportpath=reportpath, reportfile=reportfile, reportentries=False, flatten=flatten, flatname=flatname, awkwardlib=awkwardlib, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking, localsource=localsource, xrootdsource=xrootdsource, httpsource=httpsource, **options) + class TTreeMethods_pandas(object): def __init__(self, tree): self._tree = tree - def df(self, branches=None, namedecode="utf-8", entrystart=None, entrystop=None, flatten=True, flatname=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True): + def df(self, branches=None, namedecode="utf-8", entrystart=None, entrystop=None, flatten=True, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True): + import pandas + return self._tree.arrays(branches=branches, outputtype=pandas.DataFrame, namedecode=namedecode, entrystart=entrystart, entrystop=entrystop, flatten=flatten, flatname=flatname, awkwardlib=awkwardlib, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking) + + def iterate(self, branches=None, entrysteps=None, namedecode="utf-8", entrystart=None, entrystop=None, flatten=True, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True): import pandas - return self._tree.arrays(branches=branches, outputtype=pandas.DataFrame, namedecode=namedecode, entrystart=entrystart, entrystop=entrystop, flatten=flatten, flatname=flatname, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking) + return self._tree.iterate(branches=branches, entrysteps=entrysteps, outputtype=pandas.DataFrame, namedecode=namedecode, reportentries=False, entrystart=entrystart, entrystop=entrystop, flatten=flatten, flatname=flatname, awkwardlib=awkwardlib, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking) def default_flatname(branchname, fieldname, index): out = branchname diff --git a/uproot/tree.py b/uproot/tree.py index 49e11082..36fdccc1 100644 --- a/uproot/tree.py +++ b/uproot/tree.py @@ -519,6 +519,9 @@ def lazyarrays(self, branches=None, outputtype=dict, namedecode=None, limitbytes def iterate(self, branches=None, entrysteps=None, outputtype=dict, namedecode=None, reportentries=False, entrystart=None, entrystop=None, flatten=False, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True): entrystart, entrystop = self._normalize_entrystartstop(entrystart, entrystop) + # for the case of outputtype == pandas.DataFrame, do some preparation to fill DataFrames efficiently + ispandas = getattr(outputtype, "__name__", None) == "DataFrame" and getattr(outputtype, "__module__", None) == "pandas.core.frame" + if entrysteps is None: entrysteps = self.clusters(branches, entrystart=entrystart, entrystop=entrystop, strict=False) @@ -561,7 +564,7 @@ def evaluate(branch, interpretation, future, past, cachekey, pythonize): if cache is not None: cache[cachekey] = out if flatten and isinstance(interpretation, asjagged): - return out.content + return out.flatten() elif pythonize: return list(out) else: @@ -571,15 +574,20 @@ def evaluate(branch, interpretation, future, past, cachekey, pythonize): outputtype = namedtuple("Arrays", [codecs.ascii_decode(branch.name, "replace")[0] if namedecode is None else branch.name.decode(namedecode) for branch, interpretation in branches]) def wrap_for_python_scope(futures, start, stop): return lambda: outputtype(*[evaluate(branch, interpretation, future, past, cachekey, False) for branch, interpretation, future, past, cachekey in futures]) - elif getattr(outputtype, "__name__", None) == "DataFrame" and getattr(outputtype, "__module__", None) == "pandas.core.frame": + + elif ispandas: + import uproot._connect.to_pandas def wrap_for_python_scope(futures, start, stop): - return lambda: outputtype(data=OrderedDict((branch.name if namedecode is None else branch.name.decode(namedecode), evaluate(branch, interpretation, future, past, cachekey, isinstance(interpretation, asjagged))) for branch, interpretation, future, past, cachekey in futures), index=awkward.numpy.arange(start, stop)) + return lambda: uproot._connect.to_pandas.futures2df([(branch.name, interpretation, lambda: interpretation.finalize(future(), branch)) for branch, interpretation, future, past, cachekey in futures], outputtype, start, stop, flatten, flatname, awkward) + elif isinstance(outputtype, type) and issubclass(outputtype, dict): def wrap_for_python_scope(futures, start, stop): return lambda: outputtype((branch.name if namedecode is None else branch.name.decode(namedecode), evaluate(branch, interpretation, future, past, cachekey, False)) for branch, interpretation, future, past, cachekey in futures) + elif isinstance(outputtype, type) and issubclass(outputtype, (list, tuple)): def wrap_for_python_scope(futures, start, stop): return lambda: outputtype(evaluate(branch, interpretation, future, past, cachekey, False) for branch, interpretation, future, past, cachekey in futures) + else: def wrap_for_python_scope(futures, start, stop): return lambda: outputtype(*[evaluate(branch, interpretation, future, past, cachekey, False) for branch, interpretation, future, past, cachekey in futures]) From 06913b15b3b45fb9260dedf51d25bc1530b3568d Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Wed, 27 Mar 2019 08:09:24 -0500 Subject: [PATCH 3/5] [skip ci] tree.arrays and tree.iterate are good with outputtype=DataFrame; tree.df and tree.iterate are good shortcuts; uproot.pandas.iterate doesn't work yet --- uproot/__init__.py | 24 ++++++++++++++++++------ uproot/_connect/to_pandas.py | 13 +++++-------- uproot/tree.py | 4 +++- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/uproot/__init__.py b/uproot/__init__.py index 856828a3..c6a3cdef 100644 --- a/uproot/__init__.py +++ b/uproot/__init__.py @@ -143,10 +143,6 @@ from uproot.write.TFile import TFileRecreate as recreate from uproot.write.TFile import TFileUpdate as update -# import uproot._connect.to_pandas -# pandas = uproot._connect.to_pandas.Pandas -# del uproot._connect.to_pandas - from uproot.source.memmap import MemmapSource from uproot.source.file import FileSource from uproot.source.xrootd import XRootDSource @@ -168,6 +164,23 @@ from uproot.interp.objects import STLString asdebug = asjagged(asdtype("u1")) +from uproot.source.memmap import MemmapSource +from uproot.source.xrootd import XRootDSource +from uproot.source.http import HTTPSource +def iterate(path, treepath, branches=None, entrysteps=None, namedecode="utf-8", reportpath=False, reportfile=False, flatten=True, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True, localsource=MemmapSource.defaults, xrootdsource=XRootDSource.defaults, httpsource=HTTPSource.defaults, **options): + import pandas + import uproot.tree + return uproot.tree.iterate(path, treepath, branches=branches, entrysteps=entrysteps, outputtype=pandas.DataFrame, namedecode=namedecode, reportpath=reportpath, reportfile=reportfile, reportentries=False, flatten=flatten, flatname=flatname, awkwardlib=awkwardlib, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking, localsource=localsource, xrootdsource=xrootdsource, httpsource=httpsource, **options) + +from types import ModuleType +pandas = ModuleType("uproot.pandas") +pandas.iterate = iterate +del ModuleType +del iterate +del MemmapSource +del XRootDSource +del HTTPSource + # put help strings on everything (they're long, too disruptive to intersperse # in the code, and are built programmatically to avoid duplication; Python's # inline docstring method doesn't accept non-literals) @@ -176,7 +189,6 @@ # convenient access to the version number from uproot.version import __version__ -# don't expose uproot.uproot; it's ugly del uproot -__all__ = ["open", "xrootd", "http", "iterate", "numentries", "lazyarray", "lazyarrays", "daskarray", "daskarrays", "daskframe", "create", "recreate", "update", "MemmapSource", "FileSource", "XRootDSource", "HTTPSource", "interpret", "asdtype", "asarray", "asdouble32", "asstlbitset", "asjagged", "astable", "asobj", "asgenobj", "asstring", "asdebug", "SimpleArray", "STLVector", "STLMap", "STLString", "__version__"] +__all__ = ["open", "xrootd", "http", "iterate", "numentries", "lazyarray", "lazyarrays", "daskarray", "daskarrays", "daskframe", "create", "recreate", "update", "MemmapSource", "FileSource", "XRootDSource", "HTTPSource", "interpret", "asdtype", "asarray", "asdouble32", "asstlbitset", "asjagged", "astable", "asobj", "asgenobj", "asstring", "asdebug", "SimpleArray", "STLVector", "STLMap", "STLString", "pandas", "__version__"] diff --git a/uproot/_connect/to_pandas.py b/uproot/_connect/to_pandas.py index 7f007593..1c65357d 100644 --- a/uproot/_connect/to_pandas.py +++ b/uproot/_connect/to_pandas.py @@ -47,12 +47,6 @@ from uproot.source.xrootd import XRootDSource from uproot.source.http import HTTPSource -class Pandas(object): - @staticmethod - def iterate(path, treepath, branches=None, entrysteps=None, namedecode="utf-8", reportpath=False, reportfile=False, flatten=True, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True, localsource=MemmapSource.defaults, xrootdsource=XRootDSource.defaults, httpsource=HTTPSource.defaults, **options): - import pandas - return uproot.tree.iterate(path, treepath, branches=branches, entrysteps=entrysteps, outputtype=pandas.DataFrame, namedecode=namedecode, reportpath=reportpath, reportfile=reportfile, reportentries=False, flatten=flatten, flatname=flatname, awkwardlib=awkwardlib, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking, localsource=localsource, xrootdsource=xrootdsource, httpsource=httpsource, **options) - class TTreeMethods_pandas(object): def __init__(self, tree): self._tree = tree @@ -146,7 +140,7 @@ def futures2df(futures, outputtype, entrystart, entrystop, flatten, flatname, aw interpretation = interpretation.content # justifies the assumption that array.content == array.flatten() and array.stops.max() == array.stops[-1] - assert array._canuseoffset() and len(array.starts) > 0 and array.starts[0] == 0 + assert array._canuseoffset() and (len(array.starts) == 0 or array.starts[0] == 0) if starts is None: starts = array.starts @@ -156,7 +150,10 @@ def futures2df(futures, outputtype, entrystart, entrystop, flatten, flatname, aw if starts is not array.starts and not awkward.numpy.array_equal(starts, array.starts): raise ValueError("cannot use flatten=True on branches with different jagged structure, such as electrons and muons (different, variable number of each per event); either explicitly select compatible branches, such as [\"MET_*\", \"Muon_*\"] (scalar and variable per event is okay), or set flatten=False") - array = array.content + if len(array.starts) == 0: + array = array.content[0:0] + else: + array = array.content needbroadcasts.append(False) else: diff --git a/uproot/tree.py b/uproot/tree.py index 36fdccc1..d9e4898b 100644 --- a/uproot/tree.py +++ b/uproot/tree.py @@ -578,7 +578,9 @@ def wrap_for_python_scope(futures, start, stop): elif ispandas: import uproot._connect.to_pandas def wrap_for_python_scope(futures, start, stop): - return lambda: uproot._connect.to_pandas.futures2df([(branch.name, interpretation, lambda: interpretation.finalize(future(), branch)) for branch, interpretation, future, past, cachekey in futures], outputtype, start, stop, flatten, flatname, awkward) + def wrap_again(branch, interpretation, future): + return lambda: interpretation.finalize(future(), branch) + return lambda: uproot._connect.to_pandas.futures2df([(branch.name, interpretation, wrap_again(branch, interpretation, future)) for branch, interpretation, future, past, cachekey in futures], outputtype, start, stop, flatten, flatname, awkward) elif isinstance(outputtype, type) and issubclass(outputtype, dict): def wrap_for_python_scope(futures, start, stop): From a4424237188996b808c63657d3ee9d46515f197b Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Wed, 27 Mar 2019 08:58:41 -0500 Subject: [PATCH 4/5] uproot.pandas.iterate works, including globalentrystarts --- uproot/tree.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/uproot/tree.py b/uproot/tree.py index d9e4898b..c6a6f581 100644 --- a/uproot/tree.py +++ b/uproot/tree.py @@ -115,8 +115,15 @@ def iterate(path, treepath, branches=None, entrysteps=None, outputtype=dict, nam for tree, newbranches, globalentrystart, thispath, thisfile in _iterate(path, treepath, branches, awkward, localsource, xrootdsource, httpsource, **options): for start, stop, arrays in tree.iterate(branches=newbranches, entrysteps=entrysteps, outputtype=outputtype, namedecode=namedecode, reportentries=True, entrystart=0, entrystop=tree.numentries, flatten=flatten, flatname=flatname, awkwardlib=awkward, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking): if getattr(outputtype, "__name__", None) == "DataFrame" and getattr(outputtype, "__module__", None) == "pandas.core.frame": - index = awkward.numpy.frombuffer(arrays.index.data, dtype=arrays.index.dtype) - awkward.numpy.add(index, globalentrystart, index) + if type(arrays.index).__name__ == "MultiIndex": + index = arrays.index.levels[0].to_numpy() + awkward.numpy.add(index, globalentrystart, out=index) + elif type(arrays.index).__name__ == "RangeIndex": + arrays.index._start += globalentrystart + arrays.index._stop += globalentrystart + else: + index = arrays.index.to_numpy() + awkward.numpy.add(index, globalentrystart, out=index) out = (arrays,) if reportentries: out = (globalentrystart + start, globalentrystart + stop) + out From 922cd1c75b4d388420965ac2a3e61a3aa7186c45 Mon Sep 17 00:00:00 2001 From: Jim Pivarski Date: Wed, 27 Mar 2019 10:15:55 -0500 Subject: [PATCH 5/5] add uproot.pandas.iterate in a way that isn't overly clever --- uproot/__init__.py | 18 ++---------------- uproot/pandas.py | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 16 deletions(-) create mode 100644 uproot/pandas.py diff --git a/uproot/__init__.py b/uproot/__init__.py index c6a3cdef..e099a7d3 100644 --- a/uproot/__init__.py +++ b/uproot/__init__.py @@ -164,22 +164,7 @@ from uproot.interp.objects import STLString asdebug = asjagged(asdtype("u1")) -from uproot.source.memmap import MemmapSource -from uproot.source.xrootd import XRootDSource -from uproot.source.http import HTTPSource -def iterate(path, treepath, branches=None, entrysteps=None, namedecode="utf-8", reportpath=False, reportfile=False, flatten=True, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True, localsource=MemmapSource.defaults, xrootdsource=XRootDSource.defaults, httpsource=HTTPSource.defaults, **options): - import pandas - import uproot.tree - return uproot.tree.iterate(path, treepath, branches=branches, entrysteps=entrysteps, outputtype=pandas.DataFrame, namedecode=namedecode, reportpath=reportpath, reportfile=reportfile, reportentries=False, flatten=flatten, flatname=flatname, awkwardlib=awkwardlib, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking, localsource=localsource, xrootdsource=xrootdsource, httpsource=httpsource, **options) - -from types import ModuleType -pandas = ModuleType("uproot.pandas") -pandas.iterate = iterate -del ModuleType -del iterate -del MemmapSource -del XRootDSource -del HTTPSource +from uproot import pandas # put help strings on everything (they're long, too disruptive to intersperse # in the code, and are built programmatically to avoid duplication; Python's @@ -189,6 +174,7 @@ def iterate(path, treepath, branches=None, entrysteps=None, namedecode="utf-8", # convenient access to the version number from uproot.version import __version__ +# don't expose uproot.uproot; it's ugly del uproot __all__ = ["open", "xrootd", "http", "iterate", "numentries", "lazyarray", "lazyarrays", "daskarray", "daskarrays", "daskframe", "create", "recreate", "update", "MemmapSource", "FileSource", "XRootDSource", "HTTPSource", "interpret", "asdtype", "asarray", "asdouble32", "asstlbitset", "asjagged", "astable", "asobj", "asgenobj", "asstring", "asdebug", "SimpleArray", "STLVector", "STLMap", "STLString", "pandas", "__version__"] diff --git a/uproot/pandas.py b/uproot/pandas.py new file mode 100644 index 00000000..9e7cd0ad --- /dev/null +++ b/uproot/pandas.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +# Copyright (c) 2019, IRIS-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""Top-level functions for Pandas.""" + +import uproot.tree +from uproot.source.memmap import MemmapSource +from uproot.source.xrootd import XRootDSource +from uproot.source.http import HTTPSource + +def iterate(path, treepath, branches=None, entrysteps=None, namedecode="utf-8", reportpath=False, reportfile=False, flatten=True, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True, localsource=MemmapSource.defaults, xrootdsource=XRootDSource.defaults, httpsource=HTTPSource.defaults, **options): + import pandas + return uproot.tree.iterate(path, treepath, branches=branches, entrysteps=entrysteps, outputtype=pandas.DataFrame, namedecode=namedecode, reportpath=reportpath, reportfile=reportfile, reportentries=False, flatten=flatten, flatname=flatname, awkwardlib=awkwardlib, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking, localsource=localsource, xrootdsource=xrootdsource, httpsource=httpsource, **options)