Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First version of ak._v2.from_parquet #1338

Merged
merged 12 commits into from
Mar 15, 2022
29 changes: 26 additions & 3 deletions src/awkward/_v2/_connect/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,30 @@ def import_pyarrow_parquet(name):
if pyarrow is None:
raise ImportError(error_message.format(name))

import pyarrow.parquet
import pyarrow.parquet as out

return pyarrow.parquet
return out


def import_fsspec(name):
try:
import fsspec

except ModuleNotFoundError:
raise ImportError(
f"""to use {name}, you must install fsspec:

pip install fsspec

or

conda install -c conda-forge fsspec
"""
)

import_pyarrow_parquet(name)

return fsspec


if pyarrow is not None:
Expand Down Expand Up @@ -946,7 +967,9 @@ def handle_arrow(obj, conservative_optiontype=False, pass_empty_field=False):
if pass_empty_field and list(obj.schema.names) == [""]:
return child_array[0]
else:
return ak._v2.contents.RecordArray(child_array, obj.schema.names)
return ak._v2.contents.RecordArray(
child_array, obj.schema.names, length=len(obj)
)

elif isinstance(obj, pyarrow.lib.Table):
batches = obj.combine_chunks().to_batches()
Expand Down
33 changes: 27 additions & 6 deletions src/awkward/_v2/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
# First, transition all the _v2 code to start using implementations in this file.
# Then build up the high-level replacements.

# import re
# import os.path
# import warnings
import setuptools
import os
import itertools
import numbers
import os
import re
import setuptools
import threading
import traceback

Expand Down Expand Up @@ -859,7 +858,6 @@ def direct_Content_subclass_name(node):


def merge_parameters(one, two, merge_equal=False):

if one is None and two is None:
return None

Expand All @@ -883,3 +881,26 @@ def merge_parameters(one, two, merge_equal=False):
if v is not None:
out[k] = v
return out


def expand_braces(text, seen=None):
if seen is None:
seen = set()

spans = [m.span() for m in expand_braces.regex.finditer(text)][::-1]
alts = [text[start + 1 : stop - 1].split(",") for start, stop in spans]

if len(spans) == 0:
if text not in seen:
yield text
seen.add(text)

else:
for combo in itertools.product(*alts):
replaced = list(text)
for (start, stop), replacement in zip(spans, combo):
replaced[start:stop] = replacement
yield from expand_braces("".join(replaced), seen)


expand_braces.regex = re.compile(r"\{[^\{\}]*\}")
2 changes: 1 addition & 1 deletion src/awkward/_v2/contents/unmaskedarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def mergeable(self, other, mergebool):
def _reverse_merge(self, other):
return self.toIndexedOptionArray64()._reverse_merge(other)

def _mergemany(self, others):
def mergemany(self, others):
if len(others) == 0:
return self
return self.toIndexedOptionArray64().mergemany(others)
Expand Down
14 changes: 14 additions & 0 deletions src/awkward/_v2/forms/bitmaskedform.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,17 @@ def is_tuple(self):
@property
def dimension_optiontype(self):
return True

def _columns(self, path, output, list_indicator):
self._content._columns(path, output, list_indicator)

def _select_columns(self, index, specifier, matches, output):
return BitMaskedForm(
self._mask,
self._content._select_columns(index, specifier, matches, output),
self._valid_when,
self._lsb_order,
self._has_identifier,
self._parameters,
self._form_key,
)
13 changes: 13 additions & 0 deletions src/awkward/_v2/forms/bytemaskedform.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,3 +205,16 @@ def is_tuple(self):
@property
def dimension_optiontype(self):
return True

def _columns(self, path, output, list_indicator):
self._content._columns(path, output, list_indicator)

def _select_columns(self, index, specifier, matches, output):
return ByteMaskedForm(
self._mask,
self._content._select_columns(index, specifier, matches, output),
self._valid_when,
self._has_identifier,
self._parameters,
self._form_key,
)
8 changes: 8 additions & 0 deletions src/awkward/_v2/forms/emptyform.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,11 @@ def is_tuple(self):
@property
def dimension_optiontype(self):
return False

def _columns(self, path, output, list_indicator):
output.append(".".join(path))

def _select_columns(self, index, specifier, matches, output):
if any(match and index >= len(item) for item, match in zip(specifier, matches)):
output.append(None)
return self
28 changes: 28 additions & 0 deletions src/awkward/_v2/forms/form.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,3 +314,31 @@ def simplify_optiontype(self):

def simplify_uniontype(self, merge=True, mergebool=False):
return self

def columns(self, list_indicator=None):
output = []
self._columns((), output, list_indicator)
return output

def select_columns(self, specifier, expand_braces=True):
if ak._v2._util.isstr(specifier):
specifier = [specifier]

for item in specifier:
if not ak._v2._util.isstr(item):
raise ak._v2._util.error(
TypeError("a column-selection specifier must be a list of strings")
)

if expand_braces:
next_specifier = []
for item in specifier:
for result in ak._v2._util.expand_braces(item):
next_specifier.append(result)
specifier = next_specifier

specifier = [[] if item == "" else item.split(".") for item in set(specifier)]
matches = [True] * len(specifier)

output = []
return self._select_columns(0, specifier, matches, output)
12 changes: 12 additions & 0 deletions src/awkward/_v2/forms/indexedform.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,15 @@ def is_tuple(self):
@property
def dimension_optiontype(self):
return False

def _columns(self, path, output, list_indicator):
self._content._columns(path, output, list_indicator)

def _select_columns(self, index, specifier, matches, output):
return IndexedForm(
self._index,
self._content._select_columns(index, specifier, matches, output),
self._has_identifier,
self._parameters,
self._form_key,
)
12 changes: 12 additions & 0 deletions src/awkward/_v2/forms/indexedoptionform.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,3 +187,15 @@ def is_tuple(self):
@property
def dimension_optiontype(self):
return True

def _columns(self, path, output, list_indicator):
self._content._columns(path, output, list_indicator)

def _select_columns(self, index, specifier, matches, output):
return IndexedOptionForm(
self._index,
self._content._select_columns(index, specifier, matches, output),
self._has_identifier,
self._parameters,
self._form_key,
)
18 changes: 18 additions & 0 deletions src/awkward/_v2/forms/listform.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,3 +196,21 @@ def is_tuple(self):
@property
def dimension_optiontype(self):
return False

def _columns(self, path, output, list_indicator):
if (
self.parameter("__array__") not in ("string", "bytestring")
and list_indicator is not None
):
path = path + (list_indicator,)
self._content._columns(path, output, list_indicator)

def _select_columns(self, index, specifier, matches, output):
return ListForm(
self._starts,
self._stops,
self._content._select_columns(index, specifier, matches, output),
self._has_identifier,
self._parameters,
self._form_key,
)
17 changes: 17 additions & 0 deletions src/awkward/_v2/forms/listoffsetform.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,20 @@ def is_tuple(self):
@property
def dimension_optiontype(self):
return False

def _columns(self, path, output, list_indicator):
if (
self.parameter("__array__") not in ("string", "bytestring")
and list_indicator is not None
):
path = path + (list_indicator,)
self._content._columns(path, output, list_indicator)

def _select_columns(self, index, specifier, matches, output):
return ListOffsetForm(
self._offsets,
self._content._select_columns(index, specifier, matches, output),
self._has_identifier,
self._parameters,
self._form_key,
)
8 changes: 8 additions & 0 deletions src/awkward/_v2/forms/numpyform.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,3 +206,11 @@ def is_tuple(self):
@property
def dimension_optiontype(self):
return False

def _columns(self, path, output, list_indicator):
output.append(".".join(path))

def _select_columns(self, index, specifier, matches, output):
if any(match and index >= len(item) for item, match in zip(specifier, matches)):
output.append(None)
return self
31 changes: 31 additions & 0 deletions src/awkward/_v2/forms/recordform.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE

import glob
from collections.abc import Iterable

import awkward as ak
Expand Down Expand Up @@ -303,3 +304,33 @@ def branch_depth(self):
@property
def dimension_optiontype(self):
return False

def _columns(self, path, output, list_indicator):
for content, field in zip(self._contents, self.fields):
content._columns(path + (field,), output, list_indicator)

def _select_columns(self, index, specifier, matches, output):
contents = []
fields = []
for content, field in zip(self._contents, self.fields):
next_matches = [
matches[i]
and (index >= len(item) or glob.fnmatch.fnmatchcase(field, item[index]))
for i, item in enumerate(specifier)
]
if any(next_matches):
len_output = len(output)
next_content = content._select_columns(
index + 1, specifier, next_matches, output
)
if len_output != len(output):
contents.append(next_content)
fields.append(field)

return RecordForm(
contents,
fields,
self._has_identifier,
self._parameters,
self._form_key,
)
17 changes: 17 additions & 0 deletions src/awkward/_v2/forms/regularform.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,20 @@ def is_tuple(self):
@property
def dimension_optiontype(self):
return False

def _columns(self, path, output, list_indicator):
if (
self.parameter("__array__") not in ("string", "bytestring")
and list_indicator is not None
):
path = path + (list_indicator,)
self._content._columns(path, output, list_indicator)

def _select_columns(self, index, specifier, matches, output):
return RegularForm(
self._content._select_columns(index, specifier, matches, output),
self._size,
self._has_identifier,
self._parameters,
self._form_key,
)
28 changes: 28 additions & 0 deletions src/awkward/_v2/forms/unionform.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,3 +244,31 @@ def dimension_optiontype(self):
if content.dimension_optiontype:
return True
return False

def _columns(self, path, output, list_indicator):
for content, field in zip(self._contents, self.fields):
content._columns(path + (field,), output, list_indicator)

def _select_columns(self, index, specifier, matches, output):
contents = []
for content in self._contents:
len_output = len(output)
next_content = content._select_columns(index, specifier, matches, output)
if len_output != len(output):
contents.append(next_content)

if len(contents) == 0:
return ak._v2.forms.EmptyForm(
self._has_identifier, self._parameters, self._form_key
)
elif len(contents) == 1:
return contents[0]
else:
return UnionForm(
self._tags,
self._index,
contents,
self._has_identifier,
self._parameters,
self._form_key,
)
11 changes: 11 additions & 0 deletions src/awkward/_v2/forms/unmaskedform.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,14 @@ def is_tuple(self):
@property
def dimension_optiontype(self):
return True

def _columns(self, path, output, list_indicator):
self._content._columns(path, output, list_indicator)

def _select_columns(self, index, specifier, matches, output):
return UnmaskedForm(
self._content._select_columns(index, specifier, matches, output),
self._has_identifier,
self._parameters,
self._form_key,
)
3 changes: 3 additions & 0 deletions src/awkward/_v2/operations/convert/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
)
from awkward._v2.operations.convert.ak_to_parquet import to_parquet # noqa: F401
from awkward._v2.operations.convert.ak_from_parquet import from_parquet # noqa: F401
from awkward._v2.operations.convert.ak_metadata_from_parquet import ( # noqa: F401
metadata_from_parquet,
)
from awkward._v2.operations.convert.ak_to_buffers import to_buffers # noqa: F401
from awkward._v2.operations.convert.ak_from_buffers import from_buffers # noqa: F401
from awkward._v2.operations.convert.ak_to_pandas import to_pandas # noqa: F401
Loading