diff --git a/src/awkward/_v2/_connect/pyarrow.py b/src/awkward/_v2/_connect/pyarrow.py index 80977d7988..6d4d53ccc2 100644 --- a/src/awkward/_v2/_connect/pyarrow.py +++ b/src/awkward/_v2/_connect/pyarrow.py @@ -43,9 +43,30 @@ def import_pyarrow_parquet(name): if pyarrow is None: raise ImportError(error_message.format(name)) - import pyarrow.parquet + import pyarrow.parquet as out - return pyarrow.parquet + return out + + +def import_fsspec(name): + try: + import fsspec + + except ModuleNotFoundError: + raise ImportError( + f"""to use {name}, you must install fsspec: + + pip install fsspec + +or + + conda install -c conda-forge fsspec +""" + ) + + import_pyarrow_parquet(name) + + return fsspec if pyarrow is not None: @@ -946,7 +967,9 @@ def handle_arrow(obj, conservative_optiontype=False, pass_empty_field=False): if pass_empty_field and list(obj.schema.names) == [""]: return child_array[0] else: - return ak._v2.contents.RecordArray(child_array, obj.schema.names) + return ak._v2.contents.RecordArray( + child_array, obj.schema.names, length=len(obj) + ) elif isinstance(obj, pyarrow.lib.Table): batches = obj.combine_chunks().to_batches() diff --git a/src/awkward/_v2/_util.py b/src/awkward/_v2/_util.py index e371dbad36..28d8909dd5 100644 --- a/src/awkward/_v2/_util.py +++ b/src/awkward/_v2/_util.py @@ -3,12 +3,11 @@ # First, transition all the _v2 code to start using implementations in this file. # Then build up the high-level replacements. -# import re -# import os.path -# import warnings -import setuptools -import os +import itertools import numbers +import os +import re +import setuptools import threading import traceback @@ -859,7 +858,6 @@ def direct_Content_subclass_name(node): def merge_parameters(one, two, merge_equal=False): - if one is None and two is None: return None @@ -883,3 +881,26 @@ def merge_parameters(one, two, merge_equal=False): if v is not None: out[k] = v return out + + +def expand_braces(text, seen=None): + if seen is None: + seen = set() + + spans = [m.span() for m in expand_braces.regex.finditer(text)][::-1] + alts = [text[start + 1 : stop - 1].split(",") for start, stop in spans] + + if len(spans) == 0: + if text not in seen: + yield text + seen.add(text) + + else: + for combo in itertools.product(*alts): + replaced = list(text) + for (start, stop), replacement in zip(spans, combo): + replaced[start:stop] = replacement + yield from expand_braces("".join(replaced), seen) + + +expand_braces.regex = re.compile(r"\{[^\{\}]*\}") diff --git a/src/awkward/_v2/contents/unmaskedarray.py b/src/awkward/_v2/contents/unmaskedarray.py index 4b5cb82d02..5fb14a8234 100644 --- a/src/awkward/_v2/contents/unmaskedarray.py +++ b/src/awkward/_v2/contents/unmaskedarray.py @@ -286,7 +286,7 @@ def mergeable(self, other, mergebool): def _reverse_merge(self, other): return self.toIndexedOptionArray64()._reverse_merge(other) - def _mergemany(self, others): + def mergemany(self, others): if len(others) == 0: return self return self.toIndexedOptionArray64().mergemany(others) diff --git a/src/awkward/_v2/forms/bitmaskedform.py b/src/awkward/_v2/forms/bitmaskedform.py index d768df8959..81be730be9 100644 --- a/src/awkward/_v2/forms/bitmaskedform.py +++ b/src/awkward/_v2/forms/bitmaskedform.py @@ -226,3 +226,17 @@ def is_tuple(self): @property def dimension_optiontype(self): return True + + def _columns(self, path, output, list_indicator): + self._content._columns(path, output, list_indicator) + + def _select_columns(self, index, specifier, matches, output): + return BitMaskedForm( + self._mask, + self._content._select_columns(index, specifier, matches, output), + self._valid_when, + self._lsb_order, + self._has_identifier, + self._parameters, + self._form_key, + ) diff --git a/src/awkward/_v2/forms/bytemaskedform.py b/src/awkward/_v2/forms/bytemaskedform.py index 670b3e8c12..36268048cf 100644 --- a/src/awkward/_v2/forms/bytemaskedform.py +++ b/src/awkward/_v2/forms/bytemaskedform.py @@ -205,3 +205,16 @@ def is_tuple(self): @property def dimension_optiontype(self): return True + + def _columns(self, path, output, list_indicator): + self._content._columns(path, output, list_indicator) + + def _select_columns(self, index, specifier, matches, output): + return ByteMaskedForm( + self._mask, + self._content._select_columns(index, specifier, matches, output), + self._valid_when, + self._has_identifier, + self._parameters, + self._form_key, + ) diff --git a/src/awkward/_v2/forms/emptyform.py b/src/awkward/_v2/forms/emptyform.py index 198770a840..dd25b2d64b 100644 --- a/src/awkward/_v2/forms/emptyform.py +++ b/src/awkward/_v2/forms/emptyform.py @@ -97,3 +97,11 @@ def is_tuple(self): @property def dimension_optiontype(self): return False + + def _columns(self, path, output, list_indicator): + output.append(".".join(path)) + + def _select_columns(self, index, specifier, matches, output): + if any(match and index >= len(item) for item, match in zip(specifier, matches)): + output.append(None) + return self diff --git a/src/awkward/_v2/forms/form.py b/src/awkward/_v2/forms/form.py index 676a73dac8..ac85c18d5f 100644 --- a/src/awkward/_v2/forms/form.py +++ b/src/awkward/_v2/forms/form.py @@ -314,3 +314,31 @@ def simplify_optiontype(self): def simplify_uniontype(self, merge=True, mergebool=False): return self + + def columns(self, list_indicator=None): + output = [] + self._columns((), output, list_indicator) + return output + + def select_columns(self, specifier, expand_braces=True): + if ak._v2._util.isstr(specifier): + specifier = [specifier] + + for item in specifier: + if not ak._v2._util.isstr(item): + raise ak._v2._util.error( + TypeError("a column-selection specifier must be a list of strings") + ) + + if expand_braces: + next_specifier = [] + for item in specifier: + for result in ak._v2._util.expand_braces(item): + next_specifier.append(result) + specifier = next_specifier + + specifier = [[] if item == "" else item.split(".") for item in set(specifier)] + matches = [True] * len(specifier) + + output = [] + return self._select_columns(0, specifier, matches, output) diff --git a/src/awkward/_v2/forms/indexedform.py b/src/awkward/_v2/forms/indexedform.py index 76722855f2..e117f1162b 100644 --- a/src/awkward/_v2/forms/indexedform.py +++ b/src/awkward/_v2/forms/indexedform.py @@ -170,3 +170,15 @@ def is_tuple(self): @property def dimension_optiontype(self): return False + + def _columns(self, path, output, list_indicator): + self._content._columns(path, output, list_indicator) + + def _select_columns(self, index, specifier, matches, output): + return IndexedForm( + self._index, + self._content._select_columns(index, specifier, matches, output), + self._has_identifier, + self._parameters, + self._form_key, + ) diff --git a/src/awkward/_v2/forms/indexedoptionform.py b/src/awkward/_v2/forms/indexedoptionform.py index e500da9289..418fb10eee 100644 --- a/src/awkward/_v2/forms/indexedoptionform.py +++ b/src/awkward/_v2/forms/indexedoptionform.py @@ -187,3 +187,15 @@ def is_tuple(self): @property def dimension_optiontype(self): return True + + def _columns(self, path, output, list_indicator): + self._content._columns(path, output, list_indicator) + + def _select_columns(self, index, specifier, matches, output): + return IndexedOptionForm( + self._index, + self._content._select_columns(index, specifier, matches, output), + self._has_identifier, + self._parameters, + self._form_key, + ) diff --git a/src/awkward/_v2/forms/listform.py b/src/awkward/_v2/forms/listform.py index 3cfa3e1d82..c92eedba00 100644 --- a/src/awkward/_v2/forms/listform.py +++ b/src/awkward/_v2/forms/listform.py @@ -196,3 +196,21 @@ def is_tuple(self): @property def dimension_optiontype(self): return False + + def _columns(self, path, output, list_indicator): + if ( + self.parameter("__array__") not in ("string", "bytestring") + and list_indicator is not None + ): + path = path + (list_indicator,) + self._content._columns(path, output, list_indicator) + + def _select_columns(self, index, specifier, matches, output): + return ListForm( + self._starts, + self._stops, + self._content._select_columns(index, specifier, matches, output), + self._has_identifier, + self._parameters, + self._form_key, + ) diff --git a/src/awkward/_v2/forms/listoffsetform.py b/src/awkward/_v2/forms/listoffsetform.py index 25caf5bceb..1ed268cd11 100644 --- a/src/awkward/_v2/forms/listoffsetform.py +++ b/src/awkward/_v2/forms/listoffsetform.py @@ -163,3 +163,20 @@ def is_tuple(self): @property def dimension_optiontype(self): return False + + def _columns(self, path, output, list_indicator): + if ( + self.parameter("__array__") not in ("string", "bytestring") + and list_indicator is not None + ): + path = path + (list_indicator,) + self._content._columns(path, output, list_indicator) + + def _select_columns(self, index, specifier, matches, output): + return ListOffsetForm( + self._offsets, + self._content._select_columns(index, specifier, matches, output), + self._has_identifier, + self._parameters, + self._form_key, + ) diff --git a/src/awkward/_v2/forms/numpyform.py b/src/awkward/_v2/forms/numpyform.py index e0051b91f8..612b533b56 100644 --- a/src/awkward/_v2/forms/numpyform.py +++ b/src/awkward/_v2/forms/numpyform.py @@ -206,3 +206,11 @@ def is_tuple(self): @property def dimension_optiontype(self): return False + + def _columns(self, path, output, list_indicator): + output.append(".".join(path)) + + def _select_columns(self, index, specifier, matches, output): + if any(match and index >= len(item) for item, match in zip(specifier, matches)): + output.append(None) + return self diff --git a/src/awkward/_v2/forms/recordform.py b/src/awkward/_v2/forms/recordform.py index 7028a3d67d..33378665c9 100644 --- a/src/awkward/_v2/forms/recordform.py +++ b/src/awkward/_v2/forms/recordform.py @@ -1,5 +1,6 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE +import glob from collections.abc import Iterable import awkward as ak @@ -303,3 +304,33 @@ def branch_depth(self): @property def dimension_optiontype(self): return False + + def _columns(self, path, output, list_indicator): + for content, field in zip(self._contents, self.fields): + content._columns(path + (field,), output, list_indicator) + + def _select_columns(self, index, specifier, matches, output): + contents = [] + fields = [] + for content, field in zip(self._contents, self.fields): + next_matches = [ + matches[i] + and (index >= len(item) or glob.fnmatch.fnmatchcase(field, item[index])) + for i, item in enumerate(specifier) + ] + if any(next_matches): + len_output = len(output) + next_content = content._select_columns( + index + 1, specifier, next_matches, output + ) + if len_output != len(output): + contents.append(next_content) + fields.append(field) + + return RecordForm( + contents, + fields, + self._has_identifier, + self._parameters, + self._form_key, + ) diff --git a/src/awkward/_v2/forms/regularform.py b/src/awkward/_v2/forms/regularform.py index 8a481e445a..cfbc364875 100644 --- a/src/awkward/_v2/forms/regularform.py +++ b/src/awkward/_v2/forms/regularform.py @@ -168,3 +168,20 @@ def is_tuple(self): @property def dimension_optiontype(self): return False + + def _columns(self, path, output, list_indicator): + if ( + self.parameter("__array__") not in ("string", "bytestring") + and list_indicator is not None + ): + path = path + (list_indicator,) + self._content._columns(path, output, list_indicator) + + def _select_columns(self, index, specifier, matches, output): + return RegularForm( + self._content._select_columns(index, specifier, matches, output), + self._size, + self._has_identifier, + self._parameters, + self._form_key, + ) diff --git a/src/awkward/_v2/forms/unionform.py b/src/awkward/_v2/forms/unionform.py index 056e369c1f..dfd63f51ef 100644 --- a/src/awkward/_v2/forms/unionform.py +++ b/src/awkward/_v2/forms/unionform.py @@ -244,3 +244,31 @@ def dimension_optiontype(self): if content.dimension_optiontype: return True return False + + def _columns(self, path, output, list_indicator): + for content, field in zip(self._contents, self.fields): + content._columns(path + (field,), output, list_indicator) + + def _select_columns(self, index, specifier, matches, output): + contents = [] + for content in self._contents: + len_output = len(output) + next_content = content._select_columns(index, specifier, matches, output) + if len_output != len(output): + contents.append(next_content) + + if len(contents) == 0: + return ak._v2.forms.EmptyForm( + self._has_identifier, self._parameters, self._form_key + ) + elif len(contents) == 1: + return contents[0] + else: + return UnionForm( + self._tags, + self._index, + contents, + self._has_identifier, + self._parameters, + self._form_key, + ) diff --git a/src/awkward/_v2/forms/unmaskedform.py b/src/awkward/_v2/forms/unmaskedform.py index ada8c73cb2..7b14e87ba0 100644 --- a/src/awkward/_v2/forms/unmaskedform.py +++ b/src/awkward/_v2/forms/unmaskedform.py @@ -153,3 +153,14 @@ def is_tuple(self): @property def dimension_optiontype(self): return True + + def _columns(self, path, output, list_indicator): + self._content._columns(path, output, list_indicator) + + def _select_columns(self, index, specifier, matches, output): + return UnmaskedForm( + self._content._select_columns(index, specifier, matches, output), + self._has_identifier, + self._parameters, + self._form_key, + ) diff --git a/src/awkward/_v2/operations/convert/__init__.py b/src/awkward/_v2/operations/convert/__init__.py index 390302663b..09b6792e53 100644 --- a/src/awkward/_v2/operations/convert/__init__.py +++ b/src/awkward/_v2/operations/convert/__init__.py @@ -24,6 +24,9 @@ ) from awkward._v2.operations.convert.ak_to_parquet import to_parquet # noqa: F401 from awkward._v2.operations.convert.ak_from_parquet import from_parquet # noqa: F401 +from awkward._v2.operations.convert.ak_metadata_from_parquet import ( # noqa: F401 + metadata_from_parquet, +) from awkward._v2.operations.convert.ak_to_buffers import to_buffers # noqa: F401 from awkward._v2.operations.convert.ak_from_buffers import from_buffers # noqa: F401 from awkward._v2.operations.convert.ak_to_pandas import to_pandas # noqa: F401 diff --git a/src/awkward/_v2/operations/convert/ak_from_parquet.py b/src/awkward/_v2/operations/convert/ak_from_parquet.py index 32aa45edbb..75185bacfa 100644 --- a/src/awkward/_v2/operations/convert/ak_from_parquet.py +++ b/src/awkward/_v2/operations/convert/ak_from_parquet.py @@ -2,6 +2,265 @@ import awkward as ak +np = ak.nplike.NumpyMetadata.instance() +numpy = ak.nplike.Numpy.instance() -def from_parquet(): - raise ak._v2._util.error(NotImplementedError) + +def from_parquet( + path, + columns=None, + row_groups=None, + storage_options=None, + max_gap=64_000, + max_block=256_000_000, + footer_sample_size=1_000_000, + conservative_optiontype=False, + highlevel=True, + behavior=None, +): + """ + Args: + path (str): Local filename or remote URL, passed to fsspec for resolution. + May contain glob patterns. + columns (None, str, or list of str): Glob pattern(s) with bash-like curly + brackets for matching column names. Nested records are separated by dots. + If a list of patterns, the logical-or is matched. If None, all columns + are read. + row_groups (None or set of int): Row groups to read; must be non-negative. + Order is ignored: the output array is presented in the order specified by + Parquet metadata. If None, all row groups/all rows are read. + storage_options: Passed to `fsspec.parquet.open_parquet_file`. + max_gap (int): Passed to `fsspec.parquet.open_parquet_file`. + max_block (int): Passed to `fsspec.parquet.open_parquet_file`. + footer_sample_size (int): Passed to `fsspec.parquet.open_parquet_file`. + conservative_optiontype (bool): Passed to `ak.from_arrow`. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.layout.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Reads data from a local or remote Parquet file or collection of files. + + The data are eagerly (not lazily) read and must fit into memory. Use `columns` + and/or `row_groups` to select and filter manageable subsets of the data, and + use #ak.metadata_from_parquet to find column names and the range of row groups + that a dataset has. + + See also #ak.to_parquet, #ak.metadata_from_parquet. + """ + with ak._v2._util.OperationErrorContext( + "ak._v2.from_parquet", + dict( + path=path, + columns=columns, + row_groups=row_groups, + storage_options=storage_options, + max_gap=max_gap, + max_block=max_block, + footer_sample_size=footer_sample_size, + conservative_optiontype=conservative_optiontype, + highlevel=highlevel, + behavior=behavior, + ), + ): + return _impl( + path, + columns, + row_groups, + storage_options, + max_gap, + max_block, + footer_sample_size, + conservative_optiontype, + highlevel, + behavior, + ) + + +def _impl( + path, + columns, + row_groups, + storage_options, + max_gap, + max_block, + footer_sample_size, + conservative_optiontype, + highlevel, + behavior, +): + import awkward._v2._connect.pyarrow # noqa: F401 + + name = "ak._v2.from_parquet" + pyarrow_parquet = ak._v2._connect.pyarrow.import_pyarrow_parquet(name) + fsspec = ak._v2._connect.pyarrow.import_fsspec(name) + + import fsspec.parquet + + if row_groups is not None: + if not all(ak._v2._util.isint(x) and x >= 0 for x in row_groups): + raise ak._v2._util.error( + TypeError("row_groups must be a set of non-negative integers") + ) + + fs, _, paths = fsspec.get_fs_token_paths( + path, mode="rb", storage_options=storage_options + ) + + all_paths, path_for_metadata = _all_and_metadata_paths(path, fs, paths) + + parquet_columns = None + subform = None + subrg = [None] * len(all_paths) + actual_paths = all_paths + if columns is not None or row_groups is not None: + with fsspec.parquet.open_parquet_file( + path_for_metadata, + fs=fs, + engine="pyarrow", + row_groups=[], + storage_options=storage_options, + max_gap=max_gap, + max_block=max_block, + footer_sample_size=footer_sample_size, + ) as file_for_metadata: + parquetfile_for_metadata = pyarrow_parquet.ParquetFile(file_for_metadata) + + if columns is not None: + # FIXME: get this from parquetfile_for_metadata + list_indicator = "list.item" + + form = ak._v2._connect.pyarrow.form_handle_arrow( + parquetfile_for_metadata.schema_arrow, pass_empty_field=True + ) + subform = form.select_columns(columns) + parquet_columns = subform.columns(list_indicator=list_indicator) + + if row_groups is not None: + metadata = parquetfile_for_metadata.metadata + if any(not 0 <= rg < metadata.num_row_groups for rg in row_groups): + raise ak._v2._util.error( + ValueError( + f"one of the requested row_groups is out of range (must be less than {metadata.num_row_groups})" + ) + ) + + split_paths = [p.split("/") for p in all_paths] + prev_index = None + prev_i = 0 + actual_paths = [] + subrg = [] + for i in range(metadata.num_row_groups): + split_path = metadata.row_group(i).column(0).file_path.split("/") + index = None + for j, compare in enumerate(split_paths): + if split_path == compare[-len(split_path) :]: + index = j + break + if index is None: + eoln = "\n " + raise ak._v2._util.error( + LookupError( + f"""path {'/'.join(split_path)!r} from metadata not found in path matches: + + {eoln.join(all_paths)}""" + ) + ) + + if prev_index != index: + prev_index = index + prev_i = i + actual_paths.append(all_paths[index]) + subrg.append([]) + + if i in row_groups: + subrg[-1].append(i - prev_i) + + for k in range(len(subrg) - 1, -1, -1): + if len(subrg[k]) == 0: + del actual_paths[k] + del subrg[k] + + arrays = [] + for i, p in enumerate(actual_paths): + with fsspec.parquet.open_parquet_file( + p, + fs=fs, + engine="pyarrow", + columns=parquet_columns, + row_groups=subrg[i], + storage_options=storage_options, + max_gap=max_gap, + max_block=max_block, + footer_sample_size=footer_sample_size, + ) as file: + parquetfile = pyarrow_parquet.ParquetFile(file) + + if subform is None: + subform = ak._v2._connect.pyarrow.form_handle_arrow( + parquetfile.schema_arrow, pass_empty_field=True + ) + + if row_groups is None: + arrow_table = parquetfile.read(parquet_columns) + else: + arrow_table = parquetfile.read_row_groups(subrg[i], parquet_columns) + + arrays.append( + ak._v2._connect.pyarrow.handle_arrow( + arrow_table, + conservative_optiontype=conservative_optiontype, + pass_empty_field=True, + ) + ) + + if len(arrays) == 0: + return ak._v2.operations.convert.ak_from_buffers._impl( + subform, 0, _DictOfEmptyBuffers(), "", numpy, highlevel, behavior + ) + else: + return ak._v2.operations.structure.ak_concatenate._impl( + arrays, 0, True, True, highlevel, behavior + ) + + +class _DictOfEmptyBuffers: + def __getitem__(self, where): + return b"\x00\x00\x00\x00\x00\x00\x00\x00" + + +def _all_and_metadata_paths(path, fs, paths): + all_paths = [] + for x in paths: + if fs.isfile(x): + is_meta = x.split("/")[-1] == "_metadata" + is_comm = x.split("/")[-1] == "_common_metadata" + all_paths.append((x, is_meta, is_comm)) + elif fs.isdir(x): + for prefix, _, files in fs.walk(x): + for f in files: + is_meta = f == "_metadata" + is_comm = f == "_common_metadata" + if f.endswith((".parq", ".parquet")) or is_meta or is_comm: + if fs.isfile("/".join((prefix, f))): + all_paths.append(("/".join((prefix, f)), is_meta, is_comm)) + + path_for_metadata = [x for x, is_meta, is_comm in all_paths if is_meta] + if len(path_for_metadata) != 0: + path_for_metadata = path_for_metadata[0] + else: + path_for_metadata = [x for x, is_meta, is_comm in all_paths if is_comm] + if len(path_for_metadata) != 0: + path_for_metadata = path_for_metadata[0] + else: + if len(all_paths) != 0: + path_for_metadata = all_paths[0][0] + + all_paths = [x for x, is_meta, is_comm in all_paths if not is_meta and not is_comm] + + if len(all_paths) == 0: + raise ak._v2._util.error( + ValueError(f"no *.parquet or *.parq matches for path {path!r}") + ) + + return all_paths, path_for_metadata diff --git a/src/awkward/_v2/operations/convert/ak_metadata_from_parquet.py b/src/awkward/_v2/operations/convert/ak_metadata_from_parquet.py new file mode 100644 index 0000000000..3a09eb5c57 --- /dev/null +++ b/src/awkward/_v2/operations/convert/ak_metadata_from_parquet.py @@ -0,0 +1,106 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +import collections + +import awkward as ak + +np = ak.nplike.NumpyMetadata.instance() + + +ParquetMetadata = collections.namedtuple( + "ParquetMetadata", + ["form", "fs", "paths", "metadata"], +) + + +def metadata_from_parquet( + path, + storage_options=None, + max_gap=64_000, + max_block=256_000_000, + footer_sample_size=1_000_000, +): + """ + Args: + path (str): Local filename or remote URL, passed to fsspec for resolution. + May contain glob patterns. + storage_options: Passed to `fsspec.parquet.open_parquet_file`. + max_gap (int): Passed to `fsspec.parquet.open_parquet_file`. + max_block (int): Passed to `fsspec.parquet.open_parquet_file`. + footer_sample_size (int): Passed to `fsspec.parquet.open_parquet_file`. + + Returns a named tuple containing + + * `form`: an Awkward Form representing the low-level type of the data + (use `.type` to get a high-level type), + * `fs`: the fsspec filesystem object, + * `paths`: a list of matching path names, + * `metadata`: the Parquet metadata, which includes `.num_rows` for the length + of the array that would be read by #ak.from_parquet and `.num_row_groups` + for the units that can be filtered (for the #ak.from_parquet `row_groups` + argument). + + See also #ak.from_parquet, #ak.to_parquet. + """ + with ak._v2._util.OperationErrorContext( + "ak._v2.metadata_from_parquet", + dict( + path=path, + storage_options=storage_options, + max_gap=max_gap, + max_block=max_block, + footer_sample_size=footer_sample_size, + ), + ): + return _impl( + path, + storage_options, + max_gap, + max_block, + footer_sample_size, + ) + + +def _impl( + path, + storage_options, + max_gap, + max_block, + footer_sample_size, +): + import awkward._v2._connect.pyarrow # noqa: F401 + + name = "ak._v2.from_parquet" + pyarrow_parquet = ak._v2._connect.pyarrow.import_pyarrow_parquet(name) + fsspec = ak._v2._connect.pyarrow.import_fsspec(name) + + import fsspec.parquet + + fs, _, paths = fsspec.get_fs_token_paths( + path, mode="rb", storage_options=storage_options + ) + + ( + all_paths, + path_for_metadata, + ) = ak._v2.operations.convert.ak_from_parquet._all_and_metadata_paths( + path, fs, paths + ) + + with fsspec.parquet.open_parquet_file( + path_for_metadata, + fs=fs, + engine="pyarrow", + row_groups=[], + storage_options=storage_options, + max_gap=max_gap, + max_block=max_block, + footer_sample_size=footer_sample_size, + ) as file_for_metadata: + parquetfile_for_metadata = pyarrow_parquet.ParquetFile(file_for_metadata) + + form = ak._v2._connect.pyarrow.form_handle_arrow( + parquetfile_for_metadata.schema_arrow, pass_empty_field=True + ) + + return ParquetMetadata(form, fs, all_paths, parquetfile_for_metadata.metadata)