Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add the ability to read RNTuple alias columns #1004

Merged
merged 14 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 78 additions & 40 deletions src/uproot/models/RNTuple.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import struct
import zlib
from collections import defaultdict

import numpy

Expand Down Expand Up @@ -50,7 +51,7 @@ def _keys(self):
keys = []
field_records = self.header.field_records
for i, fr in enumerate(field_records):
if fr.parent_field_id == i:
if fr.parent_field_id == i and fr.type_name != "":
keys.append(fr.field_name)
return keys

Expand Down Expand Up @@ -144,8 +145,6 @@ def header(self):
self._header = h
assert h.crc32 == zlib.crc32(self._header_chunk.raw_data[:-4])

# cursor = self._header_cursor.copy()
# cursor.debug(self._header_chunk)
return self._header

@property
Expand Down Expand Up @@ -238,15 +237,23 @@ def base_col_form(self, cr, col_id, parameters=None):
def col_form(self, field_id):
ak = uproot.extras.awkward()

# FIXME remove this ugly logic
rel_crs = []
rel_crs_idxs = []
for i, cr in enumerate(self.header.column_records):
if cr.field_id == field_id:
rel_crs.append(cr)
rel_crs_idxs.append(i)
if cr.field_id > field_id:
break
if field_id in self._column_recorrds_dict:
rel_crs = self._column_recorrds_dict[field_id]["rel_crs"]
rel_crs_idxs = self._column_recorrds_dict[field_id]["rel_crs_idxs"]
elif field_id in self._alias_columns_dict:
rel_crs = self._column_recorrds_dict[self._alias_columns_dict[field_id]][
"rel_crs"
]
rel_crs_idxs = self._column_recorrds_dict[
self._alias_columns_dict[field_id]
]["rel_crs_idxs"]
else:
raise (
RuntimeError(
f"The filed_id: {field_id} is missing both from the columns records and the alias columns."
)
)

if len(rel_crs) == 1: # base case
return self.base_col_form(rel_crs[0], rel_crs_idxs[0])
elif (
Expand All @@ -269,7 +276,7 @@ def field_form(self, this_id, seen):

field_records = self.header.field_records
this_record = field_records[this_id]
seen.append(this_id)
seen.add(this_id)
structural_role = this_record.struct_role
if (
structural_role == uproot.const.rntuple_role_leaf
Expand All @@ -279,40 +286,32 @@ def field_form(self, this_id, seen):
# n.b. the split may happen in column
return self.col_form(this_id)
elif structural_role == uproot.const.rntuple_role_leaf:
# std::array
child_id = next(
filter(
lambda i: field_records[i].parent_field_id == this_id,
range(this_id + 1, len(field_records)),
)
)
# std::array it only has one child
if this_id in self._related_ids:
child_id = self._related_ids[this_id][0]

inner = self.field_form(child_id, seen)
return ak.forms.RegularForm(inner, this_record.repetition)
elif structural_role == uproot.const.rntuple_role_vector:
keyname = self.col_form(this_id)
child_id = next(
filter(
lambda i: field_records[i].parent_field_id == this_id,
range(this_id + 1, len(field_records)),
)
)
keyname = f"column-{this_id}"
# this only has one child
if this_id in self._related_ids:
child_id = self._related_ids[this_id][0]
inner = self.field_form(child_id, seen)
return ak.forms.ListOffsetForm("u32", inner, form_key=keyname)
elif structural_role == uproot.const.rntuple_role_struct:
newids = []
for i, fr in enumerate(field_records):
if i not in seen and fr.parent_field_id == this_id:
newids.append(i)
if this_id in self._related_ids:
newids = self._related_ids[this_id]
# go find N in the rest, N is the # of fields in struct
recordlist = [self.field_form(i, seen) for i in newids]
namelist = [field_records[i].field_name for i in newids]
return ak.forms.RecordForm(recordlist, namelist, form_key="whatever")
elif structural_role == uproot.const.rntuple_role_union:
keyname = self.col_form(this_id)
newids = []
for i, fr in enumerate(field_records):
if i not in seen and fr.parent_field_id == this_id:
newids.append(i)
if this_id in self._related_ids:
newids = self._related_ids[this_id]
recordlist = [self.field_form(i, seen) for i in newids]
return ak.forms.UnionForm("i8", "i64", recordlist, form_key=keyname)
else:
Expand All @@ -325,7 +324,7 @@ def to_akform(self):
field_records = self.header.field_records
recordlist = []
topnames = self.keys()
seen = []
seen = set()
for i in range(len(field_records)):
if i not in seen:
recordlist.append(self.field_form(i, seen))
Expand Down Expand Up @@ -430,7 +429,7 @@ def read_col_page(self, ncol, cluster_i):
if zigzag:
res = from_zigzag(res)
elif delta:
numpy.cumsum(res)
res = numpy.cumsum(res)
return res

def arrays(
Expand All @@ -457,17 +456,56 @@ def arrays(
[c.num_entries for c in clusters[start_cluster_idx:stop_cluster_idx]]
)

self._alias_columns_dict = {
el.field_id: el.physical_id
for i, el in enumerate(self.header.alias_columns)
}
self._column_recorrds_dict = {}
self._column_recorrds_dict = {
el.field_id: {
"rel_crs": [
*(self._column_recorrds_dict.get(el.field_id) or {}).get(
"rel_crs", []
),
el,
],
"rel_crs_idxs": [
*(self._column_recorrds_dict.get(el.field_id) or {}).get(
"rel_crs_idxs", []
),
i,
],
}
for i, el in enumerate(self.header.column_records)
}

self._related_ids = defaultdict(list)
for i, el in enumerate(self.header.field_records):
if el.parent_field_id != i:
self._related_ids[el.parent_field_id].append(i)

form = self.to_akform().select_columns(filter_names)
# only read columns mentioned in the awkward form
target_cols = []
container_dict = {}
_recursive_find(form, target_cols)
for i, cr in enumerate(self.column_records):
key = f"column-{i}"
dtype_byte = cr.type
if key in target_cols:
for key in target_cols:
if "column" in key:
key_nr = int(key.split("-")[1])
if key_nr in self._column_recorrds_dict:
id = key_nr
elif key_nr in self._alias_columns_dict:
id = self._alias_columns_dict[key_nr]
else:
raise (
RuntimeError(
f"The key: {key} is missing both from the columns records and the alias columns."
)
)

dtype_byte = self._column_recorrds_dict[id]["rel_crs"][0].type
content = self.read_col_pages(
i, range(start_cluster_idx, stop_cluster_idx)
id, range(start_cluster_idx, stop_cluster_idx)
)
if dtype_byte == uproot.const.rntuple_col_type_to_num_dict["switch"]:
kindex, tags = _split_switch_bits(content)
Expand Down
29 changes: 29 additions & 0 deletions tests/test_0962_RNTuple_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pytest
import uproot
import awkward as ak
import skhep_testdata
import numpy as np

Expand Down Expand Up @@ -35,3 +36,31 @@ def test_new_support_RNTuple_split_int16_reading():
assert df.one_integers[-1] == 1
assert np.all(np.unique(df.one_integers[: len(df.one_integers) // 2]) == [2])
assert np.all(np.unique(df.one_integers[len(df.one_integers) / 2 + 1 :]) == [1])


pytest.importorskip("zstandard")


@pytest.mark.xfail(
reason="Uproot can now read the data from event files (CMS/Atlas), but this test fails because the column matching logic is wrong.",
strict=True,
)
def test_new_support_RNTuple_event_data():
with uproot.open(
"https://xrootd-local.unl.edu:1094//store/user/AGC/nanoaod-rntuple/zstd/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19980_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1_00000_0000.root"
) as f:
obj = f["Events"]
df = obj.arrays(["nTau"])
assert len(df) == 1334428
assert ak.to_list(df["nTau"][:10]) == [
0,
0,
2,
0,
1,
1,
1,
1,
2,
0,
]