Skip to content

Commit

Permalink
ServiceX Exploratory Notebook (#4) (#7)
Browse files Browse the repository at this point in the history
* Use `uproot.dask` directly to open up files rather than letting the automaticed process take care of it. This fixed the bug.
* Remove the awkward array restriction
* Update documentation to note the removal of the limitation
  • Loading branch information
gordonwatts authored Apr 2, 2024
1 parent 49c6a55 commit e83612f
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 28 deletions.
5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@ awkward
hist[dask]
# Necessary due to bug in uproot/dask-awkward that prevents
# ak.concat working.
dask_awkward==2024.2.0
uproot
# Get the version with PHYSLITE support sort-of built it
func_adl_servicex_xaodr21>=2.0a1
ipywidgets

# To load from s3 the servicex outputs into uproot, we needed
# to add a few things by hand! :-(
fsspec-xrootd
100 changes: 76 additions & 24 deletions servicex/00-exploring-the-data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,10 @@
"outputs": [],
"source": [
"ttbar_all_rucio_dataset_name = \"mc23_13p6TeV.601229.PhPy8EG_A14_ttbar_hdamp258p75_SingleLep.deriv.DAOD_PHYSLITE.e8514_s4162_r14622_p6026\"\n",
"ttbar_all = f\"rucio://{ttbar_all_rucio_dataset_name}?files=1\"\n",
"ds = SXDSAtlasxAODR21(ttbar_all, backend='atlasr22')"
"ttbar_all = f\"rucio://{ttbar_all_rucio_dataset_name}?files=4\"\n",
"ds = SXDSAtlasxAODR21(ttbar_all, backend='atlasr22')\n",
"\n",
"ds.return_qastle = True"
]
},
{
Expand Down Expand Up @@ -100,11 +102,68 @@
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"\"(call Select (call Select (call MetaData (call MetaData (call EventDataset 'bogus.root') (dict (list 'metadata_type' 'name' 'include_files' 'container_type' 'contains_collection' 'link_libraries') (list 'add_atlas_event_collection_info' 'EventInfo' (list 'xAODEventInfo/versions/EventInfo_v1.h') 'xAOD::EventInfo_v1' False (list 'xAODEventInfo')))) (dict (list 'metadata_type' 'name' 'include_files' 'container_type' 'element_type' 'contains_collection' 'link_libraries') (list 'add_atlas_event_collection_info' 'Jets' (list 'xAODJet/JetContainer.h') 'DataVector<xAOD::Jet_v1>' 'xAOD::Jet_v1' True (list 'xAODJet')))) (lambda (list e) (dict (list 'evt' 'jet') (list (call (attr e 'EventInfo') 'EventInfo') (call (attr e 'Jets') 'AnalysisJets'))))) (lambda (list ei) (dict (list 'event_number' 'run_number' 'jet_pt') (list (call (attr (attr ei 'evt') 'eventNumber')) (call (attr (attr ei 'evt') 'runNumber')) (call (attr (attr ei 'jet') 'Select') (lambda (list j) (/ (call (attr j 'pt')) 1000)))))))\""
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Start by grabbing the data as an awkward array\n",
"# TODO: Files should remain in the S3 cache and be read directly from there\n",
"data = query.AsAwkwardArray().value()"
"qastle_text = query.value()\n",
"qastle_text"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[WindowsPath('C:/Users/gordo/AppData/Local/Temp/servicex_gordo/data/0efb1cfc-144d-4191-a534-df85eee6e499/aa1e6d40432ec1f582cf3ae6669eaddb-TE.37223155._000310.pool.root.1'),\n",
" WindowsPath('C:/Users/gordo/AppData/Local/Temp/servicex_gordo/data/0efb1cfc-144d-4191-a534-df85eee6e499/612ea1fbd7d3d65292f5f9e4572d9c84-TE.37223155._000309.pool.root.1'),\n",
" WindowsPath('C:/Users/gordo/AppData/Local/Temp/servicex_gordo/data/0efb1cfc-144d-4191-a534-df85eee6e499/78ec35c7cec2203f7e0f1d88d1ca744e-TE.37223155._000289.pool.root.1'),\n",
" WindowsPath('C:/Users/gordo/AppData/Local/Temp/servicex_gordo/data/0efb1cfc-144d-4191-a534-df85eee6e499/765c5edfec0e370202db27045132f1e7-TE.37223155._000125.pool.root.1')]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from servicex import ServiceXDataset\n",
"ds_prime = ServiceXDataset(ttbar_all, backend_name='atlasr22')\n",
"# TODO: Why does `get_data_parquet` return no files, but `get_data_rootfiles` does?\n",
"# Download the files locally\n",
"files = ds_prime.get_data_rootfiles(qastle_text, title=\"First Request\")\n",
"# Get a URL so we can open over the internet\n",
"# files = ds_prime.get_data_rootfiles_uri(qastle_text, title=\"First Request\")\n",
"# files = [f.url for f in files]\n",
"\n",
"files"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import uproot\n",
"data = uproot.dask({\n",
" f: 'atlas_xaod_tree'\n",
" for f in files\n",
"})"
]
},
{
Expand All @@ -118,7 +177,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -132,7 +191,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -146,7 +205,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand All @@ -166,23 +225,23 @@
"<text text-anchor=\"middle\" x=\"125.0\" y=\"15\" style=\"fill:currentColor;\">\n",
"x-axis\n",
"</text>\n",
"<polyline points=\" 0,0 0,-0 12.5,-0 12.5,-0 25,-0 25,-0 37.5,-0 37.5,-0 50,-0 50,-0 62.5,-0 62.5,-0 75,-0 75,-0 87.5,-0 87.5,-0 100,-0 100,-0 112.5,-0 112.5,-0 125,-0 125,-0 137.5,-0 137.5,-0 150,-0 150,-0 162.5,-0 162.5,-0 175,-0 175,-0 187.5,-0 187.5,-100 200,-100 200,-0 212.5,-0 212.5,-0 225,-0 225,-0 237.5,-0 237.5,-0 250,-0 250,0\" style=\"fill:none; stroke:currentColor;\"/>\n",
"<polyline points=\" 0,0 0,-0 12.5,-0 12.5,-0 25,-0 25,-0 37.5,-0 37.5,-0 50,-0 50,-0 62.5,-0 62.5,-0 75,-0 75,-0 87.5,-0 87.5,-0 100,-0 100,-0 112.5,-0 112.5,-0 125,-0 125,-0 137.5,-0 137.5,-2.35 150,-2.35 150,-87.1 162.5,-87.1 162.5,-78.8 175,-78.8 175,-76.5 187.5,-76.5 187.5,-100 200,-100 200,-2.35 212.5,-2.35 212.5,-0 225,-0 225,-0 237.5,-0 237.5,-0 250,-0 250,0\" style=\"fill:none; stroke:currentColor;\"/>\n",
"</svg>\n",
"</div>\n",
"<div style=\"flex=grow:1;\">\n",
"Regular(20, 0, 1e+08, name='x', label='x-axis')<br/>\n",
"<hr style=\"margin-top:.2em; margin-bottom:.2em;\"/>\n",
"Int64() Σ=150000.0\n",
"Int64() Σ=590000.0\n",
"\n",
"</div>\n",
"</div>\n",
"</html>"
],
"text/plain": [
"Hist(Regular(20, 0, 1e+08, name='x', label='x-axis'), storage=Int64()) # Sum: 150000.0"
"Hist(Regular(20, 0, 1e+08, name='x', label='x-axis'), storage=Int64()) # Sum: 590000.0"
]
},
"execution_count": 7,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -193,7 +252,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand All @@ -213,37 +272,30 @@
"<text text-anchor=\"middle\" x=\"125.0\" y=\"15\" style=\"fill:currentColor;\">\n",
"Jet $p_T$\n",
"</text>\n",
"<polyline points=\" 0,0 0,-2.02 12.5,-2.02 12.5,-100 25,-100 25,-59.4 37.5,-59.4 37.5,-31.2 50,-31.2 50,-22.1 62.5,-22.1 62.5,-17.1 75,-17.1 75,-13.7 87.5,-13.7 87.5,-10.6 100,-10.6 100,-8.31 112.5,-8.31 112.5,-6.49 125,-6.49 125,-5.12 137.5,-5.12 137.5,-4 150,-4 150,-3.04 162.5,-3.04 162.5,-2.39 175,-2.39 175,-1.88 187.5,-1.88 187.5,-1.45 200,-1.45 200,-1.16 212.5,-1.16 212.5,-0.881 225,-0.881 225,-0.726 237.5,-0.726 237.5,-0.596 250,-0.596 250,0\" style=\"fill:none; stroke:currentColor;\"/>\n",
"<polyline points=\" 0,0 0,-1.99 12.5,-1.99 12.5,-100 25,-100 25,-59.3 37.5,-59.3 37.5,-31 50,-31 50,-22 62.5,-22 62.5,-17.2 75,-17.2 75,-13.6 87.5,-13.6 87.5,-10.6 100,-10.6 100,-8.35 112.5,-8.35 112.5,-6.52 125,-6.52 125,-5.11 137.5,-5.11 137.5,-3.96 150,-3.96 150,-3.06 162.5,-3.06 162.5,-2.37 175,-2.37 175,-1.86 187.5,-1.86 187.5,-1.44 200,-1.44 200,-1.15 212.5,-1.15 212.5,-0.897 225,-0.897 225,-0.734 237.5,-0.734 237.5,-0.583 250,-0.583 250,0\" style=\"fill:none; stroke:currentColor;\"/>\n",
"</svg>\n",
"</div>\n",
"<div style=\"flex=grow:1;\">\n",
"Regular(20, 0, 200, name='x', label='Jet $p_T$')<br/>\n",
"<hr style=\"margin-top:.2em; margin-bottom:.2em;\"/>\n",
"Int64() Σ=1435200.0 <em>(1450989.0 with flow)</em>\n",
"Int64() Σ=5643591.0 <em>(5705961.0 with flow)</em>\n",
"\n",
"</div>\n",
"</div>\n",
"</html>"
],
"text/plain": [
"Hist(Regular(20, 0, 200, name='x', label='Jet $p_T$'), storage=Int64()) # Sum: 1435200.0 (1450989.0 with flow)"
"Hist(Regular(20, 0, 200, name='x', label='Jet $p_T$'), storage=Int64()) # Sum: 5643591.0 (5705961.0 with flow)"
]
},
"execution_count": 8,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r2.compute()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -262,7 +314,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
"version": "3.10.8"
}
},
"nbformat": 4,
Expand Down
4 changes: 1 addition & 3 deletions servicex/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@ This directory contains scripts and notebooks to implement fetching the data loc

The default `servicex.yaml` file was used from the UChicago AF.

Note that you'll need to be aware of the `requirements.txt` as a bug in `dask_awkward` means this can't run on the most recent version.

## Files

| File | Description |
|------|-------------|
| 00-exploring-the-data | Outlines the raw ServiceX code that we can use. We'll need to develop libraries which will obscure this code quite a bit given how many branches we'll need to load. This notebook can't run on the most recent version of `dask_awkward` - until [this bug](https://github.com/dask-contrib/dask-awkward/issues/456) is fixed. |
| 00-exploring-the-data | Outlines the raw ServiceX code that we can use. We'll need to develop libraries which will obscure this code quite a bit given how many branches we'll need to load. Working around [this bug](https://github.com/dask-contrib/dask-awkward/issues/456) makes the code a little more complex than it needs to be. |

0 comments on commit e83612f

Please sign in to comment.