Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: include complete list of files #28

Merged
merged 12 commits into from
Apr 12, 2024
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.zip filter=lfs diff=lfs merge=lfs -text
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,5 @@ servicex.yaml
*.pstats
servicex/test.py
*.html
.DS_Store
figures/
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,15 @@ with additional files:
* `input_files/container_list.txt`: list of containers to run over
* `input_files/produce_container_metadata.py`: query metadata for containers: number of files / events, size
* `input_files/container_metadata.json`: output of `input_files/produce_container_metadata.py` with container metadata
* `input_files/get_file_list.py`: for a given dataset creates a txt file listing file access paths that include apropriate xcache. The same kind of output can be obtained by doing:
* `input_files/get_file_list.py`: for a given dataset creates a txt file listing file access paths that include appropriate xcache. The same kind of output can be obtained by doing:

```
export SITE_NAME=AF_200
rucio list-file-replicas mc20_13TeV:mc20_13TeV.364126.Sherpa_221_NNPDF30NNLO_Zee_MAXHTPTV500_1000.deriv.DAOD_PHYSLITE.e5299_s3681_r13145_p6026 --protocol root --pfns --rses MWT2_UC_LOCALGROUPDISK
```

* `input_files/containers_to_files.py`: process the list of containers into a list of files per container with hardcoded xcache instances, writes to `input_files/file_lists/*`.

### Branch list determination

Branches to be read are determined with a 2018 data file.
Expand Down
30 changes: 30 additions & 0 deletions input_files/containers_to_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# process list of containers into list of files with hardcoded xcache instances

# to run get_file_list.py, use e.g. a venv on uchicago via ssh
# python3 -m venv venv
# source venv/bin/activate
# pip install xmltodict
# (assuming setupATLAS / lsetup rucio + proxy present)

import os
import shutil

if __name__ == "__main__":
with open("container_list.txt") as f:
containers = f.readlines()

for container in containers:
container = container.strip()

if "#" in container:
continue # skip comments

cmd = f"python get_file_list.py {container}"
print(cmd)
os.system(cmd) # produce file list

# create zipped version of folder with file lists
shutil.make_archive("file_lists", "zip", "file_lists")

# cleanup: delete non-zipped version
shutil.rmtree("file_lists")
3 changes: 3 additions & 0 deletions input_files/file_lists.zip
Git LFS file not shown
7 changes: 6 additions & 1 deletion input_files/get_file_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import logging
import xmltodict
import hashlib
import os

from rucio.common.exception import DataIdentifierNotFound
from rucio.client.scopeclient import ScopeClient
Expand Down Expand Up @@ -140,12 +141,16 @@ def hash_string(input_string):
return int_value


output_directory = "file_lists"
if not os.path.exists(output_directory):
os.mkdir(output_directory)

cf = []
for f in files:
c = hash_string(f) % len(caches)
cf.append(f'root://{caches[c]}//{f}')
print(f)

with open(f'{did}.txt', 'w') as file:
with open(f'{output_directory}/{did.replace(":", "-")}.txt', 'w') as file:
for f in cf:
file.write(f + '\n')
69 changes: 69 additions & 0 deletions input_files/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from collections import defaultdict
import json
from pathlib import Path
import zipfile

from . import find_containers

DIR = Path(__file__).parent.resolve()


def get_dsids(process):
if "data" not in process:
return find_containers.container_dict[process]
else:
return [process]


def get_fileset(processes_to_use, max_files_per_container=None):
with open(DIR / "container_metadata.json") as f:
container_metadata = json.load(f) # name -> metadata

container_to_file_list = {} # container name -> list of files

# read from zipped file
with zipfile.ZipFile(DIR / "file_lists.zip") as z:
for filename in sorted(z.namelist()):
container_name = filename.split("/")[-1:][0].replace("-", ":").replace(".txt", "")
with z.open(filename) as f:
file_list = f.readlines()

# limit amount of files per container
if max_files_per_container is not None:
file_list = file_list[:max_files_per_container]

container_to_file_list[container_name] = [p.decode("utf-8").strip() for p in file_list]

fileset = defaultdict(lambda: defaultdict(dict)) # process -> list of files
total_nfiles = 0
total_size_TB = 0
total_nevts = 0
for process in processes_to_use:
dsids = get_dsids(process)
for dsid in dsids:
# find matching containers
matching_containers = [c for c in list(container_to_file_list.keys()) if str(dsid) in c]
# for each container, add full list of files
for container in matching_containers:
file_list = container_to_file_list[container]
total_nfiles += len(file_list)
if max_files_per_container is None:
assert len(file_list) == container_metadata[container]["nfiles"]
total_size_TB += container_metadata[container]["size_TB"]
total_nevts += container_metadata[container]["nevts"]
fileset[process]["files"].update(dict(zip(file_list, ["CollectionTree"]*len(file_list))))

print("fileset summary")
print(f" - number of files: {total_nfiles:,}")
if max_files_per_container is None:
print(f" - total size: {total_size_TB:.3f} TB")
print(f" - number of nevts: {total_nevts:,}")
else:
print("cannot determine total size / number of events when max_files_per_container is being used")

return fileset


if __name__ == "__main__":
processes = ["db", "zjets", "wjets", "ttV", "othertop", "ttbar", "data15_13TeV", "data16_13TeV", "data17_13TeV", "data18_13TeV"]
get_fileset(processes)
Loading