iris-hep · alexander-held · Apr 12, 2024 · Apr 10, 2024 · Apr 10, 2024 · Apr 10, 2024
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.zip filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
@@ -169,3 +169,5 @@ servicex.yaml
 *.pstats
 servicex/test.py
 *.html
+.DS_Store
+figures/
diff --git a/README.md b/README.md
@@ -36,13 +36,15 @@ with additional files:
 * `input_files/container_list.txt`: list of containers to run over
 * `input_files/produce_container_metadata.py`: query metadata for containers: number of files / events, size
 * `input_files/container_metadata.json`: output of `input_files/produce_container_metadata.py` with container metadata
-* `input_files/get_file_list.py`: for a given dataset creates a txt file listing file access paths that include apropriate xcache. The same kind of output can be obtained by doing:
+* `input_files/get_file_list.py`: for a given dataset creates a txt file listing file access paths that include appropriate xcache. The same kind of output can be obtained by doing:
 
     ```
     export SITE_NAME=AF_200
     rucio list-file-replicas mc20_13TeV:mc20_13TeV.364126.Sherpa_221_NNPDF30NNLO_Zee_MAXHTPTV500_1000.deriv.DAOD_PHYSLITE.e5299_s3681_r13145_p6026 --protocol root  --pfns --rses MWT2_UC_LOCALGROUPDISK
     ```
 
+* `input_files/containers_to_files.py`: process the list of containers into a list of files per container with hardcoded xcache instances, writes to `input_files/file_lists/*`.
+
 ### Branch list determination
 
 Branches to be read are determined with a 2018 data file.

diff --git a/input_files/containers_to_files.py b/input_files/containers_to_files.py
@@ -0,0 +1,30 @@
+# process list of containers into list of files with hardcoded xcache instances
+
+# to run get_file_list.py, use e.g. a venv on uchicago via ssh
+# python3 -m venv venv
+# source venv/bin/activate
+# pip install xmltodict
+# (assuming setupATLAS / lsetup rucio + proxy present)
+
+import os
+import shutil
+
+if __name__ == "__main__":
+    with open("container_list.txt") as f:
+        containers = f.readlines()
+
+    for container in containers:
+        container = container.strip()
+
+        if "#" in container:
+            continue  # skip comments
+
+        cmd = f"python get_file_list.py {container}"
+        print(cmd)
+        os.system(cmd)  # produce file list
+
+    # create zipped version of folder with file lists
+    shutil.make_archive("file_lists", "zip", "file_lists")
+
+    # cleanup: delete non-zipped version
+    shutil.rmtree("file_lists")
diff --git a/input_files/file_lists.zip b/input_files/file_lists.zip
diff --git a/input_files/get_file_list.py b/input_files/get_file_list.py
@@ -13,6 +13,7 @@
 import logging
 import xmltodict
 import hashlib
+import os
 
 from rucio.common.exception import DataIdentifierNotFound
 from rucio.client.scopeclient import ScopeClient
@@ -140,12 +141,16 @@ def hash_string(input_string):
     return int_value
 
 
+output_directory = "file_lists"
+if not os.path.exists(output_directory):
+    os.mkdir(output_directory)
+
 cf = []
 for f in files:
     c = hash_string(f) % len(caches)
     cf.append(f'root://{caches[c]}//{f}')
     print(f)
 
-with open(f'{did}.txt', 'w') as file:
+with open(f'{output_directory}/{did.replace(":", "-")}.txt', 'w') as file:
     for f in cf:
         file.write(f + '\n')
diff --git a/input_files/utils.py b/input_files/utils.py
@@ -0,0 +1,69 @@
+from collections import defaultdict
+import json
+from pathlib import Path
+import zipfile
+
+from . import find_containers
+
+DIR = Path(__file__).parent.resolve()
+
+
+def get_dsids(process):
+    if "data" not in process:
+        return find_containers.container_dict[process]
+    else:
+        return [process]
+
+
+def get_fileset(processes_to_use, max_files_per_container=None):
+    with open(DIR / "container_metadata.json") as f:
+        container_metadata = json.load(f)  # name -> metadata
+
+    container_to_file_list = {}  # container name -> list of files
+
+    # read from zipped file
+    with zipfile.ZipFile(DIR / "file_lists.zip") as z:
+        for filename in sorted(z.namelist()):
+            container_name = filename.split("/")[-1:][0].replace("-", ":").replace(".txt", "")
+            with z.open(filename) as f:
+                file_list = f.readlines()
+
+            # limit amount of files per container
+            if max_files_per_container is not None:
+                file_list = file_list[:max_files_per_container]
+
+            container_to_file_list[container_name] = [p.decode("utf-8").strip() for p in file_list]
+
+    fileset = defaultdict(lambda: defaultdict(dict))  # process -> list of files
+    total_nfiles = 0
+    total_size_TB = 0
+    total_nevts = 0
+    for process in processes_to_use:
+        dsids = get_dsids(process)
+        for dsid in dsids:
+            # find matching containers
+            matching_containers = [c for c in list(container_to_file_list.keys()) if str(dsid) in c]
+            # for each container, add full list of files
+            for container in matching_containers:
+                file_list = container_to_file_list[container]
+                total_nfiles += len(file_list)
+                if max_files_per_container is None:
+                    assert len(file_list) == container_metadata[container]["nfiles"]
+                total_size_TB += container_metadata[container]["size_TB"]
+                total_nevts += container_metadata[container]["nevts"]
+                fileset[process]["files"].update(dict(zip(file_list, ["CollectionTree"]*len(file_list))))
+
+    print("fileset summary")
+    print(f" - number of files: {total_nfiles:,}")
+    if max_files_per_container is None:
+        print(f" - total size: {total_size_TB:.3f} TB")
+        print(f" - number of nevts: {total_nevts:,}")
+    else:
+        print("cannot determine total size / number of events when max_files_per_container is being used")
+
+    return fileset
+
+
+if __name__ == "__main__":
+    processes = ["db", "zjets", "wjets", "ttV", "othertop", "ttbar", "data15_13TeV", "data16_13TeV", "data17_13TeV", "data18_13TeV"]
+    get_fileset(processes)