Some fixes from Kieran

HenryDayHall · Oct 1, 2023 · d75e099 · d75e099
1 parent aa1fdfc
commit d75e099
Show file tree

Hide file tree

Showing 5 changed files with 78 additions and 87 deletions.
diff --git a/jet_tools/CompareClusters.py b/jet_tools/CompareClusters.py
@@ -12,6 +12,7 @@
 import functools
 from .spectraljet import Components, FormJets, Constants, TypeTools
 from jet_tools import TrueTag, InputTools, JetQuality, PlottingTools, RemovePileup
+import pandas as pd
 
 SCORE_COLS = ["QualityWidth", "QualityFraction",
               "AveSignalMassRatio", "AveBGMassRatio",
@@ -846,8 +847,8 @@ def tabulate_scores(eventWise_paths, variable_cols=None, score_cols=None):
         names of the variable columns of table
     score_cols : list of string
         names of the score columns of table
-    table : awkward array in 2d
-        awkward array where each row represents a jet
+    table : pandas dataframe
+        dataframe where each row represents a jet
         and each column is a hyperparameter
 
     """
@@ -877,7 +878,12 @@ def tabulate_scores(eventWise_paths, variable_cols=None, score_cols=None):
                 value = getattr(eventWise, f"{name}_{part}", "Undefined")
                 row.append(value)
             table.append(row)
-    table = TypeTools.StringyArray(table)
+    # Rearrange columns to the desired order
+    table = pd.DataFrame(table)
+
+    # Check if table is empty
+    if not table.empty:
+        table.columns = all_cols
     return all_cols, variable_cols, score_cols, table
 
 
@@ -1092,34 +1098,28 @@ def filter_traditional(all_cols, variable_cols, score_cols, table):
 def filter_matching(all_cols, table, exact=None, approx=None):
     """
     Construct a mask that selects rows of the table
-    which match the exact and approimate equality requirements.
+    which match the exact and approximate equality requirements.
     
     Parameters
     ----------
     all_cols : list of string
         names of the columns of table in the order found in table
-    table : awkward array in 2d
-        awkward array where each row represents a jet
+    table : pandas DataFrame or awkward array in 2D
+        DataFrame or awkward array where each row represents a jet
         and each column is a hyperparameter
 
     Returns
     -------
-    mask : array of bool
+    mask : iterable of bool
         rows to keep
     """
-    mask = np.full(len(table), True, dtype=bool)
+    mask = pd.Series([True]*table.shape[0])
     if exact is not None:
-        for name in exact:
-            column = [row[all_cols.index(name)] ==
-                      TypeTools.restring(exact[name])
-                      for row in table]
-            mask *= column
+        for name, value in exact.items():
+            mask &= (table[name] == TypeTools.restring(value))
     if approx is not None:
-        for name in approx:
-            column = [TypeTools.soft_equality(row[all_cols.index(name)],
-                                              approx[name])
-                      for row in table]
-            mask *= column
+        for name, value in approx.items():
+            mask &= table[name].apply(TypeTools.soft_equality, args=(value,))
     return mask
 
 
@@ -1628,8 +1628,8 @@ def project_2d(show_params, best_slice, all_cols, variable_cols, score_cols,
         names of the variable columns of table
     score_cols : list of string
         names of the score columns of table
-    table : awkward array in 2d
-        awkward array where each row represents a jet
+    table : pandas dataframe
+        pandas dataframe where each row represents a jet
         and each column is a hyperparameter
     print_best : bool
         print the lowest loss in the slice and assocated parameter choices
@@ -1660,39 +1660,29 @@ def project_2d(show_params, best_slice, all_cols, variable_cols, score_cols,
     assert len(show_params) == 2
     show_indices = [all_cols.index(s) for s in show_params]
     distinct_values = []
-    for i in show_indices:
-        values = table[:, i]
-        distinct_values.append(np.array(TypeTools.generic_sort(set(values))))
+    for s in show_params:
+        values = table[s]
+        distinct_values.append(np.array(TypeTools.generic_sort(values.unique())))
 
-    bg_index = all_cols.index("SeperateAveDistanceBG")
-    sg_index = all_cols.index("SeperateAveDistanceSignal")
-    mult_index = all_cols.index("AveSeperateJets")
+    bg_col = "SeperateAveDistanceBG"
+    sg_col = "SeperateAveDistanceSignal"
+    mult_col = "AveSeperateJets"
     # if we are taking the best slice remove all other table rows
     if best_slice:
-        # old way (awkward 1.8.0)
-        #combined = np.sqrt(0.53*np.array(ak.to_list(table[:, bg_index]))**2 +
-        #                   np.array(ak.to_list(table[:, sg_index]))**2)
-        # TODO revert if 1.8.0 does bug fix
-        #combined = np.sqrt(0.53*np.array(list(table[:, bg_index]))**2 +
-        #                   np.array(list(table[:, sg_index]))**2)
-
-        #combined = np.sqrt(0.2*np.array(ak.to_list(table[:, bg_index]))**2 +
-        #                   np.array(ak.to_list(table[:, sg_index]))**2)
-        #combined = (np.sqrt(0.53*np.array(ak.to_list(table[:, bg_index]))**2 +
-        #             np.array(ak.to_list(table[:, sg_index]))**2)/
-        #             np.array(ak.to_list(table[:, mult_index])))
+        bg_values = convert_to_float(table[bg_col].to_numpy())
+        sg_values = convert_to_float(table[sg_col].to_numpy())
+        combined = np.sqrt(0.53 * np.square(bg_values) + np.square(sg_values))
         if isinstance(best_slice, bool):
             best_idx = np.nanargmin(combined.tolist())
         elif isinstance(best_slice, dict):
-            # then is is the values of the best slice
+            # then it is the values of the best slice
             mask = filter_matching(all_cols, table, approx=best_slice)
-            possible = np.where(mask)[0]
+            possible = mask.index[mask].tolist()
             distances = np.zeros_like(possible, dtype=float)
             for i in possible:
                 for v in variable_cols:
-                    col = all_cols.index(v)
-                    if table[i, col] != best_slice[v]:
-                        distances += abs(table[i, col] - best_slice[v])
+                    if table.loc[i, v] != best_slice[v]:
+                        distances += abs(table.loc[i, v] - best_slice[v])
             best_idx = possible[np.argmin(distances)]
         elif isinstance(best_slice, int):
             best_idx = best_slice
@@ -1701,39 +1691,33 @@ def project_2d(show_params, best_slice, all_cols, variable_cols, score_cols,
         if print_best:
             print(f"best score; {combined[best_idx]}")
             print(f"best row={best_idx};", end=' ')
-            print({v: table[best_idx, all_cols.index(v)] for v in
-                   ["jet_name", "eventWise_name"] + variable_cols})
+            print({v: table.loc[best_idx, v] for v in ["jet_name", "eventWise_name"] + variable_cols})
             print()
             print_best = False
-        close_filter = np.ones(num_configurations, dtype=bool)
-        for i in variable_idxs:
-            if i not in show_indices:
-                close_filter *= TypeTools.soft_equality(table[:, i],
-                                                        table[best_idx, i])
-            if not close_filter[best_idx]:
-                # the best must be close to the best...
-                raise ValueError(
-                    f"Failed to find the {all_cols[i]} that match {table[best_idx, i]}")
+        close_filter = pd.Series([True] * len(table), index=table.index)
+        for i in variable_cols:
+            if i not in show_params:
+                close_filter &= TypeTools.soft_equality(table[i], table.loc[best_idx, i])
+            if not close_filter.loc[best_idx]:
+                raise ValueError(f"Failed to find the {i} that match {table.loc[best_idx, i]}")
         table = table[close_filter]
     # make the images
     image_sg = np.full(tuple(len(val) for val in distinct_values), np.nan)
     image_bg = np.full(tuple(len(val) for val in distinct_values), np.nan)
     image_mult = np.full(tuple(len(val) for val in distinct_values), np.nan)
-    x_values = np.array(table[:, show_indices[0]].tolist())
-    y_values = np.array(table[:, show_indices[1]].tolist())
+    x_values = table[show_params[0]].values
+    y_values = table[show_params[1]].values
     for x, x_val in enumerate(distinct_values[0]):
         matches_x = TypeTools.soft_equality(x_values, x_val)
         for y, y_val in enumerate(distinct_values[1]):
             matches_y = TypeTools.soft_equality(y_values, y_val)
-            matches = matches_x*matches_y
-            image_bg[x, y] = np.nanmean(table[matches, bg_index].tolist())
-            image_sg[x, y] = np.nanmean(table[matches, sg_index].tolist())
-            image_mult[x, y] = np.nanmean(table[matches, mult_index].tolist())
-    combined = np.sqrt(0.53*image_bg**2 + image_sg**2)
-    #combined = np.sqrt(0.2*image_bg**2 + image_sg**2)
-    #combined = np.sqrt(0.53*image_bg**2 + image_sg**2)/image_mult
-    show_mappings = [{v:i for i, v in enumerate(dis)}
-                     for dis in distinct_values]
+            matches = matches_x & matches_y
+            image_bg[x, y] = np.nanmean(table[matches][bg_col].values)
+            image_sg[x, y] = np.nanmean(table[matches][sg_col].values)
+            image_mult[x, y] = np.nanmean(table[matches][mult_col].values)
+
+    combined = np.sqrt(0.53 * image_bg**2 + image_sg**2)
+    show_mappings = [{v: i for i, v in enumerate(dis)} for dis in distinct_values]
     return image_sg, image_bg, image_mult, combined, show_params, show_mappings
 
 
@@ -1838,13 +1822,13 @@ def tabulate_matching_scores(eventWise, jet_name_base, jet_PS_mask=None):
         names of the variable columns of table
     score_cols : list of string
         names of the score columns of table
-    table : awkward array in 2d
-        awkward array where each row represents a jet
+    table : parndas Dataframe
+        dataframe where each row represents a jet
         and each column is a hyperparameter
 
     """
     all_cols, variable_cols, score_cols, table = tabulate_scores(eventWise)
-    name_col = all_cols.index("jet_name")
+    name_col = "jet_name"
     if jet_PS_mask is None:
         def test(name):
             name = str(name)
@@ -1858,9 +1842,8 @@ def test(name):
                 return False
             return name.startswith(jet_name_base)
 
-    matches = np.fromiter((test(n) for n in table[:, name_col]),
-                          dtype=bool)
-    table = table[matches]
+    # Using pandas to filter rows
+    table = table[table[name_col].apply(test)]
     return all_cols, variable_cols, score_cols, table
 
 

diff --git a/jet_tools/CustomSampler.py b/jet_tools/CustomSampler.py
@@ -1,8 +1,7 @@
-import torch
-###from ipdb import set_trace as st
 import numpy as np
 from torch.utils.data.sampler import Sampler
 
+
 class ValidationRandomSampler(Sampler):
     """
     Samples elements with a validation set held out,

diff --git a/jet_tools/ParallelFormJets.py b/jet_tools/ParallelFormJets.py
@@ -527,7 +527,7 @@ def batch_cluster(eventWise, jet_class,
     jet_class : str or callable
         The algorithm to do the clustering.
         If it's a string it is the algorithms name
-        in the module FormJets
+        in the module FormJets or SGWTFormJets
     jet_name : str
         prefix of the jet variables being worked on in the file
     cluster_parameters : dict
@@ -541,6 +541,8 @@ def batch_cluster(eventWise, jet_class,
         condition.
     
     """
+    if isinstance(jet_class, str):
+        jet_class = FormJets.cluster_classes[jet_class]
     finished = FormJets.cluster_multiapply(eventWise, jet_class, cluster_parameters,
                                            jet_name=jet_name, batch_length=batch_size,
                                            silent=True, save_frequency=30)
@@ -657,7 +659,7 @@ def create_scan_lists(eventWise_path, jet_class,
         jet_class_name = jet_class.__name__
     all_cols, var_cols, table = tabulate_fragments(eventWise_path)
     if len(table):
-        existing_names = table[:, all_cols.index("jet_name")]
+        existing_names = table["jet_name"].tolist()
         for params in full_param_list:
             matches = CompareClusters.filter_matching(all_cols, table, params)
             if not np.any(matches):
@@ -715,11 +717,11 @@ def benchmark_inputs():
     #name_list.append("V2SpectralJet")
     #param_list.append(params)
     #class_list.append(FormJets.Spectral)
-    ## old spectral, new params
-    #params = oldSpectralarxiv
-    #name_list.append("V1SpectralJet")
-    #param_list.append(params)
-    #class_list.append(FormJets.Spectral)
+    # old spectral, new params
+    params = oldSpectralarxivIRSafe
+    name_list.append("V1SpectralJet")
+    param_list.append(params)
+    class_list.append(FormJets.Spectral)
     ## old spectral, new params
     #params = newUnsafeSpectralarxiv
     #name_list.append("UnsafeV2SpectralJet")
@@ -905,8 +907,6 @@ def run_list(eventWise_path, end_time, class_list, param_list, name_list):
                   }
 
 
-
-
 oldSpectraloldParams = {'DeltaR': 1.26,
                         'NumEigenvectors': np.inf,
                         'EigenvalueLimit': 0.4,
@@ -1201,6 +1201,12 @@ def run_list(eventWise_path, end_time, class_list, param_list, name_list):
                'CutoffKNN': [3, 5, 7, 9, None]}
 
 
+fix_SGWJ1 = {'Cutoff': 0,
+            'Normalised': True}
+scan_SGWJ1 = {'Sigma': [0.01, .1, 1.],
+              'NRounds': [5, 15, 30]}
+
+
 def tabulate_fragments(eventWise_paths):
     """ Makes the assumption that jets with the same name are the same jet"""
     if isinstance(eventWise_paths, Components.EventWise):
@@ -1214,7 +1220,7 @@ def tabulate_fragments(eventWise_paths):
     all_cols, var_cols, _, table = \
         CompareClusters.tabulate_scores(eventWise_paths, score_cols=[])
     if len(table):
-        jet_names = list(table[:, all_cols.index('jet_name')])
+        jet_names = list(table['jet_name'])
         single_mask = [name not in jet_names[:i]
                        for i, name in enumerate(jet_names)]
         table = table[single_mask]
@@ -1471,6 +1477,9 @@ def complete_sequence(eventWise_path, jet_class, end_time, scan_parameters,
         should the file be preped for irc calculations?
     
     """
+    if end_time < time.time():
+       print("Assuming end_time is a duration in seconds, as end_time < time.time()")
+       end_time += time.time()
     # inputs
     run_jetInputs(eventWise_path, end_time)
     if time.time() > end_time:
@@ -1507,7 +1516,7 @@ def complete_sequence(eventWise_path, jet_class, end_time, scan_parameters,
     if time.time() > end_time:
         return
     # score clusterings
-    if False:
+    if dijet_mass is not None:
         if dijet_mass == 'semileptonictop':
             run_semileptonic(eventWise_path, end_time)
         else:
@@ -1517,7 +1526,7 @@ def complete_sequence(eventWise_path, jet_class, end_time, scan_parameters,
     # calculate mass peaks
     if dijet_mass == 'semileptonictop':
         run_correct_semileptonic_masses(eventWise_path, end_time)
-    elif jet_pt_cut is not None:
+    elif jet_pt_cut is not None and dijet_mass is not None:
         run_correct_masses(eventWise_path, end_time, jet_pt_cut)
     if time.time() > end_time:
         return

diff --git a/jet_tools/spectraljet b/jet_tools/spectraljet
diff --git a/requirements.txt b/requirements.txt
@@ -17,8 +17,8 @@ tabulate >= 0.8.6
 torch >= 1.3.1
 scikit-hep >= 0.5.1
 bokeh >= 1.4.0
-psutil >= 5.6.7
 pygit2 >= 1.2.1
+pandas
 # cannot get mpi4py to install in github actions or travis.
 #nevergrad >= 0.4.2
 #mpi4py >= 3.0.3  # needed for abcpy, but abcpy is bad at installing it