Skip to content

Commit

Permalink
Some fixes from Kieran
Browse files Browse the repository at this point in the history
  • Loading branch information
HenryDayHall committed Oct 1, 2023
1 parent aa1fdfc commit d75e099
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 87 deletions.
125 changes: 54 additions & 71 deletions jet_tools/CompareClusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import functools
from .spectraljet import Components, FormJets, Constants, TypeTools
from jet_tools import TrueTag, InputTools, JetQuality, PlottingTools, RemovePileup
import pandas as pd

SCORE_COLS = ["QualityWidth", "QualityFraction",
"AveSignalMassRatio", "AveBGMassRatio",
Expand Down Expand Up @@ -846,8 +847,8 @@ def tabulate_scores(eventWise_paths, variable_cols=None, score_cols=None):
names of the variable columns of table
score_cols : list of string
names of the score columns of table
table : awkward array in 2d
awkward array where each row represents a jet
table : pandas dataframe
dataframe where each row represents a jet
and each column is a hyperparameter
"""
Expand Down Expand Up @@ -877,7 +878,12 @@ def tabulate_scores(eventWise_paths, variable_cols=None, score_cols=None):
value = getattr(eventWise, f"{name}_{part}", "Undefined")
row.append(value)
table.append(row)
table = TypeTools.StringyArray(table)
# Rearrange columns to the desired order
table = pd.DataFrame(table)

# Check if table is empty
if not table.empty:
table.columns = all_cols
return all_cols, variable_cols, score_cols, table


Expand Down Expand Up @@ -1092,34 +1098,28 @@ def filter_traditional(all_cols, variable_cols, score_cols, table):
def filter_matching(all_cols, table, exact=None, approx=None):
"""
Construct a mask that selects rows of the table
which match the exact and approimate equality requirements.
which match the exact and approximate equality requirements.
Parameters
----------
all_cols : list of string
names of the columns of table in the order found in table
table : awkward array in 2d
awkward array where each row represents a jet
table : pandas DataFrame or awkward array in 2D
DataFrame or awkward array where each row represents a jet
and each column is a hyperparameter
Returns
-------
mask : array of bool
mask : iterable of bool
rows to keep
"""
mask = np.full(len(table), True, dtype=bool)
mask = pd.Series([True]*table.shape[0])
if exact is not None:
for name in exact:
column = [row[all_cols.index(name)] ==
TypeTools.restring(exact[name])
for row in table]
mask *= column
for name, value in exact.items():
mask &= (table[name] == TypeTools.restring(value))
if approx is not None:
for name in approx:
column = [TypeTools.soft_equality(row[all_cols.index(name)],
approx[name])
for row in table]
mask *= column
for name, value in approx.items():
mask &= table[name].apply(TypeTools.soft_equality, args=(value,))
return mask


Expand Down Expand Up @@ -1628,8 +1628,8 @@ def project_2d(show_params, best_slice, all_cols, variable_cols, score_cols,
names of the variable columns of table
score_cols : list of string
names of the score columns of table
table : awkward array in 2d
awkward array where each row represents a jet
table : pandas dataframe
pandas dataframe where each row represents a jet
and each column is a hyperparameter
print_best : bool
print the lowest loss in the slice and assocated parameter choices
Expand Down Expand Up @@ -1660,39 +1660,29 @@ def project_2d(show_params, best_slice, all_cols, variable_cols, score_cols,
assert len(show_params) == 2
show_indices = [all_cols.index(s) for s in show_params]
distinct_values = []
for i in show_indices:
values = table[:, i]
distinct_values.append(np.array(TypeTools.generic_sort(set(values))))
for s in show_params:
values = table[s]
distinct_values.append(np.array(TypeTools.generic_sort(values.unique())))

bg_index = all_cols.index("SeperateAveDistanceBG")
sg_index = all_cols.index("SeperateAveDistanceSignal")
mult_index = all_cols.index("AveSeperateJets")
bg_col = "SeperateAveDistanceBG"
sg_col = "SeperateAveDistanceSignal"
mult_col = "AveSeperateJets"
# if we are taking the best slice remove all other table rows
if best_slice:
# old way (awkward 1.8.0)
#combined = np.sqrt(0.53*np.array(ak.to_list(table[:, bg_index]))**2 +
# np.array(ak.to_list(table[:, sg_index]))**2)
# TODO revert if 1.8.0 does bug fix
#combined = np.sqrt(0.53*np.array(list(table[:, bg_index]))**2 +
# np.array(list(table[:, sg_index]))**2)

#combined = np.sqrt(0.2*np.array(ak.to_list(table[:, bg_index]))**2 +
# np.array(ak.to_list(table[:, sg_index]))**2)
#combined = (np.sqrt(0.53*np.array(ak.to_list(table[:, bg_index]))**2 +
# np.array(ak.to_list(table[:, sg_index]))**2)/
# np.array(ak.to_list(table[:, mult_index])))
bg_values = convert_to_float(table[bg_col].to_numpy())
sg_values = convert_to_float(table[sg_col].to_numpy())
combined = np.sqrt(0.53 * np.square(bg_values) + np.square(sg_values))
if isinstance(best_slice, bool):
best_idx = np.nanargmin(combined.tolist())
elif isinstance(best_slice, dict):
# then is is the values of the best slice
# then it is the values of the best slice
mask = filter_matching(all_cols, table, approx=best_slice)
possible = np.where(mask)[0]
possible = mask.index[mask].tolist()
distances = np.zeros_like(possible, dtype=float)
for i in possible:
for v in variable_cols:
col = all_cols.index(v)
if table[i, col] != best_slice[v]:
distances += abs(table[i, col] - best_slice[v])
if table.loc[i, v] != best_slice[v]:
distances += abs(table.loc[i, v] - best_slice[v])
best_idx = possible[np.argmin(distances)]
elif isinstance(best_slice, int):
best_idx = best_slice
Expand All @@ -1701,39 +1691,33 @@ def project_2d(show_params, best_slice, all_cols, variable_cols, score_cols,
if print_best:
print(f"best score; {combined[best_idx]}")
print(f"best row={best_idx};", end=' ')
print({v: table[best_idx, all_cols.index(v)] for v in
["jet_name", "eventWise_name"] + variable_cols})
print({v: table.loc[best_idx, v] for v in ["jet_name", "eventWise_name"] + variable_cols})
print()
print_best = False
close_filter = np.ones(num_configurations, dtype=bool)
for i in variable_idxs:
if i not in show_indices:
close_filter *= TypeTools.soft_equality(table[:, i],
table[best_idx, i])
if not close_filter[best_idx]:
# the best must be close to the best...
raise ValueError(
f"Failed to find the {all_cols[i]} that match {table[best_idx, i]}")
close_filter = pd.Series([True] * len(table), index=table.index)
for i in variable_cols:
if i not in show_params:
close_filter &= TypeTools.soft_equality(table[i], table.loc[best_idx, i])
if not close_filter.loc[best_idx]:
raise ValueError(f"Failed to find the {i} that match {table.loc[best_idx, i]}")
table = table[close_filter]
# make the images
image_sg = np.full(tuple(len(val) for val in distinct_values), np.nan)
image_bg = np.full(tuple(len(val) for val in distinct_values), np.nan)
image_mult = np.full(tuple(len(val) for val in distinct_values), np.nan)
x_values = np.array(table[:, show_indices[0]].tolist())
y_values = np.array(table[:, show_indices[1]].tolist())
x_values = table[show_params[0]].values
y_values = table[show_params[1]].values
for x, x_val in enumerate(distinct_values[0]):
matches_x = TypeTools.soft_equality(x_values, x_val)
for y, y_val in enumerate(distinct_values[1]):
matches_y = TypeTools.soft_equality(y_values, y_val)
matches = matches_x*matches_y
image_bg[x, y] = np.nanmean(table[matches, bg_index].tolist())
image_sg[x, y] = np.nanmean(table[matches, sg_index].tolist())
image_mult[x, y] = np.nanmean(table[matches, mult_index].tolist())
combined = np.sqrt(0.53*image_bg**2 + image_sg**2)
#combined = np.sqrt(0.2*image_bg**2 + image_sg**2)
#combined = np.sqrt(0.53*image_bg**2 + image_sg**2)/image_mult
show_mappings = [{v:i for i, v in enumerate(dis)}
for dis in distinct_values]
matches = matches_x & matches_y
image_bg[x, y] = np.nanmean(table[matches][bg_col].values)
image_sg[x, y] = np.nanmean(table[matches][sg_col].values)
image_mult[x, y] = np.nanmean(table[matches][mult_col].values)

combined = np.sqrt(0.53 * image_bg**2 + image_sg**2)
show_mappings = [{v: i for i, v in enumerate(dis)} for dis in distinct_values]
return image_sg, image_bg, image_mult, combined, show_params, show_mappings


Expand Down Expand Up @@ -1838,13 +1822,13 @@ def tabulate_matching_scores(eventWise, jet_name_base, jet_PS_mask=None):
names of the variable columns of table
score_cols : list of string
names of the score columns of table
table : awkward array in 2d
awkward array where each row represents a jet
table : parndas Dataframe
dataframe where each row represents a jet
and each column is a hyperparameter
"""
all_cols, variable_cols, score_cols, table = tabulate_scores(eventWise)
name_col = all_cols.index("jet_name")
name_col = "jet_name"
if jet_PS_mask is None:
def test(name):
name = str(name)
Expand All @@ -1858,9 +1842,8 @@ def test(name):
return False
return name.startswith(jet_name_base)

matches = np.fromiter((test(n) for n in table[:, name_col]),
dtype=bool)
table = table[matches]
# Using pandas to filter rows
table = table[table[name_col].apply(test)]
return all_cols, variable_cols, score_cols, table


Expand Down
3 changes: 1 addition & 2 deletions jet_tools/CustomSampler.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import torch
###from ipdb import set_trace as st
import numpy as np
from torch.utils.data.sampler import Sampler


class ValidationRandomSampler(Sampler):
"""
Samples elements with a validation set held out,
Expand Down
33 changes: 21 additions & 12 deletions jet_tools/ParallelFormJets.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ def batch_cluster(eventWise, jet_class,
jet_class : str or callable
The algorithm to do the clustering.
If it's a string it is the algorithms name
in the module FormJets
in the module FormJets or SGWTFormJets
jet_name : str
prefix of the jet variables being worked on in the file
cluster_parameters : dict
Expand All @@ -541,6 +541,8 @@ def batch_cluster(eventWise, jet_class,
condition.
"""
if isinstance(jet_class, str):
jet_class = FormJets.cluster_classes[jet_class]
finished = FormJets.cluster_multiapply(eventWise, jet_class, cluster_parameters,
jet_name=jet_name, batch_length=batch_size,
silent=True, save_frequency=30)
Expand Down Expand Up @@ -657,7 +659,7 @@ def create_scan_lists(eventWise_path, jet_class,
jet_class_name = jet_class.__name__
all_cols, var_cols, table = tabulate_fragments(eventWise_path)
if len(table):
existing_names = table[:, all_cols.index("jet_name")]
existing_names = table["jet_name"].tolist()
for params in full_param_list:
matches = CompareClusters.filter_matching(all_cols, table, params)
if not np.any(matches):
Expand Down Expand Up @@ -715,11 +717,11 @@ def benchmark_inputs():
#name_list.append("V2SpectralJet")
#param_list.append(params)
#class_list.append(FormJets.Spectral)
## old spectral, new params
#params = oldSpectralarxiv
#name_list.append("V1SpectralJet")
#param_list.append(params)
#class_list.append(FormJets.Spectral)
# old spectral, new params
params = oldSpectralarxivIRSafe
name_list.append("V1SpectralJet")
param_list.append(params)
class_list.append(FormJets.Spectral)
## old spectral, new params
#params = newUnsafeSpectralarxiv
#name_list.append("UnsafeV2SpectralJet")
Expand Down Expand Up @@ -905,8 +907,6 @@ def run_list(eventWise_path, end_time, class_list, param_list, name_list):
}




oldSpectraloldParams = {'DeltaR': 1.26,
'NumEigenvectors': np.inf,
'EigenvalueLimit': 0.4,
Expand Down Expand Up @@ -1201,6 +1201,12 @@ def run_list(eventWise_path, end_time, class_list, param_list, name_list):
'CutoffKNN': [3, 5, 7, 9, None]}


fix_SGWJ1 = {'Cutoff': 0,
'Normalised': True}
scan_SGWJ1 = {'Sigma': [0.01, .1, 1.],
'NRounds': [5, 15, 30]}


def tabulate_fragments(eventWise_paths):
""" Makes the assumption that jets with the same name are the same jet"""
if isinstance(eventWise_paths, Components.EventWise):
Expand All @@ -1214,7 +1220,7 @@ def tabulate_fragments(eventWise_paths):
all_cols, var_cols, _, table = \
CompareClusters.tabulate_scores(eventWise_paths, score_cols=[])
if len(table):
jet_names = list(table[:, all_cols.index('jet_name')])
jet_names = list(table['jet_name'])
single_mask = [name not in jet_names[:i]
for i, name in enumerate(jet_names)]
table = table[single_mask]
Expand Down Expand Up @@ -1471,6 +1477,9 @@ def complete_sequence(eventWise_path, jet_class, end_time, scan_parameters,
should the file be preped for irc calculations?
"""
if end_time < time.time():
print("Assuming end_time is a duration in seconds, as end_time < time.time()")
end_time += time.time()
# inputs
run_jetInputs(eventWise_path, end_time)
if time.time() > end_time:
Expand Down Expand Up @@ -1507,7 +1516,7 @@ def complete_sequence(eventWise_path, jet_class, end_time, scan_parameters,
if time.time() > end_time:
return
# score clusterings
if False:
if dijet_mass is not None:
if dijet_mass == 'semileptonictop':
run_semileptonic(eventWise_path, end_time)
else:
Expand All @@ -1517,7 +1526,7 @@ def complete_sequence(eventWise_path, jet_class, end_time, scan_parameters,
# calculate mass peaks
if dijet_mass == 'semileptonictop':
run_correct_semileptonic_masses(eventWise_path, end_time)
elif jet_pt_cut is not None:
elif jet_pt_cut is not None and dijet_mass is not None:
run_correct_masses(eventWise_path, end_time, jet_pt_cut)
if time.time() > end_time:
return
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ tabulate >= 0.8.6
torch >= 1.3.1
scikit-hep >= 0.5.1
bokeh >= 1.4.0
psutil >= 5.6.7
pygit2 >= 1.2.1
pandas
# cannot get mpi4py to install in github actions or travis.
#nevergrad >= 0.4.2
#mpi4py >= 3.0.3 # needed for abcpy, but abcpy is bad at installing it
Expand Down

0 comments on commit d75e099

Please sign in to comment.