Cluster over stat instead of ifar. Make the background calculation _m…

…uch_ faster.
gwastro · Jul 2, 2019 · 618c6d7 · 618c6d7
1 parent 2a84923
commit 618c6d7
Show file tree

Hide file tree

Showing 2 changed files with 74 additions and 103 deletions.
diff --git a/bin/hdfcoinc/pycbc_multiifo_add_statmap b/bin/hdfcoinc/pycbc_multiifo_add_statmap
@@ -16,6 +16,11 @@ parser.add_argument("--version", action="version", version=pycbc.version.git_ver
 parser.add_argument('--verbose', action='store_true')
 parser.add_argument('--statmap-files', nargs='+',
                     help="List of coinc files to be redistributed")
+parser.add_argument('--censor-ifar-threshold', type=float, default=0.003,
+    help="If provided, only window out foreground triggers with IFAR (years)"
+         "above the threshold [default=0.003yr]")
+parser.add_argument('--veto-window', type=float, default=0.1,
+    help="Time around each zerolag trigger to window out [default=.1s]")
 parser.add_argument('--cluster-window', type=float)
 parser.add_argument('--output-file', help="name of output file")
 args = parser.parse_args()
@@ -28,34 +33,27 @@ f = h5py.File(args.output_file, "w")
 
 logging.info('Copying segments and attributes to %s' % args.output_file)
 # Move segments information into the final file - remove some duplication
-# in earlier files
-for fi in files:
-    for key in fi['segments']:
-        if key.startswith('foreground') or key.startswith('background'):
-            continue
-        f['segments/%s/end' % key] = fi['segments/%s/end' % key][:]
-        f['segments/%s/start' % key] = fi['segments/%s/start' % key][:]
-        if 'segments/foreground_veto' in fi:
-            f['segments/%s/foreground_veto/end' % key] = \
-                                         fi['segments/foreground_veto/end'][:]
-            f['segments/%s/foreground_veto/start' % key] = \
-                                       fi['segments/foreground_veto/start'][:]
-        for attr_name in fi.attrs:
-            if key not in f:
-                 f.create_group(key)
-            f[key].attrs[attr_name] = fi.attrs[attr_name]
-
-logging.info('Combining foreground and foreground excluded segments')
-# Set up dictionaries to contain segments from the individual statmap files
+# in earlier files. Also set up dictionaries to contain segments from the
+# individual statmap files
 indiv_segs = segments.segmentlistdict({})
-
-# loop through statmap files and put segments into segmentlistdicts
 for fi in files:
     key = fi.attrs['ifos'].replace(' ','')
-    # get analysed segments from individual statmap files
     starts = fi['segments/{}/start'.format(key)][:]
     ends = fi['segments/{}/end'.format(key)][:]
     indiv_segs[key] = pycbc.events.veto.start_end_to_segments(starts, ends)
+    f['segments/{}/start'.format(key)] = starts
+    f['segments/{}/end'.format(key)] = ends
+    if 'segments/foreground_veto' in fi:
+        f['segments/%s/foreground_veto/end' % key] = \
+                                         fi['segments/foreground_veto/end'][:]
+        f['segments/%s/foreground_veto/start' % key] = \
+                                       fi['segments/foreground_veto/start'][:]
+    for attr_name in fi.attrs:
+        if key not in f:
+            f.create_group(key)
+        f[key].attrs[attr_name] = fi.attrs[attr_name]
+
+logging.info('Combining foreground segments')
 
 # Convert segmentlistdict to a list ('seglists') of segmentlists
 # then np.sum(seglists, axis=0) does seglists[0] + seglists[1] + ...
@@ -67,17 +65,10 @@ all_ifos = np.unique([ifo for fi in files
 # output to file
 f.attrs['foreground_time'] = abs(foreground_segs)
 
-logging.info('Copying foreground & background common datasets')
-keys_to_copy = ['decimation_factor', 'stat']
-fg_bg = ['foreground','background', 'background_exc']
-for fg_type in fg_bg:
-    for k in keys_to_copy:
-        pycbc.io.combine_and_copy(f, files, fg_type + '/' + k)
-
-fg_only_keys_to_copy = ['template_id','timeslide_id', 'ifar', 'ifar_exc']
-logging.info('Copying foreground-only datasets')
-for k in fg_only_keys_to_copy:
-    pycbc.io.combine_and_copy(f, files, 'foreground/' + k)
+logging.info('Copying foreground datasets')
+for k in files[0]['foreground']:
+    if not k.startswith('fap') and k not in all_ifos:
+        pycbc.io.combine_and_copy(f, files, 'foreground/' + k)
 
 logging.info('Collating triggers into single structure')
 
@@ -108,51 +99,18 @@ for ifo in all_ifos:
     f['foreground/{}/time'.format(ifo)] = all_trig_times[ifo]
     f['foreground/{}/trigger_id'.format(ifo)] = all_trig_ids[ifo]
 
-logging.info('Getting ifo combination information for each coincidence')
-for f_in in files:
-    key = f_in.attrs['ifos'].replace(' ','')
-
-    for fg_type in fg_bg:
-        ifo_combo_key = fg_type + '/ifo_combination'
-        fg_comb_repeat = np.array(np.repeat(key.encode('utf8'),
-                                            f_in[fg_type + '/stat'].size))
-        if ifo_combo_key in f:
-            ifo_comb_fg = f[ifo_combo_key][:]
-            del f[ifo_combo_key]
-            ifo_comb_fg = np.concatenate([ifo_comb_fg, fg_comb_repeat])
-        else:
-            ifo_comb_fg = fg_comb_repeat
-
-        f[ifo_combo_key]=ifo_comb_fg
-
-del fg_comb_repeat, ifo_comb_fg
-
-logging.info('Working available ifo combinations are available for each '
-             'coincidence')
-
-logging.info('Finding indices of which background events are from which detector combination')
-
-where_combo = {ifo_c:np.where(f['background/ifo_combination'][:]==ifo_c)[0]
-               for ifo_c in f['segments'] if ifo_c is not 'foreground_veto'}
-where_combo_exc = {ifo_c:np.where(f['background_exc/ifo_combination'][:]==ifo_c)[0]
-               for ifo_c in f['segments'] if ifo_c is not 'foreground_veto'}
-
-logging.info('{} triggers'.format(f['foreground/ifar'].size))
-ifar_stat = np.core.records.fromarrays([f['foreground/ifar'][:],
-                                        f['foreground/stat'][:]],
-                                        names='ifar,stat')
+n_triggers = f['foreground/ifar'].size
+logging.info('{} triggers'.format(n_triggers))
 
 # all_times is a tuple of trigger time arrays
 all_times = (f['foreground/%s/time' % ifo][:] for ifo in all_ifos)
 
-def argmax(v):
-    return np.argsort(v)[-1]
-
 # Currently only clustering zerolag, i.e. foreground, so set all timeslide_ids
 # to zero
-cidx = pycbc.events.cluster_coincs_multiifo(ifar_stat, all_times,
-                                            np.zeros(len(ifar_stat)), 0,
-                                            args.cluster_window, argmax)
+cidx = pycbc.events.cluster_coincs_multiifo(f['foreground/stat'][:], all_times,
+                                            np.zeros(n_triggers), 0,
+                                            args.cluster_window, argmax=np.argmax)
+
 
 def filter_dataset(h5file, name, idx):
     # Dataset needs to be deleted and remade as it is a different size
@@ -169,35 +127,49 @@ for key in f['foreground'].keys():
         for k in f['foreground/%s' % key].keys():
             filter_dataset(f, 'foreground/{}/{}'.format(key, k), cidx)
 
+n_triggers = f['foreground/ifar'].size
+
 times_tuple = (f['foreground/{}/time'.format(ifo)] for ifo in all_ifos)
 test_times = np.array([pycbc.events.mean_if_greater_than_zero(tc)[0]
                           for tc in zip(*times_tuple)])
 
 is_in_combo_time = {}
 for key in f['segments']:
-    is_in_combo_time[key] = np.zeros_like(f['foreground/decimation_factor'][:])
+    is_in_combo_time[key] = np.zeros(n_triggers)
     if key.startswith('foreground') or key.startswith('background'):
         continue
     end_times = np.array(f['segments/%s/end' % key][:])
     start_times = np.array(f['segments/%s/start' % key][:])
     idx_within_segment = pycbc.events.indices_within_times(test_times,
                                                            start_times,
                                                            end_times)
-    is_in_combo_time[key][idx_within_segment] += np.ones_like(idx_within_segment)
+    is_in_combo_time[key][idx_within_segment] = np.ones_like(idx_within_segment)
 del idx_within_segment
 
-
-all_indices = np.arange(f['foreground/decimation_factor'].size)
-available_combos =[' '.join(sorted([key for key in is_in_combo_time if is_in_combo_time[key][i]])).encode('utf8') for i in all_indices]
-del all_indices
-f['foreground/available_combinations'] = available_combos
+available_combos =[' '.join(sorted([key for key in is_in_combo_time if is_in_combo_time[key][i]])).encode('utf8') for i in np.arange(n_triggers)]
 
 all_combo_types = np.unique(available_combos)
 idx = {ct:np.where(np.array(available_combos)==ct)[0]
        for ct in all_combo_types}
 
 del available_combos
 
+logging.info('Calculating n_louder background triggers in each type for all foreground events')
+
+fnlouder = {}
+fnlouder_exc = {}
+for f_in in files:
+        ifo_combo_key = f_in.attrs['ifos'].replace(' ','')
+        _, fnlouder[ifo_combo_key] = coinc.calculate_n_louder(
+                                                   f_in['background/stat'][:],
+                                                   f['foreground/stat'][:],
+                                                   f_in['background/decimation_factor'][:]
+                                                             )
+        _, fnlouder_exc[ifo_combo_key] = coinc.calculate_n_louder(
+                                                   f_in['background_exc/stat'][:],
+                                                   f['foreground/stat'][:],
+                                                   f_in['background_exc/decimation_factor'][:]
+                                                             )
 logging.info('Recalculating ifar according to summed trigger distributions')
 
 fg_ifar = np.zeros_like(f['foreground/decimation_factor'][:])
@@ -216,30 +188,26 @@ for ct in all_combo_types:
     largest_combination = cts[np.argmax([len(ifo_c) for ifo_c in cts])]
     bg_time = f[largest_combination].attrs['background_time']
     bg_time_exc = f[largest_combination].attrs['background_time_exc']
-    inc_bg_list = [where_combo[ifo_c] for ifo_c in cts]
-    inc_bg = list(itertools.chain(*inc_bg_list))
-    inc_bg_exc_list = [where_combo_exc[ifo_c] for ifo_c in cts]
-    inc_bg_exc = list(itertools.chain(*inc_bg_exc_list))
-    _, fnlouder = coinc.calculate_n_louder(f['background/stat'][:][inc_bg],
-                                           f['foreground/stat'][:][idx[ct]],
-                                           f['background/decimation_factor'][:][inc_bg])
-    _, fnlouder_exc = coinc.calculate_n_louder(
-                          f['background_exc/stat'][:][inc_bg_exc],
-                          f['foreground/stat'][:][idx[ct]],
-                          f['background_exc/decimation_factor'][:][inc_bg_exc]
-                                              )
+    fnlouder = np.sum([fnlouder[ifo_combo_key][idx[ct]] for ifo_combo_key in cts])
+    fnlouder_exc = np.sum([fnlouder_exc[ifo_combo_key][idx[ct]] for ifo_combo_key in cts])
     ifar = bg_time / (fnlouder + 1)
     ifar_exc = bg_time_exc / (fnlouder_exc + 1)
     fg_ifar[idx[ct]] = conv.sec_to_year(ifar)
     fg_ifar_exc[idx[ct]] = conv.sec_to_year(ifar_exc)
 
-for bg_type in ['background', 'background_exc']:
-    for k in ['stat','decimation_factor', 'ifo_combination']:
-        print(bg_type + '/' + k)
-        if bg_type + '/' + k in f:
-            print('deleting')
-            del f[bg_type + '/' + k]
-        else: print('not deleting')
+f.attrs['foreground_time_exc'] = f.attrs['foreground_time']
+
+# Construct the foreground censor veto from the clustered candidate times
+# above the ifar threshold
+thr = test_times[fg_ifar > args.censor_ifar_threshold]
+vstart = thr - args.veto_window
+vend = thr + args.veto_window
+vtime = segments.segmentlist([segments.segment(s, e)
+                              for s, e in zip(vstart, vend)])
+logging.info('Censoring %.2f seconds', abs(vtime))
+f.attrs['foreground_time_exc'] -= abs(vtime)
+f['segments/foreground_veto/start'] = vstart
+f['segments/foreground_veto/end'] = vend
 
 f['foreground/ifar'][:] = fg_ifar
 f['foreground/fap'] = 1 - np.exp(-f.attrs['foreground_time'] / fg_ifar)

diff --git a/bin/hdfcoinc/pycbc_multiifo_coinc_statmap b/bin/hdfcoinc/pycbc_multiifo_coinc_statmap
@@ -109,14 +109,17 @@ else:
 
 logging.info("We have %s triggers" % len(all_trigs.stat))
 fore_locs = all_trigs.timeslide_id == 0
-# Foreground trigger times for ifo.
+# Foreground trigger times for ifos
 fore_time = {}
 for ifo in ifos:
     fore_time[ifo] = all_trigs.data['%s/time' % ifo][fore_locs]
-# Average times of triggers from ifo1 and ifo2
-ave_fore_time = 0
-for ifo in ifos:
-    ave_fore_time += fore_time[ifo] / len(ifos)
+# Average times of triggers (note that coincs where not all ifos have triggers
+# will have some -1 sentinel values)
+fore_time_zip = zip(*fore_time.values())
+ave_fore_time = []
+for ts in fore_time_zip:
+    ave_fore_time.append(coinc.mean_if_greater_than_zero(ts)[0])
+ave_fore_time = numpy.array(ave_fore_time)
 
 # Remove start and end time around every average foreground trigger time to
 # window around.