diff --git a/docs/mva/index.rst b/docs/mva/index.rst index 6e93e1f1..9b76c6ac 100644 --- a/docs/mva/index.rst +++ b/docs/mva/index.rst @@ -10,20 +10,30 @@ Introduction classification problems. It is based on machine learning methods available in scikit-learn_. Internally, the tables are dealt with the Pandas_ Python module. -For each type of camera a regressor/classifier should be trained. For both type of models -an average of the image estimates is later computed to determine a global -output for the event (energy or score/gammaness). +For each type of camera a regressor/classifier should be trained. +For both type of models an average of the image estimates is later computed to +determine a global output for the event (energy or score/gammaness). Details ------- -Data is split in train and test subsamples by images. +Data is split in train and test subsamples by single telescope images. -The class `TrainModel` uses a training sample composed of gamma-rays for a +The class ```TrainModel``` uses a training sample composed of gamma-rays for a regression model. In addition of a gamma-ray sample, a sample of -protons is also used to build a classifier. The training of a model is done via -the GridSearchCV_ algorithm which allows to find the best hyper-parameters of -the models. +protons is also used to build a classifier. + +The training of a model can be done also via the GridSearchCV_ algorithm which +allows to find the best hyper-parameters of the models. + +Supported models: + +- ``sklearn.ensemble.RandomForestClassifier`` +- ``sklearn.ensemble.RandomForestRegressor`` +- ``sklearn.ensemble.AdaBoostRegressor`` + +For details about the generation of each model type, please refer to +:ref:`model_building`. Reference/API ------------- diff --git a/docs/scripts/DL2.rst b/docs/scripts/DL2.rst index a19028af..79e34048 100644 --- a/docs/scripts/DL2.rst +++ b/docs/scripts/DL2.rst @@ -15,14 +15,11 @@ By invoking the help argument, you can get help about how the script works: .. code-block:: - usage: protopipe-DL2 [-h] --config_file CONFIG_FILE -o OUTFILE [-m MAX_EVENTS] - [-i INDIR] [-f [INFILE_LIST [INFILE_LIST ...]]] - [--cam_ids [CAM_IDS [CAM_IDS ...]]] [--wave_dir WAVE_DIR] - [--wave_temp_dir WAVE_TEMP_DIR] [--wave | --tail] - [--debug] [--regressor_dir REGRESSOR_DIR] - [--classifier_dir CLASSIFIER_DIR] - [--force_tailcut_for_extended_cleaning FORCE_TAILCUT_FOR_EXTENDED_CLEANING] - [--save_images] + usage: protopipe-DL2 [-h] --config_file CONFIG_FILE -o OUTFILE [-m MAX_EVENTS] [-i INDIR] [-f [INFILE_LIST [INFILE_LIST ...]]] + [--cam_ids [CAM_IDS [CAM_IDS ...]]] [--wave_dir WAVE_DIR] [--wave_temp_dir WAVE_TEMP_DIR] [--wave | --tail] [--debug] + [--regressor_dir REGRESSOR_DIR] [--classifier_dir CLASSIFIER_DIR] + [--force_tailcut_for_extended_cleaning FORCE_TAILCUT_FOR_EXTENDED_CLEANING] [--save_images] + [--regressor_config REGRESSOR_CONFIG] [--classifier_config CLASSIFIER_CONFIG] optional arguments: -h, --help show this help message and exit @@ -35,11 +32,9 @@ By invoking the help argument, you can get help about how the script works: give a specific list of files to run on --cam_ids [CAM_IDS [CAM_IDS ...]] give the specific list of camera types to run on - --wave_dir WAVE_DIR directory where to find mr_filter. if not set look in - $PATH + --wave_dir WAVE_DIR directory where to find mr_filter. if not set look in $PATH --wave_temp_dir WAVE_TEMP_DIR - directory where mr_filter to store the temporary fits - files + directory where mr_filter to store the temporary fits files --wave if set, use wavelet cleaning -- default --tail if set, use tail cleaning, otherwise wavelets --debug Print debugging information @@ -50,3 +45,7 @@ By invoking the help argument, you can get help about how the script works: --force_tailcut_for_extended_cleaning FORCE_TAILCUT_FOR_EXTENDED_CLEANING For tailcut cleaning for energy/score estimation --save_images Save images in images.h5 (one file testing) + --regressor_config REGRESSOR_CONFIG + Configuration file used to produce regressor model + --classifier_config CLASSIFIER_CONFIG + Configuration file used to produce classification model diff --git a/docs/scripts/data_training.rst b/docs/scripts/data_training.rst index daf31c7e..24ee6300 100644 --- a/docs/scripts/data_training.rst +++ b/docs/scripts/data_training.rst @@ -19,15 +19,10 @@ By invoking the help argument, you can get help about how the script works: .. code-block:: - usage: protopipe-TRAINING [-h] --config_file CONFIG_FILE -o OUTFILE - [-m MAX_EVENTS] [-i INDIR] - [-f [INFILE_LIST [INFILE_LIST ...]]] - [--cam_ids [CAM_IDS [CAM_IDS ...]]] - [--wave_dir WAVE_DIR] - [--wave_temp_dir WAVE_TEMP_DIR] [--wave | --tail] - [--debug] [--save_images] - [--estimate_energy ESTIMATE_ENERGY] - [--regressor_dir REGRESSOR_DIR] + usage: protopipe-TRAINING [-h] --config_file CONFIG_FILE -o OUTFILE [-m MAX_EVENTS] [-i INDIR] [-f [INFILE_LIST [INFILE_LIST ...]]] + [--cam_ids [CAM_IDS [CAM_IDS ...]]] [--wave_dir WAVE_DIR] [--wave_temp_dir WAVE_TEMP_DIR] [--wave | --tail] + [--debug] [--save_images] [--estimate_energy ESTIMATE_ENERGY] [--regressor_dir REGRESSOR_DIR] + [--regressor_config REGRESSOR_CONFIG] optional arguments: -h, --help show this help message and exit @@ -40,20 +35,19 @@ By invoking the help argument, you can get help about how the script works: give a specific list of files to run on --cam_ids [CAM_IDS [CAM_IDS ...]] give the specific list of camera types to run on - --wave_dir WAVE_DIR directory where to find mr_filter. if not set look in - $PATH + --wave_dir WAVE_DIR directory where to find mr_filter. if not set look in $PATH --wave_temp_dir WAVE_TEMP_DIR - directory where mr_filter to store the temporary fits - files + directory where mr_filter to store the temporary fits files --wave if set, use wavelet cleaning -- default --tail if set, use tail cleaning, otherwise wavelets --debug Print debugging information --save_images Save also all images --estimate_energy ESTIMATE_ENERGY - Estimate the events' energy with a regressor from - protopipe.scripts.build_model + Estimate the events' energy with a regressor from protopipe.scripts.build_model --regressor_dir REGRESSOR_DIR regressors directory + --regressor_config REGRESSOR_CONFIG + Configuration file used to produce regressor model The configuration file used by this script is ``analysis.yaml``, diff --git a/docs/scripts/model_building.rst b/docs/scripts/model_building.rst index 7e83f5cb..341167c5 100644 --- a/docs/scripts/model_building.rst +++ b/docs/scripts/model_building.rst @@ -15,10 +15,8 @@ The following is the help output which shows required arguments and options. .. code-block:: >$ protopipe-MODEL -h - usage: protopipe-MODEL [-h] --config_file CONFIG_FILE - [--max_events MAX_EVENTS] [--wave | --tail] - (--cameras_from_config | --cameras_from_file | --cam_id_list CAM_ID_LIST) - [-i INDIR] [--infile_signal INFILE_SIGNAL] + usage: protopipe-MODEL [-h] --config_file CONFIG_FILE [--max_events MAX_EVENTS] [--wave | --tail] + (--cameras_from_config | --cameras_from_file | --cam_id_list CAM_ID_LIST) [-i INDIR] [--infile_signal INFILE_SIGNAL] [--infile_background INFILE_BACKGROUND] [-o OUTDIR] Build model for regression/classification @@ -27,9 +25,9 @@ The following is the help output which shows required arguments and options. -h, --help show this help message and exit --config_file CONFIG_FILE --max_events MAX_EVENTS - maximum number of events for training + maximum number of events to use --wave if set, use wavelet cleaning - --tail if set, use tail cleaning, otherwise wavelets + --tail if set, use tail cleaning (default), otherwise wavelets --cameras_from_config Get cameras configuration file (Priority 1) --cameras_from_file Get cameras from input file (Priority 2) @@ -44,10 +42,13 @@ The following is the help output which shows required arguments and options. -o OUTDIR, --outdir OUTDIR The script takes along its arguments a configuration file which depends on what -type of estimator needs to be trained: +type of model needs to be built. -* ``regressor.yaml`` is used to train an energy regressor, -* ``classifier.yaml`` is used to train a gamma/hadron classifier. +The available choices can be found under ```protopipe.aux.example_config_files```: + +* ``AdaBoostRegressor.yaml`` is used to train an energy regressor, +* ``RandomForestRegressor.yaml`` is used to train an energy regressor, +* ``RandomForestClassifier.yaml`` is used to train a gamma/hadron classifier. Energy regressor ---------------- @@ -57,44 +58,86 @@ and some event characteristics (the features) to reconstruct the energy. This table is created in the :ref:`data_training` step. The following is a commented example of the required configuration file -``regressor.yaml``: +``AdaBoostRegressor.yaml`` with similar options as for ``RandomForestRegressor.yaml``, .. code-block:: yaml General: - model_type: 'regressor' - # [...] = your analysis local full path OUTSIDE the Vagrant box - data_dir: '[...]/shared_folder/analyses/v0.4.0_dev1/data/TRAINING/for_energy_estimation' - data_file: 'TRAINING_energy_tail_gamma_merged.h5' - outdir: '[...]/shared_folder/analyses/v0.4.0_dev1/estimators/energy_regressor' - cam_id_list: ['LSTCam', 'NectarCam'] - table_name_template: '' # leave empty (TO BE REMOVED) - + # [...] = your analysis local full path OUTSIDE the Vagrant box + data_dir: '../../data/' + data_sig_file: 'TRAINING_energy_tail_gamma_merged.h5' + outdir: './' + + # List of cameras to use (you can override this from the CLI) + cam_id_list: ['LSTCam', 'NectarCam'] + + # If train_fraction is 1, all the TRAINING dataset will be used to train the + # model and benchmarking can only be done from the benchmarking notebook + # TRAINING/benchmarks_DL2_to_classification.ipynb Split: - train_fraction: 0.8 + train_fraction: 0.8 + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + + # Optimize the hyper-parameters of the estimator with a grid search + # If True parameters should be provided as lists + # If False the model used will be the unique one based on your the + GridSearchCV: + use: False # True or False + # if False the following two variables are irrelevant + scoring: 'explained_variance' + cv: 2 Method: - name: 'AdaBoostRegressor' - target_name: 'true_energy' - tuned_parameters: - learning_rate: [0.3] - n_estimators: [100] - base_estimator__max_depth: [null] # null is equivalent to None - base_estimator__min_samples_split: [2] - base_estimator__min_samples_leaf: [10] - scoring: 'explained_variance' - cv: 2 - + name: 'sklearn.ensemble.AdaBoostRegressor' + target_name: 'true_energy' + # Please, see scikit-learn's API for what each parameter means + # NOTE: null == None + base_estimator: + name: 'sklearn.tree.DecisionTreeRegressor' + parameters: + # NOTE: here we set the parameters relevant for sklearn.tree.DecisionTreeRegressor + criterion: "mse" # "mse", "friedman_mse", "mae" or "poisson" + splitter: "best" # "best" or "random" + max_depth: null # null or integer + min_samples_split: 2 # integer or float + min_samples_leaf: 1 # int or float + min_weight_fraction_leaf: 0.0 # float + max_features: null # null, "auto", "sqrt", "log2", int or float + max_leaf_nodes: null # null or integer + min_impurity_decrease: 0.0 # float + random_state: 0 # null or integer or RandomState + ccp_alpha: 0.0 # non-negative float + tuned_parameters: + n_estimators: 50 + learning_rate: 1 + loss: 'linear' # 'linear', 'square' or 'exponential' + random_state: 0 # int, RandomState instance or None + + # List of the features to use to train the model + # You can: + # - comment/uncomment the ones you see here, + # - add new ones here if they can be evaluated with pandas.DataFrame.eval + # - if not you can propose modifications to protopipe.mva.utils.prepare_data FeatureList: - - 'log10_hillas_intensity' - - 'log10_impact_dist' - - 'hillas_width_reco' - - 'hillas_length_reco' - - 'h_max' - + Basic: # single-named, they need to correspond to input data columns + - 'h_max' # Height of shower maximum from stereoscopic reconstruction + - 'impact_dist' # Impact parameter from stereoscopic reconstruction + - 'hillas_width' # Image Width + - 'hillas_length' # Image Length + # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel + - 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera + Derived: # custom evaluations of basic features that will be added to the data + # column name : expression to evaluate using basic column names + log10_WLS: log10(hillas_width*hillas_length/hillas_intensity) + log10_intensity: log10(hillas_intensity) + CTAMARS_1: (sqrt((hillas_x - az)**2 + (hillas_y - alt)**2))**2 + CTAMARS_2: arctan2(hillas_y - alt, hillas_x - az) + + # These cuts select the input data BEFORE training SigFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' + - 'good_image == 1' + - 'is_valid == True' + - 'hillas_intensity_reco > 0' Diagnostic: # Energy binning (used for reco and true energy) @@ -136,58 +179,97 @@ as a contamination). An alternative approach - yet to study - could be to train a classifier with gamma against a background sample composed of weighted hadrons and weighted electrons. +The following the example provided by the example configuration file ``RandomForestClassifier.yaml``, + .. code-block:: yaml General: - model_type: 'classifier' - # [...] = your analysis local full path OUTSIDE the Vagrant box - data_dir: '[...]/shared_folder/analyses/v0.4.0_dev1/data/TRAINING/for_particle_classification/' - data_sig_file: 'TRAINING_classification_tail_gamma_merged.h5' - data_bkg_file: 'TRAINING_classification_tail_proton_merged.h5' - cam_id_list: ['LSTCam', 'NectarCam'] - table_name_template: '' # leave empty (TO BE REMOVED) - outdir: '[...]/shared_folder/analyses/v0.4.0_dev1/estimators/gamma_hadron_classifier' - + # [...] = your analysis local full path OUTSIDE the Vagrant box + data_dir: '../../data/' # '[...]/data/TRAINING/for_particle_classification/' + data_sig_file: 'TRAINING_classification_tail_gamma_merged.h5' + data_bkg_file: 'TRAINING_classification_tail_proton_merged.h5' + outdir: './' # [...]/estimators/gamma_hadron_classifier + + # List of cameras to use (protopipe-MODEL help output for other options) + cam_id_list: ['LSTCam', 'NectarCam'] + + # If train_fraction is 1, all the TRAINING dataset will be used to train the + # model and benchmarking can only be done from the benchmarking notebook + # TRAINING/benchmarks_DL2_to_classification.ipynb Split: - train_fraction: 0.8 - use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split - + train_fraction: 0.8 + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + + # Optimize the hyper-parameters of the estimator with a grid search + # If 'True' parameters should be provided as lists (for None use [null]) + # If 'False' the model used will be the unique one based on your the + GridSearchCV: + use: False # 'True' or 'False' + # if False the following to variables are irrelevant + scoring: 'roc_auc' + cv: 2 + + # Definition of the algorithm/method used and its hyper-parameters Method: - name: 'RandomForestClassifier' # AdaBoostClassifier or RandomForestClassifier - target_name: 'label' - tuned_parameters: # these are lists of values used by the GridSearchCV algorithm - n_estimators: [200] - max_depth: [10] # null for None - max_features: [3] # possible choices are “auto”, “sqrt”, “log2”, int or float - min_samples_split: [10] - min_samples_leaf: [10] - scoring: 'roc_auc' # possible choices are 'roc_auc', 'explained_variance' - cv: 2 - use_proba: True # If not output is score - calibrate_output: False # If true calibrate probability - + name: 'sklearn.ensemble.RandomForestClassifier' # DO NOT CHANGE + target_name: 'label' # defined between 0 and 1 (DO NOT CHANGE) + tuned_parameters: + # Please, see scikit-learn's API for what each parameter means + # WARNING: null (not a string) == 'None' + n_estimators: 100 # integer + criterion: 'gini' # 'gini' or 'entropy' + max_depth: null # null or integer + min_samples_split: 2 # integer or float + min_samples_leaf: 1 # integer or float + min_weight_fraction_leaf: 0.0 # float + max_features: 3 # 'auto', 'sqrt', 'log2', integer or float + max_leaf_nodes: null # null or integer + min_impurity_decrease: 0.0 # float + bootstrap: False # True or False + oob_score: False # True or False + n_jobs: null # null or integer + random_state: 0 # null or integer or RandomState + verbose: 0 # integer + warm_start: False # 'True' or 'False' + class_weight: null # 'balanced', 'balanced_subsample', null, dict or list of dicts + ccp_alpha: 0.0 # non-negative float + max_samples: null # null, integer or float + calibrate_output: False # If True calibrate model on test data + + # List of the features to use to train the model + # You can: + # - comment/uncomment the ones you see here, + # - add new ones here if they can be evaluated with pandas.DataFrame.eval + # - if not you can propose modifications to protopipe.mva.utils.prepare_data FeatureList: - - 'log10_reco_energy' - - 'log10_reco_energy_tel' - - 'log10_hillas_intensity' - - 'hillas_width' - - 'hillas_length' - - 'h_max' - - 'impact_dist' - + Basic: # single-named, they need to correspond to input data columns + - 'h_max' # Height of shower maximum from stereoscopic reconstruction + - 'impact_dist' # Impact parameter from stereoscopic reconstruction + - 'hillas_width' # Image Width + - 'hillas_length' # Image Length + # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel + Derived: # custom evaluations of basic features that will be added to the data + # column name : expression to evaluate using basic column names + log10_intensity: log10(hillas_intensity) + log10_energy: log10(reco_energy) # Averaged-estimated energy of the shower + log10_energy_tel: log10(reco_energy_tel) # Estimated energy of the shower per telescope + + # These cuts select the input data BEFORE training SigFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' + - 'good_image == 1' + - 'is_valid == True' + - 'hillas_intensity_reco > 0' BkgFiducialCuts: - 'good_image == 1' - 'is_valid == True' + - 'hillas_intensity_reco > 0' Diagnostic: # Energy binning (used for reco and true energy) energy: nbins: 4 - min: 0.02 + min: 0.0125 max: 200 We want to exploit parameters showing statistical differences in the shower diff --git a/protopipe/aux/example_config_files/AdaBoostRegressor.yaml b/protopipe/aux/example_config_files/AdaBoostRegressor.yaml new file mode 100644 index 00000000..5aee5f32 --- /dev/null +++ b/protopipe/aux/example_config_files/AdaBoostRegressor.yaml @@ -0,0 +1,83 @@ +General: + # [...] = your analysis local full path OUTSIDE the Vagrant box + data_dir: '../../data/' + data_sig_file: 'TRAINING_energy_tail_gamma_merged.h5' + outdir: './' + + # List of cameras to use (you can override this from the CLI) + cam_id_list: ['LSTCam', 'NectarCam'] + +# If train_fraction is 1, all the TRAINING dataset will be used to train the +# model and benchmarking can only be done from the benchmarking notebook +# TRAINING/benchmarks_DL2_to_classification.ipynb +Split: + train_fraction: 0.8 + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + +# Optimize the hyper-parameters of the estimator with a grid search +# If True parameters should be provided as lists +# If False the model used will be the one based on the chosen single-valued hyper-parameters +GridSearchCV: + use: False # True or False + # if False the following two variables are irrelevant + scoring: 'explained_variance' + cv: 2 + +Method: + name: 'sklearn.ensemble.AdaBoostRegressor' + target_name: 'true_energy' + # Please, see scikit-learn's API for what each parameter means + # NOTE: null == None + base_estimator: + name: 'sklearn.tree.DecisionTreeRegressor' + parameters: + # NOTE: here we set the parameters relevant for sklearn.tree.DecisionTreeRegressor + criterion: "mse" # "mse", "friedman_mse", "mae" or "poisson" + splitter: "best" # "best" or "random" + max_depth: null # null or integer + min_samples_split: 2 # integer or float + min_samples_leaf: 1 # int or float + min_weight_fraction_leaf: 0.0 # float + max_features: null # null, "auto", "sqrt", "log2", int or float + max_leaf_nodes: null # null or integer + min_impurity_decrease: 0.0 # float + random_state: 0 # null or integer or RandomState + ccp_alpha: 0.0 # non-negative float + tuned_parameters: + n_estimators: 50 + learning_rate: 1 + loss: 'linear' # 'linear', 'square' or 'exponential' + random_state: 0 # int, RandomState instance or None + +# List of the features to use to train the model +# You can: +# - comment/uncomment the ones you see here, +# - add new ones here if they can be evaluated with pandas.DataFrame.eval +# - if not you can propose modifications to protopipe.mva.utils.prepare_data +FeatureList: + Basic: # single-named, they need to correspond to input data columns + - 'h_max' # Height of shower maximum from stereoscopic reconstruction + - 'impact_dist' # Impact parameter from stereoscopic reconstruction + - 'hillas_width' # Image Width + - 'hillas_length' # Image Length + # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel + - 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera + Derived: # custom evaluations of basic features that will be added to the data + # column name : expression to evaluate using basic column names + log10_WLS: log10(hillas_width*hillas_length/hillas_intensity) + log10_intensity: log10(hillas_intensity) + r_origin: (sqrt((hillas_x - az)**2 + (hillas_y - alt)**2))**2 + phi_origin: arctan2(hillas_y - alt, hillas_x - az) + +# These cuts select the input data BEFORE training +SigFiducialCuts: + - 'good_image == 1' + - 'is_valid == True' + - 'hillas_intensity_reco > 0' + +Diagnostic: + # Energy binning (used for reco and true energy) + energy: + nbins: 15 + min: 0.0125 + max: 125 diff --git a/protopipe/aux/example_config_files/RandomForestClassifier.yaml b/protopipe/aux/example_config_files/RandomForestClassifier.yaml new file mode 100644 index 00000000..2a4a8a3d --- /dev/null +++ b/protopipe/aux/example_config_files/RandomForestClassifier.yaml @@ -0,0 +1,88 @@ +General: + # [...] = your analysis local full path OUTSIDE the Vagrant box + data_dir: '../../data/' # '[...]/data/TRAINING/for_particle_classification/' + data_sig_file: 'TRAINING_classification_tail_gamma_merged.h5' + data_bkg_file: 'TRAINING_classification_tail_proton_merged.h5' + outdir: './' # [...]/estimators/gamma_hadron_classifier + + # List of cameras to use (protopipe-MODEL help output for other options) + cam_id_list: ['LSTCam', 'NectarCam'] + +# If train_fraction is 1, all the TRAINING dataset will be used to train the +# model and benchmarking can only be done from the benchmarking notebook +# TRAINING/benchmarks_DL2_to_classification.ipynb +Split: + train_fraction: 0.8 + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + +# Optimize the hyper-parameters of the estimator with a grid search +# If True parameters should be provided as lists (for None use [null]) +# If False the model used will be the one based on the chosen single-valued hyper-parameters +GridSearchCV: + use: False # True or False + # if False the following to variables are irrelevant + scoring: 'roc_auc' + cv: 2 + +# Definition of the algorithm/method used and its hyper-parameters +Method: + name: 'sklearn.ensemble.RandomForestClassifier' # DO NOT CHANGE + target_name: 'label' # defined between 0 and 1 (DO NOT CHANGE) + tuned_parameters: + # Please, see scikit-learn's API for what each parameter means + # WARNING: null (not a string) == 'None' + n_estimators: 100 # integer + criterion: 'gini' # 'gini' or 'entropy' + max_depth: null # null or integer + min_samples_split: 2 # integer or float + min_samples_leaf: 1 # integer or float + min_weight_fraction_leaf: 0.0 # float + max_features: 3 # 'auto', 'sqrt', 'log2', integer or float + max_leaf_nodes: null # null or integer + min_impurity_decrease: 0.0 # float + bootstrap: False # True or False + oob_score: False # True or False + n_jobs: null # null or integer + random_state: 0 # null or integer or RandomState + verbose: 0 # integer + warm_start: False # 'True' or 'False' + class_weight: null # 'balanced', 'balanced_subsample', null, dict or list of dicts + ccp_alpha: 0.0 # non-negative float + max_samples: null # null, integer or float + calibrate_output: False # If True calibrate model on test data + +# List of the features to use to train the model +# You can: +# - comment/uncomment the ones you see here, +# - add new ones here if they can be evaluated with pandas.DataFrame.eval +# - if not you can propose modifications to protopipe.mva.utils.prepare_data +FeatureList: + Basic: # single-named, they need to correspond to input data columns + - 'h_max' # Height of shower maximum from stereoscopic reconstruction + - 'impact_dist' # Impact parameter from stereoscopic reconstruction + - 'hillas_width' # Image Width + - 'hillas_length' # Image Length + # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel + Derived: # custom evaluations of basic features that will be added to the data + # column name : expression to evaluate using basic column names + log10_intensity: log10(hillas_intensity) + log10_reco_energy: log10(reco_energy) # Averaged-estimated energy of the shower + log10_reco_energy_tel: log10(reco_energy_tel) # Estimated energy of the shower per telescope + +# These cuts select the input data BEFORE training +SigFiducialCuts: + - 'good_image == 1' + - 'is_valid == True' + - 'hillas_intensity_reco > 0' + +BkgFiducialCuts: + - 'good_image == 1' + - 'is_valid == True' + - 'hillas_intensity_reco > 0' + +Diagnostic: + # Energy binning (used for reco and true energy) + energy: + nbins: 4 + min: 0.0125 + max: 200 diff --git a/protopipe/aux/example_config_files/RandomForestRegressor.yaml b/protopipe/aux/example_config_files/RandomForestRegressor.yaml new file mode 100644 index 00000000..069eab29 --- /dev/null +++ b/protopipe/aux/example_config_files/RandomForestRegressor.yaml @@ -0,0 +1,83 @@ +General: + # [...] = your analysis local full path OUTSIDE the Vagrant box + data_dir: '../../data/' # '[...]/data/TRAINING/for_energy_estimation/' + data_sig_file: 'TRAINING_energy_tail_gamma_merged.h5' + outdir: './' # '[...]/estimators/energy_regressor' + + # List of cameras to use (protopipe-MODEL help output for other options) + cam_id_list: ['LSTCam', 'NectarCam'] + +# If train_fraction is 1, all the TRAINING dataset will be used to train the +# model and benchmarking can only be done from the benchmarking notebook +# TRAINING/benchmarks_DL2_to_classification.ipynb +Split: + train_fraction: 0.8 + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + +# Optimize the hyper-parameters of the estimator with a grid search +# If True parameters should be provided as lists +# If False the model used will be the one based on the chosen single-valued hyper-parameters +GridSearchCV: + use: False # True or False + # if False the following two variables are irrelevant + scoring: 'explained_variance' + cv: 2 + +# Definition of the model algorithm/method used and its hyper-parameters +Method: + name: 'sklearn.ensemble.RandomForestRegressor' # DO NOT CHANGE + target_name: 'log10_true_energy' + tuned_parameters: + # Please, see scikit-learn's API for what each parameter means + # NOTE: null == None + n_estimators: 50 # integer + criterion: "mse" # "mse" or "mae" + max_depth: null # null or integer + min_samples_split: 5 # integer + min_samples_leaf: 5 # integer + min_weight_fraction_leaf: 0.0 # float + max_features: 3 # {"auto", "sqrt", "log2"}, int or float + max_leaf_nodes: null # null or integer + min_impurity_decrease: 0.0 # float + bootstrap: False # True or False + oob_score: False # True or False + n_jobs: null # 'None' or integer + random_state: 0 # null or integer or RandomState + verbose: 0 # integer + warm_start: False # True or False + ccp_alpha: 0.0 # non-negative float + max_samples: null # null, integer or float + +# List of the features to use to train the model +# You can: +# - comment/uncomment the ones you see here, +# - add new ones here if they can be evaluated with pandas.DataFrame.eval +# - if not you can propose modifications to protopipe.mva.utils.prepare_data +FeatureList: + Basic: # single-named, they need to correspond to input data columns + - 'h_max' # Height of shower maximum from stereoscopic reconstruction + - 'impact_dist' # Impact parameter from stereoscopic reconstruction + - 'hillas_width' # Image Width + - 'hillas_length' # Image Length + # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel + - 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera + Derived: # custom evaluations of basic features that will be added to the data + # column name : expression to evaluate using basic column names + log10_WLS: log10(hillas_width*hillas_length/hillas_intensity) + log10_intensity: log10(hillas_intensity) + r_origin: (sqrt((hillas_x - az)**2 + (hillas_y - alt)**2))**2 + phi_origin: arctan2(hillas_y - alt, hillas_x - az) + +# These cuts select the input data BEFORE training +SigFiducialCuts: + - 'good_image == 1' + - 'is_valid == True' + - 'hillas_intensity_reco > 0' + +# Information used by the benchmarking notebook related to this model +Diagnostic: + # Energy binning (used for reco and true energy) + energy: + nbins: 15 + min: 0.0125 + max: 125 diff --git a/protopipe/aux/example_config_files/classifier.yaml b/protopipe/aux/example_config_files/classifier.yaml deleted file mode 100644 index 39013a9a..00000000 --- a/protopipe/aux/example_config_files/classifier.yaml +++ /dev/null @@ -1,51 +0,0 @@ -General: - model_type: 'classifier' - # [...] = your analysis local full path OUTSIDE the Vagrant box - data_dir: '[...]/shared_folder/analyses/v0.4.0_dev1/data/TRAINING/for_particle_classification/' - data_sig_file: 'TRAINING_classification_tail_gamma_merged.h5' - data_bkg_file: 'TRAINING_classification_tail_proton_merged.h5' - cam_id_list: ['LSTCam', 'NectarCam'] - table_name_template: '' # leave empty (TO BE REMOVED) - outdir: '[...]/shared_folder/analyses/v0.4.0_dev1/estimators/gamma_hadron_classifier' - -Split: - train_fraction: 0.8 - use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split - -Method: - name: 'RandomForestClassifier' # AdaBoostClassifier or RandomForestClassifier - target_name: 'label' - tuned_parameters: # these are lists of values used by the GridSearchCV algorithm - n_estimators: [200] - max_depth: [10] # null for None - max_features: [3] # possible choices are “auto”, “sqrt”, “log2”, int or float - min_samples_split: [10] - min_samples_leaf: [10] - scoring: 'roc_auc' # possible choices are 'roc_auc', 'explained_variance' - cv: 2 - use_proba: True # If not output is score - calibrate_output: False # If true calibrate probability - -FeatureList: - - 'log10_reco_energy' - - 'log10_reco_energy_tel' - - 'log10_hillas_intensity' - - 'hillas_width' - - 'hillas_length' - - 'h_max' - - 'impact_dist' - -SigFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' - -BkgFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' - -Diagnostic: - # Energy binning (used for reco and true energy) - energy: - nbins: 4 - min: 0.02 - max: 200 diff --git a/protopipe/aux/example_config_files/regressor.yaml b/protopipe/aux/example_config_files/regressor.yaml deleted file mode 100644 index 392bda52..00000000 --- a/protopipe/aux/example_config_files/regressor.yaml +++ /dev/null @@ -1,41 +0,0 @@ -General: - model_type: 'regressor' - # [...] = your analysis local full path OUTSIDE the Vagrant box - data_dir: '[...]/shared_folder/analyses/v0.4.0_dev1/data/TRAINING/for_energy_estimation' - data_file: 'TRAINING_energy_tail_gamma_merged.h5' - outdir: '[...]/shared_folder/analyses/v0.4.0_dev1/estimators/energy_regressor' - cam_id_list: ['LSTCam', 'NectarCam'] - table_name_template: '' # leave empty (TO BE REMOVED) - -Split: - train_fraction: 0.8 - -Method: - name: 'AdaBoostRegressor' - target_name: 'true_energy' - tuned_parameters: - learning_rate: [0.3] - n_estimators: [100] - base_estimator__max_depth: [null] # null is equivalent to None - base_estimator__min_samples_split: [2] - base_estimator__min_samples_leaf: [10] - scoring: 'explained_variance' - cv: 2 - -FeatureList: - - 'log10_hillas_intensity' - - 'log10_impact_dist' - - 'hillas_width_reco' - - 'hillas_length_reco' - - 'h_max' - -SigFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' - -Diagnostic: - # Energy binning (used for reco and true energy) - energy: - nbins: 15 - min: 0.0125 - max: 125 diff --git a/protopipe/mva/__init__.py b/protopipe/mva/__init__.py index fc180b8f..cf9c422c 100644 --- a/protopipe/mva/__init__.py +++ b/protopipe/mva/__init__.py @@ -1,6 +1,7 @@ """ -Classes to buil models based on machine learning methods. +Classes to build models based on machine learning methods. """ from .train_model import * from .diagnostic import * from .utils import * +from .io import * diff --git a/protopipe/mva/io.py b/protopipe/mva/io.py new file mode 100644 index 00000000..963d053a --- /dev/null +++ b/protopipe/mva/io.py @@ -0,0 +1,147 @@ +"""Input functions for a model initilization.""" + +import argparse +import joblib +from os import path + +from protopipe.mva.utils import save_obj + + +def initialize_script_arguments(): + """Initialize the parser of protopipe.scripts.build_model. + + Returns + ------- + args : argparse.Namespace + Populated argparse namespace. + """ + + parser = argparse.ArgumentParser( + description="Build model for regression/classification" + ) + parser.add_argument("--config_file", type=str, required=True) + + parser.add_argument( + "--max_events", + type=int, + default=None, + help="maximum number of events to use", + ) + + mode_group = parser.add_mutually_exclusive_group() + mode_group.add_argument( + "--wave", + dest="mode", + action="store_const", + const="wave", + default="tail", + help="if set, use wavelet cleaning", + ) + mode_group.add_argument( + "--tail", + dest="mode", + action="store_const", + const="tail", + help="if set, use tail cleaning (default), otherwise wavelets", + ) + + # These last CL arguments can overwrite the values from the config + + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('--cameras_from_config', + action='store_true', + help="Get cameras configuration file (Priority 1)",) + group.add_argument('--cameras_from_file', + action='store_true', + help="Get cameras from input file (Priority 2)",) + group.add_argument('--cam_id_list', + type=str, + default=None, + help="Select cameras like 'LSTCam CHEC' (Priority 3)",) + + parser.add_argument( + "-i", + "--indir", + type=str, + default=None, + help="Directory containing the required input file(s)" + ) + parser.add_argument( + "--infile_signal", + type=str, + default=None, + help="SIGNAL file (default: read from config file)", + ) + parser.add_argument( + "--infile_background", + type=str, + default=None, + help="BACKGROUND file (default: read from config file)", + ) + parser.add_argument("-o", "--outdir", type=str, default=None) + + args = parser.parse_args() + + return args + + +def save_output(models, + cam_id, + factory, + best_model, + model_types, + method_name, + outdir): + """Save model and data used to produce it per camera-type. + + Parameters + ---------- + models: dict + Dictionary of models with camera names as keys. + cam_id: str + Name of the analyzed camera. + factory: protopipe.mva.TrainModel + Wrapper around trained model containing references to train/test samples. + best_model: + Fit of the model from factory. + model_types: dict + Dictionary that maps type of model to method name. + method_name: str + Name of the scikit-learn model. + outdir: str + Path to output directory where to save the trained model and train/test samples. + """ + + models[cam_id] = best_model + model_type = [k for k, v in model_types.items() if method_name in v][0] + outname = "{}_{}_{}.pkl.gz".format( + model_type, cam_id, method_name + ) + joblib.dump(best_model, path.join(outdir, outname)) + + # SAVE DATA + save_obj( + factory.data_scikit, + path.join( + outdir, + "data_scikit_{}_{}_{}.pkl.gz".format( + model_type, method_name, cam_id + ), + ), + ) + factory.data_train.to_pickle( + path.join( + outdir, + "data_train_{}_{}_{}.pkl.gz".format( + model_type, method_name, cam_id + ), + ) + ) + factory.data_test.to_pickle( + path.join( + outdir, + "data_test_{}_{}_{}.pkl.gz".format( + model_type, method_name, cam_id + ), + ) + ) diff --git a/protopipe/mva/train_model.py b/protopipe/mva/train_model.py index ca536dee..fd25c2b0 100644 --- a/protopipe/mva/train_model.py +++ b/protopipe/mva/train_model.py @@ -48,7 +48,7 @@ def split_data( to build a classifier """ - if self.case in "regressor": + if self.case == "regressor": ( X_train, X_test, @@ -65,7 +65,7 @@ def split_data( weight = np.ones(len(self.data_train)) weight_train = weight / sum(weight) - elif self.case in "classifier": + else: ( X_train_sig, X_test_sig, diff --git a/protopipe/mva/utils.py b/protopipe/mva/utils.py index f7171e53..bd20ef93 100644 --- a/protopipe/mva/utils.py +++ b/protopipe/mva/utils.py @@ -18,21 +18,48 @@ def load_obj(name): return pickle.load(f) -def prepare_data(ds, cuts, label=None): - """Add variables in data frame""" - ds["log10_hillas_intensity"] = np.log10( - ds["hillas_intensity_reco"] - ) # THIS SHOULDN'T BE HARDCODED!!! - ds["log10_impact_dist"] = np.log10(ds["impact_dist"]) +def prepare_data(ds, derived_features, cuts, select_data=True, label=None): + """Add custom variables to the input data and optionally select it. + + Parameters + ---------- + ds : pandas.DataFrame + Input data not yet selected. + derived_features: dict + Dictionary of more complex featuresread from the configuration file. + cuts: str + Fiducial cuts from protopipe.mva.utils.make_cut_list + select_data: bool + If True apply cuts to the final dataframe. + label: str + Name of the classifier target label if any. + + Returns + ------- + ds : pandas.DataFrame + Input data integrated with new variables and optionally selected for + the fiducial cuts. + """ + + # This is always useful ds["log10_true_energy"] = np.log10(ds["true_energy"]) - try: # for classification - ds["log10_reco_energy"] = np.log10(ds["reco_energy"]) - ds["log10_reco_energy_tel"] = np.log10(ds["reco_energy_tel"]) + + if label is not None: # only for classification ds["label"] = np.full(len(ds), label) - except: - pass - ds = ds.query(cuts) + # This is needed because our reference analysis uses energy as + # feature for classification + # We should propably support a more elastic choice in the future. + if not all(i in derived_features for i in ["log10_reco_energy", "log10_reco_energy_tel"]): + raise ValueError('log10_reco_energy and log10_reco_energy_tel need to be model features.') + + # Compute derived features and add them to the dataframe + for feature_name, feature_expression in derived_features.items(): + ds.eval(f'{feature_name} = {feature_expression}', + inplace=True) + + if select_data: + ds = ds.query(cuts) return ds diff --git a/protopipe/scripts/build_model.py b/protopipe/scripts/build_model.py index 49659715..30625625 100755 --- a/protopipe/scripts/build_model.py +++ b/protopipe/scripts/build_model.py @@ -1,97 +1,32 @@ #!/usr/bin/env python import os -import pandas as pd -import argparse from os import path -from sklearn.ensemble import ( - AdaBoostRegressor, - AdaBoostClassifier, - RandomForestClassifier, -) -from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier -import joblib +import importlib + +import pandas as pd + from sklearn.metrics import classification_report from sklearn.calibration import CalibratedClassifierCV from protopipe.pipeline.utils import load_config, get_camera_names - from protopipe.mva import TrainModel -from protopipe.mva.utils import make_cut_list, prepare_data, save_obj +from protopipe.mva.io import initialize_script_arguments, save_output +from protopipe.mva.utils import ( + make_cut_list, + prepare_data +) def main(): - # Read arguments - parser = argparse.ArgumentParser( - description="Build model for regression/classification" - ) - parser.add_argument("--config_file", type=str, required=True) - parser.add_argument( - "--max_events", - type=int, - default=None, - help="maximum number of events for training", - ) - mode_group = parser.add_mutually_exclusive_group() - mode_group.add_argument( - "--wave", - dest="mode", - action="store_const", - const="wave", - default="tail", - help="if set, use wavelet cleaning", - ) - mode_group.add_argument( - "--tail", - dest="mode", - action="store_const", - const="tail", - help="if set, use tail cleaning, otherwise wavelets", - ) - - # These last CL arguments can overwrite the values from the config - - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('--cameras_from_config', - action='store_true', - help="Get cameras configuration file (Priority 1)",) - group.add_argument('--cameras_from_file', - action='store_true', - help="Get cameras from input file (Priority 2)",) - group.add_argument('--cam_id_list', - type=str, - default=None, - help="Select cameras like 'LSTCam CHEC' (Priority 3)",) - - parser.add_argument( - "-i", - "--indir", - type=str, - default=None, - help="Directory containing the required input file(s)" - ) - parser.add_argument( - "--infile_signal", - type=str, - default=None, - help="SIGNAL file (default: read from config file)", - ) - parser.add_argument( - "--infile_background", - type=str, - default=None, - help="BACKGROUND file (default: read from config file)", - ) - parser.add_argument("-o", "--outdir", type=str, default=None) - - args = parser.parse_args() - - # Read configuration file + # INITIALIZE CLI arguments + args = initialize_script_arguments() + + # LOAD CONFIGURATION FILE cfg = load_config(args.config_file) - # Type of model (regressor or classifier) - model_type = cfg["General"]["model_type"] + # INPUT CONFIGURATION # Import parameters if args.indir is None: @@ -106,52 +41,92 @@ def main(): if not os.path.exists(outdir): os.makedirs(outdir) - table_name_template = cfg["General"]["table_name_template"] - - # List of features - feature_list = cfg["FeatureList"] + # Get file containing gammas (signal) + if args.infile_signal is None: + data_sig_file = cfg["General"]["data_sig_file"].format(args.mode) + else: + data_sig_file = args.infile_signal - # Optimisation parameters - method_name = cfg["Method"]["name"] - tuned_parameters = [cfg["Method"]["tuned_parameters"]] - scoring = cfg["Method"]["scoring"] - cv = cfg["Method"]["cv"] + filename_sig = path.join(data_dir, data_sig_file) - # Split fraction - train_fraction = cfg["Split"]["train_fraction"] + print(f"INPUT SIGNAL FILE PATH= {filename_sig}") - if model_type in "regressor": + # Cameras to use + if args.cameras_from_config: + print("GETTING CAMERAS FROM CONFIGURATION FILE") + cam_ids = cfg["General"]["cam_id_list"] + elif args.cameras_from_file: + print("GETTING CAMERAS FROM SIGNAL TRAINING FILE") + # in the same analysis all particle types are analyzed in the + # same way so we can just use gammas + cam_ids = get_camera_names(filename_sig) + else: + print("GETTING CAMERAS FROM CLI") + cam_ids = args.cam_id_lists.split() - if args.infile_signal is None: - data_file = cfg["General"]["data_file"].format(args.mode) - else: - data_file = args.infile_signal + # The names of the tables inside the HDF5 file are the camera's names + table_name = [cam_id for cam_id in cam_ids] - filename = path.join(data_dir, data_file) + # Dataset split train-test fraction + train_fraction = cfg["Split"]["train_fraction"] + # Name of target quantity + target_name = cfg["Method"]["target_name"] + + # Get list of features + features_basic = cfg["FeatureList"]["Basic"] + features_derived = cfg["FeatureList"]["Derived"] + feature_list = features_basic + list(features_derived) + print("Going to use the following features to train the model:") + print(feature_list) + # sort features_to_use alphabetically to ensure order + # preservation with model.predict in protopipe.scripts + feature_list = sorted(feature_list) + + # GridSearchCV + use_GridSearchCV = cfg["GridSearchCV"]["use"] + scoring = cfg["GridSearchCV"]["scoring"] + cv = cfg["GridSearchCV"]["cv"] + + # Hyper-parameters of the main model + tuned_parameters = cfg["Method"]["tuned_parameters"] + + # Initialize the model dynamically + + # There always at least one (main) model to initialize + model_to_use = cfg['Method']['name'] + module_name = '.'.join(model_to_use.split('.', 2)[:-1]) + class_name = model_to_use.split('.')[-1] + module = importlib.import_module(module_name) # sklearn.XXX + model = getattr(module, class_name) + print(f"Going to use {module_name}.{class_name}...") + + # Check for any base estimator if main model is a meta-estimator + if "base_estimator" in cfg['Method']: + base_estimator_cfg = cfg['Method']['base_estimator'] + base_estimator_name = base_estimator_cfg['name'] + base_estimator_pars = base_estimator_cfg['parameters'] + base_estimator_module_name = '.'.join(base_estimator_name.split('.', 2)[:-1]) + base_estimator_class_name = base_estimator_name.split('.')[-1] + base_estimator_module = importlib.import_module(base_estimator_module_name) # sklearn.XXX + base_estimator_model = getattr(base_estimator_module, base_estimator_class_name) + initialized_base_estimator = base_estimator_model(**base_estimator_pars) + print(f"...based on {base_estimator_module_name}.{base_estimator_class_name}") + initialized_model = model(base_estimator=initialized_base_estimator, + **cfg['Method']['tuned_parameters']) + else: + initialized_model = model(**cfg['Method']['tuned_parameters']) - if args.cameras_from_config: - cam_ids = cfg["General"]["cam_id_list"] - elif args.cameras_from_file: - cam_ids = get_camera_names(filename) - else: - cam_ids = args.cam_id_list.split() + # Map model types to the models supported by the script + model_types = {"regressor": ["RandomForestRegressor", + "AdaBoostRegressor"], + "classifier": ["RandomForestClassifier"]} - table_name = [table_name_template + cam_id for cam_id in cam_ids] + if class_name in model_types["regressor"]: - # List of cuts + # Get the selection cuts cuts = make_cut_list(cfg["SigFiducialCuts"]) - init_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None)) - - # Name of target - target_name = cfg["Method"]["target_name"] - elif model_type in "classifier": - - # read signal file from either config file or CLI - if args.infile_signal is None: - data_sig_file = cfg["General"]["data_sig_file"].format(args.mode) - else: - data_sig_file = args.infile_signal + elif class_name in model_types["classifier"]: # read background file from either config file or CLI if args.infile_background is None: @@ -159,50 +134,23 @@ def main(): else: data_bkg_file = args.infile_background - filename_sig = path.join(data_dir, data_sig_file) + # filename_sig = path.join(data_dir, data_sig_file) filename_bkg = path.join(data_dir, data_bkg_file) - if args.cameras_from_config: - print("TAKING CAMERAS FROM CONFIG") - cam_ids = cfg["General"]["cam_id_list"] - elif args.cameras_from_file: - print("TAKING CAMERAS FROM TRAINING FILE") - # in the same analysis all particle types are analyzed in the - # same way so we can just use gammas - cam_ids = get_camera_names(filename_sig) - else: - print("TAKING CAMERAS FROM CLI") - cam_ids = args.cam_id_lists.split() - - table_name = [table_name_template + cam_id for cam_id in cam_ids] + # table_name = [table_name_template + cam_id for cam_id in cam_ids] - # List of cuts + # Get the selection cuts sig_cuts = make_cut_list(cfg["SigFiducialCuts"]) bkg_cuts = make_cut_list(cfg["BkgFiducialCuts"]) - # Model - if method_name in "AdaBoostClassifier": - init_model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4)) - elif method_name in "RandomForestClassifier": - init_model = RandomForestClassifier( - n_estimators=500, - max_depth=None, - min_samples_split=0.05, - max_features="sqrt", - bootstrap=True, - random_state=None, - criterion="gini", - class_weight="balanced_subsample", # Tree-wise re-weighting - ) - - # Name of target - target_name = cfg["Method"]["target_name"] - use_same_number_of_sig_and_bkg_for_training = cfg["Split"][ "use_same_number_of_sig_and_bkg_for_training" ] - print("### Using {} for model construction".format(method_name)) + else: + raise ValueError("ERROR: not a supported model") + + print("### Using {} for model construction".format(model_to_use)) print(f"LIST OF CAMERAS TO USE = {cam_ids}") @@ -211,43 +159,65 @@ def main(): print("### Building model for {}".format(cam_id)) - if model_type in "regressor": + if class_name in model_types["regressor"]: + # Load data - data = pd.read_hdf(filename, table_name[idx], mode="r") - data = prepare_data(ds=data, cuts=cuts)[0:args.max_events] + data_sig = pd.read_hdf(filename_sig, table_name[idx], mode="r") + # Add any derived feature and apply fiducial cuts + data_sig = prepare_data(ds=data_sig, + derived_features=features_derived, + select_data=True, + cuts=cuts) + + if args.max_events: + data_sig = data_sig[0:args.max_events] - print(f"Going to split {len(data)} SIGNAL images...") + print(f"Going to split {len(data_sig)} SIGNAL images...") - # Init model factory + # Initialize the model factory = TrainModel( - case=model_type, target_name=target_name, feature_name_list=feature_list + case="regressor", + target_name=target_name, + feature_name_list=feature_list ) - # Split data - factory.split_data(data_sig=data, train_fraction=train_fraction) + # Split the TRAINING dataset in a train and test sub-datasets + # Useful to test the models before using them for DL2 production + factory.split_data(data_sig=data_sig, train_fraction=train_fraction) print("Training sample: sig {}".format(len(factory.data_train))) print("Test sample: sig {}".format(len(factory.data_test))) - elif model_type in "classifier": + + else: # if it's not a regressor it's a classifier + # Load data data_sig = pd.read_hdf(filename_sig, table_name[idx], mode="r") data_bkg = pd.read_hdf(filename_bkg, table_name[idx], mode="r") # Add label - data_sig = prepare_data(ds=data_sig, label=1, cuts=sig_cuts) - data_bkg = prepare_data(ds=data_bkg, label=0, cuts=bkg_cuts) + data_sig = prepare_data(ds=data_sig, + label=1, + cuts=sig_cuts, + select_data=True, + derived_features=features_derived) + data_bkg = prepare_data(ds=data_bkg, + label=0, + cuts=bkg_cuts, + select_data=True, + derived_features=features_derived) if args.max_events: - data_sig = data_sig[0:(args.max_events - 1)] - data_bkg = data_bkg[0:(args.max_events - 1)] + data_sig = data_sig[0:args.max_events] + data_bkg = data_bkg[0:args.max_events] print(f"Going to split {len(data_sig)} SIGNAL images and {len(data_bkg)} BACKGROUND images") - # Init model factory + # Initialize the model factory = TrainModel( - case=model_type, target_name=target_name, feature_name_list=feature_list + case="classifier", target_name=target_name, feature_name_list=feature_list ) - # Split data + # Split the TRAINING dataset in a train and test sub-datasets + # Useful to test the models before using them for DL2 production factory.split_data( data_sig=data_sig, data_bkg=data_bkg, @@ -268,23 +238,33 @@ def main(): ) ) - # Build model - best_model = factory.get_optimal_model( - init_model, tuned_parameters, scoring=scoring, cv=cv - ) - - if model_type in "classifier": - # print report - if model_type in "classifier": - print( - classification_report( - factory.data_scikit["y_test"], - best_model.predict(factory.data_scikit["X_test"]), - ) + if use_GridSearchCV: + # Apply optimization of the hyper-parameters via grid search + # and return best model + best_model = factory.get_optimal_model( + initialized_model, tuned_parameters, scoring=scoring, cv=cv + ) + else: # otherwise use directly the initial model + best_model = initialized_model + + # Fit the chosen model on the train data + best_model.fit( + factory.data_scikit["X_train"], + factory.data_scikit["y_train"], + sample_weight=factory.data_scikit["w_train"], + ) + + if class_name in model_types["classifier"]: + + print( + classification_report( + factory.data_scikit["y_test"], + best_model.predict(factory.data_scikit["X_test"]), ) + ) - # Calibrate model if necessary on test data - if cfg["Method"]["calibrate_output"] is True: + # Calibrate model if necessary on test data (GridSearchCV) + if use_GridSearchCV and cfg["Method"]["calibrate_output"]: print("==> Calibrate classifier...") best_model = CalibratedClassifierCV( @@ -295,39 +275,13 @@ def main(): factory.data_scikit["X_test"], factory.data_scikit["y_test"] ) - # save model - models[cam_id] = best_model - outname = "{}_{}_{}_{}.pkl.gz".format( - model_type, args.mode, cam_id, method_name - ) - joblib.dump(best_model, path.join(outdir, outname)) - - # save data - save_obj( - factory.data_scikit, - path.join( - outdir, - "data_scikit_{}_{}_{}_{}.pkl.gz".format( - model_type, method_name, args.mode, cam_id - ), - ), - ) - factory.data_train.to_pickle( - path.join( - outdir, - "data_train_{}_{}_{}_{}.pkl.gz".format( - model_type, method_name, args.mode, cam_id - ), - ) - ) - factory.data_test.to_pickle( - path.join( - outdir, - "data_test_{}_{}_{}_{}.pkl.gz".format( - model_type, method_name, args.mode, cam_id - ), - ) - ) + save_output(models, + cam_id, + factory, + best_model, + model_types, + class_name, + outdir) if __name__ == "__main__": diff --git a/protopipe/scripts/data_training.py b/protopipe/scripts/data_training.py index f92388b8..66a63c43 100755 --- a/protopipe/scripts/data_training.py +++ b/protopipe/scripts/data_training.py @@ -8,6 +8,7 @@ from glob import glob import signal import tables as tb +import pandas as pd from ctapipe.utils.CutFlow import CutFlow from ctapipe.io import EventSource @@ -47,6 +48,13 @@ def main(): parser.add_argument( "--regressor_dir", type=str, default="./", help="regressors directory" ) + parser.add_argument( + "--regressor_config", + type=str, + default=None, + help="Configuration file used to produce regressor model" + ) + args = parser.parse_args() # Read configuration file @@ -104,8 +112,12 @@ def main(): # wrapper for the scikit-learn regressor if args.estimate_energy is True: + + # Read configuration file + regressor_config = load_config(args.regressor_config) + regressor_files = ( - args.regressor_dir + "/regressor_{mode}_{cam_id}_{regressor}.pkl.gz" + args.regressor_dir + "/regressor_{cam_id}_{regressor}.pkl.gz" ) reg_file = regressor_files.format( **{ @@ -294,17 +306,60 @@ def main(): moments = hillas_dict[tel_id] model = regressors[cam_id] - features_img = np.array( - [ - np.log10(moments.intensity), - np.log10(impact_dict[tel_id].value), - moments.width.value, - moments.length.value, - h_max.value, - ] - ) - - energy_tel[idx] = model.predict([features_img]) + ############################################################ + # GET FEATURES + ############################################################ + + # Read feature list from model configutation file + features_basic = regressor_config["FeatureList"]["Basic"] + features_derived = regressor_config["FeatureList"]["Derived"] + features = features_basic + list(features_derived) + + # Create a pandas Dataframe with basic quantities + # This is needed in order to connect the I/O system of the + # model inputs to the in-memory computation of this script + data = pd.DataFrame({ + "hillas_intensity": [moments.intensity], + "hillas_width": [moments.width.to("deg").value], + "hillas_length": [moments.length.to("deg").value], + "hillas_x": [moments.x.to("deg").value], + "hillas_y": [moments.y.to("deg").value], + "hillas_phi": [moments.phi.to("deg").value], + "hillas_r": [moments.r.to("deg").value], + "leakage_intensity_width_1_reco": [leakage_dict[tel_id]['leak1_reco']], + "leakage_intensity_width_2_reco": [leakage_dict[tel_id]['leak2_reco']], + "leakage_intensity_width_1": [leakage_dict[tel_id]['leak1']], + "leakage_intensity_width_2": [leakage_dict[tel_id]['leak2']], + "az": [reco_result.az.to("deg").value], + "alt": [reco_result.alt.to("deg").value], + "h_max": [h_max.value], + "impact_dist": [impact_dict[tel_id].to("m").value], + }) + + # Compute derived features and add them to the dataframe + for key, expression in features_derived.items(): + data.eval(f'{key} = {expression}', inplace=True) + + # features_img = np.array( + # [ + # np.log10(moments.intensity), + # np.log10(impact_dict[tel_id].value), + # moments.width.value, + # moments.length.value, + # h_max.value, + # ] + # ) + + # sort features_to_use alphabetically to ensure order + # preservation with model.fit in protopipe.mva + features = sorted(features) + + # Select the values for the full set of features + features_values = data[features].to_numpy() + + ############################################################ + + energy_tel[idx] = model.predict(features_values) weight_tel[idx] = moments.intensity reco_energy_tel[tel_id] = energy_tel[idx] diff --git a/protopipe/scripts/tests/test_AdaBoostRegressor.yaml b/protopipe/scripts/tests/test_AdaBoostRegressor.yaml new file mode 100644 index 00000000..2941dbdb --- /dev/null +++ b/protopipe/scripts/tests/test_AdaBoostRegressor.yaml @@ -0,0 +1,85 @@ +General: + # [...] = your analysis local full path OUTSIDE the Vagrant box + # NOTE: not used here since the testing suite needs to work from the CLI + data_dir: '../../data/' + data_sig_file: 'TRAINING_energy_tail_gamma_merged.h5' + outdir: './' + + # List of cameras to use (you can override this from the CLI) + # NOTE: not used here since the testing suite needs to work from the CLI + cam_id_list: ['LSTCam', 'NectarCam'] + +# If train_fraction is 1, all the TRAINING dataset will be used to train the +# model and benchmarking can only be done from the benchmarking notebook +# TRAINING/benchmarks_DL2_to_classification.ipynb +Split: + train_fraction: 0.8 + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + +# Optimize the hyper-parameters of the estimator with a grid search +# If True parameters should be provided as lists +# If False the model used will be the one based on the chosen single-valued hyper-parameters +GridSearchCV: + use: False # True or False + # if False the following two variables are irrelevant + scoring: 'explained_variance' + cv: 2 + +Method: + name: 'sklearn.ensemble.AdaBoostRegressor' + target_name: 'true_energy' + # Please, see scikit-learn's API for what each parameter means + # NOTE: null == None + base_estimator: + name: 'sklearn.tree.DecisionTreeRegressor' + parameters: + # NOTE: here we set the parameters relevant for sklearn.tree.DecisionTreeRegressor + criterion: "mse" # "mse", "friedman_mse", "mae" or "poisson" + splitter: "best" # "best" or "random" + max_depth: null # null or integer + min_samples_split: 2 # integer or float + min_samples_leaf: 1 # int or float + min_weight_fraction_leaf: 0.0 # float + max_features: null # null, "auto", "sqrt", "log2", int or float + max_leaf_nodes: null # null or integer + min_impurity_decrease: 0.0 # float + random_state: 0 # null or integer or RandomState + ccp_alpha: 0.0 # non-negative float + tuned_parameters: + n_estimators: 50 + learning_rate: 1 + loss: 'linear' # 'linear', 'square' or 'exponential' + random_state: 0 # int, RandomState instance or None + +# List of the features to use to train the model +# You can: +# - comment/uncomment the ones you see here, +# - add new ones here if they can be evaluated with pandas.DataFrame.eval +# - if not you can propose modifications to protopipe.mva.utils.prepare_data +FeatureList: + Basic: # single-named, they need to correspond to input data columns + - 'h_max' # Height of shower maximum from stereoscopic reconstruction + - 'impact_dist' # Impact parameter from stereoscopic reconstruction + - 'hillas_width' # Image Width + - 'hillas_length' # Image Length + # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel + - 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera + Derived: # custom evaluations of basic features that will be added to the data + # column name : expression to evaluate using basic column names + log10_WLS: log10(hillas_width*hillas_length/hillas_intensity) + log10_intensity: log10(hillas_intensity) + r_origin: (sqrt((hillas_x - az)**2 + (hillas_y - alt)**2))**2 + phi_origin: arctan2(hillas_y - alt, hillas_x - az) + +# These cuts select the input data BEFORE training +SigFiducialCuts: + - 'good_image == 1' + - 'is_valid == True' + - 'hillas_intensity_reco > 0' + +Diagnostic: + # Energy binning (used for reco and true energy) + energy: + nbins: 15 + min: 0.0125 + max: 125 diff --git a/protopipe/scripts/tests/test_RandomForestClassifier.yaml b/protopipe/scripts/tests/test_RandomForestClassifier.yaml new file mode 100644 index 00000000..0f406e38 --- /dev/null +++ b/protopipe/scripts/tests/test_RandomForestClassifier.yaml @@ -0,0 +1,89 @@ +General: + # NOTE: not used here since the testing suite needs to work from the CLI + data_dir: '../../data/' + data_sig_file: 'TRAINING_classification_tail_gamma_merged.h5' + data_bkg_file: 'TRAINING_classification_tail_proton_merged.h5' + outdir: './' + + # List of cameras to use (protopipe-MODEL help output for other options) + # NOTE: not used here since the testing suite needs to work from the CLI + cam_id_list: ['LSTCam', 'NectarCam'] + +# If train_fraction is 1, all the TRAINING dataset will be used to train the +# model and benchmarking can only be done from the benchmarking notebook +# TRAINING/benchmarks_DL2_to_classification.ipynb +Split: + train_fraction: 0.8 + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + +# Optimize the hyper-parameters of the estimator with a grid search +# If True parameters should be provided as lists (for None use [null]) +# If False the model used will be the one based on the chosen single-valued hyper-parameters +GridSearchCV: + use: False # True or False + # if False the following to variables are irrelevant + scoring: 'roc_auc' + cv: 2 + +# Definition of the algorithm/method used and its hyper-parameters +Method: + name: 'sklearn.ensemble.RandomForestClassifier' # DO NOT CHANGE + target_name: 'label' # defined between 0 and 1 (DO NOT CHANGE) + tuned_parameters: + # Please, see scikit-learn's API for what each parameter means + # WARNING: null (not a string) == 'None' + n_estimators: 100 # integer + criterion: 'gini' # 'gini' or 'entropy' + max_depth: null # null or integer + min_samples_split: 2 # integer or float + min_samples_leaf: 1 # integer or float + min_weight_fraction_leaf: 0.0 # float + max_features: 3 # 'auto', 'sqrt', 'log2', integer or float + max_leaf_nodes: null # null or integer + min_impurity_decrease: 0.0 # float + bootstrap: False # True or False + oob_score: False # True or False + n_jobs: null # null or integer + random_state: 0 # null or integer or RandomState + verbose: 0 # integer + warm_start: False # 'True' or 'False' + class_weight: null # 'balanced', 'balanced_subsample', null, dict or list of dicts + ccp_alpha: 0.0 # non-negative float + max_samples: null # null, integer or float + calibrate_output: False # If True calibrate model on test data + +# List of the features to use to train the model +# You can: +# - comment/uncomment the ones you see here, +# - add new ones here if they can be evaluated with pandas.DataFrame.eval +# - if not you can propose modifications to protopipe.mva.utils.prepare_data +FeatureList: + Basic: # single-named, they need to correspond to input data columns + - 'h_max' # Height of shower maximum from stereoscopic reconstruction + - 'impact_dist' # Impact parameter from stereoscopic reconstruction + - 'hillas_width' # Image Width + - 'hillas_length' # Image Length + # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel + Derived: # custom evaluations of basic features that will be added to the data + # column name : expression to evaluate using basic column names + log10_intensity: log10(hillas_intensity) + log10_reco_energy: log10(reco_energy) # Averaged-estimated energy of the shower + log10_reco_energy_tel: log10(reco_energy_tel) # Estimated energy of the shower per telescope + +# These cuts select the input data BEFORE training +SigFiducialCuts: + - 'good_image == 1' + - 'is_valid == True' + - 'hillas_intensity_reco > 0' + +BkgFiducialCuts: + - 'good_image == 1' + - 'is_valid == True' + - 'hillas_intensity_reco > 0' + +Diagnostic: + # Energy binning (used for reco and true energy) + energy: + nbins: 4 + min: 0.0125 + max: 200 diff --git a/protopipe/scripts/tests/test_RandomForestRegressor.yaml b/protopipe/scripts/tests/test_RandomForestRegressor.yaml new file mode 100644 index 00000000..802aefee --- /dev/null +++ b/protopipe/scripts/tests/test_RandomForestRegressor.yaml @@ -0,0 +1,84 @@ +General: + # NOTE: not used here since the testing suite needs to work from the CLI + data_dir: './' + data_sig_file: 'test_TRAINING_energy_{}_gamma_merged.h5' + outdir: './' + + # List of cameras to use (protopipe-MODEL help output for other options) + # NOTE: not used here since the testing suite needs to work from the CLI + cam_id_list: ['LSTCam', 'NectarCam'] + +# If train_fraction is 1, all the TRAINING dataset will be used to train the +# model and benchmarking can only be done from the benchmarking notebook +# TRAINING/benchmarks_DL2_to_classification.ipynb +Split: + train_fraction: 0.8 + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + +# Optimize the hyper-parameters of the estimator with a grid search +# If True parameters should be provided as lists +# If False the model used will be the one based on the chosen single-valued hyper-parameters +GridSearchCV: + use: False # True or False + # if False the following two variables are irrelevant + scoring: 'explained_variance' + cv: 2 + +# Definition of the model algorithm/method used and its hyper-parameters +Method: + name: 'sklearn.ensemble.RandomForestRegressor' # DO NOT CHANGE + target_name: 'log10_true_energy' + tuned_parameters: + # Please, see scikit-learn's API for what each parameter means + # NOTE: null == None + n_estimators: 50 # integer + criterion: "mse" # "mse" or "mae" + max_depth: null # null or integer + min_samples_split: 5 # integer + min_samples_leaf: 5 # integer + min_weight_fraction_leaf: 0.0 # float + max_features: 3 # {"auto", "sqrt", "log2"}, int or float + max_leaf_nodes: null # null or integer + min_impurity_decrease: 0.0 # float + bootstrap: False # True or False + oob_score: False # True or False + n_jobs: null # 'None' or integer + random_state: 0 # null or integer or RandomState + verbose: 0 # integer + warm_start: False # True or False + ccp_alpha: 0.0 # non-negative float + max_samples: null # null, integer or float + +# List of the features to use to train the model +# You can: +# - comment/uncomment the ones you see here, +# - add new ones here if they can be evaluated with pandas.DataFrame.eval +# - if not you can propose modifications to protopipe.mva.utils.prepare_data +FeatureList: + Basic: # single-named, they need to correspond to input data columns + - 'h_max' # Height of shower maximum from stereoscopic reconstruction + - 'impact_dist' # Impact parameter from stereoscopic reconstruction + - 'hillas_width' # Image Width + - 'hillas_length' # Image Length + # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel + - 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera + Derived: # custom evaluations of basic features that will be added to the data + # column name : expression to evaluate using basic column names + log10_WLS: log10(hillas_width*hillas_length/hillas_intensity) + log10_intensity: log10(hillas_intensity) + r_origin: (sqrt((hillas_x - az)**2 + (hillas_y - alt)**2))**2 + phi_origin: arctan2(hillas_y - alt, hillas_x - az) + +# These cuts select the input data BEFORE training +SigFiducialCuts: + - 'good_image == 1' + - 'is_valid == True' + - 'hillas_intensity_reco > 0' + +# Information used by the benchmarking notebook related to this model +Diagnostic: + # Energy binning (used for reco and true energy) + energy: + nbins: 15 + min: 0.0125 + max: 125 diff --git a/protopipe/scripts/tests/test_classifier.yaml b/protopipe/scripts/tests/test_classifier.yaml deleted file mode 100644 index e1d54bb5..00000000 --- a/protopipe/scripts/tests/test_classifier.yaml +++ /dev/null @@ -1,51 +0,0 @@ -General: - model_type: 'classifier' - # [...] = your analysis local full path OUTSIDE the Vagrant box - data_dir: '[...]/shared_folder/analyses/v0.4.0_dev1/data/TRAINING/for_particle_classification/' - data_sig_file: 'TRAINING_classification_{}_gamma_merged.h5' - data_bkg_file: 'TRAINING_classification_{}_proton_merged.h5' - cam_id_list: ['LSTCam', 'NectarCam'] - table_name_template: '' # leave empty (TO BE REMOVED) - outdir: '[...]/shared_folder/analyses/v0.4.0_dev1/estimators/gamma_hadron_classifier' - -Split: - train_fraction: 0.5 - use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split - -Method: - name: 'RandomForestClassifier' # AdaBoostClassifier or RandomForestClassifier - target_name: 'label' - tuned_parameters: # these are lists of values used by the GridSearchCV algorithm - n_estimators: [200] - max_depth: [10] # null for None - max_features: [3] # possible choices are “auto”, “sqrt”, “log2”, int or float - min_samples_split: [10] - min_samples_leaf: [10] - scoring: 'roc_auc' # possible choices are 'roc_auc', 'explained_variance' - cv: 2 - use_proba: True # If not output is score - calibrate_output: False # If true calibrate probability - -FeatureList: - # - 'log10_reco_energy' - # - 'log10_reco_energy_tel' - - 'log10_hillas_intensity' - - 'hillas_width' - - 'hillas_length' - - 'h_max' - - 'impact_dist' - -SigFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' - -BkgFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' - -Diagnostic: - # Energy binning (used for reco and true energy) - energy: - nbins: 4 - min: 0.02 - max: 200 diff --git a/protopipe/scripts/tests/test_config_analysis_north.yaml b/protopipe/scripts/tests/test_config_analysis_north.yaml index 2391676b..6bce806d 100644 --- a/protopipe/scripts/tests/test_config_analysis_north.yaml +++ b/protopipe/scripts/tests/test_config_analysis_north.yaml @@ -96,7 +96,7 @@ Reconstruction: # Parameters for energy estimation EnergyRegressor: # Name of the regression method (e.g. AdaBoostRegressor, etc.) - method_name: 'AdaBoostRegressor' + method_name: 'RandomForestRegressor' # Parameters for g/h separation GammaHadronClassifier: diff --git a/protopipe/scripts/tests/test_config_analysis_south.yaml b/protopipe/scripts/tests/test_config_analysis_south.yaml index 3ad4ee3b..09aa981a 100644 --- a/protopipe/scripts/tests/test_config_analysis_south.yaml +++ b/protopipe/scripts/tests/test_config_analysis_south.yaml @@ -96,7 +96,7 @@ Reconstruction: # Parameters for energy estimation EnergyRegressor: # Name of the regression method (e.g. AdaBoostRegressor, etc.) - method_name: 'AdaBoostRegressor' + method_name: 'RandomForestRegressor' # Parameters for g/h separation GammaHadronClassifier: diff --git a/protopipe/scripts/tests/test_pipeline.py b/protopipe/scripts/tests/test_pipeline.py index dc330276..23ff4fe6 100644 --- a/protopipe/scripts/tests/test_pipeline.py +++ b/protopipe/scripts/tests/test_pipeline.py @@ -13,6 +13,9 @@ # CONFIG FILES config_prod3b_CTAN = resource_filename("protopipe", "scripts/tests/test_config_analysis_north.yaml") config_prod3b_CTAS = resource_filename("protopipe", "scripts/tests/test_config_analysis_south.yaml") +config_AdaBoostRegressor = resource_filename("protopipe", "scripts/tests/test_AdaBoostRegressor.yaml") +config_RandomForestRegressor = resource_filename("protopipe", "scripts/tests/test_RandomForestRegressor.yaml") +config_RandomForestClassifier = resource_filename("protopipe", "scripts/tests/test_RandomForestClassifier.yaml") # TEST FILES @@ -57,7 +60,7 @@ def test_GET_GAMMAS_FOR_ENERGY_MODEL_WITH_IMAGES(test_case, pipeline_testdir): print( # only with "pytest -s" f''' - /n You can reproduce this test by running the following command, + You can reproduce this test by running the following command, {command} ''' @@ -89,7 +92,7 @@ def test_GET_GAMMAS_FOR_ENERGY_MODEL(test_case, pipeline_testdir): print( # only with "pytest -s" f''' - /n You can reproduce this test by running the following command, + You can reproduce this test by running the following command, {command} ''' @@ -106,28 +109,56 @@ def test_GET_GAMMAS_FOR_ENERGY_MODEL(test_case, pipeline_testdir): @pytest.mark.parametrize("test_case", [ - pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="EN", + pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="EN_1", depends=["g1N"])), - pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="ES", + pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="ES_1", depends=["g1S"])), ]) def test_BUILD_ENERGY_MODEL_AdaBoost_DecisionTreeRegressor(test_case, pipeline_testdir): - """Launch protopipe.scripts.build_model for a AdaBoost DecisionTreeRegressor.""" + """Launch protopipe.scripts.build_model for a AdaBoostRegressor based on DecisionTreeRegressor.""" infile = pipeline_testdir / f"test_gamma1_noImages_{test_case}.h5" outdir = pipeline_testdir / f"energy_model_{test_case}" - config = resource_filename("protopipe", "scripts/tests/test_regressor.yaml") + command = f"python {build_model.__file__}\ + --config_file {config_AdaBoostRegressor}\ + --infile_signal {infile}\ + --outdir {outdir}\ + --cameras_from_file" + + print( # only with "pytest -s" + f''' + You can reproduce this test by running the following command, + + {command} + ''' + ) + + exit_status = system(command) + assert exit_status == 0 + + +@pytest.mark.parametrize("test_case", [ + pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="EN_2", + depends=["g1N"])), + pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="ES_2", + depends=["g1S"])), +]) +def test_BUILD_ENERGY_MODEL_RandomForestRegressor(test_case, pipeline_testdir): + """Launch protopipe.scripts.build_model for a RandomForestRegressor.""" + + infile = pipeline_testdir / f"test_gamma1_noImages_{test_case}.h5" + outdir = pipeline_testdir / f"energy_model_{test_case}" command = f"python {build_model.__file__}\ - --config_file {config}\ + --config_file {config_RandomForestRegressor}\ --infile_signal {infile}\ --outdir {outdir}\ --cameras_from_file" print( # only with "pytest -s" f''' - /n You can reproduce this test by running the following command, + You can reproduce this test by running the following command, {command} ''' @@ -139,9 +170,9 @@ def test_BUILD_ENERGY_MODEL_AdaBoost_DecisionTreeRegressor(test_case, pipeline_t @pytest.mark.parametrize("test_case", [ pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="g2N", - depends=["EN"])), + depends=["EN_2"])), pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="g2S", - depends=["ES"])), + depends=["ES_2"])), ]) def test_GET_GAMMAS_FOR_CLASSIFICATION_MODEL(test_case, pipeline_testdir): @@ -154,11 +185,12 @@ def test_GET_GAMMAS_FOR_CLASSIFICATION_MODEL(test_case, pipeline_testdir): -i {input_data[test_case]['gamma2'].parent}\ -f {input_data[test_case]['gamma2'].name}\ --estimate_energy True\ + --regressor_config {config_RandomForestRegressor}\ --regressor_dir {modelpath}" print( # only with "pytest -s" f''' - /n You can reproduce this test by running the following command, + You can reproduce this test by running the following command, {command} ''' @@ -176,9 +208,9 @@ def test_GET_GAMMAS_FOR_CLASSIFICATION_MODEL(test_case, pipeline_testdir): @pytest.mark.parametrize("test_case", [ pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="p1N", - depends=["EN"])), + depends=["EN_2"])), pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="p1S", - depends=["ES"])), + depends=["ES_2"])), ]) def test_GET_PROTONS_FOR_CLASSIFICATION_MODEL(test_case, pipeline_testdir): @@ -191,11 +223,12 @@ def test_GET_PROTONS_FOR_CLASSIFICATION_MODEL(test_case, pipeline_testdir): -i {input_data[test_case]['proton1'].parent}\ -f {input_data[test_case]['proton1'].name}\ --estimate_energy True\ + --regressor_config {config_RandomForestRegressor}\ --regressor_dir {modelpath}" print( # only with "pytest -s" f''' - /n You can reproduce this test by running the following command, + You can reproduce this test by running the following command, {command} ''' @@ -217,17 +250,15 @@ def test_GET_PROTONS_FOR_CLASSIFICATION_MODEL(test_case, pipeline_testdir): pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="C2", depends=["g2S", "p1S"])), ]) -def test_BUILD_CLASSIFICATION_MODEL_RandomForest(test_case, pipeline_testdir): +def test_BUILD_CLASSIFICATION_MODEL_RandomForestClassifier(test_case, pipeline_testdir): """Launch protopipe.scripts.build_model for a Random Forest classifier.""" infile_signal = pipeline_testdir / f"test_gamma2_noImages_{test_case}.h5" infile_background = pipeline_testdir / f"test_proton1_noImages_{test_case}.h5" outdir = pipeline_testdir / f"classification_model_{test_case}" - config = resource_filename("protopipe", "scripts/tests/test_classifier.yaml") - command = f"python {build_model.__file__}\ - --config_file {config}\ + --config_file {config_RandomForestClassifier}\ --infile_signal {infile_signal}\ --infile_background {infile_background}\ --outdir {outdir}\ @@ -235,7 +266,7 @@ def test_BUILD_CLASSIFICATION_MODEL_RandomForest(test_case, pipeline_testdir): print( # only with "pytest -s" f''' - /n You can reproduce this test by running the following command, + You can reproduce this test by running the following command, {command} ''' diff --git a/protopipe/scripts/tests/test_regressor.yaml b/protopipe/scripts/tests/test_regressor.yaml deleted file mode 100644 index 9041fe5e..00000000 --- a/protopipe/scripts/tests/test_regressor.yaml +++ /dev/null @@ -1,41 +0,0 @@ -General: - model_type: 'regressor' - # [...] = your analysis local full path OUTSIDE the Vagrant box - data_dir: './' - data_file: 'test_TRAINING_energy_{}_gamma_merged.h5' - outdir: './' - cam_id_list: ['LSTCam', 'NectarCam'] - table_name_template: '' # leave empty (TO BE REMOVED) - -Split: - train_fraction: 0.5 - -Method: - name: 'AdaBoostRegressor' - target_name: 'true_energy' - tuned_parameters: - learning_rate: [0.3] - n_estimators: [100] - base_estimator__max_depth: [null] # null is equivalent to None - base_estimator__min_samples_split: [2] - base_estimator__min_samples_leaf: [10] - scoring: 'explained_variance' - cv: 2 - -FeatureList: - - 'log10_hillas_intensity' - - 'log10_impact_dist' - - 'hillas_width_reco' - - 'hillas_length_reco' - - 'h_max' - -SigFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' - -Diagnostic: - # Energy binning (used for reco and true energy) - energy: - nbins: 15 - min: 0.0125 - max: 125 diff --git a/protopipe/scripts/write_dl2.py b/protopipe/scripts/write_dl2.py index dbb94da0..f30beacb 100755 --- a/protopipe/scripts/write_dl2.py +++ b/protopipe/scripts/write_dl2.py @@ -2,6 +2,7 @@ from sys import exit import numpy as np +import pandas as pd from glob import glob import signal from astropy.coordinates.angle_utilities import angular_separation @@ -47,6 +48,20 @@ def main(): action="store_true", help="Save images in images.h5 (one file testing)", ) + + parser.add_argument( + "--regressor_config", + type=str, + default=None, + help="Configuration file used to produce regressor model" + ) + parser.add_argument( + "--classifier_config", + type=str, + default=None, + help="Configuration file used to produce classification model" + ) + args = parser.parse_args() # Read configuration file @@ -133,8 +148,12 @@ def main(): # Classifiers if use_classifier: + + # Read configuration file + classifier_config = load_config(args.classifier_config) + classifier_files = ( - args.classifier_dir + "/classifier_{mode}_{cam_id}_{classifier}.pkl.gz" + args.classifier_dir + "/classifier_{cam_id}_{classifier}.pkl.gz" ) clf_file = classifier_files.format( **{ @@ -156,8 +175,12 @@ def main(): # Regressors if use_regressor: + + # Read configuration file + regressor_config = load_config(args.regressor_config) + regressor_files = ( - args.regressor_dir + "/regressor_{mode}_{cam_id}_{regressor}.pkl.gz" + args.regressor_dir + "/regressor_{cam_id}_{regressor}.pkl.gz" ) reg_file = regressor_files.format( **{ @@ -271,13 +294,10 @@ class RecoEvent(tb.IsDescription): source, save_images=args.save_images, debug=args.debug ): - # True energy - true_energy = event.simulation.shower.energy.value - # True direction true_az = event.simulation.shower.az true_alt = event.simulation.shower.alt - + # Array pointing in AltAz frame pointing_az = event.pointing.array_azimuth pointing_alt = event.pointing.array_altitude @@ -339,21 +359,55 @@ class RecoEvent(tb.IsDescription): cam_id = source.subarray.tel[tel_id].camera.camera_name moments = hillas_dict[tel_id] + model = regressors[cam_id] - # Features to be fed in the regressor - features_img = np.array( - [ - np.log10(moments.intensity), - np.log10(impact_dict[tel_id].value), - moments.width.value, - moments.length.value, - h_max.value, - ] - ) + ############################################################ + # GET FEATURES + ############################################################ + + # Read feature list from model configutation file + features_basic = regressor_config["FeatureList"]["Basic"] + features_derived = regressor_config["FeatureList"]["Derived"] + features = features_basic + list(features_derived) + + # Create a pandas Dataframe with basic quantities + # This is needed in order to connect the I/O system of the + # model inputs to the in-memory computation of this script + data = pd.DataFrame({ + "hillas_intensity": [moments.intensity], + "hillas_width": [moments.width.to("deg").value], + "hillas_length": [moments.length.to("deg").value], + "hillas_x": [moments.x.to("deg").value], + "hillas_y": [moments.y.to("deg").value], + "hillas_phi": [moments.phi.to("deg").value], + "hillas_r": [moments.r.to("deg").value], + "leakage_intensity_width_1_reco": [leakage_dict[tel_id]['leak1_reco']], + "leakage_intensity_width_2_reco": [leakage_dict[tel_id]['leak2_reco']], + "leakage_intensity_width_1": [leakage_dict[tel_id]['leak1']], + "leakage_intensity_width_2": [leakage_dict[tel_id]['leak2']], + "az": [reco_result.az.to("deg").value], + "alt": [reco_result.alt.to("deg").value], + "h_max": [h_max.value], + "impact_dist": [impact_dict[tel_id].to("m").value], + }) + + # Compute derived features and add them to the dataframe + for key, expression in features_derived.items(): + if key not in data: + data.eval(f'{key} = {expression}', inplace=True) + + # sort features_to_use alphabetically to ensure order + # preservation with model.fit in protopipe.mva + features = sorted(features) + + # Select the values for the full set of features + features_values = data[features].to_numpy() + + ############################################################ if good_for_reco[tel_id] == 1: - energy_tel[idx] = model.predict([features_img]) + energy_tel[idx] = model.predict(features_values) else: energy_tel[idx] = np.nan @@ -389,24 +443,57 @@ class RecoEvent(tb.IsDescription): weight_tel = np.zeros(len(hillas_dict.keys())) for idx, tel_id in enumerate(hillas_dict.keys()): + cam_id = source.subarray.tel[tel_id].camera.camera_name moments = hillas_dict[tel_id] + model = classifiers[cam_id] - # Features to be fed in the classifier - # this should be read in some way from - # the classifier configuration file!!!!! - - features_img = np.array( - [ - np.log10(reco_energy), - np.log10(energy_tel_classifier[tel_id]), - np.log10(moments.intensity), - moments.width.value, - moments.length.value, - h_max.value, - impact_dict[tel_id].value, - ] - ) + + ############################################################ + # GET FEATURES + ############################################################ + + # Read feature list from model configutation file + features_basic = classifier_config["FeatureList"]["Basic"] + features_derived = classifier_config["FeatureList"]["Derived"] + features = features_basic + list(features_derived) + + # Create a pandas Dataframe with basic quantities + # This is needed in order to connect the I/O system of the + # model inputs to the in-memory computation of this script + data = pd.DataFrame({ + "hillas_intensity": [moments.intensity], + "hillas_width": [moments.width.to("deg").value], + "hillas_length": [moments.length.to("deg").value], + "hillas_x": [moments.x.to("deg").value], + "hillas_y": [moments.y.to("deg").value], + "hillas_phi": [moments.phi.to("deg").value], + "hillas_r": [moments.r.to("deg").value], + "leakage_intensity_width_1_reco": [leakage_dict[tel_id]['leak1_reco']], + "leakage_intensity_width_2_reco": [leakage_dict[tel_id]['leak2_reco']], + "leakage_intensity_width_1": [leakage_dict[tel_id]['leak1']], + "leakage_intensity_width_2": [leakage_dict[tel_id]['leak2']], + "az": [reco_result.az.to("deg").value], + "alt": [reco_result.alt.to("deg").value], + "h_max": [h_max.value], + "impact_dist": [impact_dict[tel_id].to("m").value], + "reco_energy": reco_energy, + "reco_energy_tel": energy_tel_classifier[tel_id], + }) + + # Compute derived features and add them to the dataframe + for key, expression in features_derived.items(): + if key not in data: + data.eval(f'{key} = {expression}', inplace=True) + + # sort features_to_use alphabetically to ensure order + # preservation with model.fit in protopipe.mva + features = sorted(features) + + # Select the values for the full set of features + features_values = data[features].to_numpy() + + ############################################################ # Here we check for valid telescope-wise energies # Because it means that it's a good image @@ -415,11 +502,9 @@ class RecoEvent(tb.IsDescription): if not np.isnan(energy_tel_classifier[tel_id]): # Output of classifier according to type of classifier if use_proba_for_classifier is False: - score_tel[idx] = model.decision_function([features_img]) + score_tel[idx] = model.decision_function(features_values) else: - gammaness_tel[idx] = model.predict_proba([features_img])[ - :, 1 - ] + gammaness_tel[idx] = model.predict_proba(features_values)[:, 1] weight_tel[idx] = np.sqrt(moments.intensity) else: # WARNING: @@ -481,7 +566,7 @@ class RecoEvent(tb.IsDescription): for idx, tel_id in enumerate(hillas_dict.keys()): cam_id = source.subarray.tel[tel_id].camera.camera_name if cam_id not in images_phe: - + n_pixels = source.subarray.tel[tel_id].camera.geometry.n_pixels StoredImages["true_image"] = tb.Float32Col( shape=(n_pixels), pos=2 @@ -495,7 +580,7 @@ class RecoEvent(tb.IsDescription): StoredImages["cleaning_mask_clusters"] = tb.BoolCol( shape=(n_pixels), pos=5 ) # not in ctapipe - + images_table[cam_id] = images_outfile.create_table( "/", "_".join(["images", cam_id]), StoredImages )