diff --git a/docs/mva/index.rst b/docs/mva/index.rst
index 6e93e1f1..9b76c6ac 100644
--- a/docs/mva/index.rst
+++ b/docs/mva/index.rst
@@ -10,20 +10,30 @@ Introduction
 classification problems. It is based on machine learning methods available in
 scikit-learn_. Internally, the tables are dealt with the Pandas_ Python module.
 
-For each type of camera a regressor/classifier should be trained. For both type of models
-an average of the image estimates is later computed to determine a global
-output for the event (energy or score/gammaness).
+For each type of camera a regressor/classifier should be trained.
+For both type of models an average of the image estimates is later computed to
+determine a global output for the event (energy or score/gammaness).
 
 Details
 -------
 
-Data is split in train and test subsamples by images.
+Data is split in train and test subsamples by single telescope images.
 
-The class `TrainModel` uses a training sample composed of gamma-rays for a
+The class ```TrainModel``` uses a training sample composed of gamma-rays for a
 regression model. In addition of a gamma-ray sample, a sample of
-protons is also used to build a classifier. The training of a model is done via
-the GridSearchCV_ algorithm which allows to find the best hyper-parameters of
-the models.
+protons is also used to build a classifier.
+
+The training of a model can be done also via the GridSearchCV_ algorithm which 
+allows to find the best hyper-parameters of the models.
+
+Supported models:
+
+- ``sklearn.ensemble.RandomForestClassifier``
+- ``sklearn.ensemble.RandomForestRegressor``
+- ``sklearn.ensemble.AdaBoostRegressor``
+
+For details about the generation of each model type, please refer to 
+:ref:`model_building`.
 
 Reference/API
 -------------
diff --git a/docs/scripts/DL2.rst b/docs/scripts/DL2.rst
index a19028af..79e34048 100644
--- a/docs/scripts/DL2.rst
+++ b/docs/scripts/DL2.rst
@@ -15,14 +15,11 @@ By invoking the help argument, you can get help about how the script works:
 
 .. code-block::
 
-  usage: protopipe-DL2 [-h] --config_file CONFIG_FILE -o OUTFILE [-m MAX_EVENTS]
-                       [-i INDIR] [-f [INFILE_LIST [INFILE_LIST ...]]]
-                       [--cam_ids [CAM_IDS [CAM_IDS ...]]] [--wave_dir WAVE_DIR]
-                       [--wave_temp_dir WAVE_TEMP_DIR] [--wave | --tail]
-                       [--debug] [--regressor_dir REGRESSOR_DIR]
-                       [--classifier_dir CLASSIFIER_DIR]
-                       [--force_tailcut_for_extended_cleaning FORCE_TAILCUT_FOR_EXTENDED_CLEANING]
-                       [--save_images]
+  usage: protopipe-DL2 [-h] --config_file CONFIG_FILE -o OUTFILE [-m MAX_EVENTS] [-i INDIR] [-f [INFILE_LIST [INFILE_LIST ...]]]
+                     [--cam_ids [CAM_IDS [CAM_IDS ...]]] [--wave_dir WAVE_DIR] [--wave_temp_dir WAVE_TEMP_DIR] [--wave | --tail] [--debug]
+                     [--regressor_dir REGRESSOR_DIR] [--classifier_dir CLASSIFIER_DIR]
+                     [--force_tailcut_for_extended_cleaning FORCE_TAILCUT_FOR_EXTENDED_CLEANING] [--save_images]
+                     [--regressor_config REGRESSOR_CONFIG] [--classifier_config CLASSIFIER_CONFIG]
 
   optional arguments:
     -h, --help            show this help message and exit
@@ -35,11 +32,9 @@ By invoking the help argument, you can get help about how the script works:
                           give a specific list of files to run on
     --cam_ids [CAM_IDS [CAM_IDS ...]]
                           give the specific list of camera types to run on
-    --wave_dir WAVE_DIR   directory where to find mr_filter. if not set look in
-                          $PATH
+    --wave_dir WAVE_DIR   directory where to find mr_filter. if not set look in $PATH
     --wave_temp_dir WAVE_TEMP_DIR
-                          directory where mr_filter to store the temporary fits
-                          files
+                          directory where mr_filter to store the temporary fits files
     --wave                if set, use wavelet cleaning -- default
     --tail                if set, use tail cleaning, otherwise wavelets
     --debug               Print debugging information
@@ -50,3 +45,7 @@ By invoking the help argument, you can get help about how the script works:
     --force_tailcut_for_extended_cleaning FORCE_TAILCUT_FOR_EXTENDED_CLEANING
                           For tailcut cleaning for energy/score estimation
     --save_images         Save images in images.h5 (one file testing)
+    --regressor_config REGRESSOR_CONFIG
+                          Configuration file used to produce regressor model
+    --classifier_config CLASSIFIER_CONFIG
+                          Configuration file used to produce classification model
diff --git a/docs/scripts/data_training.rst b/docs/scripts/data_training.rst
index daf31c7e..24ee6300 100644
--- a/docs/scripts/data_training.rst
+++ b/docs/scripts/data_training.rst
@@ -19,15 +19,10 @@ By invoking the help argument, you can get help about how the script works:
 
 .. code-block::
 
-  usage: protopipe-TRAINING [-h] --config_file CONFIG_FILE -o OUTFILE
-                            [-m MAX_EVENTS] [-i INDIR]
-                            [-f [INFILE_LIST [INFILE_LIST ...]]]
-                            [--cam_ids [CAM_IDS [CAM_IDS ...]]]
-                            [--wave_dir WAVE_DIR]
-                            [--wave_temp_dir WAVE_TEMP_DIR] [--wave | --tail]
-                            [--debug] [--save_images]
-                            [--estimate_energy ESTIMATE_ENERGY]
-                            [--regressor_dir REGRESSOR_DIR]
+  usage: protopipe-TRAINING [-h] --config_file CONFIG_FILE -o OUTFILE [-m MAX_EVENTS] [-i INDIR] [-f [INFILE_LIST [INFILE_LIST ...]]]
+                          [--cam_ids [CAM_IDS [CAM_IDS ...]]] [--wave_dir WAVE_DIR] [--wave_temp_dir WAVE_TEMP_DIR] [--wave | --tail]
+                          [--debug] [--save_images] [--estimate_energy ESTIMATE_ENERGY] [--regressor_dir REGRESSOR_DIR]
+                          [--regressor_config REGRESSOR_CONFIG]
 
   optional arguments:
     -h, --help            show this help message and exit
@@ -40,20 +35,19 @@ By invoking the help argument, you can get help about how the script works:
                           give a specific list of files to run on
     --cam_ids [CAM_IDS [CAM_IDS ...]]
                           give the specific list of camera types to run on
-    --wave_dir WAVE_DIR   directory where to find mr_filter. if not set look in
-                          $PATH
+    --wave_dir WAVE_DIR   directory where to find mr_filter. if not set look in $PATH
     --wave_temp_dir WAVE_TEMP_DIR
-                          directory where mr_filter to store the temporary fits
-                          files
+                          directory where mr_filter to store the temporary fits files
     --wave                if set, use wavelet cleaning -- default
     --tail                if set, use tail cleaning, otherwise wavelets
     --debug               Print debugging information
     --save_images         Save also all images
     --estimate_energy ESTIMATE_ENERGY
-                          Estimate the events' energy with a regressor from
-                          protopipe.scripts.build_model
+                          Estimate the events' energy with a regressor from protopipe.scripts.build_model
     --regressor_dir REGRESSOR_DIR
                           regressors directory
+    --regressor_config REGRESSOR_CONFIG
+                          Configuration file used to produce regressor model
 
 The configuration file used by this script is ``analysis.yaml``,
 
diff --git a/docs/scripts/model_building.rst b/docs/scripts/model_building.rst
index 7e83f5cb..341167c5 100644
--- a/docs/scripts/model_building.rst
+++ b/docs/scripts/model_building.rst
@@ -15,10 +15,8 @@ The following is the help output which shows required arguments and options.
 .. code-block::
 
     >$ protopipe-MODEL -h
-    usage: protopipe-MODEL [-h] --config_file CONFIG_FILE
-                       [--max_events MAX_EVENTS] [--wave | --tail]
-                       (--cameras_from_config | --cameras_from_file | --cam_id_list CAM_ID_LIST)
-                       [-i INDIR] [--infile_signal INFILE_SIGNAL]
+    usage: protopipe-MODEL [-h] --config_file CONFIG_FILE [--max_events MAX_EVENTS] [--wave | --tail]
+                       (--cameras_from_config | --cameras_from_file | --cam_id_list CAM_ID_LIST) [-i INDIR] [--infile_signal INFILE_SIGNAL]
                        [--infile_background INFILE_BACKGROUND] [-o OUTDIR]
 
     Build model for regression/classification
@@ -27,9 +25,9 @@ The following is the help output which shows required arguments and options.
       -h, --help            show this help message and exit
       --config_file CONFIG_FILE
       --max_events MAX_EVENTS
-                            maximum number of events for training
+                            maximum number of events to use
       --wave                if set, use wavelet cleaning
-      --tail                if set, use tail cleaning, otherwise wavelets
+      --tail                if set, use tail cleaning (default), otherwise wavelets
       --cameras_from_config
                             Get cameras configuration file (Priority 1)
       --cameras_from_file   Get cameras from input file (Priority 2)
@@ -44,10 +42,13 @@ The following is the help output which shows required arguments and options.
       -o OUTDIR, --outdir OUTDIR
 
 The script takes along its arguments a configuration file which depends on what
-type of estimator needs to be trained:
+type of model needs to be built.
 
-* ``regressor.yaml`` is used to train an energy regressor,
-* ``classifier.yaml`` is used to train a gamma/hadron classifier.
+The available choices can be found under ```protopipe.aux.example_config_files```:
+
+* ``AdaBoostRegressor.yaml`` is used to train an energy regressor,
+* ``RandomForestRegressor.yaml``  is used to train an energy regressor,
+* ``RandomForestClassifier.yaml`` is used to train a gamma/hadron classifier.
 
 Energy regressor
 ----------------
@@ -57,44 +58,86 @@ and some event characteristics (the features) to reconstruct the energy.
 This table is created in the :ref:`data_training` step.
 
 The following is a commented example of the required configuration file
-``regressor.yaml``:
+``AdaBoostRegressor.yaml`` with similar options as for ``RandomForestRegressor.yaml``,
 
 .. code-block:: yaml
 
   General:
-   model_type: 'regressor'
-   # [...] = your analysis local full path OUTSIDE the Vagrant box
-   data_dir: '[...]/shared_folder/analyses/v0.4.0_dev1/data/TRAINING/for_energy_estimation'
-   data_file: 'TRAINING_energy_tail_gamma_merged.h5'
-   outdir: '[...]/shared_folder/analyses/v0.4.0_dev1/estimators/energy_regressor'
-   cam_id_list: ['LSTCam', 'NectarCam']
-   table_name_template: '' # leave empty (TO BE REMOVED)
-
+    # [...] = your analysis local full path OUTSIDE the Vagrant box
+    data_dir: '../../data/'
+    data_sig_file: 'TRAINING_energy_tail_gamma_merged.h5'
+    outdir: './'
+    
+    # List of cameras to use (you can override this from the CLI)
+    cam_id_list: ['LSTCam', 'NectarCam']
+
+  # If train_fraction is 1, all the TRAINING dataset will be used to train the
+  # model and benchmarking can only be done from the benchmarking notebook
+  # TRAINING/benchmarks_DL2_to_classification.ipynb
   Split:
-   train_fraction: 0.8
+    train_fraction: 0.8
+    use_same_number_of_sig_and_bkg_for_training: False  # Lowest statistics will drive the split
+
+  # Optimize the hyper-parameters of the estimator with a grid search
+  # If True parameters should be provided as lists
+  # If False the model used will be the unique one based on your the
+  GridSearchCV:
+    use: False # True or False
+    # if False the following two variables are irrelevant
+    scoring: 'explained_variance'
+    cv: 2
 
   Method:
-   name: 'AdaBoostRegressor'
-   target_name: 'true_energy'
-   tuned_parameters:
-    learning_rate: [0.3]
-    n_estimators: [100]
-    base_estimator__max_depth: [null]  # null is equivalent to None
-    base_estimator__min_samples_split: [2]
-    base_estimator__min_samples_leaf: [10]
-   scoring: 'explained_variance'
-   cv: 2
-
+    name: 'sklearn.ensemble.AdaBoostRegressor'
+    target_name: 'true_energy'
+    # Please, see scikit-learn's API for what each parameter means
+    # NOTE: null == None
+    base_estimator:
+      name: 'sklearn.tree.DecisionTreeRegressor'
+      parameters:
+        # NOTE: here we set the parameters relevant for sklearn.tree.DecisionTreeRegressor
+        criterion: "mse" # "mse", "friedman_mse", "mae" or "poisson"
+        splitter: "best" # "best" or "random"
+        max_depth: null # null or integer
+        min_samples_split: 2 # integer or float
+        min_samples_leaf: 1 # int or float
+        min_weight_fraction_leaf: 0.0 # float
+        max_features: null # null, "auto", "sqrt", "log2", int or float
+        max_leaf_nodes: null # null or integer
+        min_impurity_decrease: 0.0 # float
+        random_state: 0 # null or integer or RandomState
+        ccp_alpha: 0.0 # non-negative float
+    tuned_parameters:
+      n_estimators: 50
+      learning_rate: 1
+      loss: 'linear' # 'linear', 'square' or 'exponential'
+      random_state: 0 # int, RandomState instance or None
+
+  # List of the features to use to train the model
+  # You can:
+  # - comment/uncomment the ones you see here,
+  # - add new ones here if they can be evaluated with pandas.DataFrame.eval
+  # - if not you can propose modifications to protopipe.mva.utils.prepare_data
   FeatureList:
-   - 'log10_hillas_intensity'
-   - 'log10_impact_dist'
-   - 'hillas_width_reco'
-   - 'hillas_length_reco'
-   - 'h_max'
-
+    Basic: # single-named, they need to correspond to input data columns
+    - 'h_max'         # Height of shower maximum from stereoscopic reconstruction
+    - 'impact_dist'   # Impact parameter from stereoscopic reconstruction
+    - 'hillas_width'  # Image Width
+    - 'hillas_length' # Image Length
+    # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel
+    - 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera
+    Derived: # custom evaluations of basic features that will be added to the data
+      # column name : expression to evaluate using basic column names
+      log10_WLS: log10(hillas_width*hillas_length/hillas_intensity)
+      log10_intensity: log10(hillas_intensity)
+      CTAMARS_1: (sqrt((hillas_x - az)**2 + (hillas_y - alt)**2))**2
+      CTAMARS_2: arctan2(hillas_y - alt, hillas_x - az)
+
+  # These cuts select the input data BEFORE training
   SigFiducialCuts:
-   - 'good_image == 1'
-   - 'is_valid == True'
+    - 'good_image == 1'
+    - 'is_valid == True'
+    - 'hillas_intensity_reco > 0'
 
   Diagnostic:
    # Energy binning (used for reco and true energy)
@@ -136,58 +179,97 @@ as a contamination).
   An alternative approach - yet to study - could be to train a classifier with gamma
   against a background sample composed of weighted hadrons and weighted electrons.
 
+The following the example provided by the example configuration file ``RandomForestClassifier.yaml``,
+
 .. code-block:: yaml
 
   General:
-   model_type: 'classifier'
-   # [...] = your analysis local full path OUTSIDE the Vagrant box
-   data_dir: '[...]/shared_folder/analyses/v0.4.0_dev1/data/TRAINING/for_particle_classification/'
-   data_sig_file: 'TRAINING_classification_tail_gamma_merged.h5'
-   data_bkg_file: 'TRAINING_classification_tail_proton_merged.h5'
-   cam_id_list: ['LSTCam', 'NectarCam']
-   table_name_template: '' # leave empty (TO BE REMOVED)
-   outdir: '[...]/shared_folder/analyses/v0.4.0_dev1/estimators/gamma_hadron_classifier'
-
+    # [...] = your analysis local full path OUTSIDE the Vagrant box
+    data_dir: '../../data/' # '[...]/data/TRAINING/for_particle_classification/'
+    data_sig_file: 'TRAINING_classification_tail_gamma_merged.h5'
+    data_bkg_file: 'TRAINING_classification_tail_proton_merged.h5'
+    outdir: './' # [...]/estimators/gamma_hadron_classifier
+    
+    # List of cameras to use (protopipe-MODEL help output for other options)
+    cam_id_list: ['LSTCam', 'NectarCam']
+
+  # If train_fraction is 1, all the TRAINING dataset will be used to train the
+  # model and benchmarking can only be done from the benchmarking notebook
+  # TRAINING/benchmarks_DL2_to_classification.ipynb
   Split:
-   train_fraction: 0.8
-   use_same_number_of_sig_and_bkg_for_training: False  # Lowest statistics will drive the split
-
+    train_fraction: 0.8
+    use_same_number_of_sig_and_bkg_for_training: False  # Lowest statistics will drive the split
+
+  # Optimize the hyper-parameters of the estimator with a grid search
+  # If 'True' parameters should be provided as lists (for None use [null])
+  # If 'False' the model used will be the unique one based on your the
+  GridSearchCV:
+    use: False # 'True' or 'False'
+    # if False the following to variables are irrelevant
+    scoring: 'roc_auc'
+    cv: 2
+
+  # Definition of the algorithm/method used and its hyper-parameters
   Method:
-   name: 'RandomForestClassifier'  # AdaBoostClassifier or RandomForestClassifier
-   target_name: 'label'
-   tuned_parameters: # these are lists of values used by the GridSearchCV algorithm
-    n_estimators: [200]
-    max_depth: [10]  # null for None
-    max_features: [3] # possible choices are “auto”, “sqrt”, “log2”, int or float
-    min_samples_split: [10]
-    min_samples_leaf: [10]
-   scoring: 'roc_auc' # possible choices are 'roc_auc', 'explained_variance'
-   cv: 2
-   use_proba: True  # If not output is score
-   calibrate_output: False  # If true calibrate probability
-
+    name: 'sklearn.ensemble.RandomForestClassifier' # DO NOT CHANGE
+    target_name: 'label' # defined between 0 and 1 (DO NOT CHANGE)
+    tuned_parameters:
+      # Please, see scikit-learn's API for what each parameter means
+      # WARNING: null (not a string) == 'None'
+      n_estimators: 100 # integer
+      criterion: 'gini' # 'gini' or 'entropy'
+      max_depth: null # null or integer
+      min_samples_split: 2 # integer or float
+      min_samples_leaf: 1 # integer or float
+      min_weight_fraction_leaf: 0.0 # float
+      max_features: 3 # 'auto', 'sqrt', 'log2', integer or float
+      max_leaf_nodes: null # null or integer
+      min_impurity_decrease: 0.0 # float
+      bootstrap: False # True or False
+      oob_score: False # True or False
+      n_jobs: null # null or integer
+      random_state: 0 # null or integer or RandomState
+      verbose: 0 # integer
+      warm_start: False # 'True' or 'False'
+      class_weight: null # 'balanced', 'balanced_subsample', null, dict or list of dicts
+      ccp_alpha: 0.0 # non-negative float
+      max_samples: null # null, integer or float
+    calibrate_output: False  # If True calibrate model on test data
+
+  # List of the features to use to train the model
+  # You can:
+  # - comment/uncomment the ones you see here,
+  # - add new ones here if they can be evaluated with pandas.DataFrame.eval
+  # - if not you can propose modifications to protopipe.mva.utils.prepare_data
   FeatureList:
-   - 'log10_reco_energy'
-   - 'log10_reco_energy_tel'
-   - 'log10_hillas_intensity'
-   - 'hillas_width'
-   - 'hillas_length'
-   - 'h_max'
-   - 'impact_dist'
-
+    Basic: # single-named, they need to correspond to input data columns
+    - 'h_max'         # Height of shower maximum from stereoscopic reconstruction
+    - 'impact_dist'   # Impact parameter from stereoscopic reconstruction
+    - 'hillas_width'  # Image Width
+    - 'hillas_length' # Image Length
+    # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel
+    Derived: # custom evaluations of basic features that will be added to the data
+      # column name : expression to evaluate using basic column names
+      log10_intensity: log10(hillas_intensity)
+      log10_energy: log10(reco_energy) # Averaged-estimated energy of the shower
+      log10_energy_tel: log10(reco_energy_tel) # Estimated energy of the shower per telescope
+
+  # These cuts select the input data BEFORE training
   SigFiducialCuts:
-   - 'good_image == 1'
-   - 'is_valid == True'
+    - 'good_image == 1'
+    - 'is_valid == True'
+    - 'hillas_intensity_reco > 0'
 
   BkgFiducialCuts:
    - 'good_image == 1'
    - 'is_valid == True'
+   - 'hillas_intensity_reco > 0'
 
   Diagnostic:
    # Energy binning (used for reco and true energy)
    energy:
     nbins: 4
-    min: 0.02
+    min: 0.0125
     max: 200
 
 We want to exploit parameters showing statistical differences in the shower
diff --git a/protopipe/aux/example_config_files/AdaBoostRegressor.yaml b/protopipe/aux/example_config_files/AdaBoostRegressor.yaml
new file mode 100644
index 00000000..5aee5f32
--- /dev/null
+++ b/protopipe/aux/example_config_files/AdaBoostRegressor.yaml
@@ -0,0 +1,83 @@
+General:
+  # [...] = your analysis local full path OUTSIDE the Vagrant box
+  data_dir: '../../data/'
+  data_sig_file: 'TRAINING_energy_tail_gamma_merged.h5'
+  outdir: './'
+  
+  # List of cameras to use (you can override this from the CLI)
+  cam_id_list: ['LSTCam', 'NectarCam']
+
+# If train_fraction is 1, all the TRAINING dataset will be used to train the
+# model and benchmarking can only be done from the benchmarking notebook
+# TRAINING/benchmarks_DL2_to_classification.ipynb
+Split:
+  train_fraction: 0.8
+  use_same_number_of_sig_and_bkg_for_training: False  # Lowest statistics will drive the split
+
+# Optimize the hyper-parameters of the estimator with a grid search
+# If True parameters should be provided as lists
+# If False the model used will be the one based on the chosen single-valued hyper-parameters
+GridSearchCV:
+  use: False # True or False
+  # if False the following two variables are irrelevant
+  scoring: 'explained_variance'
+  cv: 2
+
+Method:
+  name: 'sklearn.ensemble.AdaBoostRegressor'
+  target_name: 'true_energy'
+  # Please, see scikit-learn's API for what each parameter means
+  # NOTE: null == None
+  base_estimator:
+    name: 'sklearn.tree.DecisionTreeRegressor'
+    parameters:
+      # NOTE: here we set the parameters relevant for sklearn.tree.DecisionTreeRegressor
+      criterion: "mse" # "mse", "friedman_mse", "mae" or "poisson"
+      splitter: "best" # "best" or "random"
+      max_depth: null # null or integer
+      min_samples_split: 2 # integer or float
+      min_samples_leaf: 1 # int or float
+      min_weight_fraction_leaf: 0.0 # float
+      max_features: null # null, "auto", "sqrt", "log2", int or float
+      max_leaf_nodes: null # null or integer
+      min_impurity_decrease: 0.0 # float
+      random_state: 0 # null or integer or RandomState
+      ccp_alpha: 0.0 # non-negative float
+  tuned_parameters:
+    n_estimators: 50
+    learning_rate: 1
+    loss: 'linear' # 'linear', 'square' or 'exponential'
+    random_state: 0 # int, RandomState instance or None
+
+# List of the features to use to train the model
+# You can:
+# - comment/uncomment the ones you see here,
+# - add new ones here if they can be evaluated with pandas.DataFrame.eval
+# - if not you can propose modifications to protopipe.mva.utils.prepare_data
+FeatureList:
+  Basic: # single-named, they need to correspond to input data columns
+  - 'h_max'         # Height of shower maximum from stereoscopic reconstruction
+  - 'impact_dist'   # Impact parameter from stereoscopic reconstruction
+  - 'hillas_width'  # Image Width
+  - 'hillas_length' # Image Length
+  # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel
+  - 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera
+  Derived: # custom evaluations of basic features that will be added to the data
+    # column name : expression to evaluate using basic column names
+    log10_WLS: log10(hillas_width*hillas_length/hillas_intensity)
+    log10_intensity: log10(hillas_intensity)
+    r_origin: (sqrt((hillas_x - az)**2 + (hillas_y - alt)**2))**2
+    phi_origin: arctan2(hillas_y - alt, hillas_x - az)
+
+# These cuts select the input data BEFORE training
+SigFiducialCuts:
+  - 'good_image == 1'
+  - 'is_valid == True'
+  - 'hillas_intensity_reco > 0'
+
+Diagnostic:
+ # Energy binning (used for reco and true energy)
+ energy:
+  nbins: 15
+  min: 0.0125
+  max: 125
diff --git a/protopipe/aux/example_config_files/RandomForestClassifier.yaml b/protopipe/aux/example_config_files/RandomForestClassifier.yaml
new file mode 100644
index 00000000..2a4a8a3d
--- /dev/null
+++ b/protopipe/aux/example_config_files/RandomForestClassifier.yaml
@@ -0,0 +1,88 @@
+General:
+  # [...] = your analysis local full path OUTSIDE the Vagrant box
+  data_dir: '../../data/' # '[...]/data/TRAINING/for_particle_classification/'
+  data_sig_file: 'TRAINING_classification_tail_gamma_merged.h5'
+  data_bkg_file: 'TRAINING_classification_tail_proton_merged.h5'
+  outdir: './' # [...]/estimators/gamma_hadron_classifier
+  
+  # List of cameras to use (protopipe-MODEL help output for other options)
+  cam_id_list: ['LSTCam', 'NectarCam']
+
+# If train_fraction is 1, all the TRAINING dataset will be used to train the
+# model and benchmarking can only be done from the benchmarking notebook
+# TRAINING/benchmarks_DL2_to_classification.ipynb
+Split:
+  train_fraction: 0.8
+  use_same_number_of_sig_and_bkg_for_training: False  # Lowest statistics will drive the split
+
+# Optimize the hyper-parameters of the estimator with a grid search
+# If True parameters should be provided as lists (for None use [null])
+# If False the model used will be the one based on the chosen single-valued hyper-parameters
+GridSearchCV:
+  use: False # True or False
+  # if False the following to variables are irrelevant
+  scoring: 'roc_auc'
+  cv: 2
+
+# Definition of the algorithm/method used and its hyper-parameters
+Method:
+  name: 'sklearn.ensemble.RandomForestClassifier' # DO NOT CHANGE
+  target_name: 'label' # defined between 0 and 1 (DO NOT CHANGE)
+  tuned_parameters:
+    # Please, see scikit-learn's API for what each parameter means
+    # WARNING: null (not a string) == 'None'
+    n_estimators: 100 # integer
+    criterion: 'gini' # 'gini' or 'entropy'
+    max_depth: null # null or integer
+    min_samples_split: 2 # integer or float
+    min_samples_leaf: 1 # integer or float
+    min_weight_fraction_leaf: 0.0 # float
+    max_features: 3 # 'auto', 'sqrt', 'log2', integer or float
+    max_leaf_nodes: null # null or integer
+    min_impurity_decrease: 0.0 # float
+    bootstrap: False # True or False
+    oob_score: False # True or False
+    n_jobs: null # null or integer
+    random_state: 0 # null or integer or RandomState
+    verbose: 0 # integer
+    warm_start: False # 'True' or 'False'
+    class_weight: null # 'balanced', 'balanced_subsample', null, dict or list of dicts
+    ccp_alpha: 0.0 # non-negative float
+    max_samples: null # null, integer or float
+  calibrate_output: False  # If True calibrate model on test data
+
+# List of the features to use to train the model
+# You can:
+# - comment/uncomment the ones you see here,
+# - add new ones here if they can be evaluated with pandas.DataFrame.eval
+# - if not you can propose modifications to protopipe.mva.utils.prepare_data
+FeatureList:
+  Basic: # single-named, they need to correspond to input data columns
+  - 'h_max'         # Height of shower maximum from stereoscopic reconstruction
+  - 'impact_dist'   # Impact parameter from stereoscopic reconstruction
+  - 'hillas_width'  # Image Width
+  - 'hillas_length' # Image Length
+  # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel
+  Derived: # custom evaluations of basic features that will be added to the data
+    # column name : expression to evaluate using basic column names
+    log10_intensity: log10(hillas_intensity)
+    log10_reco_energy: log10(reco_energy) # Averaged-estimated energy of the shower
+    log10_reco_energy_tel: log10(reco_energy_tel) # Estimated energy of the shower per telescope
+
+# These cuts select the input data BEFORE training
+SigFiducialCuts:
+  - 'good_image == 1'
+  - 'is_valid == True'
+  - 'hillas_intensity_reco > 0'
+
+BkgFiducialCuts:
+ - 'good_image == 1'
+ - 'is_valid == True'
+ - 'hillas_intensity_reco > 0'
+
+Diagnostic:
+ # Energy binning (used for reco and true energy)
+ energy:
+  nbins: 4
+  min: 0.0125
+  max: 200
diff --git a/protopipe/aux/example_config_files/RandomForestRegressor.yaml b/protopipe/aux/example_config_files/RandomForestRegressor.yaml
new file mode 100644
index 00000000..069eab29
--- /dev/null
+++ b/protopipe/aux/example_config_files/RandomForestRegressor.yaml
@@ -0,0 +1,83 @@
+General:
+  # [...] = your analysis local full path OUTSIDE the Vagrant box
+  data_dir: '../../data/' # '[...]/data/TRAINING/for_energy_estimation/'
+  data_sig_file: 'TRAINING_energy_tail_gamma_merged.h5'
+  outdir: './' # '[...]/estimators/energy_regressor'
+  
+  # List of cameras to use (protopipe-MODEL help output for other options)
+  cam_id_list: ['LSTCam', 'NectarCam']
+
+# If train_fraction is 1, all the TRAINING dataset will be used to train the
+# model and benchmarking can only be done from the benchmarking notebook
+# TRAINING/benchmarks_DL2_to_classification.ipynb
+Split:
+  train_fraction: 0.8
+  use_same_number_of_sig_and_bkg_for_training: False  # Lowest statistics will drive the split
+
+# Optimize the hyper-parameters of the estimator with a grid search
+# If True parameters should be provided as lists
+# If False the model used will be the one based on the chosen single-valued hyper-parameters
+GridSearchCV:
+  use: False # True or False
+  # if False the following two variables are irrelevant
+  scoring: 'explained_variance'
+  cv: 2
+
+# Definition of the model algorithm/method used and its hyper-parameters
+Method:
+  name: 'sklearn.ensemble.RandomForestRegressor' # DO NOT CHANGE
+  target_name: 'log10_true_energy'
+  tuned_parameters:
+    # Please, see scikit-learn's API for what each parameter means
+    # NOTE: null == None
+    n_estimators: 50 # integer
+    criterion: "mse" # "mse" or "mae"
+    max_depth: null # null or integer
+    min_samples_split: 5 # integer
+    min_samples_leaf: 5 # integer
+    min_weight_fraction_leaf: 0.0 # float
+    max_features: 3 # {"auto", "sqrt", "log2"}, int or float
+    max_leaf_nodes: null # null or integer
+    min_impurity_decrease: 0.0 # float
+    bootstrap: False # True or False
+    oob_score: False # True or False
+    n_jobs: null # 'None' or integer
+    random_state: 0 # null or integer or RandomState
+    verbose: 0 # integer
+    warm_start: False # True or False
+    ccp_alpha: 0.0 # non-negative float
+    max_samples: null # null, integer or float
+
+# List of the features to use to train the model
+# You can:
+# - comment/uncomment the ones you see here,
+# - add new ones here if they can be evaluated with pandas.DataFrame.eval
+# - if not you can propose modifications to protopipe.mva.utils.prepare_data
+FeatureList:
+  Basic: # single-named, they need to correspond to input data columns
+  - 'h_max'         # Height of shower maximum from stereoscopic reconstruction
+  - 'impact_dist'   # Impact parameter from stereoscopic reconstruction
+  - 'hillas_width'  # Image Width
+  - 'hillas_length' # Image Length
+  # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel
+  - 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera
+  Derived: # custom evaluations of basic features that will be added to the data
+    # column name : expression to evaluate using basic column names
+    log10_WLS: log10(hillas_width*hillas_length/hillas_intensity)
+    log10_intensity: log10(hillas_intensity)
+    r_origin: (sqrt((hillas_x - az)**2 + (hillas_y - alt)**2))**2
+    phi_origin: arctan2(hillas_y - alt, hillas_x - az)
+
+# These cuts select the input data BEFORE training
+SigFiducialCuts:
+  - 'good_image == 1'
+  - 'is_valid == True'
+  - 'hillas_intensity_reco > 0'
+
+# Information used by the benchmarking notebook related to this model
+Diagnostic:
+  # Energy binning (used for reco and true energy)
+  energy:
+    nbins: 15
+    min: 0.0125
+    max: 125
diff --git a/protopipe/aux/example_config_files/classifier.yaml b/protopipe/aux/example_config_files/classifier.yaml
deleted file mode 100644
index 39013a9a..00000000
--- a/protopipe/aux/example_config_files/classifier.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-General:
- model_type: 'classifier'
- # [...] = your analysis local full path OUTSIDE the Vagrant box
- data_dir: '[...]/shared_folder/analyses/v0.4.0_dev1/data/TRAINING/for_particle_classification/'
- data_sig_file: 'TRAINING_classification_tail_gamma_merged.h5'
- data_bkg_file: 'TRAINING_classification_tail_proton_merged.h5'
- cam_id_list: ['LSTCam', 'NectarCam']
- table_name_template: '' # leave empty (TO BE REMOVED)
- outdir: '[...]/shared_folder/analyses/v0.4.0_dev1/estimators/gamma_hadron_classifier'
-
-Split:
- train_fraction: 0.8
- use_same_number_of_sig_and_bkg_for_training: False  # Lowest statistics will drive the split
-
-Method:
- name: 'RandomForestClassifier'  # AdaBoostClassifier or RandomForestClassifier
- target_name: 'label'
- tuned_parameters: # these are lists of values used by the GridSearchCV algorithm
-  n_estimators: [200]
-  max_depth: [10]  # null for None
-  max_features: [3] # possible choices are “auto”, “sqrt”, “log2”, int or float
-  min_samples_split: [10]
-  min_samples_leaf: [10]
- scoring: 'roc_auc' # possible choices are 'roc_auc', 'explained_variance'
- cv: 2
- use_proba: True  # If not output is score
- calibrate_output: False  # If true calibrate probability
-
-FeatureList:
- - 'log10_reco_energy'
- - 'log10_reco_energy_tel'
- - 'log10_hillas_intensity'
- - 'hillas_width'
- - 'hillas_length'
- - 'h_max'
- - 'impact_dist'
-
-SigFiducialCuts:
- - 'good_image == 1'
- - 'is_valid == True'
-
-BkgFiducialCuts:
- - 'good_image == 1'
- - 'is_valid == True'
-
-Diagnostic:
- # Energy binning (used for reco and true energy)
- energy:
-  nbins: 4
-  min: 0.02
-  max: 200
diff --git a/protopipe/aux/example_config_files/regressor.yaml b/protopipe/aux/example_config_files/regressor.yaml
deleted file mode 100644
index 392bda52..00000000
--- a/protopipe/aux/example_config_files/regressor.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-General:
- model_type: 'regressor'
- # [...] = your analysis local full path OUTSIDE the Vagrant box
- data_dir: '[...]/shared_folder/analyses/v0.4.0_dev1/data/TRAINING/for_energy_estimation'
- data_file: 'TRAINING_energy_tail_gamma_merged.h5'
- outdir: '[...]/shared_folder/analyses/v0.4.0_dev1/estimators/energy_regressor'
- cam_id_list: ['LSTCam', 'NectarCam']
- table_name_template: '' # leave empty (TO BE REMOVED)
-
-Split:
- train_fraction: 0.8
-
-Method:
- name: 'AdaBoostRegressor'
- target_name: 'true_energy'
- tuned_parameters:
-  learning_rate: [0.3]
-  n_estimators: [100]
-  base_estimator__max_depth: [null]  # null is equivalent to None
-  base_estimator__min_samples_split: [2]
-  base_estimator__min_samples_leaf: [10]
- scoring: 'explained_variance'
- cv: 2
-
-FeatureList:
- - 'log10_hillas_intensity'
- - 'log10_impact_dist'
- - 'hillas_width_reco'
- - 'hillas_length_reco'
- - 'h_max'
-
-SigFiducialCuts:
- - 'good_image == 1'
- - 'is_valid == True'
-
-Diagnostic:
- # Energy binning (used for reco and true energy)
- energy:
-  nbins: 15
-  min: 0.0125
-  max: 125
diff --git a/protopipe/mva/__init__.py b/protopipe/mva/__init__.py
index fc180b8f..cf9c422c 100644
--- a/protopipe/mva/__init__.py
+++ b/protopipe/mva/__init__.py
@@ -1,6 +1,7 @@
 """
-Classes to buil models based on machine learning methods.
+Classes to build models based on machine learning methods.
 """
 from .train_model import *
 from .diagnostic import *
 from .utils import *
+from .io import *
diff --git a/protopipe/mva/io.py b/protopipe/mva/io.py
new file mode 100644
index 00000000..963d053a
--- /dev/null
+++ b/protopipe/mva/io.py
@@ -0,0 +1,147 @@
+"""Input functions for a model initilization."""
+
+import argparse
+import joblib
+from os import path
+
+from protopipe.mva.utils import save_obj
+
+
+def initialize_script_arguments():
+    """Initialize the parser of protopipe.scripts.build_model.
+
+    Returns
+    -------
+    args : argparse.Namespace
+        Populated argparse namespace.
+    """
+
+    parser = argparse.ArgumentParser(
+        description="Build model for regression/classification"
+    )
+    parser.add_argument("--config_file", type=str, required=True)
+
+    parser.add_argument(
+        "--max_events",
+        type=int,
+        default=None,
+        help="maximum number of events to use",
+    )
+
+    mode_group = parser.add_mutually_exclusive_group()
+    mode_group.add_argument(
+        "--wave",
+        dest="mode",
+        action="store_const",
+        const="wave",
+        default="tail",
+        help="if set, use wavelet cleaning",
+    )
+    mode_group.add_argument(
+        "--tail",
+        dest="mode",
+        action="store_const",
+        const="tail",
+        help="if set, use tail cleaning (default), otherwise wavelets",
+    )
+
+    # These last CL arguments can overwrite the values from the config
+
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument('--cameras_from_config',
+                       action='store_true',
+                       help="Get cameras configuration file (Priority 1)",)
+    group.add_argument('--cameras_from_file',
+                       action='store_true',
+                       help="Get cameras from input file (Priority 2)",)
+    group.add_argument('--cam_id_list',
+                       type=str,
+                       default=None,
+                       help="Select cameras like 'LSTCam CHEC' (Priority 3)",)
+
+    parser.add_argument(
+        "-i",
+        "--indir",
+        type=str,
+        default=None,
+        help="Directory containing the required input file(s)"
+    )
+    parser.add_argument(
+        "--infile_signal",
+        type=str,
+        default=None,
+        help="SIGNAL file (default: read from config file)",
+    )
+    parser.add_argument(
+        "--infile_background",
+        type=str,
+        default=None,
+        help="BACKGROUND file (default: read from config file)",
+    )
+    parser.add_argument("-o", "--outdir", type=str, default=None)
+
+    args = parser.parse_args()
+
+    return args
+
+
+def save_output(models,
+                cam_id,
+                factory,
+                best_model,
+                model_types,
+                method_name,
+                outdir):
+    """Save model and data used to produce it per camera-type.
+
+    Parameters
+    ----------
+    models: dict
+        Dictionary of models with camera names as keys.
+    cam_id: str
+        Name of the analyzed camera.
+    factory: protopipe.mva.TrainModel
+        Wrapper around trained model containing references to train/test samples.
+    best_model:
+        Fit of the model from factory.
+    model_types: dict
+        Dictionary that maps type of model to method name.
+    method_name: str
+        Name of the scikit-learn model.
+    outdir: str
+        Path to output directory where to save the trained model and train/test samples.
+    """
+
+    models[cam_id] = best_model
+    model_type = [k for k, v in model_types.items() if method_name in v][0]
+    outname = "{}_{}_{}.pkl.gz".format(
+        model_type, cam_id, method_name
+    )
+    joblib.dump(best_model, path.join(outdir, outname))
+
+    # SAVE DATA
+    save_obj(
+        factory.data_scikit,
+        path.join(
+            outdir,
+            "data_scikit_{}_{}_{}.pkl.gz".format(
+                model_type, method_name, cam_id
+            ),
+        ),
+    )
+    factory.data_train.to_pickle(
+        path.join(
+            outdir,
+            "data_train_{}_{}_{}.pkl.gz".format(
+                model_type, method_name, cam_id
+            ),
+        )
+    )
+    factory.data_test.to_pickle(
+        path.join(
+            outdir,
+            "data_test_{}_{}_{}.pkl.gz".format(
+                model_type, method_name, cam_id
+            ),
+        )
+    )
diff --git a/protopipe/mva/train_model.py b/protopipe/mva/train_model.py
index ca536dee..fd25c2b0 100644
--- a/protopipe/mva/train_model.py
+++ b/protopipe/mva/train_model.py
@@ -48,7 +48,7 @@ def split_data(
             to build a classifier
         """
 
-        if self.case in "regressor":
+        if self.case == "regressor":
             (
                 X_train,
                 X_test,
@@ -65,7 +65,7 @@ def split_data(
             weight = np.ones(len(self.data_train))
             weight_train = weight / sum(weight)
 
-        elif self.case in "classifier":
+        else:
             (
                 X_train_sig,
                 X_test_sig,
diff --git a/protopipe/mva/utils.py b/protopipe/mva/utils.py
index f7171e53..bd20ef93 100644
--- a/protopipe/mva/utils.py
+++ b/protopipe/mva/utils.py
@@ -18,21 +18,48 @@ def load_obj(name):
         return pickle.load(f)
 
 
-def prepare_data(ds, cuts, label=None):
-    """Add variables in data frame"""
-    ds["log10_hillas_intensity"] = np.log10(
-        ds["hillas_intensity_reco"]
-    )  # THIS SHOULDN'T BE HARDCODED!!!
-    ds["log10_impact_dist"] = np.log10(ds["impact_dist"])
+def prepare_data(ds, derived_features, cuts, select_data=True, label=None):
+    """Add custom variables to the input data and optionally select it.
+
+    Parameters
+    ----------
+    ds : pandas.DataFrame
+        Input data not yet selected.
+    derived_features: dict
+        Dictionary of more complex featuresread from the configuration file.
+    cuts: str
+        Fiducial cuts from protopipe.mva.utils.make_cut_list
+    select_data: bool
+        If True apply cuts to the final dataframe.
+    label: str
+        Name of the classifier target label if any.
+
+    Returns
+    -------
+    ds : pandas.DataFrame
+        Input data integrated with new variables and optionally selected for
+        the fiducial cuts.
+    """
+
+    # This is always useful
     ds["log10_true_energy"] = np.log10(ds["true_energy"])
-    try:  # for classification
-        ds["log10_reco_energy"] = np.log10(ds["reco_energy"])
-        ds["log10_reco_energy_tel"] = np.log10(ds["reco_energy_tel"])
+
+    if label is not None:  # only for classification
         ds["label"] = np.full(len(ds), label)
-    except:
-        pass
 
-    ds = ds.query(cuts)
+        # This is needed because our reference analysis uses energy as
+        # feature for classification
+        # We should propably support a more elastic choice in the future.
+        if not all(i in derived_features for i in ["log10_reco_energy", "log10_reco_energy_tel"]):
+            raise ValueError('log10_reco_energy and log10_reco_energy_tel need to be model features.')
+
+    # Compute derived features and add them to the dataframe
+    for feature_name, feature_expression in derived_features.items():
+        ds.eval(f'{feature_name} = {feature_expression}',
+                inplace=True)
+
+    if select_data:
+        ds = ds.query(cuts)
 
     return ds
 
diff --git a/protopipe/scripts/build_model.py b/protopipe/scripts/build_model.py
index 49659715..30625625 100755
--- a/protopipe/scripts/build_model.py
+++ b/protopipe/scripts/build_model.py
@@ -1,97 +1,32 @@
 #!/usr/bin/env python
 
 import os
-import pandas as pd
-import argparse
 from os import path
-from sklearn.ensemble import (
-    AdaBoostRegressor,
-    AdaBoostClassifier,
-    RandomForestClassifier,
-)
-from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
-import joblib
+import importlib
+
+import pandas as pd
+
 from sklearn.metrics import classification_report
 from sklearn.calibration import CalibratedClassifierCV
 
 from protopipe.pipeline.utils import load_config, get_camera_names
-
 from protopipe.mva import TrainModel
-from protopipe.mva.utils import make_cut_list, prepare_data, save_obj
+from protopipe.mva.io import initialize_script_arguments, save_output
+from protopipe.mva.utils import (
+    make_cut_list,
+    prepare_data
+)
 
 
 def main():
 
-    # Read arguments
-    parser = argparse.ArgumentParser(
-        description="Build model for regression/classification"
-    )
-    parser.add_argument("--config_file", type=str, required=True)
-    parser.add_argument(
-        "--max_events",
-        type=int,
-        default=None,
-        help="maximum number of events for training",
-    )
-    mode_group = parser.add_mutually_exclusive_group()
-    mode_group.add_argument(
-        "--wave",
-        dest="mode",
-        action="store_const",
-        const="wave",
-        default="tail",
-        help="if set, use wavelet cleaning",
-    )
-    mode_group.add_argument(
-        "--tail",
-        dest="mode",
-        action="store_const",
-        const="tail",
-        help="if set, use tail cleaning, otherwise wavelets",
-    )
-
-    # These last CL arguments can overwrite the values from the config
-
-    group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument('--cameras_from_config',
-                       action='store_true',
-                       help="Get cameras configuration file (Priority 1)",)
-    group.add_argument('--cameras_from_file',
-                       action='store_true',
-                       help="Get cameras from input file (Priority 2)",)
-    group.add_argument('--cam_id_list',
-                       type=str,
-                       default=None,
-                       help="Select cameras like 'LSTCam CHEC' (Priority 3)",)
-
-    parser.add_argument(
-        "-i",
-        "--indir",
-        type=str,
-        default=None,
-        help="Directory containing the required input file(s)"
-    )
-    parser.add_argument(
-        "--infile_signal",
-        type=str,
-        default=None,
-        help="SIGNAL file (default: read from config file)",
-    )
-    parser.add_argument(
-        "--infile_background",
-        type=str,
-        default=None,
-        help="BACKGROUND file (default: read from config file)",
-    )
-    parser.add_argument("-o", "--outdir", type=str, default=None)
-
-    args = parser.parse_args()
-
-    # Read configuration file
+    # INITIALIZE CLI arguments
+    args = initialize_script_arguments()
+
+    # LOAD CONFIGURATION FILE
     cfg = load_config(args.config_file)
 
-    # Type of model (regressor or classifier)
-    model_type = cfg["General"]["model_type"]
+    # INPUT CONFIGURATION
 
     # Import parameters
     if args.indir is None:
@@ -106,52 +41,92 @@ def main():
     if not os.path.exists(outdir):
         os.makedirs(outdir)
 
-    table_name_template = cfg["General"]["table_name_template"]
-
-    # List of features
-    feature_list = cfg["FeatureList"]
+    # Get file containing gammas (signal)
+    if args.infile_signal is None:
+        data_sig_file = cfg["General"]["data_sig_file"].format(args.mode)
+    else:
+        data_sig_file = args.infile_signal
 
-    # Optimisation parameters
-    method_name = cfg["Method"]["name"]
-    tuned_parameters = [cfg["Method"]["tuned_parameters"]]
-    scoring = cfg["Method"]["scoring"]
-    cv = cfg["Method"]["cv"]
+    filename_sig = path.join(data_dir, data_sig_file)
 
-    # Split fraction
-    train_fraction = cfg["Split"]["train_fraction"]
+    print(f"INPUT SIGNAL FILE PATH= {filename_sig}")
 
-    if model_type in "regressor":
+    # Cameras to use
+    if args.cameras_from_config:
+        print("GETTING CAMERAS FROM CONFIGURATION FILE")
+        cam_ids = cfg["General"]["cam_id_list"]
+    elif args.cameras_from_file:
+        print("GETTING CAMERAS FROM SIGNAL TRAINING FILE")
+        # in the same analysis all particle types are analyzed in the
+        # same way so we can just use gammas
+        cam_ids = get_camera_names(filename_sig)
+    else:
+        print("GETTING CAMERAS FROM CLI")
+        cam_ids = args.cam_id_lists.split()
 
-        if args.infile_signal is None:
-            data_file = cfg["General"]["data_file"].format(args.mode)
-        else:
-            data_file = args.infile_signal
+    # The names of the tables inside the HDF5 file are the camera's names
+    table_name = [cam_id for cam_id in cam_ids]
 
-        filename = path.join(data_dir, data_file)
+    # Dataset split train-test fraction
+    train_fraction = cfg["Split"]["train_fraction"]
+    # Name of target quantity
+    target_name = cfg["Method"]["target_name"]
+
+    # Get list of features
+    features_basic = cfg["FeatureList"]["Basic"]
+    features_derived = cfg["FeatureList"]["Derived"]
+    feature_list = features_basic + list(features_derived)
+    print("Going to use the following features to train the model:")
+    print(feature_list)
+    # sort features_to_use alphabetically to ensure order
+    # preservation with model.predict in protopipe.scripts
+    feature_list = sorted(feature_list)
+
+    # GridSearchCV
+    use_GridSearchCV = cfg["GridSearchCV"]["use"]
+    scoring = cfg["GridSearchCV"]["scoring"]
+    cv = cfg["GridSearchCV"]["cv"]
+
+    # Hyper-parameters of the main model
+    tuned_parameters = cfg["Method"]["tuned_parameters"]
+
+    # Initialize the model dynamically
+
+    # There always at least one (main) model to initialize
+    model_to_use = cfg['Method']['name']
+    module_name = '.'.join(model_to_use.split('.', 2)[:-1])
+    class_name = model_to_use.split('.')[-1]
+    module = importlib.import_module(module_name)  # sklearn.XXX
+    model = getattr(module, class_name)
+    print(f"Going to use {module_name}.{class_name}...")
+
+    # Check for any base estimator if main model is a meta-estimator
+    if "base_estimator" in cfg['Method']:
+        base_estimator_cfg = cfg['Method']['base_estimator']
+        base_estimator_name = base_estimator_cfg['name']
+        base_estimator_pars = base_estimator_cfg['parameters']
+        base_estimator_module_name = '.'.join(base_estimator_name.split('.', 2)[:-1])
+        base_estimator_class_name = base_estimator_name.split('.')[-1]
+        base_estimator_module = importlib.import_module(base_estimator_module_name)  # sklearn.XXX
+        base_estimator_model = getattr(base_estimator_module, base_estimator_class_name)
+        initialized_base_estimator = base_estimator_model(**base_estimator_pars)
+        print(f"...based on {base_estimator_module_name}.{base_estimator_class_name}")
+        initialized_model = model(base_estimator=initialized_base_estimator,
+                                  **cfg['Method']['tuned_parameters'])
+    else:
+        initialized_model = model(**cfg['Method']['tuned_parameters'])
 
-        if args.cameras_from_config:
-            cam_ids = cfg["General"]["cam_id_list"]
-        elif args.cameras_from_file:
-            cam_ids = get_camera_names(filename)
-        else:
-            cam_ids = args.cam_id_list.split()
+    # Map model types to the models supported by the script
+    model_types = {"regressor": ["RandomForestRegressor",
+                                 "AdaBoostRegressor"],
+                   "classifier": ["RandomForestClassifier"]}
 
-        table_name = [table_name_template + cam_id for cam_id in cam_ids]
+    if class_name in model_types["regressor"]:
 
-        # List of cuts
+        # Get the selection cuts
         cuts = make_cut_list(cfg["SigFiducialCuts"])
-        init_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None))
-
-        # Name of target
-        target_name = cfg["Method"]["target_name"]
 
-    elif model_type in "classifier":
-
-        # read signal file from either config file or CLI
-        if args.infile_signal is None:
-            data_sig_file = cfg["General"]["data_sig_file"].format(args.mode)
-        else:
-            data_sig_file = args.infile_signal
+    elif class_name in model_types["classifier"]:
 
         # read background file from either config file or CLI
         if args.infile_background is None:
@@ -159,50 +134,23 @@ def main():
         else:
             data_bkg_file = args.infile_background
 
-        filename_sig = path.join(data_dir, data_sig_file)
+        # filename_sig = path.join(data_dir, data_sig_file)
         filename_bkg = path.join(data_dir, data_bkg_file)
 
-        if args.cameras_from_config:
-            print("TAKING CAMERAS FROM CONFIG")
-            cam_ids = cfg["General"]["cam_id_list"]
-        elif args.cameras_from_file:
-            print("TAKING CAMERAS FROM TRAINING FILE")
-            # in the same analysis all particle types are analyzed in the
-            # same way so we can just use gammas
-            cam_ids = get_camera_names(filename_sig)
-        else:
-            print("TAKING CAMERAS FROM CLI")
-            cam_ids = args.cam_id_lists.split()
-
-        table_name = [table_name_template + cam_id for cam_id in cam_ids]
+        # table_name = [table_name_template + cam_id for cam_id in cam_ids]
 
-        # List of cuts
+        # Get the selection cuts
         sig_cuts = make_cut_list(cfg["SigFiducialCuts"])
         bkg_cuts = make_cut_list(cfg["BkgFiducialCuts"])
 
-        # Model
-        if method_name in "AdaBoostClassifier":
-            init_model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4))
-        elif method_name in "RandomForestClassifier":
-            init_model = RandomForestClassifier(
-                n_estimators=500,
-                max_depth=None,
-                min_samples_split=0.05,
-                max_features="sqrt",
-                bootstrap=True,
-                random_state=None,
-                criterion="gini",
-                class_weight="balanced_subsample",  # Tree-wise re-weighting
-            )
-
-            # Name of target
-            target_name = cfg["Method"]["target_name"]
-
         use_same_number_of_sig_and_bkg_for_training = cfg["Split"][
             "use_same_number_of_sig_and_bkg_for_training"
         ]
 
-    print("### Using {} for model construction".format(method_name))
+    else:
+        raise ValueError("ERROR: not a supported model")
+
+    print("### Using {} for model construction".format(model_to_use))
 
     print(f"LIST OF CAMERAS TO USE = {cam_ids}")
 
@@ -211,43 +159,65 @@ def main():
 
         print("### Building model for {}".format(cam_id))
 
-        if model_type in "regressor":
+        if class_name in model_types["regressor"]:
+
             # Load data
-            data = pd.read_hdf(filename, table_name[idx], mode="r")
-            data = prepare_data(ds=data, cuts=cuts)[0:args.max_events]
+            data_sig = pd.read_hdf(filename_sig, table_name[idx], mode="r")
+            # Add any derived feature and apply fiducial cuts
+            data_sig = prepare_data(ds=data_sig,
+                                    derived_features=features_derived,
+                                    select_data=True,
+                                    cuts=cuts)
+
+            if args.max_events:
+                data_sig = data_sig[0:args.max_events]
 
-            print(f"Going to split {len(data)} SIGNAL images...")
+            print(f"Going to split {len(data_sig)} SIGNAL images...")
 
-            # Init model factory
+            # Initialize the model
             factory = TrainModel(
-                case=model_type, target_name=target_name, feature_name_list=feature_list
+                case="regressor",
+                target_name=target_name,
+                feature_name_list=feature_list
             )
 
-            # Split data
-            factory.split_data(data_sig=data, train_fraction=train_fraction)
+            # Split the TRAINING dataset in a train and test sub-datasets
+            # Useful to test the models before using them for DL2 production
+            factory.split_data(data_sig=data_sig, train_fraction=train_fraction)
             print("Training sample: sig {}".format(len(factory.data_train)))
             print("Test sample: sig {}".format(len(factory.data_test)))
-        elif model_type in "classifier":
+
+        else:  # if it's not a regressor it's a classifier
+
             # Load data
             data_sig = pd.read_hdf(filename_sig, table_name[idx], mode="r")
             data_bkg = pd.read_hdf(filename_bkg, table_name[idx], mode="r")
 
             # Add label
-            data_sig = prepare_data(ds=data_sig, label=1, cuts=sig_cuts)
-            data_bkg = prepare_data(ds=data_bkg, label=0, cuts=bkg_cuts)
+            data_sig = prepare_data(ds=data_sig,
+                                    label=1,
+                                    cuts=sig_cuts,
+                                    select_data=True,
+                                    derived_features=features_derived)
+            data_bkg = prepare_data(ds=data_bkg,
+                                    label=0,
+                                    cuts=bkg_cuts,
+                                    select_data=True,
+                                    derived_features=features_derived)
 
             if args.max_events:
-                data_sig = data_sig[0:(args.max_events - 1)]
-                data_bkg = data_bkg[0:(args.max_events - 1)]
+                data_sig = data_sig[0:args.max_events]
+                data_bkg = data_bkg[0:args.max_events]
 
             print(f"Going to split {len(data_sig)} SIGNAL images and {len(data_bkg)} BACKGROUND images")
 
-            # Init model factory
+            # Initialize the model
             factory = TrainModel(
-                case=model_type, target_name=target_name, feature_name_list=feature_list
+                case="classifier", target_name=target_name, feature_name_list=feature_list
             )
 
-            # Split data
+            # Split the TRAINING dataset in a train and test sub-datasets
+            # Useful to test the models before using them for DL2 production
             factory.split_data(
                 data_sig=data_sig,
                 data_bkg=data_bkg,
@@ -268,23 +238,33 @@ def main():
                 )
             )
 
-        # Build model
-        best_model = factory.get_optimal_model(
-            init_model, tuned_parameters, scoring=scoring, cv=cv
-        )
-
-        if model_type in "classifier":
-            # print report
-            if model_type in "classifier":
-                print(
-                    classification_report(
-                        factory.data_scikit["y_test"],
-                        best_model.predict(factory.data_scikit["X_test"]),
-                    )
+        if use_GridSearchCV:
+            # Apply optimization of the hyper-parameters via grid search
+            # and return best model
+            best_model = factory.get_optimal_model(
+                initialized_model, tuned_parameters, scoring=scoring, cv=cv
+            )
+        else:  # otherwise use directly the initial model
+            best_model = initialized_model
+
+            # Fit the chosen model on the train data
+            best_model.fit(
+                factory.data_scikit["X_train"],
+                factory.data_scikit["y_train"],
+                sample_weight=factory.data_scikit["w_train"],
+            )
+
+        if class_name in model_types["classifier"]:
+
+            print(
+                classification_report(
+                    factory.data_scikit["y_test"],
+                    best_model.predict(factory.data_scikit["X_test"]),
                 )
+            )
 
-            # Calibrate model if necessary on test data
-            if cfg["Method"]["calibrate_output"] is True:
+            # Calibrate model if necessary on test data (GridSearchCV)
+            if use_GridSearchCV and cfg["Method"]["calibrate_output"]:
                 print("==> Calibrate classifier...")
 
                 best_model = CalibratedClassifierCV(
@@ -295,39 +275,13 @@ def main():
                     factory.data_scikit["X_test"], factory.data_scikit["y_test"]
                 )
 
-        # save model
-        models[cam_id] = best_model
-        outname = "{}_{}_{}_{}.pkl.gz".format(
-            model_type, args.mode, cam_id, method_name
-        )
-        joblib.dump(best_model, path.join(outdir, outname))
-
-        # save data
-        save_obj(
-            factory.data_scikit,
-            path.join(
-                outdir,
-                "data_scikit_{}_{}_{}_{}.pkl.gz".format(
-                    model_type, method_name, args.mode, cam_id
-                ),
-            ),
-        )
-        factory.data_train.to_pickle(
-            path.join(
-                outdir,
-                "data_train_{}_{}_{}_{}.pkl.gz".format(
-                    model_type, method_name, args.mode, cam_id
-                ),
-            )
-        )
-        factory.data_test.to_pickle(
-            path.join(
-                outdir,
-                "data_test_{}_{}_{}_{}.pkl.gz".format(
-                    model_type, method_name, args.mode, cam_id
-                ),
-            )
-        )
+        save_output(models,
+                    cam_id,
+                    factory,
+                    best_model,
+                    model_types,
+                    class_name,
+                    outdir)
 
 
 if __name__ == "__main__":
diff --git a/protopipe/scripts/data_training.py b/protopipe/scripts/data_training.py
index f92388b8..66a63c43 100755
--- a/protopipe/scripts/data_training.py
+++ b/protopipe/scripts/data_training.py
@@ -8,6 +8,7 @@
 from glob import glob
 import signal
 import tables as tb
+import pandas as pd
 
 from ctapipe.utils.CutFlow import CutFlow
 from ctapipe.io import EventSource
@@ -47,6 +48,13 @@ def main():
     parser.add_argument(
         "--regressor_dir", type=str, default="./", help="regressors directory"
     )
+    parser.add_argument(
+        "--regressor_config",
+        type=str,
+        default=None,
+        help="Configuration file used to produce regressor model"
+    )
+
     args = parser.parse_args()
 
     # Read configuration file
@@ -104,8 +112,12 @@ def main():
 
     # wrapper for the scikit-learn regressor
     if args.estimate_energy is True:
+
+        # Read configuration file
+        regressor_config = load_config(args.regressor_config)
+
         regressor_files = (
-            args.regressor_dir + "/regressor_{mode}_{cam_id}_{regressor}.pkl.gz"
+            args.regressor_dir + "/regressor_{cam_id}_{regressor}.pkl.gz"
         )
         reg_file = regressor_files.format(
             **{
@@ -294,17 +306,60 @@ def main():
                     moments = hillas_dict[tel_id]
                     model = regressors[cam_id]
 
-                    features_img = np.array(
-                        [
-                            np.log10(moments.intensity),
-                            np.log10(impact_dict[tel_id].value),
-                            moments.width.value,
-                            moments.length.value,
-                            h_max.value,
-                        ]
-                    )
-
-                    energy_tel[idx] = model.predict([features_img])
+                    ############################################################
+                    #                  GET FEATURES
+                    ############################################################
+
+                    # Read feature list from model configutation file
+                    features_basic = regressor_config["FeatureList"]["Basic"]
+                    features_derived = regressor_config["FeatureList"]["Derived"]
+                    features = features_basic + list(features_derived)
+
+                    # Create a pandas Dataframe with basic quantities
+                    # This is needed in order to connect the I/O system of the
+                    # model inputs to the in-memory computation of this script
+                    data = pd.DataFrame({
+                        "hillas_intensity": [moments.intensity],
+                        "hillas_width": [moments.width.to("deg").value],
+                        "hillas_length": [moments.length.to("deg").value],
+                        "hillas_x": [moments.x.to("deg").value],
+                        "hillas_y": [moments.y.to("deg").value],
+                        "hillas_phi": [moments.phi.to("deg").value],
+                        "hillas_r": [moments.r.to("deg").value],
+                        "leakage_intensity_width_1_reco": [leakage_dict[tel_id]['leak1_reco']],
+                        "leakage_intensity_width_2_reco": [leakage_dict[tel_id]['leak2_reco']],
+                        "leakage_intensity_width_1": [leakage_dict[tel_id]['leak1']],
+                        "leakage_intensity_width_2": [leakage_dict[tel_id]['leak2']],
+                        "az": [reco_result.az.to("deg").value],
+                        "alt": [reco_result.alt.to("deg").value],
+                        "h_max": [h_max.value],
+                        "impact_dist": [impact_dict[tel_id].to("m").value],
+                    })
+
+                    # Compute derived features and add them to the dataframe
+                    for key, expression in features_derived.items():
+                        data.eval(f'{key} = {expression}', inplace=True)
+
+                    # features_img = np.array(
+                    #     [
+                    #         np.log10(moments.intensity),
+                    #         np.log10(impact_dict[tel_id].value),
+                    #         moments.width.value,
+                    #         moments.length.value,
+                    #         h_max.value,
+                    #     ]
+                    # )
+
+                    # sort features_to_use alphabetically to ensure order
+                    # preservation with model.fit in protopipe.mva
+                    features = sorted(features)
+
+                    # Select the values for the full set of features
+                    features_values = data[features].to_numpy()
+
+                    ############################################################
+
+                    energy_tel[idx] = model.predict(features_values)
                     weight_tel[idx] = moments.intensity
                     reco_energy_tel[tel_id] = energy_tel[idx]
 
diff --git a/protopipe/scripts/tests/test_AdaBoostRegressor.yaml b/protopipe/scripts/tests/test_AdaBoostRegressor.yaml
new file mode 100644
index 00000000..2941dbdb
--- /dev/null
+++ b/protopipe/scripts/tests/test_AdaBoostRegressor.yaml
@@ -0,0 +1,85 @@
+General:
+  # [...] = your analysis local full path OUTSIDE the Vagrant box
+  # NOTE: not used here since the testing suite needs to work from the CLI
+  data_dir: '../../data/'
+  data_sig_file: 'TRAINING_energy_tail_gamma_merged.h5'
+  outdir: './'
+  
+  # List of cameras to use (you can override this from the CLI)
+  # NOTE: not used here since the testing suite needs to work from the CLI
+  cam_id_list: ['LSTCam', 'NectarCam']
+
+# If train_fraction is 1, all the TRAINING dataset will be used to train the
+# model and benchmarking can only be done from the benchmarking notebook
+# TRAINING/benchmarks_DL2_to_classification.ipynb
+Split:
+  train_fraction: 0.8
+  use_same_number_of_sig_and_bkg_for_training: False  # Lowest statistics will drive the split
+
+# Optimize the hyper-parameters of the estimator with a grid search
+# If True parameters should be provided as lists
+# If False the model used will be the one based on the chosen single-valued hyper-parameters
+GridSearchCV:
+  use: False # True or False
+  # if False the following two variables are irrelevant
+  scoring: 'explained_variance'
+  cv: 2
+
+Method:
+  name: 'sklearn.ensemble.AdaBoostRegressor'
+  target_name: 'true_energy'
+  # Please, see scikit-learn's API for what each parameter means
+  # NOTE: null == None
+  base_estimator:
+    name: 'sklearn.tree.DecisionTreeRegressor'
+    parameters:
+      # NOTE: here we set the parameters relevant for sklearn.tree.DecisionTreeRegressor
+      criterion: "mse" # "mse", "friedman_mse", "mae" or "poisson"
+      splitter: "best" # "best" or "random"
+      max_depth: null # null or integer
+      min_samples_split: 2 # integer or float
+      min_samples_leaf: 1 # int or float
+      min_weight_fraction_leaf: 0.0 # float
+      max_features: null # null, "auto", "sqrt", "log2", int or float
+      max_leaf_nodes: null # null or integer
+      min_impurity_decrease: 0.0 # float
+      random_state: 0 # null or integer or RandomState
+      ccp_alpha: 0.0 # non-negative float
+  tuned_parameters:
+    n_estimators: 50
+    learning_rate: 1
+    loss: 'linear' # 'linear', 'square' or 'exponential'
+    random_state: 0 # int, RandomState instance or None
+
+# List of the features to use to train the model
+# You can:
+# - comment/uncomment the ones you see here,
+# - add new ones here if they can be evaluated with pandas.DataFrame.eval
+# - if not you can propose modifications to protopipe.mva.utils.prepare_data
+FeatureList:
+  Basic: # single-named, they need to correspond to input data columns
+  - 'h_max'         # Height of shower maximum from stereoscopic reconstruction
+  - 'impact_dist'   # Impact parameter from stereoscopic reconstruction
+  - 'hillas_width'  # Image Width
+  - 'hillas_length' # Image Length
+  # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel
+  - 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera
+  Derived: # custom evaluations of basic features that will be added to the data
+    # column name : expression to evaluate using basic column names
+    log10_WLS: log10(hillas_width*hillas_length/hillas_intensity)
+    log10_intensity: log10(hillas_intensity)
+    r_origin: (sqrt((hillas_x - az)**2 + (hillas_y - alt)**2))**2
+    phi_origin: arctan2(hillas_y - alt, hillas_x - az)
+
+# These cuts select the input data BEFORE training
+SigFiducialCuts:
+  - 'good_image == 1'
+  - 'is_valid == True'
+  - 'hillas_intensity_reco > 0'
+
+Diagnostic:
+ # Energy binning (used for reco and true energy)
+ energy:
+  nbins: 15
+  min: 0.0125
+  max: 125
diff --git a/protopipe/scripts/tests/test_RandomForestClassifier.yaml b/protopipe/scripts/tests/test_RandomForestClassifier.yaml
new file mode 100644
index 00000000..0f406e38
--- /dev/null
+++ b/protopipe/scripts/tests/test_RandomForestClassifier.yaml
@@ -0,0 +1,89 @@
+General:
+  # NOTE: not used here since the testing suite needs to work from the CLI
+  data_dir: '../../data/'
+  data_sig_file: 'TRAINING_classification_tail_gamma_merged.h5'
+  data_bkg_file: 'TRAINING_classification_tail_proton_merged.h5'
+  outdir: './'
+  
+  # List of cameras to use (protopipe-MODEL help output for other options)
+  # NOTE: not used here since the testing suite needs to work from the CLI
+  cam_id_list: ['LSTCam', 'NectarCam']
+
+# If train_fraction is 1, all the TRAINING dataset will be used to train the
+# model and benchmarking can only be done from the benchmarking notebook
+# TRAINING/benchmarks_DL2_to_classification.ipynb
+Split:
+  train_fraction: 0.8
+  use_same_number_of_sig_and_bkg_for_training: False  # Lowest statistics will drive the split
+
+# Optimize the hyper-parameters of the estimator with a grid search
+# If True parameters should be provided as lists (for None use [null])
+# If False the model used will be the one based on the chosen single-valued hyper-parameters
+GridSearchCV:
+  use: False # True or False
+  # if False the following to variables are irrelevant
+  scoring: 'roc_auc'
+  cv: 2
+
+# Definition of the algorithm/method used and its hyper-parameters
+Method:
+  name: 'sklearn.ensemble.RandomForestClassifier' # DO NOT CHANGE
+  target_name: 'label' # defined between 0 and 1 (DO NOT CHANGE)
+  tuned_parameters:
+    # Please, see scikit-learn's API for what each parameter means
+    # WARNING: null (not a string) == 'None'
+    n_estimators: 100 # integer
+    criterion: 'gini' # 'gini' or 'entropy'
+    max_depth: null # null or integer
+    min_samples_split: 2 # integer or float
+    min_samples_leaf: 1 # integer or float
+    min_weight_fraction_leaf: 0.0 # float
+    max_features: 3 # 'auto', 'sqrt', 'log2', integer or float
+    max_leaf_nodes: null # null or integer
+    min_impurity_decrease: 0.0 # float
+    bootstrap: False # True or False
+    oob_score: False # True or False
+    n_jobs: null # null or integer
+    random_state: 0 # null or integer or RandomState
+    verbose: 0 # integer
+    warm_start: False # 'True' or 'False'
+    class_weight: null # 'balanced', 'balanced_subsample', null, dict or list of dicts
+    ccp_alpha: 0.0 # non-negative float
+    max_samples: null # null, integer or float
+  calibrate_output: False  # If True calibrate model on test data
+
+# List of the features to use to train the model
+# You can:
+# - comment/uncomment the ones you see here,
+# - add new ones here if they can be evaluated with pandas.DataFrame.eval
+# - if not you can propose modifications to protopipe.mva.utils.prepare_data
+FeatureList:
+  Basic: # single-named, they need to correspond to input data columns
+  - 'h_max'         # Height of shower maximum from stereoscopic reconstruction
+  - 'impact_dist'   # Impact parameter from stereoscopic reconstruction
+  - 'hillas_width'  # Image Width
+  - 'hillas_length' # Image Length
+  # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel
+  Derived: # custom evaluations of basic features that will be added to the data
+    # column name : expression to evaluate using basic column names
+    log10_intensity: log10(hillas_intensity)
+    log10_reco_energy: log10(reco_energy) # Averaged-estimated energy of the shower
+    log10_reco_energy_tel: log10(reco_energy_tel) # Estimated energy of the shower per telescope
+
+# These cuts select the input data BEFORE training
+SigFiducialCuts:
+  - 'good_image == 1'
+  - 'is_valid == True'
+  - 'hillas_intensity_reco > 0'
+
+BkgFiducialCuts:
+ - 'good_image == 1'
+ - 'is_valid == True'
+ - 'hillas_intensity_reco > 0'
+
+Diagnostic:
+ # Energy binning (used for reco and true energy)
+ energy:
+  nbins: 4
+  min: 0.0125
+  max: 200
diff --git a/protopipe/scripts/tests/test_RandomForestRegressor.yaml b/protopipe/scripts/tests/test_RandomForestRegressor.yaml
new file mode 100644
index 00000000..802aefee
--- /dev/null
+++ b/protopipe/scripts/tests/test_RandomForestRegressor.yaml
@@ -0,0 +1,84 @@
+General:
+  # NOTE: not used here since the testing suite needs to work from the CLI
+  data_dir: './'
+  data_sig_file: 'test_TRAINING_energy_{}_gamma_merged.h5'
+  outdir: './'
+  
+  # List of cameras to use (protopipe-MODEL help output for other options)
+  # NOTE: not used here since the testing suite needs to work from the CLI
+  cam_id_list: ['LSTCam', 'NectarCam']
+
+# If train_fraction is 1, all the TRAINING dataset will be used to train the
+# model and benchmarking can only be done from the benchmarking notebook
+# TRAINING/benchmarks_DL2_to_classification.ipynb
+Split:
+  train_fraction: 0.8
+  use_same_number_of_sig_and_bkg_for_training: False  # Lowest statistics will drive the split
+
+# Optimize the hyper-parameters of the estimator with a grid search
+# If True parameters should be provided as lists
+# If False the model used will be the one based on the chosen single-valued hyper-parameters
+GridSearchCV:
+  use: False # True or False
+  # if False the following two variables are irrelevant
+  scoring: 'explained_variance'
+  cv: 2
+
+# Definition of the model algorithm/method used and its hyper-parameters
+Method:
+  name: 'sklearn.ensemble.RandomForestRegressor' # DO NOT CHANGE
+  target_name: 'log10_true_energy'
+  tuned_parameters:
+    # Please, see scikit-learn's API for what each parameter means
+    # NOTE: null == None
+    n_estimators: 50 # integer
+    criterion: "mse" # "mse" or "mae"
+    max_depth: null # null or integer
+    min_samples_split: 5 # integer
+    min_samples_leaf: 5 # integer
+    min_weight_fraction_leaf: 0.0 # float
+    max_features: 3 # {"auto", "sqrt", "log2"}, int or float
+    max_leaf_nodes: null # null or integer
+    min_impurity_decrease: 0.0 # float
+    bootstrap: False # True or False
+    oob_score: False # True or False
+    n_jobs: null # 'None' or integer
+    random_state: 0 # null or integer or RandomState
+    verbose: 0 # integer
+    warm_start: False # True or False
+    ccp_alpha: 0.0 # non-negative float
+    max_samples: null # null, integer or float
+
+# List of the features to use to train the model
+# You can:
+# - comment/uncomment the ones you see here,
+# - add new ones here if they can be evaluated with pandas.DataFrame.eval
+# - if not you can propose modifications to protopipe.mva.utils.prepare_data
+FeatureList:
+  Basic: # single-named, they need to correspond to input data columns
+  - 'h_max'         # Height of shower maximum from stereoscopic reconstruction
+  - 'impact_dist'   # Impact parameter from stereoscopic reconstruction
+  - 'hillas_width'  # Image Width
+  - 'hillas_length' # Image Length
+  # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel
+  - 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera
+  Derived: # custom evaluations of basic features that will be added to the data
+    # column name : expression to evaluate using basic column names
+    log10_WLS: log10(hillas_width*hillas_length/hillas_intensity)
+    log10_intensity: log10(hillas_intensity)
+    r_origin: (sqrt((hillas_x - az)**2 + (hillas_y - alt)**2))**2
+    phi_origin: arctan2(hillas_y - alt, hillas_x - az)
+
+# These cuts select the input data BEFORE training
+SigFiducialCuts:
+  - 'good_image == 1'
+  - 'is_valid == True'
+  - 'hillas_intensity_reco > 0'
+
+# Information used by the benchmarking notebook related to this model
+Diagnostic:
+  # Energy binning (used for reco and true energy)
+  energy:
+    nbins: 15
+    min: 0.0125
+    max: 125
diff --git a/protopipe/scripts/tests/test_classifier.yaml b/protopipe/scripts/tests/test_classifier.yaml
deleted file mode 100644
index e1d54bb5..00000000
--- a/protopipe/scripts/tests/test_classifier.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-General:
- model_type: 'classifier'
- # [...] = your analysis local full path OUTSIDE the Vagrant box
- data_dir: '[...]/shared_folder/analyses/v0.4.0_dev1/data/TRAINING/for_particle_classification/'
- data_sig_file: 'TRAINING_classification_{}_gamma_merged.h5'
- data_bkg_file: 'TRAINING_classification_{}_proton_merged.h5'
- cam_id_list: ['LSTCam', 'NectarCam']
- table_name_template: '' # leave empty (TO BE REMOVED)
- outdir: '[...]/shared_folder/analyses/v0.4.0_dev1/estimators/gamma_hadron_classifier'
-
-Split:
- train_fraction: 0.5
- use_same_number_of_sig_and_bkg_for_training: False  # Lowest statistics will drive the split
-
-Method:
- name: 'RandomForestClassifier'  # AdaBoostClassifier or RandomForestClassifier
- target_name: 'label'
- tuned_parameters: # these are lists of values used by the GridSearchCV algorithm
-  n_estimators: [200]
-  max_depth: [10]  # null for None
-  max_features: [3] # possible choices are “auto”, “sqrt”, “log2”, int or float
-  min_samples_split: [10]
-  min_samples_leaf: [10]
- scoring: 'roc_auc' # possible choices are 'roc_auc', 'explained_variance'
- cv: 2
- use_proba: True  # If not output is score
- calibrate_output: False  # If true calibrate probability
-
-FeatureList:
- # - 'log10_reco_energy'
- # - 'log10_reco_energy_tel'
- - 'log10_hillas_intensity'
- - 'hillas_width'
- - 'hillas_length'
- - 'h_max'
- - 'impact_dist'
-
-SigFiducialCuts:
- - 'good_image == 1'
- - 'is_valid == True'
-
-BkgFiducialCuts:
- - 'good_image == 1'
- - 'is_valid == True'
-
-Diagnostic:
- # Energy binning (used for reco and true energy)
- energy:
-  nbins: 4
-  min: 0.02
-  max: 200
diff --git a/protopipe/scripts/tests/test_config_analysis_north.yaml b/protopipe/scripts/tests/test_config_analysis_north.yaml
index 2391676b..6bce806d 100644
--- a/protopipe/scripts/tests/test_config_analysis_north.yaml
+++ b/protopipe/scripts/tests/test_config_analysis_north.yaml
@@ -96,7 +96,7 @@ Reconstruction:
 # Parameters for energy estimation
 EnergyRegressor:
  # Name of the regression method (e.g. AdaBoostRegressor, etc.)
- method_name: 'AdaBoostRegressor'
+ method_name: 'RandomForestRegressor'
 
 # Parameters for g/h separation
 GammaHadronClassifier:
diff --git a/protopipe/scripts/tests/test_config_analysis_south.yaml b/protopipe/scripts/tests/test_config_analysis_south.yaml
index 3ad4ee3b..09aa981a 100644
--- a/protopipe/scripts/tests/test_config_analysis_south.yaml
+++ b/protopipe/scripts/tests/test_config_analysis_south.yaml
@@ -96,7 +96,7 @@ Reconstruction:
 # Parameters for energy estimation
 EnergyRegressor:
  # Name of the regression method (e.g. AdaBoostRegressor, etc.)
- method_name: 'AdaBoostRegressor'
+ method_name: 'RandomForestRegressor'
 
 # Parameters for g/h separation
 GammaHadronClassifier:
diff --git a/protopipe/scripts/tests/test_pipeline.py b/protopipe/scripts/tests/test_pipeline.py
index dc330276..23ff4fe6 100644
--- a/protopipe/scripts/tests/test_pipeline.py
+++ b/protopipe/scripts/tests/test_pipeline.py
@@ -13,6 +13,9 @@
 # CONFIG FILES
 config_prod3b_CTAN = resource_filename("protopipe", "scripts/tests/test_config_analysis_north.yaml")
 config_prod3b_CTAS = resource_filename("protopipe", "scripts/tests/test_config_analysis_south.yaml")
+config_AdaBoostRegressor = resource_filename("protopipe", "scripts/tests/test_AdaBoostRegressor.yaml")
+config_RandomForestRegressor = resource_filename("protopipe", "scripts/tests/test_RandomForestRegressor.yaml")
+config_RandomForestClassifier = resource_filename("protopipe", "scripts/tests/test_RandomForestClassifier.yaml")
 
 # TEST FILES
 
@@ -57,7 +60,7 @@ def test_GET_GAMMAS_FOR_ENERGY_MODEL_WITH_IMAGES(test_case, pipeline_testdir):
 
     print(  # only with "pytest -s"
         f'''
-        /n You can reproduce this test by running the following command,
+        You can reproduce this test by running the following command,
 
         {command}
         '''
@@ -89,7 +92,7 @@ def test_GET_GAMMAS_FOR_ENERGY_MODEL(test_case, pipeline_testdir):
 
     print(  # only with "pytest -s"
         f'''
-        /n You can reproduce this test by running the following command,
+        You can reproduce this test by running the following command,
 
         {command}
         '''
@@ -106,28 +109,56 @@ def test_GET_GAMMAS_FOR_ENERGY_MODEL(test_case, pipeline_testdir):
 
 
 @pytest.mark.parametrize("test_case", [
-    pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="EN",
+    pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="EN_1",
                                                                   depends=["g1N"])),
-    pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="ES",
+    pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="ES_1",
                                                                   depends=["g1S"])),
 ])
 def test_BUILD_ENERGY_MODEL_AdaBoost_DecisionTreeRegressor(test_case, pipeline_testdir):
-    """Launch protopipe.scripts.build_model for a AdaBoost DecisionTreeRegressor."""
+    """Launch protopipe.scripts.build_model for a AdaBoostRegressor based on DecisionTreeRegressor."""
 
     infile = pipeline_testdir / f"test_gamma1_noImages_{test_case}.h5"
     outdir = pipeline_testdir / f"energy_model_{test_case}"
 
-    config = resource_filename("protopipe", "scripts/tests/test_regressor.yaml")
+    command = f"python {build_model.__file__}\
+    --config_file {config_AdaBoostRegressor}\
+    --infile_signal {infile}\
+    --outdir {outdir}\
+    --cameras_from_file"
+
+    print(  # only with "pytest -s"
+        f'''
+        You can reproduce this test by running the following command,
+
+        {command}
+        '''
+    )
+
+    exit_status = system(command)
+    assert exit_status == 0
+
+
+@pytest.mark.parametrize("test_case", [
+    pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="EN_2",
+                                                                  depends=["g1N"])),
+    pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="ES_2",
+                                                                  depends=["g1S"])),
+])
+def test_BUILD_ENERGY_MODEL_RandomForestRegressor(test_case, pipeline_testdir):
+    """Launch protopipe.scripts.build_model for a RandomForestRegressor."""
+
+    infile = pipeline_testdir / f"test_gamma1_noImages_{test_case}.h5"
+    outdir = pipeline_testdir / f"energy_model_{test_case}"
 
     command = f"python {build_model.__file__}\
-    --config_file {config}\
+    --config_file {config_RandomForestRegressor}\
     --infile_signal {infile}\
     --outdir {outdir}\
     --cameras_from_file"
 
     print(  # only with "pytest -s"
         f'''
-        /n You can reproduce this test by running the following command,
+        You can reproduce this test by running the following command,
 
         {command}
         '''
@@ -139,9 +170,9 @@ def test_BUILD_ENERGY_MODEL_AdaBoost_DecisionTreeRegressor(test_case, pipeline_t
 
 @pytest.mark.parametrize("test_case", [
     pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="g2N",
-                                                                  depends=["EN"])),
+                                                                  depends=["EN_2"])),
     pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="g2S",
-                                                                  depends=["ES"])),
+                                                                  depends=["ES_2"])),
 ])
 def test_GET_GAMMAS_FOR_CLASSIFICATION_MODEL(test_case, pipeline_testdir):
 
@@ -154,11 +185,12 @@ def test_GET_GAMMAS_FOR_CLASSIFICATION_MODEL(test_case, pipeline_testdir):
     -i {input_data[test_case]['gamma2'].parent}\
     -f {input_data[test_case]['gamma2'].name}\
     --estimate_energy True\
+    --regressor_config {config_RandomForestRegressor}\
     --regressor_dir {modelpath}"
 
     print(  # only with "pytest -s"
         f'''
-        /n You can reproduce this test by running the following command,
+        You can reproduce this test by running the following command,
 
         {command}
         '''
@@ -176,9 +208,9 @@ def test_GET_GAMMAS_FOR_CLASSIFICATION_MODEL(test_case, pipeline_testdir):
 
 @pytest.mark.parametrize("test_case", [
     pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="p1N",
-                                                                  depends=["EN"])),
+                                                                  depends=["EN_2"])),
     pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="p1S",
-                                                                  depends=["ES"])),
+                                                                  depends=["ES_2"])),
 ])
 def test_GET_PROTONS_FOR_CLASSIFICATION_MODEL(test_case, pipeline_testdir):
 
@@ -191,11 +223,12 @@ def test_GET_PROTONS_FOR_CLASSIFICATION_MODEL(test_case, pipeline_testdir):
     -i {input_data[test_case]['proton1'].parent}\
     -f {input_data[test_case]['proton1'].name}\
     --estimate_energy True\
+    --regressor_config {config_RandomForestRegressor}\
     --regressor_dir {modelpath}"
 
     print(  # only with "pytest -s"
         f'''
-        /n You can reproduce this test by running the following command,
+        You can reproduce this test by running the following command,
 
         {command}
         '''
@@ -217,17 +250,15 @@ def test_GET_PROTONS_FOR_CLASSIFICATION_MODEL(test_case, pipeline_testdir):
     pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="C2",
                                                                   depends=["g2S", "p1S"])),
 ])
-def test_BUILD_CLASSIFICATION_MODEL_RandomForest(test_case, pipeline_testdir):
+def test_BUILD_CLASSIFICATION_MODEL_RandomForestClassifier(test_case, pipeline_testdir):
     """Launch protopipe.scripts.build_model for a Random Forest classifier."""
 
     infile_signal = pipeline_testdir / f"test_gamma2_noImages_{test_case}.h5"
     infile_background = pipeline_testdir / f"test_proton1_noImages_{test_case}.h5"
     outdir = pipeline_testdir / f"classification_model_{test_case}"
 
-    config = resource_filename("protopipe", "scripts/tests/test_classifier.yaml")
-
     command = f"python {build_model.__file__}\
-    --config_file {config}\
+    --config_file {config_RandomForestClassifier}\
     --infile_signal {infile_signal}\
     --infile_background {infile_background}\
     --outdir {outdir}\
@@ -235,7 +266,7 @@ def test_BUILD_CLASSIFICATION_MODEL_RandomForest(test_case, pipeline_testdir):
 
     print(  # only with "pytest -s"
         f'''
-        /n You can reproduce this test by running the following command,
+        You can reproduce this test by running the following command,
 
         {command}
         '''
diff --git a/protopipe/scripts/tests/test_regressor.yaml b/protopipe/scripts/tests/test_regressor.yaml
deleted file mode 100644
index 9041fe5e..00000000
--- a/protopipe/scripts/tests/test_regressor.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-General:
- model_type: 'regressor'
- # [...] = your analysis local full path OUTSIDE the Vagrant box
- data_dir: './'
- data_file: 'test_TRAINING_energy_{}_gamma_merged.h5'
- outdir: './'
- cam_id_list: ['LSTCam', 'NectarCam']
- table_name_template: '' # leave empty (TO BE REMOVED)
-
-Split:
- train_fraction: 0.5
-
-Method:
- name: 'AdaBoostRegressor'
- target_name: 'true_energy'
- tuned_parameters:
-  learning_rate: [0.3]
-  n_estimators: [100]
-  base_estimator__max_depth: [null]  # null is equivalent to None
-  base_estimator__min_samples_split: [2]
-  base_estimator__min_samples_leaf: [10]
- scoring: 'explained_variance'
- cv: 2
-
-FeatureList:
- - 'log10_hillas_intensity'
- - 'log10_impact_dist'
- - 'hillas_width_reco'
- - 'hillas_length_reco'
- - 'h_max'
-
-SigFiducialCuts:
- - 'good_image == 1' 
- - 'is_valid == True'
-
-Diagnostic:
- # Energy binning (used for reco and true energy)
- energy:
-  nbins: 15
-  min: 0.0125
-  max: 125
diff --git a/protopipe/scripts/write_dl2.py b/protopipe/scripts/write_dl2.py
index dbb94da0..f30beacb 100755
--- a/protopipe/scripts/write_dl2.py
+++ b/protopipe/scripts/write_dl2.py
@@ -2,6 +2,7 @@
 
 from sys import exit
 import numpy as np
+import pandas as pd
 from glob import glob
 import signal
 from astropy.coordinates.angle_utilities import angular_separation
@@ -47,6 +48,20 @@ def main():
         action="store_true",
         help="Save images in images.h5 (one file testing)",
     )
+
+    parser.add_argument(
+        "--regressor_config",
+        type=str,
+        default=None,
+        help="Configuration file used to produce regressor model"
+    )
+    parser.add_argument(
+        "--classifier_config",
+        type=str,
+        default=None,
+        help="Configuration file used to produce classification model"
+    )
+
     args = parser.parse_args()
 
     # Read configuration file
@@ -133,8 +148,12 @@ def main():
 
     # Classifiers
     if use_classifier:
+
+        # Read configuration file
+        classifier_config = load_config(args.classifier_config)
+
         classifier_files = (
-            args.classifier_dir + "/classifier_{mode}_{cam_id}_{classifier}.pkl.gz"
+            args.classifier_dir + "/classifier_{cam_id}_{classifier}.pkl.gz"
         )
         clf_file = classifier_files.format(
             **{
@@ -156,8 +175,12 @@ def main():
 
     # Regressors
     if use_regressor:
+
+        # Read configuration file
+        regressor_config = load_config(args.regressor_config)
+
         regressor_files = (
-            args.regressor_dir + "/regressor_{mode}_{cam_id}_{regressor}.pkl.gz"
+            args.regressor_dir + "/regressor_{cam_id}_{regressor}.pkl.gz"
         )
         reg_file = regressor_files.format(
             **{
@@ -271,13 +294,10 @@ class RecoEvent(tb.IsDescription):
             source, save_images=args.save_images, debug=args.debug
         ):
 
-            # True energy
-            true_energy = event.simulation.shower.energy.value
-            
             # True direction
             true_az = event.simulation.shower.az
             true_alt = event.simulation.shower.alt
-            
+
             # Array pointing in AltAz frame
             pointing_az = event.pointing.array_azimuth
             pointing_alt = event.pointing.array_altitude
@@ -339,21 +359,55 @@ class RecoEvent(tb.IsDescription):
 
                     cam_id = source.subarray.tel[tel_id].camera.camera_name
                     moments = hillas_dict[tel_id]
+
                     model = regressors[cam_id]
 
-                    # Features to be fed in the regressor
-                    features_img = np.array(
-                        [
-                            np.log10(moments.intensity),
-                            np.log10(impact_dict[tel_id].value),
-                            moments.width.value,
-                            moments.length.value,
-                            h_max.value,
-                        ]
-                    )
+                    ############################################################
+                    #                  GET FEATURES
+                    ############################################################
+
+                    # Read feature list from model configutation file
+                    features_basic = regressor_config["FeatureList"]["Basic"]
+                    features_derived = regressor_config["FeatureList"]["Derived"]
+                    features = features_basic + list(features_derived)
+
+                    # Create a pandas Dataframe with basic quantities
+                    # This is needed in order to connect the I/O system of the
+                    # model inputs to the in-memory computation of this script
+                    data = pd.DataFrame({
+                        "hillas_intensity": [moments.intensity],
+                        "hillas_width": [moments.width.to("deg").value],
+                        "hillas_length": [moments.length.to("deg").value],
+                        "hillas_x": [moments.x.to("deg").value],
+                        "hillas_y": [moments.y.to("deg").value],
+                        "hillas_phi": [moments.phi.to("deg").value],
+                        "hillas_r": [moments.r.to("deg").value],
+                        "leakage_intensity_width_1_reco": [leakage_dict[tel_id]['leak1_reco']],
+                        "leakage_intensity_width_2_reco": [leakage_dict[tel_id]['leak2_reco']],
+                        "leakage_intensity_width_1": [leakage_dict[tel_id]['leak1']],
+                        "leakage_intensity_width_2": [leakage_dict[tel_id]['leak2']],
+                        "az": [reco_result.az.to("deg").value],
+                        "alt": [reco_result.alt.to("deg").value],
+                        "h_max": [h_max.value],
+                        "impact_dist": [impact_dict[tel_id].to("m").value],
+                    })
+
+                    # Compute derived features and add them to the dataframe
+                    for key, expression in features_derived.items():
+                        if key not in data:
+                            data.eval(f'{key} = {expression}', inplace=True)
+
+                    # sort features_to_use alphabetically to ensure order
+                    # preservation with model.fit in protopipe.mva
+                    features = sorted(features)
+
+                    # Select the values for the full set of features
+                    features_values = data[features].to_numpy()
+
+                    ############################################################
 
                     if good_for_reco[tel_id] == 1:
-                        energy_tel[idx] = model.predict([features_img])
+                        energy_tel[idx] = model.predict(features_values)
                     else:
                         energy_tel[idx] = np.nan
 
@@ -389,24 +443,57 @@ class RecoEvent(tb.IsDescription):
                 weight_tel = np.zeros(len(hillas_dict.keys()))
 
                 for idx, tel_id in enumerate(hillas_dict.keys()):
+
                     cam_id = source.subarray.tel[tel_id].camera.camera_name
                     moments = hillas_dict[tel_id]
+
                     model = classifiers[cam_id]
-                    # Features to be fed in the classifier
-                    # this should be read in some way from
-                    # the classifier configuration file!!!!!
-
-                    features_img = np.array(
-                        [
-                            np.log10(reco_energy),
-                            np.log10(energy_tel_classifier[tel_id]),
-                            np.log10(moments.intensity),
-                            moments.width.value,
-                            moments.length.value,
-                            h_max.value,
-                            impact_dict[tel_id].value,
-                        ]
-                    )
+
+                    ############################################################
+                    #                  GET FEATURES
+                    ############################################################
+
+                    # Read feature list from model configutation file
+                    features_basic = classifier_config["FeatureList"]["Basic"]
+                    features_derived = classifier_config["FeatureList"]["Derived"]
+                    features = features_basic + list(features_derived)
+
+                    # Create a pandas Dataframe with basic quantities
+                    # This is needed in order to connect the I/O system of the
+                    # model inputs to the in-memory computation of this script
+                    data = pd.DataFrame({
+                        "hillas_intensity": [moments.intensity],
+                        "hillas_width": [moments.width.to("deg").value],
+                        "hillas_length": [moments.length.to("deg").value],
+                        "hillas_x": [moments.x.to("deg").value],
+                        "hillas_y": [moments.y.to("deg").value],
+                        "hillas_phi": [moments.phi.to("deg").value],
+                        "hillas_r": [moments.r.to("deg").value],
+                        "leakage_intensity_width_1_reco": [leakage_dict[tel_id]['leak1_reco']],
+                        "leakage_intensity_width_2_reco": [leakage_dict[tel_id]['leak2_reco']],
+                        "leakage_intensity_width_1": [leakage_dict[tel_id]['leak1']],
+                        "leakage_intensity_width_2": [leakage_dict[tel_id]['leak2']],
+                        "az": [reco_result.az.to("deg").value],
+                        "alt": [reco_result.alt.to("deg").value],
+                        "h_max": [h_max.value],
+                        "impact_dist": [impact_dict[tel_id].to("m").value],
+                        "reco_energy": reco_energy,
+                        "reco_energy_tel": energy_tel_classifier[tel_id],
+                    })
+
+                    # Compute derived features and add them to the dataframe
+                    for key, expression in features_derived.items():
+                        if key not in data:
+                            data.eval(f'{key} = {expression}', inplace=True)
+
+                    # sort features_to_use alphabetically to ensure order
+                    # preservation with model.fit in protopipe.mva
+                    features = sorted(features)
+
+                    # Select the values for the full set of features
+                    features_values = data[features].to_numpy()
+
+                    ############################################################
 
                     # Here we check for valid telescope-wise energies
                     # Because it means that it's a good image
@@ -415,11 +502,9 @@ class RecoEvent(tb.IsDescription):
                     if not np.isnan(energy_tel_classifier[tel_id]):
                         # Output of classifier according to type of classifier
                         if use_proba_for_classifier is False:
-                            score_tel[idx] = model.decision_function([features_img])
+                            score_tel[idx] = model.decision_function(features_values)
                         else:
-                            gammaness_tel[idx] = model.predict_proba([features_img])[
-                                :, 1
-                            ]
+                            gammaness_tel[idx] = model.predict_proba(features_values)[:, 1]
                         weight_tel[idx] = np.sqrt(moments.intensity)
                     else:
                         # WARNING:
@@ -481,7 +566,7 @@ class RecoEvent(tb.IsDescription):
                 for idx, tel_id in enumerate(hillas_dict.keys()):
                     cam_id = source.subarray.tel[tel_id].camera.camera_name
                     if cam_id not in images_phe:
-                        
+
                         n_pixels = source.subarray.tel[tel_id].camera.geometry.n_pixels
                         StoredImages["true_image"] = tb.Float32Col(
                             shape=(n_pixels), pos=2
@@ -495,7 +580,7 @@ class RecoEvent(tb.IsDescription):
                         StoredImages["cleaning_mask_clusters"] = tb.BoolCol(
                             shape=(n_pixels), pos=5
                         )  # not in ctapipe
-                        
+
                         images_table[cam_id] = images_outfile.create_table(
                             "/", "_".join(["images", cam_id]), StoredImages
                         )