Bump v0.2.0

kusterlab · Oct 2, 2023 · 0e2c3b2 · 0e2c3b2
1 parent 133ef22
commit 0e2c3b2
Show file tree

Hide file tree

Showing 15 changed files with 1,178 additions and 1,116 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,3 +1,7 @@
+v0.2.0
+- Support multiple controls to calculate more accurate ratios
+- Improved stability of dashboards
+
 v0.1.1
 - Switch to pip installation
 - Minor improvements and fixes

diff --git a/README.md b/README.md
diff --git a/curve_curator/__init__.py b/curve_curator/__init__.py
@@ -5,4 +5,4 @@
 # Florian P. Bayer - 2023
 #
 
-__version__ = '0.1.1'
+__version__ = '0.2.0'
diff --git a/curve_curator/__main__.py b/curve_curator/__main__.py
@@ -22,8 +22,7 @@
 from . import quality_control
 from .__init__ import __version__
 
-
-def main():
+if __name__ == '__main__':
     # Build a command line parser for parsing multiple config files
     command_line = argparse.ArgumentParser(
         description='CurveCurator',
@@ -52,7 +51,7 @@ def main():
         dest="path",
         metavar="<PATH>",
         type=str,
-        help="Relative path to the config.toml or batch.txt file to run the pipeline.")
+        help="Relative path to the config.toml file to run the pipeline.")
 
     # Parse the terminal arguments
     args = command_line.parse_args()
@@ -77,9 +76,17 @@ def main():
         ui.setup_logger(Path(tf).parent, name=i)
         ui.message(f' * Executing CurveCurator pipeline version {__version__}.')
 
-        # Load config
+        # Make a counter in batch mode only for the terminal
         if args.batch:
             ui.message(f' * Processing {i+1} of {len(toml_files)} data sets.', terminal_only=True)
+
+        # Check the input file is a toml file
+        if not ui.is_toml_file(tf):
+            ui.error(f' * The given file is not a TOML parameter file !\n * If it\'s a batch file make sure you activate the batch mode with --batch.')
+            ui.doneline()
+            continue
+
+        # Load config
         config = ui.load_toml(tf, random_mode=bool(args.random))
         config = ui.set_default_values(config)
 
@@ -112,7 +119,3 @@ def main():
 
         # Done
         ui.doneline()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/curve_curator/dashboard.py b/curve_curator/dashboard.py
@@ -678,11 +678,11 @@ def dashboard(df, title, out_path, drug_doses, drug_unit, cols_ratio, model, f_s
     fig1.add_tools(dots_hover_tool)
 
     # Add thresholds and potency line. Visibility depends on the used approach
-    volcano_threshold_line_v0 = fig1.line(x='x', y='y', line_width=2, source=threshold_v0, color='red', line_dash='dashed')
+    volcano_threshold_line_v0 = fig1.line(x='x', y='y', line_width=1.5, source=threshold_v0, color='crimson', line_dash='solid')
     volcano_threshold_line_v0.visible = True
-    volcano_threshold_line_v1 = fig1.line(x='x', y='y', line_width=2, source=threshold_v1, color='red', line_dash='dashed')
+    volcano_threshold_line_v1 = fig1.line(x='x', y='y', line_width=1.5, source=threshold_v1, color='crimson', line_dash='solid')
     volcano_threshold_line_v1.visible = False
-    potency_threshold_line_p = fig1.line(x='x', y='y', line_width=2, source=threshold_p, color='red', line_dash='dashed')
+    potency_threshold_line_p = fig1.line(x='x', y='y', line_width=1.5, source=threshold_p, color='crimson', line_dash='solid')
     potency_threshold_line_p.visible = volcano_params['method'] != 'sam'
 
     # Add color bar
@@ -716,7 +716,7 @@ def dashboard(df, title, out_path, drug_doses, drug_unit, cols_ratio, model, f_s
                                                    names=len(drug_doses) * ['example']))
 
     # Plot the Curve plot with fit line and scatter points
-    fit_line = fig2.multi_line(xs='xs', ys='ys', color="red", line_width=5, alpha=0.6, source=curve_fit_source)
+    fit_line = fig2.multi_line(xs='xs', ys='ys', color="crimson", line_width=5, alpha=0.6, source=curve_fit_source)
     curve_dots = fig2.circle(x='x', y='y', fill_color='black', fill_alpha=1, source=curve_dots_source, size=7, line_color='black')
 
     # Add hover tooltips labels to figure 2 for fitted lines and curve dots
@@ -752,7 +752,7 @@ def dashboard(df, title, out_path, drug_doses, drug_unit, cols_ratio, model, f_s
 
     # Plot the data distribution
     hist_boxes_3 = fig3.quad(top=edges[:-1], bottom=edges[1:], left=0, right=hist, fill_color="gray", line_color="white", alpha=1)
-    quality_lines = fig3.multi_line(xs='xs', ys='ys', color="red", line_width=3, alpha=1, source=quality_source)
+    quality_lines = fig3.multi_line(xs='xs', ys='ys', color="crimson", line_width=2.5, alpha=1, source=quality_source)
     threshold1_line = fig3.line(x='x', y='y', color="black", line_width=3, alpha=1, source=signal_threshold1_source, line_dash='dashed')
     threshold2_line = fig3.line(x='x', y='y', color="black", line_width=3, alpha=1, source=signal_threshold2_source, line_dash='dashed')
 
@@ -792,7 +792,7 @@ def dashboard(df, title, out_path, drug_doses, drug_unit, cols_ratio, model, f_s
 
     # Plot the data distribution
     hist_boxes_4 = fig4.quad(top=edges[:-1], bottom=edges[1:], left=0, right=hist, fill_color="gray", line_color="white", alpha=1)
-    identification_lines = fig4.multi_line(xs='xs', ys='ys', color="red", line_width=3, alpha=1, source=identification_source)
+    identification_lines = fig4.multi_line(xs='xs', ys='ys', color="crimson", line_width=2.5, alpha=1, source=identification_source)
     threshold_line1 = fig4.line(x='x', y='y', color="black", line_width=3, alpha=1, source=score_threshold1_source, line_dash='dashed')
     threshold_line2 = fig4.line(x='x', y='y', color="black", line_width=3, alpha=1, source=score_threshold2_source, line_dash='dashed')
 
@@ -826,6 +826,7 @@ def dashboard(df, title, out_path, drug_doses, drug_unit, cols_ratio, model, f_s
     potency_bins = int(abs(potency_range[1] - potency_range[0]) // 0.1)
     if len(potency_array) > 0:
         hist, edges = np.histogram(potency_array, density=True, bins=np.linspace(potency_range[0], potency_range[1], potency_bins))
+        hist = hist / max(hist)
     else:
         # if not a single significant curve is present to guarantee that a nice plot is still drawn with empty background
         hist, edges = [0, 1], [-2, -1, 0]
@@ -835,7 +836,7 @@ def dashboard(df, title, out_path, drug_doses, drug_unit, cols_ratio, model, f_s
 
     # Plot the data distribution
     hist_boxes_5 = fig5.quad(top=edges[:-1], bottom=edges[1:], left=0, right=hist, fill_color="gray", line_color="white", alpha=1)
-    potency_lines = fig5.multi_line(xs='xs', ys='ys', color="red", line_width=3, alpha=1, source=potency_source)
+    potency_lines = fig5.multi_line(xs='xs', ys='ys', color="crimson", line_width=2.5, alpha=1, source=potency_source)
     threshold_line1 = fig5.line(x='x', y='y', color="black", line_width=3, alpha=1, source=potency_threshold1_source, line_dash='dashed')
     threshold_line2 = fig5.line(x='x', y='y', color="black", line_width=3, alpha=1, source=potency_threshold2_source, line_dash='dashed')
 

diff --git a/curve_curator/quantification.py b/curve_curator/quantification.py
@@ -49,8 +49,8 @@ def get_imputation_value(df, col, pct=0.005):
     ----------
     df : pd.DataFrame
         A data frame containing at least col name.
-    col : str
-        column name in the df with values from which a good imputation value is drawn.
+    col : array-like of str
+        A array-like object of column name(s) in the df with values from which a good imputation value is drawn.
     pct : float, optional
         Percentile threshold which is used to find a good value for imputation. By default 0.005.
 
@@ -59,7 +59,7 @@ def get_imputation_value(df, col, pct=0.005):
     value : float
         imputation value.
     """
-    value = df[col].replace(0, np.nan).dropna().quantile(pct)
+    value = df[col].mean(axis=1).replace(0, np.nan).dropna().quantile(pct)
     return value
 
 
@@ -132,10 +132,11 @@ def normalize_values(df, raw_cols, norm_cols, ref_col=None):
     return df, normalization_factors
 
 
-def add_ratios(df, cols, ratio_cols, ref_col):
+def add_ratios(df, cols, ratio_cols, ref_cols):
     """
     Calculate ratios of cols / ref_col.
     The ratio values will be added to the df under the name of ratio_cols.
+    In case of multiple columns the mean of ref_cols is used to calculate ratios.
 
     Parameters
     ----------
@@ -145,15 +146,15 @@ def add_ratios(df, cols, ratio_cols, ref_col):
         A array-like object containing the column labels of the data.
     ratio_cols : array-like
         A array-like object containing the column labels of the future ratio data.
-    ref_col : string
-        A string indicating a column used as a reference for ratio calculations.
+    ref_cols : array-like
+        A array-like object of strings indicating one or multiple column(s) used as a reference for ratio calculations.
 
     Returns
     -------
     df : pd.DataFrame
         The result data frame with the added ratio_cols.
     """
-    df[ratio_cols] = df[cols].div(df[ref_col], axis=0).replace([np.inf], np.nan)
+    df[ratio_cols] = df[cols].div(df[ref_cols].mean(axis=1), axis=0).replace([np.inf], np.nan)
     return df
 
 
@@ -213,7 +214,7 @@ def fit_model(y_data, x_data, M0, M1, fit_params, f_statistic_params):
     M0 : MeanModel object
         An MeanModel instance from curve_curator.models.
     M1 : LogisticModel object
-        An LogisitcModel instance from curve_curator.models.
+        An LogisticModel instance from curve_curator.models.
     fit_params : dict
         parameter dictionary which adjust the specific fitting procedures. Must contain at least the fit speed and fit type.
     f_statistic_params : dict
@@ -389,17 +390,19 @@ def run_pipeline(df, config, decoy_mode=False):
     """
     # Load parameters from toml file
     experiments = np.array(config['Experiment']['experiments'])
+    control_experiments = np.array(config['Experiment']['control_experiment'])
     drug_concs = np.array(config['Experiment']['doses'])
-    drug_scale = config['Experiment']['dose_scale']
-    control_mask = (drug_concs != 0)
+    drug_scale = float(config['Experiment']['dose_scale'])
+    control_mask = (drug_concs != 0.0)
     drug_log_concs = tool.build_drug_log_concentrations(drug_concs[control_mask], drug_scale)
 
     # build the new column names based on experiment numbers
     cols_raw = tool.build_col_names('Raw {}', experiments)
+    col_raw_control = tool.build_col_names('Raw {}', control_experiments) #f"Raw {config['Experiment']['control_experiment']}"
     cols_normal = tool.build_col_names('Normalized {}', experiments)
+    col_normal_control = tool.build_col_names('Normalized {}', control_experiments) #f"Normalized {config['Experiment']['control_experiment']}"
     cols_ratio = tool.build_col_names('Ratio {}', experiments)
-    col_raw_control = f"Raw {config['Experiment']['control_experiment']}"
-    col_normal_control = f"Normalized {config['Experiment']['control_experiment']}"
+    col_ratio_control = tool.build_col_names('Ratio {}', control_experiments)
 
     # Setup the curve fit with default values unless specified in the toml file
     proc_params = config['Processing']
@@ -434,8 +437,12 @@ def run_pipeline(df, config, decoy_mode=False):
     else:
         df = add_ratios(df, cols_raw, cols_ratio, col_raw_control)
 
-    # Signal Quality is the raw intensity in the control
-    df['Signal Quality'] = np.log2(df[col_raw_control])
+    # If multiple controls are provided, estimate the noise level in the controls alone
+    if len(col_raw_control) > 1:
+        df['Control Ratio Std'] = df[col_ratio_control].std(axis=1)
+
+    # Absolute signal quality is the raw intensity ot the control(s)
+    df['Signal Quality'] = np.log2(df[col_raw_control].mean(axis=1))
 
     # Sort concentrations and observations from low to high dose
     sorted_doses = np.argsort(drug_log_concs)

diff --git a/curve_curator/user_interface.py b/curve_curator/user_interface.py
@@ -128,14 +128,21 @@ def error(msg, end='\n\n\n\n'):
     """
     prints a error message to the terminal and logging file
     """
-    error_line = "\n" + 27 * '#' + ' ERROR ' + 27 * '#' + "\n\n"
-    msg = f'{TerminalFormatting.WARNING}{error_line}{msg}{TerminalFormatting.ENDC}'
+    error_line = "\n" + 32 * '#' + ' ERROR ' + 31 * '#' + "\n\n"
+    msg = f'{TerminalFormatting.FAIL}{error_line}{msg}{TerminalFormatting.ENDC}'
     if LOGGER:
         LOGGER.error(msg)
     else:
         print(msg, end=end)
 
 
+def is_toml_file(file_path):
+    """
+    Checks if the file_path leads to a toml file
+    """
+    return os.path.splitext(file_path)[-1].lower().endswith('.toml')
+
+
 def check_path(path, is_dir=False):
     """
     check_path(path, is_dir=False)
@@ -197,9 +204,9 @@ def check_toml_params(config):
         #
         # ['Experiment']
         #
-        experiments = config['Experiment']['experiments']
-        control_experiment = config['Experiment']['control_experiment']
-        doses = config['Experiment']['doses']
+        experiments = np.array(config['Experiment']['experiments'])
+        control_experiment = np.array([config['Experiment']['control_experiment']]).flatten()
+        doses = np.array(config['Experiment']['doses'])
         dose_scale = config['Experiment']['dose_scale']
         dose_unit = config['Experiment']['dose_unit']
 
@@ -209,8 +216,8 @@ def check_toml_params(config):
         if len(experiments) != len(doses):
             error("Error: [Experiment] 'experiments' and [Experiment] 'doses' do no correspond in length.")
             raise ValueError("[Experiment] 'experiments' & 'doses' length")
-        if control_experiment not in experiments:
-            error("Error: [Experiment] 'control_experiment' is not in [Experiment] 'experiments'.")
+        if len(set(control_experiment) - set(experiments)) > 0:
+            error("Error: [Experiment] at least one 'control_experiment' is not in [Experiment] 'experiments'.")
             raise ValueError("[Experiment] 'experiments'")
         if not dose_scale:
             error("Error: [Experiment] 'dose_scale' is empty.")
@@ -303,10 +310,15 @@ def set_default_values(config):
     """
     Sets default values for optional parameters of the pipeline when the user didn't specify it.
     """
-    experiments = config['Experiment']['experiments']
+    experiments = np.array(config['Experiment']['experiments']).flatten()
+    control_experiments = np.array([config['Experiment']['control_experiment']]).flatten()
+    doses = np.array([config['Experiment']['doses']]).flatten()
 
     # Experiment
     exp_params = config['Experiment']
+    exp_params['experiments'] = experiments
+    exp_params['control_experiment'] = control_experiments
+    exp_params['doses'] = doses
     exp_params['dose_scale'] = float(exp_params.get('dose_scale', 1e0))
     config['Experiment'] = exp_params
 

diff --git a/example_datasets/decryptM_Dasatinib/curveCurator.log b/example_datasets/decryptM_Dasatinib/curveCurator.log
@@ -1,11 +1,12 @@
-2023-08-04 14:33:03,588 - INFO -  * Executing CurveCurator pipeline version 0.0.6.
-2023-08-04 14:33:03,588 - INFO -  * Reading parameter file of experiment.
-2023-08-04 14:33:03,603 - INFO -  * Loading data file ./evidence.txt.
-2023-08-04 14:33:06,277 - INFO -  * The following normalization factors were applied:
-2023-08-04 14:33:06,284 - INFO - {'Raw 1': 0.58, 'Raw 2': 0.13, 'Raw 3': 0.2, 'Raw 4': -0.61, 'Raw 5': -0.31, 'Raw 6': 0.19, 'Raw 7': 0.24, 'Raw 8': -0.19, 'Raw 9': -0.08, 'Raw 10': -0.15}
-2023-08-04 14:33:06,294 - INFO -  * Fitting curves parameters by standard OLS with 5 cores:
-2023-08-04 14:35:58,962 - INFO -  * Fitting curves parameters done !
-2023-08-04 14:35:59,052 - INFO -  * Calculate Relevance Score and apply SAM user thresholds:
-2023-08-04 14:35:59,062 - INFO -    alpha=0.05, fc_lim=0.45, s0=0.2141
-2023-08-04 14:36:01,232 - INFO -  * Rendering interactive dashboard using webgl backend ...
-2023-08-04 14:36:02,832 - INFO -  * Dashboard successfully rendered.
+2023-10-01 16:56:46,687 - INFO -  * Executing CurveCurator pipeline version 0.2.0.
+2023-10-01 16:56:46,687 - INFO -  * Reading parameter file of experiment.
+2023-10-01 16:56:46,734 - INFO -  * Loading data file Z:\internal_projects\active\TOPAS\Publications\DecryptM_Finder\GitHubUpload\curve_curator\example_datasets\decryptM_Dasatinib\./evidence.txt.
+2023-10-01 16:56:47,864 - INFO -  * 67 Curves were removed because of >4 missing values.
+2023-10-01 16:56:49,306 - INFO -  * The following normalization factors were applied:
+2023-10-01 16:56:49,306 - INFO - {'Raw 1': 0.58, 'Raw 2': 0.13, 'Raw 3': 0.2, 'Raw 4': -0.61, 'Raw 5': -0.31, 'Raw 6': 0.19, 'Raw 7': 0.24, 'Raw 8': -0.19, 'Raw 9': -0.08, 'Raw 10': -0.15}
+2023-10-01 16:56:49,339 - INFO -  * Fitting curves parameters by standard OLS with 5 cores:
+2023-10-01 16:58:46,797 - INFO -  * Fitting curves parameters done !
+2023-10-01 16:58:46,912 - INFO -  * Calculate Relevance Score and apply SAM user thresholds:
+2023-10-01 16:58:46,912 - INFO -    alpha=0.05, fc_lim=0.45, s0=0.2141
+2023-10-01 16:58:48,859 - INFO -  * Rendering interactive dashboard using webgl backend ...
+2023-10-01 16:58:50,452 - INFO -  * Dashboard successfully rendered.