From 9cdf87904172f38168c8862b6bcd604dc69b7002 Mon Sep 17 00:00:00 2001 From: zprobot <1727697083@qq.com> Date: Thu, 28 Mar 2024 15:34:05 +0800 Subject: [PATCH 1/3] update: ibaq --- .../normalize_methods.cpython-310.pyc | Bin 0 -> 2375 bytes bin/normalize_methods.py | 85 ++ bin/peptide_normalization.py | 1180 +++++++++++----- build/lib/bin/__init__.py | 0 build/lib/bin/compute_ibaq.py | 201 +++ build/lib/bin/compute_tpa.py | 229 ++++ build/lib/bin/datasets_merger.py | 111 ++ build/lib/bin/merge_condition_files.py | 50 + build/lib/bin/normalize_methods.py | 85 ++ build/lib/bin/peptide_normalization.py | 1214 +++++++++++++++++ build/lib/bin/tsne_visualization.py | 187 +++ build/lib/ibaq/__init__.py | 1 + build/lib/ibaq/combiner.py | 226 +++ build/lib/ibaq/ibaqpy_commons.py | 434 ++++++ build/lib/ibaq/utils.py | 525 +++++++ build/scripts-3.10/compute_ibaq.py | 201 +++ build/scripts-3.10/compute_tpa.py | 229 ++++ build/scripts-3.10/datasets_merger.py | 111 ++ build/scripts-3.10/merge_condition_files.py | 50 + build/scripts-3.10/peptide_normalization.py | 1214 +++++++++++++++++ build/scripts-3.10/tsne_visualization.py | 187 +++ dist/ibaqpy-0.0.3-py3.10.egg | Bin 0 -> 100087 bytes ibaq/ibaqpy_commons.py | 47 +- ibaqpy.egg-info/PKG-INFO | 354 +++++ ibaqpy.egg-info/SOURCES.txt | 20 + ibaqpy.egg-info/dependency_links.txt | 1 + ibaqpy.egg-info/requires.txt | 9 + ibaqpy.egg-info/top_level.txt | 2 + 28 files changed, 6537 insertions(+), 416 deletions(-) create mode 100644 bin/__pycache__/normalize_methods.cpython-310.pyc create mode 100644 bin/normalize_methods.py create mode 100644 build/lib/bin/__init__.py create mode 100644 build/lib/bin/compute_ibaq.py create mode 100644 build/lib/bin/compute_tpa.py create mode 100644 build/lib/bin/datasets_merger.py create mode 100644 build/lib/bin/merge_condition_files.py create mode 100644 build/lib/bin/normalize_methods.py create mode 100644 build/lib/bin/peptide_normalization.py create mode 100644 build/lib/bin/tsne_visualization.py create mode 100644 build/lib/ibaq/__init__.py create mode 100644 build/lib/ibaq/combiner.py create mode 100644 build/lib/ibaq/ibaqpy_commons.py create mode 100644 build/lib/ibaq/utils.py create mode 100644 build/scripts-3.10/compute_ibaq.py create mode 100644 build/scripts-3.10/compute_tpa.py create mode 100644 build/scripts-3.10/datasets_merger.py create mode 100644 build/scripts-3.10/merge_condition_files.py create mode 100644 build/scripts-3.10/peptide_normalization.py create mode 100644 build/scripts-3.10/tsne_visualization.py create mode 100644 dist/ibaqpy-0.0.3-py3.10.egg create mode 100644 ibaqpy.egg-info/PKG-INFO create mode 100644 ibaqpy.egg-info/SOURCES.txt create mode 100644 ibaqpy.egg-info/dependency_links.txt create mode 100644 ibaqpy.egg-info/requires.txt create mode 100644 ibaqpy.egg-info/top_level.txt diff --git a/bin/__pycache__/normalize_methods.cpython-310.pyc b/bin/__pycache__/normalize_methods.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..872356d544de50ef337491335ecfc1c7c88accd8 GIT binary patch literal 2375 zcma)7OK%)S5T5RN?t0f=zY@OygA$2>#J~ll2oVETL?mS4x*4G6Cg3&?#SW`)>5Hh@?a;sE&qh+QErkWC;?g?K=o z0&y#(0puAFuS7%>_K$-`X*CQn1zK9$%1Wa!Q%)L*7}`w3k#Y`_d@oD_7e+UYv%ozD zUAc^u86U~ODH#mRmqn($3$EV$gU@fOY4X?2z+{-j$C2uko@+>(+}0+PyVa1{$-dpR z@-jP#pf;{IfCpK@_#$R!-+-w(DP?G{4r?>_zJ z>z&?|++vIhLc@hrdJd^JNmG97yDd`_ZnMvN$HxB?^XT^2`^XbmB>jckX>=9Z) zw{(HsV+%|Yc5Axl4=_sU(g`236HaAVpD;f`0X{^*9rH00CMdu>JTt3&W;(nxuH0Up z4AZPocASZ5G%$wMg^q#K-MD8vF@~!gmR2(Y4_2hbhi&*jlEz<##<XndJnJi`A7&yf zwAWyw|uu`oZXHq)$OpB5B)N oV46d!tjhZ`&BZW@zM`vLtV4K)hf}1V$GsKLn*3~N$Mag=zo4VNuK)l5 literal 0 HcmV?d00001 diff --git a/bin/normalize_methods.py b/bin/normalize_methods.py new file mode 100644 index 0000000..fb3bd0e --- /dev/null +++ b/bin/normalize_methods.py @@ -0,0 +1,85 @@ +import numpy as np +import pandas as pd +from sklearn.preprocessing import robust_scale +from sklearn.preprocessing import power_transform +from sklearn.preprocessing import quantile_transform + +def normalize(df,method): + match method: + case 'mean': + return mean_normalize(df) + case 'median': + return median_normalize(df) + case 'max': + return max_normalize(df) + case 'global': + return global_normalize(df) + case 'max_min': + return max_min_mormalize(df) + case 'z_score': + return z_score_normalize(df) + case 'iqr': + return iqr_normalize(df) + case 'robust': + return robust_normalize(df) + case 'vsn': + return vsn_normalize(df) + case 'quantile': + return quantile_normalize(df) + case _: + return -1 + +# mean +def mean_normalize(df): + return df / df.mean(axis=0) + +# median +def median_normalize(df): + return df / df.median(axis=0) + +#max +def max_normalize(df): + return df / df.max(axis=0) + +#global +def global_normalize(df): + return df / df.sum(axis=0) + +#max-min +def max_min_mormalize(df): + min = df.min(axis=0) + return (df - min) / (df.max(axis=0) - min) + +#z-score +def z_score_normalize(df): + return (df - df.mean(axis=0)) / df.var(axis=0) + +#IQR +def iqr_normalize(df): + Q = df.quantile([0.75,0.25],interpolation='linear',axis=0) + IQR = Q.loc[0.75,:] - Q.loc[0.25,:] + return (df - df.median(axis=0)) / IQR + +#rubust +def robust_normalize(df): + index = df.index + columns = df.columns + df = robust_scale(df, axis=0) + df = pd.DataFrame(df,columns=columns,index=index) + return df + +#vsn +def vsn_normalize(df): + index = df.index + columns = df.columns + df = power_transform(df, method='box-cox') + df = pd.DataFrame(df,columns=columns,index=index) + return df + +#quantile +def quantile_normalize(df): + index = df.index + columns = df.columns + DF = quantile_transform(df,axis=0) + df = pd.DataFrame(df,columns=columns,index=index) + return df \ No newline at end of file diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py index ddd5478..5cf18d0 100644 --- a/bin/peptide_normalization.py +++ b/bin/peptide_normalization.py @@ -4,15 +4,20 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -import qnorm +from scipy.stats import rankdata +import os +import random +import uuid from matplotlib.backends.backend_pdf import PdfPages from pandas import DataFrame +import pyarrow.parquet as pq +from normalize_methods import normalize from ibaq.ibaqpy_commons import ( BIOREPLICATE, CHANNEL, CONDITION, - FEATURE_COLUMNS, + PARQUET_COLUMNS, FRACTION, FRAGMENT_ION, INTENSITY, @@ -26,12 +31,12 @@ RUN, SAMPLE_ID, STUDY_ID, + TMT16plex, + TMT11plex, + TMT10plex, + TMT6plex, ITRAQ4plex, ITRAQ8plex, - TMT6plex, - TMT10plex, - TMT11plex, - TMT16plex, get_canonical_peptide, get_spectrum_prefix, get_study_accession, @@ -48,48 +53,281 @@ ) -# TODO: The following two func are useless. -def remove_outliers_iqr(dataset: DataFrame): +def print_dataset_size(dataset: DataFrame, message: str, verbose: bool) -> None: + if verbose: + print(message + str(len(dataset.index))) + +def recover_df(df): + """ + This function is aimed to recover data shape. + """ + samples = df.columns.tolist() + out = pd.DataFrame() + for sample in samples: + samples_df = df[sample].dropna() + samples_df = samples_df.reset_index() + samples_df['SampleID'] = sample + samples_df.rename(columns={ + sample:NORM_INTENSITY + },inplace=True) + out = pd.concat([out,samples_df]) + return out + +def analyse_sdrf(sdrf_path: str, compression: bool) -> tuple: """ - This method removes outliers from the dataframe inplace, the variable used for the outlier removal is Intensity - :param dataset: Peptide dataframe - :return: None + This function is aimed to parse SDRF and return four objects: + 1. sdrf_df: A dataframe with channels and references annoted. + 2. label: Label type of the experiment. LFQ, TMT or iTRAQ. + 3. sample_names: A list contains all sample names. + 4. choice: A dictionary caontains key-values between channel + names and numbers. + :param sdrf_path: File path of SDRF. + :param compression: Whether compressed. + :return: """ - q1 = dataset[INTENSITY].quantile(0.25) - q3 = dataset[INTENSITY].quantile(0.75) - iqr = q3 - q1 + sdrf_df = pd.read_csv(sdrf_path, sep="\t", compression=compression) + sdrf_df[REFERENCE] = sdrf_df["comment[data file]"].apply(get_spectrum_prefix) + + labels = set(sdrf_df["comment[label]"]) + # Determine label type + label, choice = get_label(labels) + if label == "TMT": + choice_df = ( + pd.DataFrame.from_dict(choice, orient="index", columns=[CHANNEL]) + .reset_index() + .rename(columns={"index": "comment[label]"}) + ) + sdrf_df = sdrf_df.merge(choice_df, on="comment[label]", how="left") + elif label == "ITRAQ": + choice_df = ( + pd.DataFrame.from_dict(choice, orient="index", columns=[CHANNEL]) + .reset_index() + .rename(columns={"index": "comment[label]"}) + ) + sdrf_df = sdrf_df.merge(choice_df, on="comment[label]", how="left") + sample_names = sdrf_df["source name"].unique().tolist() + + return sdrf_df, label, sample_names, choice - dataset.query("(@q1 - 1.5 * @iqr) <= Intensity <= (@q3 + 1.5 * @iqr)", inplace=True) +def analyse_feature_df(feature_df: pd.DataFrame) -> tuple: + """Return label type, sample names and choice dict by iterating parquet. -def remove_missing_values(normalize_df: DataFrame, ratio: float = 0.3) -> DataFrame: + :param parquet_path: Feature parquet path. + :param batch_size: Iterate batch size, defaults to 100000 + :return: Label type, sample names and choice dict """ - Remove missing values if the peptide do not have values in the most of the samples - :param normalize_df: data frame with the data - :param ratio: ratio of samples without intensity values. - :return: + samples = feature_df["sample_accession"].unique().tolist() + labels = feature_df["isotope_label_type"].unique().tolist() + # Determine label type + label, choice = get_label(labels) + + return label, samples, choice + + +def analyse_feature_parquet(parquet_path: str, batch_size: int = 100000) -> tuple: + """Return label type, sample names and choice dict by iterating parquet. + + :param parquet_path: Feature parquet path. + :param batch_size: Iterate batch size, defaults to 100000 + :return: Label type, sample names and choice dict """ - n_samples = len(normalize_df.columns) - normalize_df = normalize_df.dropna(thresh=round(n_samples * ratio)) - return normalize_df + parquet_chunks = read_large_parquet(parquet_path, batch_size) + labels, samples = list(), list() + for chunk in parquet_chunks: + samples.extend(chunk["sample_accession"].unique().tolist()) + labels.extend(chunk["isotope_label_type"].unique().tolist()) + samples = list(set(samples)) + labels = list(set(labels)) + # Determine label type + label, choice = get_label(labels) + return label, samples, choice -def print_dataset_size(dataset: DataFrame, message: str, verbose: bool) -> None: - if verbose: - print(message + str(len(dataset.index))) + +def read_large_parquet(parquet_path: str, batch_size: int = 100000): + parquet_file = pq.ParquetFile(parquet_path) + for batch in parquet_file.iter_batches(batch_size=batch_size): + batch_df = batch.to_pandas() + yield batch_df +def get_label(labels: list) -> (str, dict): + """Return label type and choice dict according to labels list. + + :param labels: Labels from SDRF. + :return: Tuple contains label type and choice dict. + """ + choice = None + if len(labels) == 1: + label = "LFQ" + elif "TMT" in ",".join(labels) or "tmt" in ",".join(labels): + if ( + len(labels) > 11 + or "TMT134N" in labels + or "TMT133C" in labels + or "TMT133N" in labels + or "TMT132C" in labels + or "TMT132N" in labels + ): + choice = TMT16plex + elif len(labels) == 11 or "TMT131C" in labels: + choice = TMT11plex + elif len(labels) > 6: + choice = TMT10plex + else: + choice = TMT6plex + label = "TMT" + elif "ITRAQ" in ",".join(labels) or "itraq" in ",".join(labels): + if len(labels) > 4: + choice = ITRAQ8plex + else: + choice = ITRAQ4plex + label = "ITRAQ" + else: + exit("Warning: Only support label free, TMT and ITRAQ experiment!") + return label, choice + + +def msstats_common_process(data_df: pd.DataFrame) -> pd.DataFrame: + """Apply common process on data. + + :param data_df: Feature data in dataframe. + :return: Processed data. + """ + data_df.rename( + columns={ + "ProteinName": PROTEIN_NAME, + "PeptideSequence": PEPTIDE_SEQUENCE, + "PrecursorCharge": PEPTIDE_CHARGE, + "Run": RUN, + "Condition": CONDITION, + "Intensity": INTENSITY, + }, + inplace=True, + ) + data_df[REFERENCE] = data_df[REFERENCE].apply(get_spectrum_prefix) + + return data_df + + +def parquet_common_process( + data_df: pd.DataFrame, label: str, choice: dict +) -> pd.DataFrame: + """Apply common process on data. + + :param data_df: Feature data in dataframe. + :return: Processed data. + """ + data_df = data_df.rename(columns=parquet_map) + data_df[PROTEIN_NAME] = data_df.apply(lambda x: ",".join(x[PROTEIN_NAME]), axis=1) + if label == "LFQ": + data_df.drop(CHANNEL, inplace=True, axis=1) + else: + data_df[CHANNEL] = data_df[CHANNEL].map(choice) + + return data_df + + +def merge_sdrf( + label: str, sdrf_df: pd.DataFrame, data_df: pd.DataFrame +) -> pd.DataFrame: + if label == "LFQ": + result_df = pd.merge( + data_df, + sdrf_df[["source name", REFERENCE]], + how="left", + on=[REFERENCE], + ) + elif label == "TMT": + result_df = pd.merge( + data_df, + sdrf_df[["source name", REFERENCE, CHANNEL]], + how="left", + on=[REFERENCE, CHANNEL], + ) + elif label == "ITRAQ": + result_df = pd.merge( + data_df, + sdrf_df[["source name", REFERENCE, CHANNEL]], + how="left", + on=[REFERENCE, CHANNEL], + ) + result_df.rename(columns={"source name": SAMPLE_ID}, inplace=True) + result_df = result_df[result_df["Condition"] != "Empty"] + + return result_df + + +def data_common_process(data_df: pd.DataFrame, min_aa: int) -> pd.DataFrame: + # Remove 0 intensity signals from the data + data_df = data_df[data_df[INTENSITY] > 0] + data_df = data_df[data_df["Condition"] != "Empty"] + + def map_canonical_seq(data_df: pd.DataFrame) -> (pd.DataFrame, dict): + modified_seqs = data_df[PEPTIDE_SEQUENCE].unique().tolist() + canonical_seqs = [get_canonical_peptide(i) for i in modified_seqs] + inner_canonical_dict = dict(zip(modified_seqs, canonical_seqs)) + data_df[PEPTIDE_CANONICAL] = data_df[PEPTIDE_SEQUENCE].map(inner_canonical_dict) + + return data_df, inner_canonical_dict + + if PEPTIDE_CANONICAL not in data_df.columns: + data_df, inner_canonical_dict = map_canonical_seq(data_df) + data_df[PEPTIDE_CANONICAL] = data_df[PEPTIDE_SEQUENCE].map(inner_canonical_dict) + # Filter peptides with less amino acids than min_aa (default: 7) + data_df = data_df[ + data_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1) + ] + data_df[PROTEIN_NAME] = data_df[PROTEIN_NAME].apply(parse_uniprot_accession) + data_df[STUDY_ID] = data_df[SAMPLE_ID].apply(get_study_accession) + if FRACTION not in data_df.columns: + data_df[FRACTION] = 1 + data_df = data_df[ + [ + PROTEIN_NAME, + PEPTIDE_SEQUENCE, + PEPTIDE_CANONICAL, + PEPTIDE_CHARGE, + INTENSITY, + REFERENCE, + CONDITION, + RUN, + BIOREPLICATE, + FRACTION, + FRAGMENT_ION, + ISOTOPE_LABEL_TYPE, + STUDY_ID, + SAMPLE_ID, + ] + ] + data_df[CONDITION] = pd.Categorical(data_df[CONDITION]) + data_df[STUDY_ID] = pd.Categorical(data_df[STUDY_ID]) + data_df[SAMPLE_ID] = pd.Categorical(data_df[SAMPLE_ID]) + + return data_df + def intensity_normalization( dataset: DataFrame, field: str, - class_field: str = "all", - scaling_method: str = "msstats", + class_field: str, + scaling_method: str = "quantile", ) -> DataFrame: + cols_to_keep = [ + PROTEIN_NAME, + PEPTIDE_CANONICAL, + PEPTIDE_SEQUENCE, + PEPTIDE_CHARGE, + SAMPLE_ID, + BIOREPLICATE, + CONDITION, + NORM_INTENSITY, + ] # TODO add imputation and/or removal to those two norm strategies if scaling_method == "msstats": # For TMT normalization if "Channel" in dataset.columns: - g = dataset.groupby(["Run", "Channel"])[field].apply(np.median) + g = dataset.groupby(["Run", "Channel"])[field].apply(np.nanmedian) g.name = "RunMedian" dataset = dataset.join(g, on=["Run", "Channel"]) median_baseline = dataset.drop_duplicates(subset=["Run", "Channel", field])[ @@ -99,7 +337,7 @@ def intensity_normalization( dataset[field] - dataset["RunMedian"] + median_baseline ) else: - g = dataset.groupby(["Run", "Fraction"])[field].apply(np.median) + g = dataset.groupby(["Run", "Fraction"])[field].apply(np.nanmedian) g.name = "RunMedian" dataset = dataset.join(g, on=["Run", "Fraction"]) dataset["FractionMedian"] = ( @@ -108,15 +346,16 @@ def intensity_normalization( dataset[NORM_INTENSITY] = ( dataset[field] - dataset["RunMedian"] + dataset["FractionMedian"] ) - return dataset + return dataset[cols_to_keep] - elif scaling_method == "qnorm": + else: # pivot to have one col per sample print("Transforming to wide format dataset size {}".format(len(dataset.index))) normalize_df = pd.pivot_table( dataset, index=[ PEPTIDE_SEQUENCE, + PEPTIDE_CANONICAL, PEPTIDE_CHARGE, FRACTION, RUN, @@ -127,28 +366,18 @@ def intensity_normalization( ], columns=class_field, values=field, - aggfunc={field: np.mean}, + aggfunc={field: np.nanmean}, observed=True, ) - normalize_df = qnorm.quantile_normalize(normalize_df, axis=1) - normalize_df = normalize_df.reset_index() - normalize_df = normalize_df.melt( - id_vars=[ - PEPTIDE_SEQUENCE, - PEPTIDE_CHARGE, - FRACTION, - RUN, - BIOREPLICATE, - PROTEIN_NAME, - STUDY_ID, - CONDITION, - ] - ) - normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) - print(dataset.head()) - return normalize_df + normalize_df = normalize(normalize_df,scaling_method) + # TODO: When restoring the pivot table here, the previous grouping caused + # the dataframe to produce a large number of rows with NORM_INTENSITY of + # NA at melt. This results in an unbearable memory consumption. - return dataset + normalize_df = recover_df(normalize_df) + normalize_df = normalize_df.drop_duplicates() + print(normalize_df.head()) + return normalize_df[cols_to_keep] def remove_low_frequency_peptides_( @@ -166,28 +395,23 @@ def remove_low_frequency_peptides_( index=[PEPTIDE_CANONICAL, PROTEIN_NAME], columns=SAMPLE_ID, values=NORM_INTENSITY, - aggfunc={NORM_INTENSITY: np.mean}, + aggfunc={NORM_INTENSITY: np.nanmean}, observed=True, ) # Count the number of null values in each row null_count = normalize_df.isnull().sum(axis=1) - # Find the rows that have null values above the threshold rows_to_drop = null_count[ null_count >= (1 - percentage_samples) * normalize_df.shape[1] ].index - # Drop the rows with too many null values normalize_df = normalize_df.drop(rows_to_drop) # Remove rows with non-null values in only one column normalize_df = normalize_df[ - normalize_df.notnull().sum(axis=1) != normalize_df.shape[1] - 1 + normalize_df.notnull().sum(axis=1) != 1 ] - normalize_df = normalize_df.reset_index() - normalize_df = normalize_df.melt(id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME]) - normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) - + normalize_df = recover_df(normalize_df) # recover condition column normalize_df = normalize_df.merge( dataset_df[[SAMPLE_ID, CONDITION]].drop_duplicates(subset=[SAMPLE_ID]), @@ -213,26 +437,18 @@ def peptide_intensity_normalization( :param scaling_method: method to use for the normalization :return: """ - if scaling_method == "qnorm": - # pivot to have one col per sample - normalize_df = pd.pivot_table( - dataset_df, - index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], - columns=class_field, - values=field, - aggfunc={field: np.mean}, - observed=True, - ) - normalize_df = qnorm.quantile_normalize(normalize_df, axis=1) - normalize_df = normalize_df.reset_index() - normalize_df = normalize_df.melt( - id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION] - ) - normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) - normalize_df = normalize_df[normalize_df[NORM_INTENSITY].notna()] - return normalize_df - - return dataset_df + # pivot to have one col per sample + normalize_df = pd.pivot_table( + dataset_df, + index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], + columns=class_field, + values=field, + aggfunc={field: np.nanmean}, + observed=True, + ) + # need nomalize? + normalize_df = recover_df(normalize_df) + return normalize_df def impute_peptide_intensities(dataset_df, field, class_field): @@ -252,7 +468,7 @@ def impute_peptide_intensities(dataset_df, field, class_field): index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], columns=class_field, values=field, - aggfunc={field: np.mean}, + aggfunc={field: np.nanmean}, observed=True, ) @@ -290,6 +506,12 @@ def impute_peptide_intensities(dataset_df, field, class_field): @click.option( "-s", "--sdrf", help="SDRF file import generated by quantms", default=None ) +@click.option("--stream", help="Stream processing normalization", is_flag=True) +@click.option( + "--chunksize", + help="The number of rows of MSstats or parquet read using pandas streaming", + default=1000000, +) @click.option( "--min_aa", help="Minimum number of amino acids to filter peptides", default=7 ) @@ -323,8 +545,8 @@ def impute_peptide_intensities(dataset_df, field, class_field): ) @click.option( "--nmethod", - help="Normalization method used to normalize intensities for all samples (options: qnorm)", - default="qnorm", + help="Normalization method used to normalize intensities for all samples (options: quantile, msstats, qnorm)", + default="quantile", ) @click.option( "--pnormalization", @@ -361,6 +583,8 @@ def peptide_normalization( msstats: str, parquet: str, sdrf: str, + stream: bool, + chunksize: int, min_aa: int, min_unique: int, remove_ids: str, @@ -384,208 +608,391 @@ def peptide_normalization( print_help_msg(peptide_normalization) exit(1) + if pnormalization and nmethod not in ["qnorm", "quantile"]: + exit( + "Peptide intensity normalization works only with qnorm or quantile methods!" + ) + + if verbose: + log_after_norm = not log2 + + pd.set_option("display.max_columns", None) compression_method = "gzip" if compress else None + print("Loading data..") - if parquet is None: - # Read the msstats file - feature_df = pd.read_csv( - msstats, - sep=",", - compression=compression_method, - dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, - ) + if not stream: + if parquet is None: + # Read the msstats file + feature_df = pd.read_csv( + msstats, + sep=",", + compression=compression_method, + dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, + ) + + # Read the sdrf file + sdrf_df, label, sample_names, choice = analyse_sdrf( + sdrf, compression_method + ) + print(sdrf_df) + + # Merged the SDRF with the Resulted file + dataset_df = msstats_common_process(feature_df) + dataset_df = merge_sdrf(label, sdrf_df, feature_df) + # Remove the intermediate variables and free the memory + del feature_df, sdrf_df + gc.collect() + else: + dataset_df = pd.read_parquet(parquet,cols=PARQUET_COLUMNS) + label, sample_names, choice = analyse_feature_df(dataset_df) + dataset_df = parquet_common_process(dataset_df, label, choice) - feature_df.rename( - columns={ - "ProteinName": PROTEIN_NAME, - "PeptideSequence": PEPTIDE_SEQUENCE, - "PrecursorCharge": PEPTIDE_CHARGE, - "Run": RUN, - "Condition": CONDITION, - "Intensity": INTENSITY, - }, - inplace=True, + dataset_df = data_common_process(dataset_df, min_aa) + # Only proteins with unique peptides number greater than min_unique (default: 2) are retained + unique_peptides = set( + dataset_df.groupby(PEPTIDE_CANONICAL) + .filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[PEPTIDE_CANONICAL] + .tolist() ) + strong_proteins = set( + dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(unique_peptides)] + .groupby(PROTEIN_NAME) + .filter(lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[ + PROTEIN_NAME + ] + .tolist() + ) + dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] + + print(f"Number of unique peptides: {len(unique_peptides)}") + print(f"Number of strong proteins: {len(strong_proteins)}") + + print("Logarithmic if specified..") + dataset_df = dataset_df.rename(columns={INTENSITY: NORM_INTENSITY}) + if log2: + dataset_df[NORM_INTENSITY] = np.log2(dataset_df[NORM_INTENSITY]) + + # Print the distribution of the original peptide intensities from quantms analysis + if verbose: + sample_names = set(dataset_df[SAMPLE_ID]) + plot_width = len(sample_names) * 0.5 + 10 + pdf = PdfPages(qc_report) + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=not log2, + width=plot_width, + title="Original peptidoform intensity distribution (no normalization)", + ) + #plt.show() + pdf.savefig(density) + """ + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=not log2, + width=plot_width, + title="Original peptidoform intensity distribution (no normalization)", + violin=violin, + ) + plt.show() + pdf.savefig(box) + """ + + # Remove high abundant and contaminants proteins and the outliers + if remove_ids is not None: + print("Remove proteins from file...") + dataset_df = remove_protein_by_ids(dataset_df, remove_ids) + if remove_decoy_contaminants: + print("Remove decoy and contaminants...") + dataset_df = remove_contaminants_entrapments_decoys(dataset_df) - feature_df[PROTEIN_NAME] = feature_df[PROTEIN_NAME].apply( - parse_uniprot_accession + print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) + print("Normalize intensities.. ") + # dataset_df = dataset_df.dropna(how="any") + if not skip_normalization: + dataset_df = intensity_normalization( + dataset_df, + field=NORM_INTENSITY, + class_field=SAMPLE_ID, + scaling_method=nmethod, + ) + if verbose: + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + #log2=log_after_norm, + width=plot_width, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, + ) + #plt.show() + pdf.savefig(density) + """ + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + """ + print("Number of peptides after normalization: " + str(len(dataset_df.index))) + print("Select the best peptidoform across fractions...") + dataset_df = get_peptidoform_normalize_intensities(dataset_df) + print( + "Number of peptides after peptidofrom selection: " + + str(len(dataset_df.index)) ) - # Read the sdrf file - sdrf_df = pd.read_csv(sdrf, sep="\t", compression=compression_method) - sdrf_df[REFERENCE] = sdrf_df["comment[data file]"].apply(get_spectrum_prefix) - print(sdrf_df) - - if FRACTION not in feature_df.columns: - feature_df[FRACTION] = 1 - feature_df = feature_df[ - [ - PROTEIN_NAME, - PEPTIDE_SEQUENCE, - PEPTIDE_CHARGE, - INTENSITY, - REFERENCE, - CONDITION, - RUN, - BIOREPLICATE, - FRACTION, - FRAGMENT_ION, - ISOTOPE_LABEL_TYPE, - ] - ] + print("Sum all peptidoforms per Sample...") + dataset_df = sum_peptidoform_intensities(dataset_df) + print("Number of peptides after selection: " + str(len(dataset_df.index))) - # Merged the SDRF with the Resulted file - labels = set(sdrf_df["comment[label]"]) - if CHANNEL not in feature_df.columns: - feature_df[REFERENCE] = feature_df[REFERENCE].apply(get_spectrum_prefix) - dataset_df = pd.merge( - feature_df, - sdrf_df[["source name", REFERENCE]], - how="left", - on=[REFERENCE], + print("Average all peptidoforms per Peptide/Sample...") + dataset_df = average_peptide_intensities(dataset_df) + print("Number of peptides after average: " + str(len(dataset_df.index))) + if verbose: + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptide intensity distribution method: " + nmethod, ) - elif "TMT" in ",".join(labels) or "tmt" in ",".join(labels): - if ( - len(labels) > 11 - or "TMT134N" in labels - or "TMT133C" in labels - or "TMT133N" in labels - or "TMT132C" in labels - or "TMT132N" in labels - ): - choice = TMT16plex - elif len(labels) == 11 or "TMT131C" in labels: - choice = TMT11plex - elif len(labels) > 6: - choice = TMT10plex - else: - choice = TMT6plex - choice = ( - pd.DataFrame.from_dict(choice, orient="index", columns=[CHANNEL]) - .reset_index() - .rename(columns={"index": "comment[label]"}) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptide intensity distribution method: " + nmethod, + violin=violin, ) - sdrf_df = sdrf_df.merge(choice, on="comment[label]", how="left") - feature_df[REFERENCE] = feature_df[REFERENCE].apply(get_spectrum_prefix) - dataset_df = pd.merge( - feature_df, - sdrf_df[["source name", REFERENCE, CHANNEL]], - how="left", - on=[REFERENCE, CHANNEL], + plt.show() + pdf.savefig(box) + + if remove_low_frequency_peptides and len(sample_names) > 1: + print(dataset_df) + dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) + print_dataset_size( + dataset_df, "Peptides after remove low frequency peptides: ", verbose ) - # result_df.drop(CHANNEL, axis=1, inplace=True) - dataset_df = dataset_df[dataset_df["Condition"] != "Empty"] - dataset_df.rename(columns={"Charge": PEPTIDE_CHARGE}, inplace=True) - elif "ITRAQ" in ",".join(labels) or "itraq" in ",".join(labels): - if len(labels) > 4: - choice = ITRAQ8plex - else: - choice = ITRAQ4plex - choice = ( - pd.DataFrame.from_dict(choice, orient="index", columns=[CHANNEL]) - .reset_index() - .rename(columns={"index": "comment[label]"}) + # Perform imputation using Random Forest in Peptide Intensities + # TODO: Check if this is necessary (Probably we can do some research if imputation at peptide level is necessary + # if impute: + # dataset_df = impute_peptide_intensities(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID) + + if pnormalization: + print("Normalize at Peptide level...") + dataset_df = peptide_intensity_normalization( + dataset_df, + field=NORM_INTENSITY, + class_field=SAMPLE_ID, + scaling_method=nmethod, ) - sdrf_df = sdrf_df.merge(choice, on="comment[label]", how="left") - feature_df[REFERENCE] = feature_df[REFERENCE].apply(get_spectrum_prefix) - dataset_df = pd.merge( - feature_df, - sdrf_df[["source name", REFERENCE, CHANNEL]], - how="left", - on=[REFERENCE, CHANNEL], + + if verbose: + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Normalization at peptide level method: " + nmethod, ) - dataset_df = dataset_df[dataset_df["Condition"] != "Empty"] - dataset_df.rename(columns={"Charge": PEPTIDE_CHARGE}, inplace=True) - else: - print("Warning: Only support label free, TMT and ITRAQ experiment!") - exit(1) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Normalization at peptide level method: " + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + pdf.close() - # Remove the intermediate variables and free the memory - del feature_df, sdrf_df - gc.collect() + print("Save the normalized peptide intensities...") + dataset_df.to_csv(output, index=False, sep=",") else: - dataset_df = pd.read_parquet(parquet)[FEATURE_COLUMNS] - dataset_df = dataset_df.rename(columns=parquet_map) - dataset_df[PROTEIN_NAME] = dataset_df.apply( - lambda x: ",".join(x[PROTEIN_NAME]), axis=1 - ) - label_type = dataset_df[CHANNEL].unique().tolist() - if len(label_type) == 1: - dataset_df.drop(CHANNEL, inplace=True, axis=1) - dataset_df = dataset_df[dataset_df["Condition"] != "Empty"] - - # Remove 0 intensity signals from the msstats file - dataset_df = dataset_df[dataset_df[INTENSITY] > 0] - dataset_df[PEPTIDE_CANONICAL] = dataset_df.apply( - lambda x: get_canonical_peptide(x[PEPTIDE_SEQUENCE]), axis=1 - ) - # Only peptides with more than min_aa (default: 7) amino acids are retained - dataset_df = dataset_df[ - dataset_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1) - ] - # Only proteins with unique peptides number greater than min_unique (default: 2) are retained - unique_peptides = set( - dataset_df.groupby(PEPTIDE_CANONICAL) - .filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[PEPTIDE_CANONICAL] - .tolist() - ) - strong_proteins = set( - dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(unique_peptides)] - .groupby(PROTEIN_NAME) - .filter(lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[PROTEIN_NAME] - .tolist() - ) - dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] - - if msstats: - dataset_df.rename(columns={"source name": SAMPLE_ID}, inplace=True) - dataset_df[STUDY_ID] = dataset_df[SAMPLE_ID].apply(get_study_accession) - dataset_df = dataset_df.filter( - items=[ - PEPTIDE_SEQUENCE, - PEPTIDE_CHARGE, - FRACTION, - RUN, - BIOREPLICATE, - PROTEIN_NAME, - STUDY_ID, - CONDITION, - SAMPLE_ID, - INTENSITY, - ] - ) - dataset_df[CONDITION] = pd.Categorical(dataset_df[CONDITION]) - dataset_df[STUDY_ID] = pd.Categorical(dataset_df[STUDY_ID]) - dataset_df[SAMPLE_ID] = pd.Categorical(dataset_df[SAMPLE_ID]) + if parquet is None: + sdrf_df, label, sample_names, choice = analyse_sdrf( + sdrf, compression_method + ) + msstats_chunks = pd.read_csv( + msstats, + sep=",", + compression=compression_method, + dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, + chunksize=chunksize, + ) + else: + label, sample_names, choice = analyse_feature_parquet( + parquet, batch_size=chunksize + ) + msstats_chunks = read_large_parquet(parquet, batch_size=chunksize) + sample_number = len(sample_names) - pd.set_option("display.max_columns", None) - print("Loading data..") - print_dataset_size(dataset_df, "Number of peptides: ", verbose) + # TODO: Stream processing to obtain strong proteins with more than 2 uniqe peptides + temp = f"Temp-{str(uuid.uuid4())}/" + os.mkdir(temp) + print(f"INFO: Writing files into {temp}...") + unique_peptides = {} + group_intensities = {} + quantile = {} + print("INFO: First iteration to get unique peptides and strong proteins...") + for msstats_df in msstats_chunks: + if parquet is None: + msstats_df = msstats_common_process(msstats_df) + msstats_df = merge_sdrf(label, sdrf_df, msstats_df) + else: + msstats_df = parquet_common_process(msstats_df, label, choice) + result_df = data_common_process(msstats_df, min_aa) - print("Logarithmic if specified..") - dataset_df.loc[dataset_df.Intensity == 0, INTENSITY] = 1 - dataset_df[NORM_INTENSITY] = ( - np.log2(dataset_df[INTENSITY]) if log2 else dataset_df[INTENSITY] - ) - dataset_df.drop(INTENSITY, axis=1, inplace=True) + # Write CSVs by Sample ID + for sample in sample_names: + file_name = f"{temp}/{sample}.csv" + write_mode = "a" if os.path.exists(file_name) else "w" + header = False if os.path.exists(file_name) else True + result_df[result_df[SAMPLE_ID] == sample].to_csv( + file_name, index=False, header=header, mode=write_mode + ) + unique_df = result_df.groupby([PEPTIDE_CANONICAL]).filter( + lambda x: len(set(x[PROTEIN_NAME])) == 1 + )[[PEPTIDE_CANONICAL, PROTEIN_NAME]] + unique_dict = dict( + zip(unique_df[PEPTIDE_CANONICAL], unique_df[PROTEIN_NAME]) + ) + for i in unique_dict.keys(): + if i in unique_peptides.keys() and unique_dict[i] != unique_peptides[i]: + unique_peptides.pop(i) + else: + unique_peptides[i] = unique_dict[i] - # Print the distribution of the original peptide intensities from quantms analysis - if verbose: - sample_names = set(dataset_df[SAMPLE_ID]) - plot_width = len(sample_names) * 0.5 + 10 + proteins_list = list(unique_peptides.values()) + count_dict = { + element: proteins_list.count(element) for element in set(proteins_list) + } + strong_proteins = [ + element for element in count_dict if count_dict[element] >= min_unique + ] + del proteins_list, count_dict + print(f"Number of unique peptides: {len(list(unique_peptides.keys()))}") + print(f"Number of strong proteins: {len(strong_proteins)}") + + # TODO: Filter proteins with less unique peptides than min_unique (default: 2) + plot_samples = random.sample(sample_names, min(len(sample_names), 20)) + plot_width = 10 + len(plot_samples) * 0.5 pdf = PdfPages(qc_report) + original_intensities_df = pd.DataFrame() + + print("INFO: Second iteration to filter data and prepare normalization...") + print("Logarithmic if specified..") + norm_record = [0] * 2 + for sample in sample_names: + msstats_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") + msstats_df = msstats_df[msstats_df[PROTEIN_NAME].isin(strong_proteins)] + # Remove high abundant and contaminants proteins and the outliers + if remove_ids is not None: + msstats_df = remove_protein_by_ids(msstats_df, remove_ids) + if remove_decoy_contaminants: + msstats_df = remove_contaminants_entrapments_decoys(msstats_df) + norm_record[0] += len(msstats_df) + msstats_df = msstats_df.rename(columns={INTENSITY: NORM_INTENSITY}) + if log2: + msstats_df[NORM_INTENSITY] = np.log2(msstats_df[NORM_INTENSITY]) + if sample in plot_samples: + original_intensities_df = pd.concat( + [original_intensities_df, msstats_df] + ) + if not skip_normalization: + if nmethod == "msstats": + if label in ["TMT", "ITRAQ"]: + g = msstats_df.groupby(["Run", "Channel"]) + else: + g = msstats_df.groupby(["Run", "Fraction"]) + for name, group in g: + group_intensity = group[NORM_INTENSITY].tolist() + if name not in group_intensities: + group_intensities[name] = group_intensity + else: + group_intensities.update( + { + name: group_intensities[NORM_INTENSITY] + + group_intensity + } + ) + elif nmethod == "quantile": + msstats_df = ( + msstats_df.groupby( + [ + PEPTIDE_SEQUENCE, + PEPTIDE_CANONICAL, + PEPTIDE_CHARGE, + FRACTION, + RUN, + BIOREPLICATE, + PROTEIN_NAME, + STUDY_ID, + CONDITION, + ] + )[NORM_INTENSITY] + .agg(np.nanmean) + .reset_index() + ) + rank = msstats_df[NORM_INTENSITY].rank(method="average") + dic = dict(zip(rank, msstats_df[NORM_INTENSITY])) + if len(quantile) == 0: + quantile = {k: (v, 1) for k, v in dic.items()} + else: + # update = min(len(quantile), len(dic)) + intersec = set(quantile.keys()) & set(dic.keys()) + update = set(dic.keys()) - set(quantile.keys()) + quantile.update( + { + i: (quantile[i][0] + dic[i], quantile[i][1] + 1) + for i in intersec + } + ) + if len(update) > 0: + quantile.update({k: (dic[k], 1) for k in update}) + msstats_df[SAMPLE_ID] = sample + else: + exit("Stream process only supports msstats and quantile methods!") + msstats_df.to_csv(f"{temp}/{sample}.csv", index=False, sep=",") + norm_record[1] += len(msstats_df) + if not skip_normalization and nmethod == "quantile": + quantile = {k: v[0] / v[1] for k, v in quantile.items()} + print(f"Peptides after contaminants removal: {norm_record[0]}") + print(f"Number of peptides after normalization: {norm_record[1]}") + # Save original intensities QC plots + original_intensities_df = original_intensities_df.reset_index(drop=True) density = plot_distributions( - dataset_df, + original_intensities_df, NORM_INTENSITY, SAMPLE_ID, log2=not log2, width=plot_width, title="Original peptidoform intensity distribution (no normalization)", ) - plt.show() pdf.savefig(density) box = plot_box_plot( - dataset_df, + original_intensities_df, NORM_INTENSITY, SAMPLE_ID, log2=not log2, @@ -595,34 +1002,118 @@ def peptide_normalization( ) plt.show() pdf.savefig(box) + del original_intensities_df - # Remove high abundant and contaminants proteins and the outliers - if remove_ids is not None: - print("Remove proteins from file...") - dataset_df = remove_protein_by_ids(dataset_df, remove_ids) - if remove_decoy_contaminants: - print("Remove decoy and contaminants...") - dataset_df = remove_contaminants_entrapments_decoys(dataset_df) - - print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) - - print("Normalize intensities.. ") - # dataset_df = dataset_df.dropna(how="any") - if not skip_normalization: - dataset_df = intensity_normalization( - dataset_df, - field=NORM_INTENSITY, - class_field=SAMPLE_ID, - scaling_method=nmethod, - ) - if verbose: - log_after_norm = ( - nmethod == "msstats" - or nmethod == "qnorm" - or ((nmethod == "quantile" or nmethod == "robust") and not log2) + # TODO: Peptide intensity normalization + peptides_count = pd.DataFrame( + columns=[PROTEIN_NAME, PEPTIDE_CANONICAL, "count"] ) + norm_intensities_df = pd.DataFrame() + if not skip_normalization and nmethod == "msstats": + # For ISO normalization + if label in ["TMT", "ITRAQ"]: + median_baseline = np.nanmedian( + list(set(sum(group_intensities.values(), []))) + ) + group_intensities = { + key: np.nanmedian(list(values)) + for key, values in group_intensities.items() + } + else: + fractions = [i[1] for i in group_intensities.keys()] + fraction_median = {} + for fraction in fractions: + fraction_keys = [ + i for i in group_intensities.keys() if i[1] == fraction + ] + fraction_intensities = [] + for key in fraction_keys: + fraction_intensities.extend(group_intensities[key]) + fraction_median[fraction] = np.nanmedian(fraction_intensities) + group_intensities = { + key: np.nanmedian(values) + for key, values in group_intensities.items() + } + print("INFO: Third iteration to normalize and counting peptides frequency...") + size_record = [0] * 3 + + def normalization( + dataset_df, label, sample, skip_normalization, nmethod, record + ): + if not skip_normalization: + field = NORM_INTENSITY + if nmethod == "msstats": + # For ISO normalization + if label in ["TMT", "ITRAQ"]: + dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( + lambda x: x[field] + - group_intensities[(x["Run"], x["Channel"])] + + median_baseline, + axis=1, + ) + else: + dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( + lambda x: x[field] + - group_intensities[(x["Run"], x["Fraction"])] + + np.nanmedian( + [ + group_intensities[i] + for i in group_intensities.keys() + if i[1] == x["Fraction"] + ] + ), + axis=1, + ) + elif nmethod == "quantile": + rank = dataset_df[NORM_INTENSITY].rank(method="average") + ref_dict = dict(zip(rank, dataset_df[NORM_INTENSITY])) + ref_dict = {v: quantile[k] for k, v in ref_dict.items()} + dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( + lambda x: ref_dict.get(x[NORM_INTENSITY], np.nan), + axis=1, + ) + dataset_df = dataset_df.drop_duplicates() + dataset_df = dataset_df[dataset_df[NORM_INTENSITY].notna()] + dataset_df = get_peptidoform_normalize_intensities(dataset_df) + record[0] += len(dataset_df.index) + dataset_df = sum_peptidoform_intensities(dataset_df) + record[1] += len(dataset_df.index) + dataset_df = average_peptide_intensities(dataset_df) + record[2] += len(dataset_df.index) + + return dataset_df, record + + for sample in sample_names: + dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") + if len(dataset_df) != 0: + norm_df, size_record = normalization( + dataset_df, label, sample, skip_normalization, nmethod, size_record + ) + else: + continue + sample_peptides = norm_df[PEPTIDE_CANONICAL].unique().tolist() + if remove_low_frequency_peptides and sample_number > 1: + sample_peptides = norm_df[ + [PROTEIN_NAME, PEPTIDE_CANONICAL] + ].drop_duplicates() + sample_peptides["count"] = 1 + peptides_count = ( + pd.concat([peptides_count, sample_peptides]) + .groupby([PROTEIN_NAME, PEPTIDE_CANONICAL]) + .agg(sum) + .reset_index() + ) + norm_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) + if sample in plot_samples: + norm_intensities_df = pd.concat([norm_intensities_df, norm_df]) + del group_intensities, quantile + print(f"Number of peptides after peptidofrom selection: {size_record[0]}") + print(f"Number of peptides after selection: {size_record[1]}") + print(f"Number of peptides after average: {size_record[2]}") + # Save normalized intensities QC plots + norm_intensities_df = norm_intensities_df.reset_index(drop=True) density = plot_distributions( - dataset_df, + norm_intensities_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, @@ -633,7 +1124,7 @@ def peptide_normalization( plt.show() pdf.savefig(density) box = plot_box_plot( - dataset_df, + norm_intensities_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, @@ -644,94 +1135,46 @@ def peptide_normalization( ) plt.show() pdf.savefig(box) + del norm_intensities_df, strong_proteins - print("Select the best peptidoform across fractions...") - print( - "Number of peptides before peptidofrom selection: " + str(len(dataset_df.index)) - ) - dataset_df = get_peptidoform_normalize_intensities(dataset_df) - print( - "Number of peptides after peptidofrom selection: " + str(len(dataset_df.index)) - ) - - # Add the peptide sequence canonical without the modifications - if PEPTIDE_CANONICAL not in dataset_df.columns: - print("Add Canonical peptides to the dataframe...") - dataset_df[PEPTIDE_CANONICAL] = dataset_df[PEPTIDE_SEQUENCE].apply( - lambda x: get_canonical_peptide(x) - ) - - print("Sum all peptidoforms per Sample...") - print("Number of peptides before sum selection: " + str(len(dataset_df.index))) - dataset_df = sum_peptidoform_intensities(dataset_df) - print("Number of peptides after sum: " + str(len(dataset_df.index))) - - print("Average all peptidoforms per Peptide/Sample...") - print("Number of peptides before average: " + str(len(dataset_df.index))) - dataset_df = average_peptide_intensities(dataset_df) - print("Number of peptides after average: " + str(len(dataset_df.index))) - - if verbose: - log_after_norm = ( - nmethod == "msstats" - or nmethod == "qnorm" - or ((nmethod == "quantile" or nmethod == "robust") and not log2) - ) - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptide intensity distribution method: " + nmethod, - ) - plt.show() - pdf.savefig(density) - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptide intensity distribution method: " + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - - if remove_low_frequency_peptides: - print( - "Peptides before removing low frequency peptides: " - + str(len(dataset_df.index)) - ) - print(dataset_df) - dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) - print_dataset_size( - dataset_df, "Peptides after remove low frequency peptides: ", verbose - ) + print("INFO: Writing normalized intensities into CSV...") + if remove_low_frequency_peptides and sample_number > 1: + peptides_count = peptides_count.loc[ + (peptides_count["count"] > 0.20 * sample_number) + & (peptides_count["count"] != sample_number - 1) + ] - # Perform imputation using Random Forest in Peptide Intensities - # TODO: Check if this is necessary (Probably we can do some research if imputation at peptide level is necessary - # if impute: - # dataset_df = impute_peptide_intensities(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID) - - if pnormalization: - print("Normalize at Peptide level...") - dataset_df = peptide_intensity_normalization( - dataset_df, - field=NORM_INTENSITY, - class_field=SAMPLE_ID, - scaling_method=nmethod, - ) + final_norm_intensities_df = pd.DataFrame() + size_record = 0 + for sample in sample_names: + dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") + if remove_low_frequency_peptides and sample_number > 1: + # Filter low-frequency peptides, which indicate whether the peptide occurs less than 20% in all samples or + # only in one sample + dataset_df = dataset_df.merge( + peptides_count[[PEPTIDE_CANONICAL, PROTEIN_NAME]], how="inner" + ) + size_record += len(dataset_df.index) + dataset_df = dataset_df[ + [PEPTIDE_CANONICAL, PROTEIN_NAME, SAMPLE_ID, NORM_INTENSITY, CONDITION] + ] + write_mode = "a" if os.path.exists(output) else "w" + header = False if os.path.exists(output) else True + dataset_df.to_csv(output, index=False, header=header, mode=write_mode) + dataset_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) + if sample in plot_samples: + final_norm_intensities_df = pd.concat( + [final_norm_intensities_df, dataset_df] + ) + print(f"Peptides after remove low frequency peptides: {size_record}") + if remove_low_frequency_peptides: + del peptides_count - if verbose: - log_after_norm = ( - nmethod == "msstats" - or nmethod == "qnorm" - or ((nmethod == "quantile" or nmethod == "robust") and not log2) - ) + # TODO: No peptides intensity normalization applied in stream processing. + # Save final normalized intensities QC plots + final_norm_intensities_df = final_norm_intensities_df.reset_index(drop=True) density = plot_distributions( - dataset_df, + final_norm_intensities_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, @@ -741,7 +1184,7 @@ def peptide_normalization( plt.show() pdf.savefig(density) box = plot_box_plot( - dataset_df, + final_norm_intensities_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, @@ -753,9 +1196,6 @@ def peptide_normalization( pdf.savefig(box) pdf.close() - print("Save the normalized peptide intensities...") - dataset_df.to_csv(output, index=False, sep=",") - if __name__ == "__main__": peptide_normalization() diff --git a/build/lib/bin/__init__.py b/build/lib/bin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build/lib/bin/compute_ibaq.py b/build/lib/bin/compute_ibaq.py new file mode 100644 index 0000000..7158d59 --- /dev/null +++ b/build/lib/bin/compute_ibaq.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import math + +import click +import matplotlib.pyplot as plt +import pandas as pd +from matplotlib.backends.backend_pdf import PdfPages +from pandas import DataFrame, Series +from pyopenms import * + +from ibaq.ibaqpy_commons import (CONDITION, IBAQ, IBAQ_LOG, IBAQ_NORMALIZED, + IBAQ_PPB, NORM_INTENSITY, PROTEIN_NAME, + SAMPLE_ID, plot_box_plot, plot_distributions, + print_help_msg, get_accession) + + +def normalize(group): + group[IBAQ_NORMALIZED] = group[IBAQ] / group[IBAQ].sum() + return group + + +def normalize_ibaq(res: DataFrame) -> DataFrame: + """ + Normalize the ibaq values using the total ibaq of the sample. The resulted + ibaq values are then multiplied by 100'000'000 (PRIDE database noramalization) + for the ibaq ppb and log10 shifted by 10 (ProteomicsDB) + :param res: Dataframe + :return: + """ + + res = res.groupby([SAMPLE_ID, CONDITION]).apply(normalize) + + # Normalization method used by Proteomics DB 10 + log10(ibaq/sum(ibaq)) + res[IBAQ_LOG] = res[IBAQ_NORMALIZED].apply( + lambda x: (math.log10(x) + 10) if x > 0 else 0 + ) + + # Normalization used by PRIDE Team (no log transformation) (ibaq/total_ibaq) * 100'000'000 + res[IBAQ_PPB] = res[IBAQ_NORMALIZED].apply(lambda x: x * 100000000) + + return res + + +@click.command() +@click.option("-f", "--fasta", help="Protein database to compute IBAQ values") +@click.option( + "-p", + "--peptides", + help="Peptide identifications with intensities following the peptide intensity output", +) +@click.option( + "-e", + "--enzyme", + help="Enzyme used during the analysis of the dataset (default: Trypsin)", + default="Trypsin", +) +@click.option( + "-n", + "--normalize", + help="Normalize IBAQ values using by using the total IBAQ of the experiment", + is_flag=True, +) +@click.option( + "--min_aa", help="Minimum number of amino acids to consider a peptide", default=7 +) +@click.option( + "--max_aa", help="Maximum number of amino acids to consider a peptide", default=30 +) +@click.option("-o", "--output", help="Output file with the proteins and ibaq values") +@click.option( + "--verbose", + help="Print addition information about the distributions of the intensities, number of peptides remove " + "after normalization, etc.", + is_flag=True, +) +@click.option( + "--qc_report", + help="PDF file to store multiple QC images", + default="IBAQ-QCprofile.pdf", +) +def ibaq_compute( + fasta: str, + peptides: str, + enzyme: str, + normalize: bool, + min_aa: int, + max_aa: int, + output: str, + verbose: bool, + qc_report: str, +) -> None: + """ + This command computes the IBAQ values for a file output of peptides with the format described in + peptide_contaminants_file_generation.py. + :param min_aa: Minimum number of amino acids to consider a peptide. + :param max_aa: Maximum number of amino acids to consider a peptide. + :param fasta: Fasta file used to perform the peptide identification. + :param peptides: Peptide intensity file. + :param enzyme: Enzyme used to digest the protein sample. + :param normalize: use some basic normalization steps. + :param output: output format containing the ibaq values. + :param verbose: Print addition information. + :param qc_report: PDF file to store multiple QC images. + :return: + """ + if peptides is None or fasta is None: + print_help_msg(ibaq_compute) + exit(1) + + fasta_proteins = list() # type: list[FASTAEntry] + protein_accessions = list() + FASTAFile().load(fasta, fasta_proteins) + uniquepepcounts = dict() # type: dict[str, int] + digestor = ProteaseDigestion() + digestor.setEnzyme(enzyme) + + def get_average_nr_peptides_unique_bygroup(pdrow: Series) -> Series: + """ + Get the average intensity for protein groups + :param pdrow: peptide row + :return: average intensity + """ + proteins = pdrow.name[0].split(";") + summ = 0 + for prot in proteins: + summ += uniquepepcounts[prot] + if len(proteins) > 0 and summ > 0: + return pdrow.NormIntensity / (summ / len(proteins)) + # If there is no protein in the group, return np nan + return np.nan # type: ignore + + for entry in fasta_proteins: + digest = list() # type: list[str] + digestor.digest(AASequence().fromString(entry.sequence), digest, min_aa, max_aa) + digestuniq = set(digest) + # TODO: Try to get protein accessions from multiple databases. + protein_name = get_accession(entry.identifier) + uniquepepcounts[protein_name] = len(digestuniq) + protein_accessions.append(protein_name) + + data = pd.read_csv(peptides, sep=",") + data = data[data[PROTEIN_NAME].isin(protein_accessions)] + print(data.head()) + # next line assumes unique peptides only (at least per indistinguishable group) + + res = pd.DataFrame( + data.groupby([PROTEIN_NAME, SAMPLE_ID, CONDITION])[NORM_INTENSITY].sum() + ).apply(get_average_nr_peptides_unique_bygroup, 1) + res = res.sort_values(ascending=False) + res = res.to_frame() + res = res.reset_index() + res = res.rename(columns={0: IBAQ}) + + if normalize: + res = normalize_ibaq(res) + # Remove IBAQ_NORMALIZED NAN values + res = res.dropna(subset=[IBAQ_NORMALIZED]) + plot_column = IBAQ_PPB + else: + # Remove IBAQ NAN values + res = res.dropna(subset=[IBAQ]) + plot_column = IBAQ + + # Print the distribution of the protein IBAQ values + if verbose: + plot_width = len(set(res["SampleID"])) * 0.5 + 10 + pdf = PdfPages(qc_report) + density = plot_distributions( + res, + plot_column, + SAMPLE_ID, + log2=True, + width=plot_width, + title="IBAQ Distribution", + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + res, + plot_column, + SAMPLE_ID, + log2=True, + width=plot_width, + title="IBAQ Distribution", + violin=False, + ) + plt.show() + pdf.savefig(box) + pdf.close() + + # # For absolute expression the relation is one sample + one condition + # condition = data[CONDITION].unique()[0] + # res[CONDITION] = condition.lower() + + res.to_csv(output, index=False) + + +if __name__ == "__main__": + ibaq_compute() diff --git a/build/lib/bin/compute_tpa.py b/build/lib/bin/compute_tpa.py new file mode 100644 index 0000000..208289f --- /dev/null +++ b/build/lib/bin/compute_tpa.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os + +import click +import matplotlib.pyplot as plt +import pandas as pd +from matplotlib.backends.backend_pdf import PdfPages +from pyopenms import * + +from ibaq.ibaqpy_commons import (CONDITION, NORM_INTENSITY, PROTEIN_NAME, SAMPLE_ID, + plot_box_plot, plot_distributions, print_help_msg, + remove_contaminants_entrapments_decoys, get_accession) + + +def handle_nonstandard_aa(aa_seq: str) -> (list, str): + """Any nonstandard amoni acid will be removed. + + :param aa_seq: Protein sequences from multiple database. + :return: One list contains nonstandard amoni acids and one remain sequence. + """ + standard_aa = 'ARNDBCEQZGHILKMFPSTWYV' + nonstandard_aa_lst = [aa for aa in aa_seq if aa not in standard_aa] + considered_seq = ''.join([aa for aa in aa_seq if aa in standard_aa]) + return nonstandard_aa_lst, considered_seq + + +@click.command() +@click.option("-f", "--fasta", help="Protein database") +@click.option( + "-p", + "--peptides", + help="Peptide identifications with intensities following the peptide intensity output", +) +@click.option("-r", "--ruler", help="Whether to use ProteomicRuler", is_flag=True) +@click.option("-n", "--ploidy", help="Ploidy number", default=2) +@click.option("-m", "--organism", help="Organism source of the data", default="human") +@click.option("-c", "--cpc", help="Cellular protein concentration(g/L)", default=200) +@click.option("-o", "--output", help="Output file with the proteins and other values") +@click.option( + "--verbose", + help="Print addition information about the distributions of the intensities, number of peptides remove " + "after normalization, etc.", + is_flag=True, +) +@click.option( + "--qc_report", + help="PDF file to store multiple QC images", + default="TPA-QCprofile.pdf", +) +def tpa_compute( + fasta: str, + peptides: str, + ruler: bool, + organism: str, + ploidy: int, + cpc: float, + output: str, + verbose: bool, + qc_report: str, +) -> None: + """ + This command computes the protein copies and concentrations according to a file output of peptides with the + format described in peptide_contaminants_file_generation.py. + :param fasta: Fasta file used to perform the peptide identification. + :param peptides: Peptide intensity file without normalization. + :param ruler: Whether to compute protein copies, weight and concentration. + :param organism: Organism source of the data. + :param ploidy: Ploidy number. + :param cpc: Cellular protein concentration(g/L). + :param output: Output format containing the TPA values, protein copy numbers and concentrations. + :param verbose: Print addition information. + :param qc_report: PDF file to store multiple QC images. + :return: + """ + if peptides is None or fasta is None: + print_help_msg(tpa_compute) + exit(1) + + data = pd.read_csv( + peptides, sep=",", usecols=[PROTEIN_NAME, NORM_INTENSITY, SAMPLE_ID, CONDITION] + ) + data[NORM_INTENSITY] = data[NORM_INTENSITY].astype("float") + data = data.dropna(subset=[NORM_INTENSITY]) + data = data[data[NORM_INTENSITY] > 0] + print(data.head()) + + res = pd.DataFrame( + data.groupby([PROTEIN_NAME, SAMPLE_ID, CONDITION])[NORM_INTENSITY].sum() + ) + res = res.reset_index() + proteins = res[PROTEIN_NAME].unique().tolist() + proteins = sum([i.split(";") for i in proteins], []) + + # calculate molecular weight of quantified proteins + mw_dict = dict() + fasta_proteins = list() # type: list[FASTAEntry] + FASTAFile().load(fasta, fasta_proteins) + for entry in fasta_proteins: + accession = get_accession(entry.identifier) + if accession in proteins: + try: + mw = AASequence().fromString(entry.sequence).getMonoWeight() + mw_dict[accession] = mw + except: + error_aa, seq = handle_nonstandard_aa(entry.sequence) + mw = AASequence().fromString(seq).getMonoWeight() + mw_dict[accession] = mw + print(f"Nonstandard amimo acids found in {accession}: {error_aa}, ignored!") + + res = res[res[PROTEIN_NAME].isin(mw_dict.keys())] + + # calculate TPA for every protein group + def get_protein_group_mw(group: str) -> float: + mw_list = [mw_dict[i] for i in group.split(";")] + return sum(mw_list) + + res["MolecularWeight"] = res.apply( + lambda x: get_protein_group_mw(x[PROTEIN_NAME]), axis=1 + ) + res["MolecularWeight"] = res["MolecularWeight"].fillna(1) + res["MolecularWeight"] = res["MolecularWeight"].replace(0, 1) + res["TPA"] = res[NORM_INTENSITY] / res["MolecularWeight"] + # Print the distribution of the protein TPA values + if verbose: + plot_width = len(set(res[SAMPLE_ID])) * 0.5 + 10 + pdf = PdfPages(qc_report) + density = plot_distributions( + res, "TPA", SAMPLE_ID, log2=True, width=plot_width, title="TPA Distribution" + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + res, + "TPA", + SAMPLE_ID, + log2=True, + width=plot_width, + title="TPA Distribution", + violin=False, + ) + plt.show() + pdf.savefig(box) + + # calculate protein weight(ng) and concentration(nM) + if ruler: + avogadro = 6.02214129e23 + average_base_pair_mass = 617.96 # 615.8771 + + organism = organism.lower() + histone_df = pd.read_json( + open(os.path.split(__file__)[0] + os.sep + "histones.json", "rb") + ).T + target_histones = histone_df[histone_df["name"] == organism.lower()] + genome_size = target_histones["genome_size"].values[0] + histones_list = target_histones["histone_entries"].values[0] + dna_mass = ploidy * genome_size * average_base_pair_mass / avogadro + + def calculate(protein_intensity, histone_intensity, mw): + copy = (protein_intensity / histone_intensity) * dna_mass * avogadro / mw + # The number of moles is equal to the number of particles divided by Avogadro's constant + moles = copy * 1e9 / avogadro # unit nmol + weight = moles * mw # unit ng + return tuple([copy, moles, weight]) + + def proteomic_ruler(df): + histone_intensity = df[df[PROTEIN_NAME].isin(histones_list)][ + NORM_INTENSITY + ].sum() + histone_intensity = histone_intensity if histone_intensity > 0 else 1 + df[["Copy", "Moles[nmol]", "Weight[ng]"]] = df.apply( + lambda x: calculate( + x[NORM_INTENSITY], histone_intensity, x["MolecularWeight"] + ), + axis=1, + result_type="expand", + ) + volume = df["Weight[ng]"].sum() * 1e-9 / cpc # unit L + df["Concentration[nM]"] = df["Moles[nmol]"] / volume # unit nM + return df + + res = res.groupby([CONDITION]).apply(proteomic_ruler) + + if verbose: + density = plot_distributions( + res, "Copy", SAMPLE_ID, width=plot_width, log2=True, title="Copy numbers Distribution" + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + res, + "Copy", + SAMPLE_ID, + width=plot_width, + log2=True, + title="Copy numbers Distribution", + violin=False, + ) + plt.show() + pdf.savefig(box) + + density = plot_distributions( + res, + "Concentration[nM]", + SAMPLE_ID, + width=plot_width, + log2=True, + title="Concentration[nM] Distribution", + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + res, + "Concentration[nM]", + SAMPLE_ID, + width=plot_width, + log2=True, + title="Concentration[nM] Distribution", + violin=False, + ) + plt.show() + pdf.savefig(box) + pdf.close() + res.to_csv(output, index=False) + + +if __name__ == "__main__": + tpa_compute() diff --git a/build/lib/bin/datasets_merger.py b/build/lib/bin/datasets_merger.py new file mode 100644 index 0000000..6954dd6 --- /dev/null +++ b/build/lib/bin/datasets_merger.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +import re + +import click + +from ibaq import __version__ +from ibaq.combiner import Combiner + +CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) + + +@click.group(context_settings=CONTEXT_SETTINGS) +def cli(): + """ + This is the main tool that gives access to all commands. + """ + + +@click.version_option( + version=__version__, package_name="ibaqpy", message="%(package)s %(version)s" +) +@click.command("datasets_merge", short_help="Merge ibaq results from compute_ibaq") +@click.option( + "--data_folder", + "-d", + help="Data dolfer contains SDRFs and ibaq CSVs.", + required=True, +) +@click.option( + "--output", "-o", help="Output file after batch effect removal.", required=True +) +@click.option( + "--covariate", + "-c", + default=None, + help="Indicator included in covariate consideration when datasets are merged.", +) +@click.option("--organism", help="Organism to keep in input data.", default="HUMAN") +@click.option( + "--covariate_to_keep", + "-k", + help="Keep tissue parts from metadata, e.g. 'LV,RV,LA,RA'.", + default=None, +) +@click.option( + "--non_missing_percent_to_keep", + "-m", + help="non-missing values in each group.", + default=0.3, +) +@click.option( + "--skip_outliers_removal", + help="Skip removing outliers in all datasets.", + default=False, + is_flag=True, +) +@click.option( + "--n_components", + help="Number of principal components to be computed.", + default=None, +) +@click.option("--min_cluster", help="The minimum size of clusters.", default=None) +@click.option( + "--min_sample_num", + help="The minimum number of samples in a neighborhood for a point to be considered as a core point.", + default=None, +) +@click.option("--n_iter", help="Number of iterations to be performed.", default=None) +@click.option( + "--verbose/--quiet", + "-v/-q", + help="Output debug information.", + default=False, + is_flag=True, +) +@click.pass_context +def datasets_merge( + ctx, + data_folder: str, + output: str, + covariate: str, + organism: str, + covariate_to_keep: list, + non_missing_percent_to_keep: float, + skip_outliers_removal: bool, + n_components: int, + min_cluster: int, + min_sample_num: int, + n_iter: int, + verbose: bool, +): + if covariate_to_keep: + covariate_to_keep = re.split(",\s*", covariate_to_keep) + combiner = Combiner(data_folder=data_folder, covariate=covariate) + combiner.imputer(covariate_to_keep) + if not skip_outliers_removal: + combiner.outlier_removal(n_components, min_cluster, min_sample_num, n_iter) + combiner.batch_correction(n_components, covariate_to_keep) + result = combiner.df_corrected + result.to_csv(output, sep=",", index=True) + + +cli.add_command(datasets_merge) + + +def main(): + cli() + + +if __name__ == "__main__": + main() diff --git a/build/lib/bin/merge_condition_files.py b/build/lib/bin/merge_condition_files.py new file mode 100644 index 0000000..c0898c6 --- /dev/null +++ b/build/lib/bin/merge_condition_files.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + + +import os + +from ibaq.ibaqpy_commons import * + + +def print_help_msg(command) -> None: + """ + Print help information + :param command: command to print helps + :return: print help + """ + with click.Context(command) as ctx: + click.echo(command.get_help(ctx)) + + +@click.command() +@click.option( + "-i", "--input", help="Folder with all the Intensity files", required=True +) +@click.option("-o", "--output", help="Prefix name for the file to group by condition") +@click.option( + "-p", "--pattern", help="Prefix of the pattern name for all the files in the folder" +) +def merge_condition_generation(input: str, output: str, pattern: str) -> None: + """ + Merge all the files in a folder with the specific pattern + :param input: Input folder containing all the peptide Intensity files + :param output: Output file prefix with all the intensities + :param pattern: pattern of the files with the corresponding file name prefix + :return: + """ + + files = [f for f in os.listdir(input) if pattern in f] + df_from_each_file = (pd.read_csv(input + "/" + f) for f in files) + concatenated_df = pd.concat(df_from_each_file, ignore_index=True) + concatenated_df[CONDITION] = concatenated_df[CONDITION].str.lower() + print(concatenated_df.head()) + + for k, g in concatenated_df.groupby([CONDITION]): + g.to_csv( + f"{output}/{k}-grouped-Intensities.csv", index=False + ) # '{}.csv'.format(k) + + +if __name__ == "__main__": + merge_condition_generation() diff --git a/build/lib/bin/normalize_methods.py b/build/lib/bin/normalize_methods.py new file mode 100644 index 0000000..fb3bd0e --- /dev/null +++ b/build/lib/bin/normalize_methods.py @@ -0,0 +1,85 @@ +import numpy as np +import pandas as pd +from sklearn.preprocessing import robust_scale +from sklearn.preprocessing import power_transform +from sklearn.preprocessing import quantile_transform + +def normalize(df,method): + match method: + case 'mean': + return mean_normalize(df) + case 'median': + return median_normalize(df) + case 'max': + return max_normalize(df) + case 'global': + return global_normalize(df) + case 'max_min': + return max_min_mormalize(df) + case 'z_score': + return z_score_normalize(df) + case 'iqr': + return iqr_normalize(df) + case 'robust': + return robust_normalize(df) + case 'vsn': + return vsn_normalize(df) + case 'quantile': + return quantile_normalize(df) + case _: + return -1 + +# mean +def mean_normalize(df): + return df / df.mean(axis=0) + +# median +def median_normalize(df): + return df / df.median(axis=0) + +#max +def max_normalize(df): + return df / df.max(axis=0) + +#global +def global_normalize(df): + return df / df.sum(axis=0) + +#max-min +def max_min_mormalize(df): + min = df.min(axis=0) + return (df - min) / (df.max(axis=0) - min) + +#z-score +def z_score_normalize(df): + return (df - df.mean(axis=0)) / df.var(axis=0) + +#IQR +def iqr_normalize(df): + Q = df.quantile([0.75,0.25],interpolation='linear',axis=0) + IQR = Q.loc[0.75,:] - Q.loc[0.25,:] + return (df - df.median(axis=0)) / IQR + +#rubust +def robust_normalize(df): + index = df.index + columns = df.columns + df = robust_scale(df, axis=0) + df = pd.DataFrame(df,columns=columns,index=index) + return df + +#vsn +def vsn_normalize(df): + index = df.index + columns = df.columns + df = power_transform(df, method='box-cox') + df = pd.DataFrame(df,columns=columns,index=index) + return df + +#quantile +def quantile_normalize(df): + index = df.index + columns = df.columns + DF = quantile_transform(df,axis=0) + df = pd.DataFrame(df,columns=columns,index=index) + return df \ No newline at end of file diff --git a/build/lib/bin/peptide_normalization.py b/build/lib/bin/peptide_normalization.py new file mode 100644 index 0000000..28c10cb --- /dev/null +++ b/build/lib/bin/peptide_normalization.py @@ -0,0 +1,1214 @@ +#!/usr/bin/env python +import gc +import click +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from scipy.stats import rankdata +import os +import random +import uuid +from matplotlib.backends.backend_pdf import PdfPages +from pandas import DataFrame +import pyarrow.parquet as pq +from normalize_methods import normalize + +from ibaq.ibaqpy_commons import ( + BIOREPLICATE, + CHANNEL, + CONDITION, + PARQUET_COLUMNS, + FRACTION, + FRAGMENT_ION, + INTENSITY, + ISOTOPE_LABEL_TYPE, + NORM_INTENSITY, + PEPTIDE_CANONICAL, + PEPTIDE_CHARGE, + PEPTIDE_SEQUENCE, + PROTEIN_NAME, + REFERENCE, + RUN, + SAMPLE_ID, + STUDY_ID, + TMT16plex, + TMT11plex, + TMT10plex, + TMT6plex, + ITRAQ4plex, + ITRAQ8plex, + get_canonical_peptide, + get_spectrum_prefix, + get_study_accession, + parquet_map, + parse_uniprot_accession, + plot_box_plot, + plot_distributions, + remove_contaminants_entrapments_decoys, + remove_protein_by_ids, + sum_peptidoform_intensities, + get_peptidoform_normalize_intensities, + average_peptide_intensities, + print_help_msg, +) + + +def print_dataset_size(dataset: DataFrame, message: str, verbose: bool) -> None: + if verbose: + print(message + str(len(dataset.index))) + +def recover_df(df): + """ + This function is aimed to recover data shape. + """ + samples = df.columns.tolist() + out = pd.DataFrame() + for sample in samples: + samples_df = df[sample].dropna() + samples_df = samples_df.reset_index() + samples_df['SampleID'] = sample + samples_df.rename(columns={ + sample:NORM_INTENSITY + },inplace=True) + out = pd.concat([out,samples_df]) + return out + +def analyse_sdrf(sdrf_path: str, compression: bool) -> tuple: + """ + This function is aimed to parse SDRF and return four objects: + 1. sdrf_df: A dataframe with channels and references annoted. + 2. label: Label type of the experiment. LFQ, TMT or iTRAQ. + 3. sample_names: A list contains all sample names. + 4. choice: A dictionary caontains key-values between channel + names and numbers. + :param sdrf_path: File path of SDRF. + :param compression: Whether compressed. + :return: + """ + sdrf_df = pd.read_csv(sdrf_path, sep="\t", compression=compression) + sdrf_df[REFERENCE] = sdrf_df["comment[data file]"].apply(get_spectrum_prefix) + + labels = set(sdrf_df["comment[label]"]) + # Determine label type + label, choice = get_label(labels) + if label == "TMT": + choice_df = ( + pd.DataFrame.from_dict(choice, orient="index", columns=[CHANNEL]) + .reset_index() + .rename(columns={"index": "comment[label]"}) + ) + sdrf_df = sdrf_df.merge(choice_df, on="comment[label]", how="left") + elif label == "ITRAQ": + choice_df = ( + pd.DataFrame.from_dict(choice, orient="index", columns=[CHANNEL]) + .reset_index() + .rename(columns={"index": "comment[label]"}) + ) + sdrf_df = sdrf_df.merge(choice_df, on="comment[label]", how="left") + sample_names = sdrf_df["source name"].unique().tolist() + + return sdrf_df, label, sample_names, choice + + +def analyse_feature_df(feature_df: pd.DataFrame) -> tuple: + """Return label type, sample names and choice dict by iterating parquet. + + :param parquet_path: Feature parquet path. + :param batch_size: Iterate batch size, defaults to 100000 + :return: Label type, sample names and choice dict + """ + samples = feature_df["sample_accession"].unique().tolist() + labels = feature_df["isotope_label_type"].unique().tolist() + # Determine label type + label, choice = get_label(labels) + + return label, samples, choice + + +def analyse_feature_parquet(parquet_path: str, batch_size: int = 100000) -> tuple: + """Return label type, sample names and choice dict by iterating parquet. + + :param parquet_path: Feature parquet path. + :param batch_size: Iterate batch size, defaults to 100000 + :return: Label type, sample names and choice dict + """ + parquet_chunks = read_large_parquet(parquet_path, batch_size) + labels, samples = list(), list() + for chunk in parquet_chunks: + samples.extend(chunk["sample_accession"].unique().tolist()) + labels.extend(chunk["isotope_label_type"].unique().tolist()) + samples = list(set(samples)) + labels = list(set(labels)) + # Determine label type + label, choice = get_label(labels) + + return label, samples, choice + + +def read_large_parquet(parquet_path: str, batch_size: int = 100000): + parquet_file = pq.ParquetFile(parquet_path) + for batch in parquet_file.iter_batches(batch_size=batch_size): + batch_df = batch.to_pandas() + yield batch_df + + +def get_label(labels: list) -> (str, dict): + """Return label type and choice dict according to labels list. + + :param labels: Labels from SDRF. + :return: Tuple contains label type and choice dict. + """ + choice = None + if len(labels) == 1: + label = "LFQ" + elif "TMT" in ",".join(labels) or "tmt" in ",".join(labels): + if ( + len(labels) > 11 + or "TMT134N" in labels + or "TMT133C" in labels + or "TMT133N" in labels + or "TMT132C" in labels + or "TMT132N" in labels + ): + choice = TMT16plex + elif len(labels) == 11 or "TMT131C" in labels: + choice = TMT11plex + elif len(labels) > 6: + choice = TMT10plex + else: + choice = TMT6plex + label = "TMT" + elif "ITRAQ" in ",".join(labels) or "itraq" in ",".join(labels): + if len(labels) > 4: + choice = ITRAQ8plex + else: + choice = ITRAQ4plex + label = "ITRAQ" + else: + exit("Warning: Only support label free, TMT and ITRAQ experiment!") + return label, choice + + +def msstats_common_process(data_df: pd.DataFrame) -> pd.DataFrame: + """Apply common process on data. + + :param data_df: Feature data in dataframe. + :return: Processed data. + """ + data_df.rename( + columns={ + "ProteinName": PROTEIN_NAME, + "PeptideSequence": PEPTIDE_SEQUENCE, + "PrecursorCharge": PEPTIDE_CHARGE, + "Run": RUN, + "Condition": CONDITION, + "Intensity": INTENSITY, + }, + inplace=True, + ) + data_df[REFERENCE] = data_df[REFERENCE].apply(get_spectrum_prefix) + + return data_df + + +def parquet_common_process( + data_df: pd.DataFrame, label: str, choice: dict +) -> pd.DataFrame: + """Apply common process on data. + + :param data_df: Feature data in dataframe. + :return: Processed data. + """ + data_df = data_df.rename(columns=parquet_map) + data_df[PROTEIN_NAME] = data_df.apply(lambda x: ",".join(x[PROTEIN_NAME]), axis=1) + if label == "LFQ": + data_df.drop(CHANNEL, inplace=True, axis=1) + else: + data_df[CHANNEL] = data_df[CHANNEL].map(choice) + + return data_df + + +def merge_sdrf( + label: str, sdrf_df: pd.DataFrame, data_df: pd.DataFrame +) -> pd.DataFrame: + if label == "LFQ": + result_df = pd.merge( + data_df, + sdrf_df[["source name", REFERENCE]], + how="left", + on=[REFERENCE], + ) + elif label == "TMT": + result_df = pd.merge( + data_df, + sdrf_df[["source name", REFERENCE, CHANNEL]], + how="left", + on=[REFERENCE, CHANNEL], + ) + elif label == "ITRAQ": + result_df = pd.merge( + data_df, + sdrf_df[["source name", REFERENCE, CHANNEL]], + how="left", + on=[REFERENCE, CHANNEL], + ) + result_df.rename(columns={"source name": SAMPLE_ID}, inplace=True) + result_df = result_df[result_df["Condition"] != "Empty"] + + return result_df + + +def data_common_process(data_df: pd.DataFrame, min_aa: int) -> pd.DataFrame: + # Remove 0 intensity signals from the data + data_df = data_df[data_df[INTENSITY] > 0] + data_df = data_df[data_df["Condition"] != "Empty"] + + def map_canonical_seq(data_df: pd.DataFrame) -> (pd.DataFrame, dict): + modified_seqs = data_df[PEPTIDE_SEQUENCE].unique().tolist() + canonical_seqs = [get_canonical_peptide(i) for i in modified_seqs] + inner_canonical_dict = dict(zip(modified_seqs, canonical_seqs)) + data_df[PEPTIDE_CANONICAL] = data_df[PEPTIDE_SEQUENCE].map(inner_canonical_dict) + + return data_df, inner_canonical_dict + + if PEPTIDE_CANONICAL not in data_df.columns: + data_df, inner_canonical_dict = map_canonical_seq(data_df) + data_df[PEPTIDE_CANONICAL] = data_df[PEPTIDE_SEQUENCE].map(inner_canonical_dict) + # Filter peptides with less amino acids than min_aa (default: 7) + data_df = data_df[ + data_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1) + ] + data_df[PROTEIN_NAME] = data_df[PROTEIN_NAME].apply(parse_uniprot_accession) + data_df[STUDY_ID] = data_df[SAMPLE_ID].apply(get_study_accession) + if FRACTION not in data_df.columns: + data_df[FRACTION] = 1 + data_df = data_df[ + [ + PROTEIN_NAME, + PEPTIDE_SEQUENCE, + PEPTIDE_CANONICAL, + PEPTIDE_CHARGE, + INTENSITY, + REFERENCE, + CONDITION, + RUN, + BIOREPLICATE, + FRACTION, + FRAGMENT_ION, + ISOTOPE_LABEL_TYPE, + STUDY_ID, + SAMPLE_ID, + ] + ] + data_df[CONDITION] = pd.Categorical(data_df[CONDITION]) + data_df[STUDY_ID] = pd.Categorical(data_df[STUDY_ID]) + data_df[SAMPLE_ID] = pd.Categorical(data_df[SAMPLE_ID]) + + return data_df + +def intensity_normalization( + dataset: DataFrame, + field: str, + class_field: str, + scaling_method: str = "quantile", +) -> DataFrame: + cols_to_keep = [ + PROTEIN_NAME, + PEPTIDE_CANONICAL, + PEPTIDE_SEQUENCE, + PEPTIDE_CHARGE, + SAMPLE_ID, + BIOREPLICATE, + CONDITION, + NORM_INTENSITY, + ] + # TODO add imputation and/or removal to those two norm strategies + if scaling_method == "msstats": + # For TMT normalization + if "Channel" in dataset.columns: + g = dataset.groupby(["Run", "Channel"])[field].apply(np.nanmedian) + g.name = "RunMedian" + dataset = dataset.join(g, on=["Run", "Channel"]) + median_baseline = dataset.drop_duplicates(subset=["Run", "Channel", field])[ + field + ].median() + dataset[NORM_INTENSITY] = ( + dataset[field] - dataset["RunMedian"] + median_baseline + ) + else: + g = dataset.groupby(["Run", "Fraction"])[field].apply(np.nanmedian) + g.name = "RunMedian" + dataset = dataset.join(g, on=["Run", "Fraction"]) + dataset["FractionMedian"] = ( + dataset["RunMedian"].groupby(dataset["Fraction"]).transform("median") + ) + dataset[NORM_INTENSITY] = ( + dataset[field] - dataset["RunMedian"] + dataset["FractionMedian"] + ) + return dataset[cols_to_keep] + + else: + # pivot to have one col per sample + print("Transforming to wide format dataset size {}".format(len(dataset.index))) + normalize_df = pd.pivot_table( + dataset, + index=[ + PEPTIDE_SEQUENCE, + PEPTIDE_CANONICAL, + PEPTIDE_CHARGE, + FRACTION, + RUN, + BIOREPLICATE, + PROTEIN_NAME, + STUDY_ID, + CONDITION, + ], + columns=class_field, + values=field, + aggfunc={field: np.nanmean}, + observed=True, + ) + normalize_df = normalize(normalize_df,scaling_method) + # TODO: When restoring the pivot table here, the previous grouping caused + # the dataframe to produce a large number of rows with NORM_INTENSITY of + # NA at melt. This results in an unbearable memory consumption. + + normalize_df = recover_df(normalize_df) + normalize_df = normalize_df.drop_duplicates() + print(normalize_df.head()) + return normalize_df[cols_to_keep] + + +def remove_low_frequency_peptides_( + dataset_df: DataFrame, percentage_samples: float = 0.20 +): + """ + Remove peptides that are present in less than 20% of the samples. + :param dataset_df: dataframe with the data + :param percentage_samples: percentage of samples + :return: + """ + + normalize_df = pd.pivot_table( + dataset_df, + index=[PEPTIDE_CANONICAL, PROTEIN_NAME], + columns=SAMPLE_ID, + values=NORM_INTENSITY, + aggfunc={NORM_INTENSITY: np.nanmean}, + observed=True, + ) + # Count the number of null values in each row + null_count = normalize_df.isnull().sum(axis=1) + # Find the rows that have null values above the threshold + rows_to_drop = null_count[ + null_count >= (1 - percentage_samples) * normalize_df.shape[1] + ].index + # Drop the rows with too many null values + normalize_df = normalize_df.drop(rows_to_drop) + + # Remove rows with non-null values in only one column + normalize_df = normalize_df[ + normalize_df.notnull().sum(axis=1) != 1 + ] + """ + normalize_df = normalize_df.reset_index() + normalize_df = normalize_df.melt(id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME]) + normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) + """ + normalize_df = recover_df(normalize_df) + # recover condition column + normalize_df = normalize_df.merge( + dataset_df[[SAMPLE_ID, CONDITION]].drop_duplicates(subset=[SAMPLE_ID]), + on=SAMPLE_ID, + how="left", + ) + + # Remove rows with null values in NORMALIZE_INTENSITY + normalize_df = normalize_df[normalize_df[NORM_INTENSITY].notna()] + + print(normalize_df.head()) + return normalize_df + + +def peptide_intensity_normalization( + dataset_df: DataFrame, field: str, class_field: str, scaling_method: str +): + """ + Normalize the peptide intensities using different methods. + :param dataset_df: dataframe with the data + :param field: field to normalize + :param class_field: field to use as class + :param scaling_method: method to use for the normalization + :return: + """ + # pivot to have one col per sample + normalize_df = pd.pivot_table( + dataset_df, + index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], + columns=class_field, + values=field, + aggfunc={field: np.nanmean}, + observed=True, + ) + # need nomalize? + normalize_df = recover_df(normalize_df) + """ + normalize_df = normalize_df.reset_index() + normalize_df = normalize_df.melt( + id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION] + ) + normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) + normalize_df = normalize_df[normalize_df[NORM_INTENSITY].notna()] + """ + return normalize_df + + +def impute_peptide_intensities(dataset_df, field, class_field): + """ + Impute the missing values using different methods. + :param dataset_df: dataframe with the data + :param field: field to impute + :param class_field: field to use as class + :return: + """ + normalize_df = pd.DataFrame() + # group by condition to detect missing values + for c, g in dataset_df.groupby(CONDITION): + # pivot to have one col per sample + group_normalize_df = pd.pivot_table( + g, + index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], + columns=class_field, + values=field, + aggfunc={field: np.nanmean}, + observed=True, + ) + + # no missing values group -> only one sample + if len(group_normalize_df.columns) < 2: + group_normalize_df = group_normalize_df.reset_index() + group_normalize_df = group_normalize_df.melt( + id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION] + ) + group_normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) + normalize_df = normalize_df.append(group_normalize_df, ignore_index=True) + # else: + # # print ("nothing") + # # Impute the missing values + # # imputer = MissForest(max_iter=5) + # # imputed_data = imputer.fit_transform(group_normalize_df) + # # group_normalize_df = pd.DataFrame(imputed_data, columns=group_normalize_df.columns, + # # index=group_normalize_df.index) + # # # Melt the dataframe + # # group_normalize_df = group_normalize_df.reset_index() + # # group_normalize_df = group_normalize_df.melt(id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION]) + # # group_normalize_df.rename(columns={'value': NORM_INTENSITY}, inplace=True) + # # normalize_df = normalize_df.append(group_normalize_df, ignore_index=True) + + return normalize_df + + +@click.command() +@click.option( + "-m", "--msstats", help="MsStats file import generated by quantms", default=None +) +@click.option( + "-p", "--parquet", help="Parquet file import generated by quantms.io", default=None +) +@click.option( + "-s", "--sdrf", help="SDRF file import generated by quantms", default=None +) +@click.option("--stream", help="Stream processing normalization", is_flag=True) +@click.option( + "--chunksize", + help="The number of rows of MSstats or parquet read using pandas streaming", + default=1000000, +) +@click.option( + "--min_aa", help="Minimum number of amino acids to filter peptides", default=7 +) +@click.option( + "--min_unique", + help="Minimum number of unique peptides to filter proteins", + default=2, +) +@click.option( + "--remove_ids", + help="Remove specific protein ids from the analysis using a file with one id per line", +) +@click.option( + "--remove_decoy_contaminants", + help="Remove decoy and contaminants proteins from the analysis", + is_flag=True, + default=False, +) +@click.option( + "--remove_low_frequency_peptides", + help="Remove peptides that are present in less than 20% of the samples", + is_flag=True, + default=False, +) +@click.option( + "--output", + help="Peptide intensity file including other all properties for normalization", +) +@click.option( + "--skip_normalization", help="Skip normalization step", is_flag=True, default=False +) +@click.option( + "--nmethod", + help="Normalization method used to normalize intensities for all samples (options: quantile, msstats, qnorm)", + default="quantile", +) +@click.option( + "--pnormalization", + help="Normalize the peptide intensities using different methods (options: qnorm)", + is_flag=True, +) +@click.option( + "--compress", + help="Read the input peptides file in compress gzip file", + is_flag=True, +) +@click.option( + "--log2", + help="Transform to log2 the peptide intensity values before normalization", + is_flag=True, +) +@click.option( + "--violin", + help="Use violin plot instead of boxplot for distribution representations", + is_flag=True, +) +@click.option( + "--verbose", + help="Print addition information about the distributions of the intensities, number of peptides remove " + "after normalization, etc.", + is_flag=True, +) +@click.option( + "--qc_report", + help="PDF file to store multiple QC images", + default="peptideNorm-QCprofile.pdf", +) +def peptide_normalization( + msstats: str, + parquet: str, + sdrf: str, + stream: bool, + chunksize: int, + min_aa: int, + min_unique: int, + remove_ids: str, + remove_decoy_contaminants: bool, + remove_low_frequency_peptides: bool, + output: str, + skip_normalization: bool, + nmethod: str, + pnormalization: bool, + compress: bool, + log2: bool, + violin: bool, + verbose: bool, + qc_report: str, +) -> None: + if output is None: + print_help_msg(peptide_normalization) + exit(1) + + if parquet is None and (msstats is None or sdrf is None): + print_help_msg(peptide_normalization) + exit(1) + + if pnormalization and nmethod not in ["qnorm", "quantile"]: + exit( + "Peptide intensity normalization works only with qnorm or quantile methods!" + ) + + if verbose: + log_after_norm = not log2 + + pd.set_option("display.max_columns", None) + compression_method = "gzip" if compress else None + print("Loading data..") + + if not stream: + if parquet is None: + # Read the msstats file + feature_df = pd.read_csv( + msstats, + sep=",", + compression=compression_method, + dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, + ) + + # Read the sdrf file + sdrf_df, label, sample_names, choice = analyse_sdrf( + sdrf, compression_method + ) + print(sdrf_df) + + # Merged the SDRF with the Resulted file + dataset_df = msstats_common_process(feature_df) + dataset_df = merge_sdrf(label, sdrf_df, feature_df) + # Remove the intermediate variables and free the memory + del feature_df, sdrf_df + gc.collect() + else: + dataset_df = pd.read_parquet(parquet)[PARQUET_COLUMNS] + label, sample_names, choice = analyse_feature_df(dataset_df) + dataset_df = parquet_common_process(dataset_df, label, choice) + + dataset_df = data_common_process(dataset_df, min_aa) + # Only proteins with unique peptides number greater than min_unique (default: 2) are retained + unique_peptides = set( + dataset_df.groupby(PEPTIDE_CANONICAL) + .filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[PEPTIDE_CANONICAL] + .tolist() + ) + strong_proteins = set( + dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(unique_peptides)] + .groupby(PROTEIN_NAME) + .filter(lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[ + PROTEIN_NAME + ] + .tolist() + ) + dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] + + print(f"Number of unique peptides: {len(unique_peptides)}") + print(f"Number of strong proteins: {len(strong_proteins)}") + + print("Logarithmic if specified..") + dataset_df = dataset_df.rename(columns={INTENSITY: NORM_INTENSITY}) + if log2: + dataset_df[NORM_INTENSITY] = np.log2(dataset_df[NORM_INTENSITY]) + + # Print the distribution of the original peptide intensities from quantms analysis + if verbose: + sample_names = set(dataset_df[SAMPLE_ID]) + plot_width = len(sample_names) * 0.5 + 10 + pdf = PdfPages(qc_report) + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=not log2, + width=plot_width, + title="Original peptidoform intensity distribution (no normalization)", + ) + #plt.show() + pdf.savefig(density) + """ + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=not log2, + width=plot_width, + title="Original peptidoform intensity distribution (no normalization)", + violin=violin, + ) + plt.show() + pdf.savefig(box) + """ + + # Remove high abundant and contaminants proteins and the outliers + if remove_ids is not None: + print("Remove proteins from file...") + dataset_df = remove_protein_by_ids(dataset_df, remove_ids) + if remove_decoy_contaminants: + print("Remove decoy and contaminants...") + dataset_df = remove_contaminants_entrapments_decoys(dataset_df) + + print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) + print("Normalize intensities.. ") + # dataset_df = dataset_df.dropna(how="any") + if not skip_normalization: + dataset_df = intensity_normalization( + dataset_df, + field=NORM_INTENSITY, + class_field=SAMPLE_ID, + scaling_method=nmethod, + ) + if verbose: + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + #log2=log_after_norm, + width=plot_width, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, + ) + #plt.show() + pdf.savefig(density) + """ + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + """ + print("Number of peptides after normalization: " + str(len(dataset_df.index))) + print("Select the best peptidoform across fractions...") + dataset_df = get_peptidoform_normalize_intensities(dataset_df) + print( + "Number of peptides after peptidofrom selection: " + + str(len(dataset_df.index)) + ) + + print("Sum all peptidoforms per Sample...") + dataset_df = sum_peptidoform_intensities(dataset_df) + print("Number of peptides after selection: " + str(len(dataset_df.index))) + + print("Average all peptidoforms per Peptide/Sample...") + dataset_df = average_peptide_intensities(dataset_df) + print("Number of peptides after average: " + str(len(dataset_df.index))) + if verbose: + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptide intensity distribution method: " + nmethod, + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptide intensity distribution method: " + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + + if remove_low_frequency_peptides and len(sample_names) > 1: + print(dataset_df) + dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) + print_dataset_size( + dataset_df, "Peptides after remove low frequency peptides: ", verbose + ) + # Perform imputation using Random Forest in Peptide Intensities + # TODO: Check if this is necessary (Probably we can do some research if imputation at peptide level is necessary + # if impute: + # dataset_df = impute_peptide_intensities(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID) + + if pnormalization: + print("Normalize at Peptide level...") + dataset_df = peptide_intensity_normalization( + dataset_df, + field=NORM_INTENSITY, + class_field=SAMPLE_ID, + scaling_method=nmethod, + ) + + if verbose: + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Normalization at peptide level method: " + nmethod, + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Normalization at peptide level method: " + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + pdf.close() + + print("Save the normalized peptide intensities...") + dataset_df.to_csv(output, index=False, sep=",") + else: + if parquet is None: + sdrf_df, label, sample_names, choice = analyse_sdrf( + sdrf, compression_method + ) + msstats_chunks = pd.read_csv( + msstats, + sep=",", + compression=compression_method, + dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, + chunksize=chunksize, + ) + else: + label, sample_names, choice = analyse_feature_parquet( + parquet, batch_size=chunksize + ) + msstats_chunks = read_large_parquet(parquet, batch_size=chunksize) + sample_number = len(sample_names) + + # TODO: Stream processing to obtain strong proteins with more than 2 uniqe peptides + temp = f"Temp-{str(uuid.uuid4())}/" + os.mkdir(temp) + print(f"INFO: Writing files into {temp}...") + unique_peptides = {} + group_intensities = {} + quantile = {} + print("INFO: First iteration to get unique peptides and strong proteins...") + for msstats_df in msstats_chunks: + if parquet is None: + msstats_df = msstats_common_process(msstats_df) + msstats_df = merge_sdrf(label, sdrf_df, msstats_df) + else: + msstats_df = parquet_common_process(msstats_df, label, choice) + result_df = data_common_process(msstats_df, min_aa) + + # Write CSVs by Sample ID + for sample in sample_names: + file_name = f"{temp}/{sample}.csv" + write_mode = "a" if os.path.exists(file_name) else "w" + header = False if os.path.exists(file_name) else True + result_df[result_df[SAMPLE_ID] == sample].to_csv( + file_name, index=False, header=header, mode=write_mode + ) + unique_df = result_df.groupby([PEPTIDE_CANONICAL]).filter( + lambda x: len(set(x[PROTEIN_NAME])) == 1 + )[[PEPTIDE_CANONICAL, PROTEIN_NAME]] + unique_dict = dict( + zip(unique_df[PEPTIDE_CANONICAL], unique_df[PROTEIN_NAME]) + ) + for i in unique_dict.keys(): + if i in unique_peptides.keys() and unique_dict[i] != unique_peptides[i]: + unique_peptides.pop(i) + else: + unique_peptides[i] = unique_dict[i] + + proteins_list = list(unique_peptides.values()) + count_dict = { + element: proteins_list.count(element) for element in set(proteins_list) + } + strong_proteins = [ + element for element in count_dict if count_dict[element] >= min_unique + ] + del proteins_list, count_dict + print(f"Number of unique peptides: {len(list(unique_peptides.keys()))}") + print(f"Number of strong proteins: {len(strong_proteins)}") + + # TODO: Filter proteins with less unique peptides than min_unique (default: 2) + plot_samples = random.sample(sample_names, min(len(sample_names), 20)) + plot_width = 10 + len(plot_samples) * 0.5 + pdf = PdfPages(qc_report) + original_intensities_df = pd.DataFrame() + + print("INFO: Second iteration to filter data and prepare normalization...") + print("Logarithmic if specified..") + norm_record = [0] * 2 + for sample in sample_names: + msstats_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") + msstats_df = msstats_df[msstats_df[PROTEIN_NAME].isin(strong_proteins)] + # Remove high abundant and contaminants proteins and the outliers + if remove_ids is not None: + msstats_df = remove_protein_by_ids(msstats_df, remove_ids) + if remove_decoy_contaminants: + msstats_df = remove_contaminants_entrapments_decoys(msstats_df) + norm_record[0] += len(msstats_df) + msstats_df = msstats_df.rename(columns={INTENSITY: NORM_INTENSITY}) + if log2: + msstats_df[NORM_INTENSITY] = np.log2(msstats_df[NORM_INTENSITY]) + if sample in plot_samples: + original_intensities_df = pd.concat( + [original_intensities_df, msstats_df] + ) + if not skip_normalization: + if nmethod == "msstats": + if label in ["TMT", "ITRAQ"]: + g = msstats_df.groupby(["Run", "Channel"]) + else: + g = msstats_df.groupby(["Run", "Fraction"]) + for name, group in g: + group_intensity = group[NORM_INTENSITY].tolist() + if name not in group_intensities: + group_intensities[name] = group_intensity + else: + group_intensities.update( + { + name: group_intensities[NORM_INTENSITY] + + group_intensity + } + ) + elif nmethod == "quantile": + msstats_df = ( + msstats_df.groupby( + [ + PEPTIDE_SEQUENCE, + PEPTIDE_CANONICAL, + PEPTIDE_CHARGE, + FRACTION, + RUN, + BIOREPLICATE, + PROTEIN_NAME, + STUDY_ID, + CONDITION, + ] + )[NORM_INTENSITY] + .agg(np.nanmean) + .reset_index() + ) + rank = msstats_df[NORM_INTENSITY].rank(method="average") + dic = dict(zip(rank, msstats_df[NORM_INTENSITY])) + if len(quantile) == 0: + quantile = {k: (v, 1) for k, v in dic.items()} + else: + # update = min(len(quantile), len(dic)) + intersec = set(quantile.keys()) & set(dic.keys()) + update = set(dic.keys()) - set(quantile.keys()) + quantile.update( + { + i: (quantile[i][0] + dic[i], quantile[i][1] + 1) + for i in intersec + } + ) + if len(update) > 0: + quantile.update({k: (dic[k], 1) for k in update}) + msstats_df[SAMPLE_ID] = sample + else: + exit("Stream process only supports msstats and quantile methods!") + msstats_df.to_csv(f"{temp}/{sample}.csv", index=False, sep=",") + norm_record[1] += len(msstats_df) + if not skip_normalization and nmethod == "quantile": + quantile = {k: v[0] / v[1] for k, v in quantile.items()} + print(f"Peptides after contaminants removal: {norm_record[0]}") + print(f"Number of peptides after normalization: {norm_record[1]}") + # Save original intensities QC plots + original_intensities_df = original_intensities_df.reset_index(drop=True) + density = plot_distributions( + original_intensities_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=not log2, + width=plot_width, + title="Original peptidoform intensity distribution (no normalization)", + ) + pdf.savefig(density) + box = plot_box_plot( + original_intensities_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=not log2, + width=plot_width, + title="Original peptidoform intensity distribution (no normalization)", + violin=violin, + ) + plt.show() + pdf.savefig(box) + del original_intensities_df + + # TODO: Peptide intensity normalization + peptides_count = pd.DataFrame( + columns=[PROTEIN_NAME, PEPTIDE_CANONICAL, "count"] + ) + norm_intensities_df = pd.DataFrame() + if not skip_normalization and nmethod == "msstats": + # For ISO normalization + if label in ["TMT", "ITRAQ"]: + median_baseline = np.nanmedian( + list(set(sum(group_intensities.values(), []))) + ) + group_intensities = { + key: np.nanmedian(list(values)) + for key, values in group_intensities.items() + } + else: + fractions = [i[1] for i in group_intensities.keys()] + fraction_median = {} + for fraction in fractions: + fraction_keys = [ + i for i in group_intensities.keys() if i[1] == fraction + ] + fraction_intensities = [] + for key in fraction_keys: + fraction_intensities.extend(group_intensities[key]) + fraction_median[fraction] = np.nanmedian(fraction_intensities) + group_intensities = { + key: np.nanmedian(values) + for key, values in group_intensities.items() + } + print("INFO: Third iteration to normalize and counting peptides frequency...") + size_record = [0] * 3 + + def normalization( + dataset_df, label, sample, skip_normalization, nmethod, record + ): + if not skip_normalization: + field = NORM_INTENSITY + if nmethod == "msstats": + # For ISO normalization + if label in ["TMT", "ITRAQ"]: + dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( + lambda x: x[field] + - group_intensities[(x["Run"], x["Channel"])] + + median_baseline, + axis=1, + ) + else: + dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( + lambda x: x[field] + - group_intensities[(x["Run"], x["Fraction"])] + + np.nanmedian( + [ + group_intensities[i] + for i in group_intensities.keys() + if i[1] == x["Fraction"] + ] + ), + axis=1, + ) + elif nmethod == "quantile": + rank = dataset_df[NORM_INTENSITY].rank(method="average") + ref_dict = dict(zip(rank, dataset_df[NORM_INTENSITY])) + ref_dict = {v: quantile[k] for k, v in ref_dict.items()} + dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( + lambda x: ref_dict.get(x[NORM_INTENSITY], np.nan), + axis=1, + ) + dataset_df = dataset_df.drop_duplicates() + dataset_df = dataset_df[dataset_df[NORM_INTENSITY].notna()] + dataset_df = get_peptidoform_normalize_intensities(dataset_df) + record[0] += len(dataset_df.index) + dataset_df = sum_peptidoform_intensities(dataset_df) + record[1] += len(dataset_df.index) + dataset_df = average_peptide_intensities(dataset_df) + record[2] += len(dataset_df.index) + + return dataset_df, record + + for sample in sample_names: + dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") + if len(dataset_df) != 0: + norm_df, size_record = normalization( + dataset_df, label, sample, skip_normalization, nmethod, size_record + ) + else: + continue + sample_peptides = norm_df[PEPTIDE_CANONICAL].unique().tolist() + if remove_low_frequency_peptides and sample_number > 1: + sample_peptides = norm_df[ + [PROTEIN_NAME, PEPTIDE_CANONICAL] + ].drop_duplicates() + sample_peptides["count"] = 1 + peptides_count = ( + pd.concat([peptides_count, sample_peptides]) + .groupby([PROTEIN_NAME, PEPTIDE_CANONICAL]) + .agg(sum) + .reset_index() + ) + norm_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) + if sample in plot_samples: + norm_intensities_df = pd.concat([norm_intensities_df, norm_df]) + del group_intensities, quantile + print(f"Number of peptides after peptidofrom selection: {size_record[0]}") + print(f"Number of peptides after selection: {size_record[1]}") + print(f"Number of peptides after average: {size_record[2]}") + # Save normalized intensities QC plots + norm_intensities_df = norm_intensities_df.reset_index(drop=True) + density = plot_distributions( + norm_intensities_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + norm_intensities_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + del norm_intensities_df, strong_proteins + + print("INFO: Writing normalized intensities into CSV...") + if remove_low_frequency_peptides and sample_number > 1: + peptides_count = peptides_count.loc[ + (peptides_count["count"] > 0.20 * sample_number) + & (peptides_count["count"] != sample_number - 1) + ] + + final_norm_intensities_df = pd.DataFrame() + size_record = 0 + for sample in sample_names: + dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") + if remove_low_frequency_peptides and sample_number > 1: + # Filter low-frequency peptides, which indicate whether the peptide occurs less than 20% in all samples or + # only in one sample + dataset_df = dataset_df.merge( + peptides_count[[PEPTIDE_CANONICAL, PROTEIN_NAME]], how="inner" + ) + size_record += len(dataset_df.index) + dataset_df = dataset_df[ + [PEPTIDE_CANONICAL, PROTEIN_NAME, SAMPLE_ID, NORM_INTENSITY, CONDITION] + ] + write_mode = "a" if os.path.exists(output) else "w" + header = False if os.path.exists(output) else True + dataset_df.to_csv(output, index=False, header=header, mode=write_mode) + dataset_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) + if sample in plot_samples: + final_norm_intensities_df = pd.concat( + [final_norm_intensities_df, dataset_df] + ) + print(f"Peptides after remove low frequency peptides: {size_record}") + if remove_low_frequency_peptides: + del peptides_count + + # TODO: No peptides intensity normalization applied in stream processing. + # Save final normalized intensities QC plots + final_norm_intensities_df = final_norm_intensities_df.reset_index(drop=True) + density = plot_distributions( + final_norm_intensities_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Normalization at peptide level method: " + nmethod, + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + final_norm_intensities_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Normalization at peptide level method: " + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + pdf.close() + + +if __name__ == "__main__": + peptide_normalization() diff --git a/build/lib/bin/tsne_visualization.py b/build/lib/bin/tsne_visualization.py new file mode 100644 index 0000000..82488fb --- /dev/null +++ b/build/lib/bin/tsne_visualization.py @@ -0,0 +1,187 @@ +# import libraries +import glob +import math + +import click +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from sklearn.decomposition import PCA +from sklearn.manifold import TSNE + +from ibaq.ibaqpy_commons import (IBAQ_LOG, IBAQ_NORMALIZED, PROTEIN_NAME, + SAMPLE_ID) + + +# function to compute principal components +def compute_pca(df, n_components=5) -> pd.DataFrame: + """ + Compute principal components for a given dataframe. + + Parameters + ---------- + df : pd.DataFrame + A dataframe with samples as rows and features as columns. + n_components : int + Number of principal components to be computed. + + Returns + ------- + df_pca : pd.DataFrame + A dataframe with the principal components. + """ + + pca = PCA(n_components=n_components) + pca.fit(df) + df_pca = pca.transform(df) + + df_pca = pd.DataFrame( + df_pca, index=df.index, columns=[f"PC{i}" for i in range(1, n_components + 1)] + ) + + plt.rcParams["figure.figsize"] = (12, 6) + + fig, ax = plt.subplots() + xi = np.arange(1, n_components + 1, step=1) + y = np.cumsum(pca.explained_variance_ratio_) + + plt.ylim(0.0, 1.1) + plt.plot(xi, y, marker="o", linestyle="--", color="b") + + plt.xlabel("Number of Components") + plt.xticks( + np.arange(0, n_components, step=1) + ) # change from 0-based array index to 1-based human-readable label + plt.ylabel("Cumulative variance (%)") + plt.title("The number of components needed to explain variance") + + plt.axhline(y=0.95, color="r", linestyle="-") + plt.text(0.5, 0.85, "95% cut-off threshold", color="red", fontsize=16) + + ax.grid(axis="x") + plt.show() + + return df_pca + + +def compute_tsne(df_pca, n_components=2, perplexity=30, learning_rate=200, n_iter=2000): + """ + Compute t-SNE components from PCA components. + + This function applies t-SNE (t-Distributed Stochastic Neighbor Embedding) to the input DataFrame, + which is expected to contain PCA components with samples as rows. The output is another DataFrame + that contains t-SNE components, also with samples as rows. + + Parameters + ---------- + df_pca : pandas DataFrame + Input DataFrame containing PCA components. Rows are samples and columns are PCA components. + n_components : int, optional + The number of dimensions for the t-SNE components (default is 2). + perplexity : float, optional + The perplexity parameter for t-SNE, which can influence the balance between maintaining + the local and global structure of the data (default is 30). + learning_rate : float, optional + The learning rate for t-SNE (default is 200). + n_iter : int, optional + The number of iterations for t-SNE optimization (default is 2000). + + Returns + ------- + df_tsne : pandas DataFrame + Output DataFrame containing t-SNE components. Rows are samples and columns are t-SNE components. + + Example + ------- + df_pca = pd.DataFrame(data, columns=['PC1', 'PC2', 'PC3']) + df_tsne = compute_tsne(df_pca) + """ + + tsne = TSNE( + n_components=n_components, + perplexity=perplexity, + learning_rate=learning_rate, + n_iter=n_iter, + ) + tsne_results = tsne.fit_transform(np.asarray(df_pca)) + + tsne_cols = [f"tSNE{i + 1}" for i in range(n_components)] + + df_tsne = pd.DataFrame(data=tsne_results, columns=tsne_cols) + df_tsne.index = df_pca.index + return df_tsne + + +def plot_tsne(df, x_col, y_col, hue_col, file_name): + fig, ax = plt.subplots(1, 1, figsize=(20, 10)) + sns.scatterplot( + x=x_col, y=y_col, hue=hue_col, data=df, ax=ax, markers=["o", "+", "x"] + ) + ax.set_xlabel(x_col) + ax.set_ylabel(y_col) + ax.set_title(f"{x_col} vs {y_col} with {hue_col} information") + # set legend inside the plot left an upper corner + plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=8) + plt.subplots_adjust(right=0.8) + plt.savefig(file_name) + + +@click.command() +@click.option( + "-f", "--folder", help="Folder that contains all the protein files", required=True +) +@click.option( + "-o", + "--pattern", + help="Protein file pattern", + required=False, + default="proteins.tsv", +) +def tsne_visualization(folder: str, pattern: str): + """ + Generate t-SNE plots from protein files in a folder. + :param folder: Folder that contains all the protein files + :param pattern: Protein file pattern + """ + # get all the files in the folder + files = glob.glob(f"{folder}/*{pattern}") + + # get the files into pandas selected columns + # (Proteins accession, Sample ID, Reanalysis accession, Intensity) + + dfs = [] # list of dataframes + + for f in files: + reanalysis = (f.split("/")[-1].split("_")[0]).replace("-proteins.tsv", "") + dfs += [ + pd.read_csv(f, usecols=[PROTEIN_NAME, SAMPLE_ID, IBAQ_LOG], sep=",").assign( + reanalysis=reanalysis + ) + ] + + total_proteins = pd.concat(dfs, ignore_index=True) + + normalize_df = pd.pivot_table( + total_proteins, + index=[SAMPLE_ID, "reanalysis"], + columns=PROTEIN_NAME, + values=IBAQ_LOG, + ) + normalize_df = normalize_df.fillna(0) + df_pca = compute_pca(normalize_df, n_components=30) + df_tsne = compute_tsne(df_pca) + + batch = df_tsne.index.get_level_values("reanalysis").tolist() + df_tsne["batch"] = batch + + # plot the t-SNE components tSNE1 vs tSNE2 with batch information using seaborn + plot_tsne( + df_tsne, "tSNE1", "tSNE2", "batch", "5.tsne_plot_with_batch_information.pdf" + ) + + print(total_proteins.shape) + + +if __name__ == "__main__": + tsne_visualization() diff --git a/build/lib/ibaq/__init__.py b/build/lib/ibaq/__init__.py new file mode 100644 index 0000000..27fdca4 --- /dev/null +++ b/build/lib/ibaq/__init__.py @@ -0,0 +1 @@ +__version__ = "0.0.3" diff --git a/build/lib/ibaq/combiner.py b/build/lib/ibaq/combiner.py new file mode 100644 index 0000000..8625e4a --- /dev/null +++ b/build/lib/ibaq/combiner.py @@ -0,0 +1,226 @@ +import logging +import os +from pathlib import Path + +import numpy as np +import pandas as pd + +from ibaq.ibaqpy_commons import load_feature, load_sdrf +from ibaq.utils import (apply_batch_correction, compute_pca, fill_samples, + filter_missing_value_by_group, folder_retrieval, + generate_meta, get_batch_info_from_sample_names, + impute_missing_values, iterative_outlier_removal, + plot_pca, remove_single_sample_batches, + split_df_by_column) + +logging.basicConfig( + format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG +) +logger = logging.getLogger(__name__) + + +class Combiner: + def __init__( + self, data_folder: os.PathLike, covariate: str = None, organism: str = "HUMAN" + ): + """Generate concated IbaqNorm and metadata.""" + logger.info("Combining SDRFs and ibaq results ...") + self.data_folder = Path(data_folder) + if not self.data_folder.exists() or not self.data_folder.is_dir(): + raise FileNotFoundError(f"Data folder {self.data_folder} does not exsit!") + self.covariate = covariate + files = folder_retrieval(str(self.data_folder)) + self.metadata, self.df = pd.DataFrame(), pd.DataFrame() + for sdrf in files["sdrf"]: + sdrf_df = load_sdrf(sdrf) + self.metadata = pd.concat([self.metadata, generate_meta(sdrf_df)]) + self.metadata = self.metadata.drop_duplicates() + self.metadata.index = self.metadata["sample_id"] + + for ibaq in files["ibaq"]: + self.df = pd.concat([self.df, load_feature(ibaq)]) + self.df = self.df[self.df["ProteinName"].str.endswith(organism)] + self.df.index = self.df["SampleID"] + self.df = self.df.join(self.metadata, how="left") + print(self.metadata, self.df.head) + + def read_data(self, meta: str, ibaq: str, organism="HUMAN", covariate=None): + """Read metadata and IbaqNorm locally. + + param meta: + param ibaq: + """ + self.covariate = covariate + self.df = pd.read_csv(ibaq, index_col=0) + self.metadata = pd.read_csv(meta) + self.df = self.df[self.df["ProteinName"].str.endswith(organism)] + self.df.index = self.df["SampleID"] + self.metadata = self.metadata.drop_duplicates() + self.df = self.df.join(self.metadata, how="left") + + def imputer(self, covariate_to_keep: list = None): + logger.info("Imputing merged ibaq results ...") + # Keep only columns 'sample_id' and covariate from df_metadata + if self.covariate: + if len(self.metadata[self.covariate].unique()) < 2: + raise SystemExit( + f"{self.covariate} should contain at least two different covariates!" + ) + + # Keep only rows within covariate_to_keep, you can keep tissue or tissue part you want. + if covariate_to_keep: + self.df = self.df[self.df[self.covariate].isin(covariate_to_keep)] + + # keep columns with at least 30% of non-missing values in each covariate_index group + self.df = filter_missing_value_by_group( + self.df, col="ProteinName", non_missing_percent_to_keep=0.3 + ) + self.proteins = self.df["ProteinName"].unique().tolist() + + # TODO: Data for imputation should take samples as columns, proteins as rows. [Expression Matrix] + # Also need to fill the proteins didn't show in original results for each sample. + if self.covariate: + # split df by covariates + df_list = split_df_by_column(self.df, cov_index_col=self.covariate) + df_list = [fill_samples(df, self.proteins) for df in df_list] + + # impute missing values with KNNImputer for every df in df_list + df_list = impute_missing_values(df_list) + + # concatenate all dataframes in df_list into one dataframe + self.df = pd.concat(df_list, axis=1) + else: + self.df = fill_samples(self.df, self.proteins) + self.df = impute_missing_values(self.df) + + self.samples = self.df.columns.tolist() + self.datasets = list(set([sample.split("-")[0] for sample in self.samples])) + print(self.df.head) + + def outlier_removal( + self, + n_components: int = None, + min_cluster_size: int = None, + min_samples_num: int = None, + n_iter: int = None, + ): + logger.info("Removing outliers from imputed data ...") + # Apply iterative outlier removal on imputed data + # get batch indices from the columns names + batches = [sample.split("-")[0] for sample in self.samples] + self.samples_number = { + dataset: batches.count(dataset) for dataset in self.datasets + } + min_samples = round(np.median(list(self.samples_number.values()))) + if min_samples == 1: + min_samples = 2 + self.batch_index = get_batch_info_from_sample_names(self.df.columns) + # apply iterative outlier removal + self.df_filtered_outliers = iterative_outlier_removal( + self.df, + self.batch_index, + n_components=n_components + if n_components + else round(len(set(self.batch_index)) / 3), + min_cluster_size=min_cluster_size if min_cluster_size else min_samples, + min_samples=min_samples_num if min_samples_num else min_samples, + n_iter=n_iter if n_iter else 5, + ) + print(self.df_filtered_outliers) + # plot PCA of corrected data with outliers removed + # transpose the dataframe to get samples as rows and features as columns + self.df_pca = compute_pca( + self.df_filtered_outliers.T, + n_components=n_components + if n_components + else round(len(set(self.batch_index)) / 3), + ) + + # add batch information to the dataframe + self.df_pca["batch"] = self.df_pca.index.str.split("-").str[0] + + # plot PC1 vs PC2 with batch information using seaborn + # put the legend outside the plot + # save the plot as a png file + plot_pca( + self.df_pca, + title="PCA plot of corrected data with outliers removed", + output_file="pca_corrected_outliers_removed.png", + ) + + def batch_correction( + self, n_components: int = None, tissue_parts_to_keep: int = None + ): + logger.info("Applying batch effect correction ...") + # Plot PCA of uncorrected imputed data + # transpose the dataframe to get samples as rows and features as columns + self.df_pca = compute_pca( + self.df.T, + n_components=n_components + if n_components + else round(len(set(self.batch_index)) / 3), + ) + + # add batch information to the dataframe + self.df_pca["batch"] = self.df_pca.index.str.split("-").str[0] + + # plot PC1 vs PC2 with batch information using seaborn + # put the legend outside the plot + # save the plot as a png file + plot_pca( + self.df_pca, + title="PCA plot of uncorrected data", + output_file="pca_uncorrected.png", + ) + + # keep samples only in tissue_part from metadata + ## TODO: specify covariates + if tissue_parts_to_keep: + self.metadata = self.metadata[ + self.metadata["tissue_part"].isin(tissue_parts_to_keep) + ] + samples_to_keep = self.metadata["sample_id"].tolist() + + # keep samples in df that are also in samples_to_keep + self.df = self.df[[s for s in self.df.columns if s in samples_to_keep]] + + # 2. Apply batch correction with covariate information + # Before apply batch correction, filter out batches with just one sample (otherwise the batch correction will fail). + batch_index = get_batch_info_from_sample_names(self.df.columns.tolist()) + self.df = remove_single_sample_batches(self.df, batch_index) + self.batch_index = get_batch_info_from_sample_names(self.df.columns.tolist()) + + # get covariate information from metadata. + columns = self.df.columns.tolist() + self.metadata = self.metadata[self.metadata["sample_id"].isin(columns)] + # reorder metadata to match the order of columns in df + self.metadata = self.metadata.reset_index(drop=True) + self.metadata = ( + self.metadata.set_index("sample_id").reindex(columns, axis=0).reset_index() + ) + if self.covariate: + # get the covariates from metadata as a list + covariates_index = self.metadata[self.covariate].tolist() + else: + covariates_index = [] + + # apply batch correction + self.df_corrected = apply_batch_correction( + self.df, self.batch_index, covs=covariates_index + ) + print(self.df_corrected) + + # plot PCA of corrected data + # transpose the dataframe to get samples as rows and features as columns + self.df_pca = compute_pca(self.df_corrected.T, n_components=5) + # add batch information to the dataframe + self.df_pca["batch"] = self.df_pca.index.str.split("-").str[0] + + # plot PC1 vs PC2 with batch information using seaborn + # put the legend outside the plot + # save the plot as a png file + plot_pca( + self.df_pca, + title="PCA plot of corrected data", + output_file="pca_corrected.png", + ) diff --git a/build/lib/ibaq/ibaqpy_commons.py b/build/lib/ibaq/ibaqpy_commons.py new file mode 100644 index 0000000..1d72e1f --- /dev/null +++ b/build/lib/ibaq/ibaqpy_commons.py @@ -0,0 +1,434 @@ +import os +import re + +import click +import matplotlib +import numpy as np +import pandas as pd +import seaborn as sns +from matplotlib import pyplot as plt +from pandas import DataFrame + +PARQUET_COLUMNS = [ + "protein_accessions", + "peptidoform", + "sequence", + "charge", + "fragment_ion", + "isotope_label_type", + "channel", + "condition", + "biological_replicate", + "run", + "fraction", + "intensity", + "reference_file_name", + "sample_accession", +] + +PROTEIN_NAME = "ProteinName" +PEPTIDE_SEQUENCE = "PeptideSequence" +PEPTIDE_CANONICAL = "PeptideCanonical" +PEPTIDE_CHARGE = "PrecursorCharge" +FRAGMENT_ION = "FragmentIon" +PRODUCT_CHARGE = "ProductCharge" +ISOTOPE_LABEL_TYPE = "IsotopeLabelType" +CHANNEL = "Channel" +MIXTRUE = "Mixture" +TECHREPMIXTURE = "TechRepMixture" +CONDITION = "Condition" +BIOREPLICATE = "BioReplicate" +RUN = "Run" +FRACTION = "Fraction" +INTENSITY = "Intensity" +NORM_INTENSITY = "NormIntensity" +RT = "Rt" +REFERENCE = "Reference" +SAMPLE_ID = "SampleID" +STUDY_ID = "StudyID" +SEARCH_ENGINE = "searchScore" +SCAN = "Scan" +MBR = "MatchBetweenRuns" +IBAQ = "Ibaq" +IBAQ_NORMALIZED = "IbaqNorm" +IBAQ_LOG = "IbaqLog" +IBAQ_PPB = "IbaqPpb" + +parquet_map = { + "protein_accessions": PROTEIN_NAME, + "peptidoform": PEPTIDE_SEQUENCE, + "sequence": PEPTIDE_CANONICAL, + "charge": PEPTIDE_CHARGE, + "fragment_ion": FRAGMENT_ION, + "isotope_label_type": ISOTOPE_LABEL_TYPE, + "channel": CHANNEL, + "condition": CONDITION, + "biological_replicate": BIOREPLICATE, + "run": RUN, + "fraction": FRACTION, + "intensity": INTENSITY, + "reference_file_name": REFERENCE, + "sample_accession": SAMPLE_ID, +} + +TMT16plex = { + "TMT126": 1, + "TMT127N": 2, + "TMT127C": 3, + "TMT128N": 4, + "TMT128C": 5, + "TMT129N": 6, + "TMT129C": 7, + "TMT130N": 8, + "TMT130C": 9, + "TMT131N": 10, + "TMT131C": 11, + "TMT132N": 12, + "TMT132C": 13, + "TMT133N": 14, + "TMT133C": 15, + "TMT134N": 16, +} + +TMT11plex = { + "TMT126": 1, + "TMT127N": 2, + "TMT127C": 3, + "TMT128N": 4, + "TMT128C": 5, + "TMT129N": 6, + "TMT129C": 7, + "TMT130N": 8, + "TMT130C": 9, + "TMT131N": 10, + "TMT131C": 11, +} + +TMT10plex = { + "TMT126": 1, + "TMT127N": 2, + "TMT127C": 3, + "TMT128N": 4, + "TMT128C": 5, + "TMT129N": 6, + "TMT129C": 7, + "TMT130N": 8, + "TMT130C": 9, + "TMT131": 10, +} + +TMT6plex = { + "TMT126": 1, + "TMT127": 2, + "TMT128": 3, + "TMT129": 4, + "TMT130": 5, + "TMT131": 6, +} + +ITRAQ4plex = {"ITRAQ114": 1, "ITRAQ115": 2, "ITRAQ116": 3, "ITRAQ117": 4} + +ITRAQ8plex = { + "ITRAQ113": 1, + "ITRAQ114": 2, + "ITRAQ115": 3, + "ITRAQ116": 4, + "ITRAQ117": 5, + "ITRAQ118": 6, + "ITRAQ119": 7, + "ITRAQ121": 8, +} + + +def print_help_msg(command: click.Command): + """ + Print the help of the command + :param command: click command object + :return: None + """ + with click.Context(command) as ctx: + click.echo(command.get_help(ctx)) + + +def get_accession(identifier: str) -> str: + """ + Get protein accession from the identifier (e.g. sp|P12345|PROT_NAME) + :param identifier: Protein identifier + :return: Protein accession + """ + identifier_lst = identifier.split("|") + if len(identifier_lst) == 1: + return identifier_lst[0] + else: + return identifier_lst[1] + + +def remove_protein_by_ids( + dataset: DataFrame, protein_file: str, protein_field=PROTEIN_NAME +) -> DataFrame: + """ + This method reads a file with a list of contaminants and high abudant proteins and + remove them from the dataset. + :param dataset: Peptide intensity DataFrame + :param protein_file: contaminants file + :param protein_field: protein field + :return: dataset with the filtered proteins + """ + contaminants_reader = open(protein_file, "r") + contaminants = contaminants_reader.read().split("\n") + contaminants = [cont for cont in contaminants if cont.strip()] + cregex = "|".join(contaminants) + return dataset[~dataset[protein_field].str.contains(cregex)] + + +def remove_contaminants_entrapments_decoys( + dataset: DataFrame, protein_field=PROTEIN_NAME +) -> DataFrame: + """ + This method reads a file with a list of contaminants and high abudant proteins and + remove them from the dataset. + :param dataset: Peptide intensity DataFrame + :param contaminants_file: contaminants file + :param protein_field: protein field + :return: dataset with the filtered proteins + """ + contaminants = [] + contaminants.append("CONTAMINANT") + contaminants.append("ENTRAPMENT") + contaminants.append("DECOY") + cregex = "|".join(contaminants) + return dataset[~dataset[protein_field].str.contains(cregex)] + + +def get_canonical_peptide(peptide_sequence: str) -> str: + """ + This function returns a peptide sequence without the modification information + :param peptide_sequence: peptide sequence with mods + :return: peptide sequence + """ + clean_peptide = re.sub("[\(\[].*?[\)\]]", "", peptide_sequence) + clean_peptide = clean_peptide.replace(".", "").replace("-", "") + return clean_peptide + + +def plot_distributions( + dataset: DataFrame, + field: str, + class_field: str, + title: str = "", + log2: bool = True, + width: float = 10, +) -> matplotlib.pyplot: + """ + Print the quantile plot for the dataset + :param dataset: DataFrame + :param field: Field that would be use in the dataframe to plot the quantile + :param class_field: Field to group the quantile into classes + :param title: Title of the box plot + :param log2: Log the intensity values + :param width: size of the plot + :return: + """ + pd.set_option("mode.chained_assignment", None) + normalize = dataset[[field, class_field]].reset_index(drop=True) + if log2: + normalize[field] = np.log2(normalize[field]) + normalize.dropna(subset=[field], inplace=True) + data_wide = normalize.pivot(columns=class_field, values=field) + # plotting multiple density plot + data_wide.plot.kde(figsize=(width, 8), linewidth=2, legend=False) + plt.title(title) + pd.set_option("mode.chained_assignment", "warn") + + return plt.gcf() + + +def plot_box_plot( + dataset: DataFrame, + field: str, + class_field: str, + log2: bool = False, + width: float = 10, + rotation: int = 30, + title: str = "", + violin: bool = False, +) -> matplotlib.pyplot: + """ + Plot a box plot of two values field and classes field + :param violin: Also add violin on top of box plot + :param dataset: Dataframe with peptide intensities + :param field: Intensity field + :param class_field: class to group the peptides + :param log2: transform peptide intensities to log scale + :param width: size of the plot + :param rotation: rotation of the x-axis + :param title: Title of the box plot + :return: + """ + pd.set_option("mode.chained_assignment", None) + normalized = dataset[[field, class_field]] + np.seterr(divide="ignore") + plt.figure(figsize=(width, 14)) + if log2: + normalized[field] = np.log2(normalized[field]) + + if violin: + chart = sns.violinplot( + x=class_field, + y=field, + data=normalized, + boxprops=dict(alpha=0.3), + palette="muted", + ) + else: + chart = sns.boxplot( + x=class_field, + y=field, + data=normalized, + boxprops=dict(alpha=0.3), + palette="muted", + ) + + chart.set(title=title) + chart.set_xticklabels(chart.get_xticklabels(), rotation=rotation, ha="right") + pd.set_option("mode.chained_assignment", "warn") + + return plt.gcf() + + +def sum_peptidoform_intensities(dataset: DataFrame) -> DataFrame: + """ + Sum the peptidoform intensities for all peptidofrom across replicates of the same sample. + :param dataset: Dataframe to be analyzed + :return: dataframe with the intensities + """ + dataset = dataset[dataset[NORM_INTENSITY].notna()] + normalize_df = dataset.groupby( + [PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION], observed=True + )[NORM_INTENSITY].sum() + normalize_df = normalize_df.reset_index() + normalize_df = pd.merge( + normalize_df, + dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION]], + how="left", + on=[PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION], + ) + normalize_df.drop_duplicates(inplace=True) + return normalize_df + + +def parse_uniprot_accession(uniprot_id: str) -> str: + """ + Parse the uniprot accession from the uniprot id in the form of + tr|CONTAMINANT_Q3SX28|CONTAMINANT_TPM2_BOVIN and convert to CONTAMINANT_Q3SX28 + :param uniprot_id: uniprot id + :return: uniprot accession + """ + uniprot_list = uniprot_id.split(";") + result_uniprot_list = [] + for accession in uniprot_list: + if accession.count("|") == 2: + accession = accession.split("|")[1] + result_uniprot_list.append(accession) + return ";".join(result_uniprot_list) + + +def get_study_accession(sample_id: str) -> str: + """ + Get the project accession from the Sample accession. The function expected a sample accession in the following + format PROJECT-SAMPLEID + :param sample_id: Sample Accession + :return: study accession + """ + return sample_id.split("-")[0] + + +def get_spectrum_prefix(reference_spectrum: str) -> str: + """ + Get the reference name from Reference column. The function expected a reference name in the following format eg. + 20150820_Haura-Pilot-TMT1-bRPLC03-2.mzML_controllerType=0 controllerNumber=1 scan=16340. This function can also + remove suffix of spectrum files. + :param reference_spectrum: + :return: reference name + """ + return re.split(r"\.mzML|\.MZML|\.raw|\.RAW|\.d|\.wiff", reference_spectrum)[0] + + +# Common functions when normalizing peptide dataframe +def get_peptidoform_normalize_intensities( + dataset: DataFrame, higher_intensity: bool = True +) -> DataFrame: + """ + Select the best peptidoform for the same sample and the same replicates. A peptidoform is the combination of + a (PeptideSequence + Modifications) + Charge state. + :param dataset: dataset including all properties + :param higher_intensity: select based on normalize intensity, if false based on best scored peptide + :return: + """ + dataset = dataset[dataset[NORM_INTENSITY].notna()] + if higher_intensity: + dataset = dataset.loc[ + dataset.groupby( + [PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, SAMPLE_ID, CONDITION, BIOREPLICATE], + observed=True, + )[NORM_INTENSITY].idxmax() + ].reset_index(drop=True) + else: + dataset = dataset.loc[ + dataset.groupby( + [PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, SAMPLE_ID, CONDITION, BIOREPLICATE], + observed=True, + )[SEARCH_ENGINE].idxmax() + ].reset_index(drop=True) + return dataset + + +def average_peptide_intensities(dataset: DataFrame) -> DataFrame: + """ + Median the intensities of all the peptidoforms for a specific peptide sample combination. + :param dataset: Dataframe containing all the peptidoforms + :return: New dataframe + """ + dataset_df = dataset.groupby( + [PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION], observed=True + )[NORM_INTENSITY].median() + dataset_df = dataset_df.reset_index() + dataset_df = pd.merge( + dataset_df, + dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION]], + how="left", + on=[PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION], + ) + dataset_df.drop_duplicates(inplace=True) + return dataset_df + + +# Functions needed by Combiner +def load_sdrf(sdrf_path: str) -> DataFrame: + """ + Load sdrf TSV as a dataframe. + :param sdrf_path: Path to SDRF TSV. + :return: + """ + if not os.path.exists(sdrf_path): + raise FileNotFoundError(f"{sdrf_path} does not exist!") + sdrf_df = pd.read_csv(sdrf_path, sep="\t") + sdrf_df.columns = [col.lower() for col in sdrf_df.columns] + return sdrf_df + + +def load_feature(feature_path: str) -> DataFrame: + """ + Load feature file as a dataframe. + :param feature_path: Path to feature file. + :return: + """ + suffix = os.path.splitext(feature_path)[1][1:] + if suffix == "parquet": + return pd.read_parquet(feature_path) + elif suffix == "csv": + return pd.read_csv(feature_path) + else: + raise SystemExit( + f"{suffix} is not allowed as input, please provide msstats_in or feature parquet." + ) diff --git a/build/lib/ibaq/utils.py b/build/lib/ibaq/utils.py new file mode 100644 index 0000000..3f4416b --- /dev/null +++ b/build/lib/ibaq/utils.py @@ -0,0 +1,525 @@ +# import libraries +import logging +import os +from typing import List, Optional, Union + +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +from sklearn.decomposition import PCA +from sklearn.impute import KNNImputer + +import hdbscan +from combat.pycombat import pycombat + +logging.basicConfig( + format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG +) +logger = logging.getLogger(__name__) + + +def folder_retrieval(folder: str) -> dict: + """Retrieval SDRF and ibaq.csv from a given folder. + + param folder: + return: + """ + + folder = folder + os.sep if not folder.endswith(os.sep) else folder + results = {"sdrf": [], "ibaq": []} + items = os.listdir(folder) + for item in items: + try: + results["sdrf"].extend( + [ + f"{folder}{item}/{i}" + for i in os.listdir(f"{folder}{item}/") + if i.endswith(".sdrf.tsv") + ] + ) + results["ibaq"].extend( + [ + f"{folder}{item}/{i}" + for i in os.listdir(f"{folder}{item}/") + if i.endswith("ibaq.csv") or i.endswith("ibaq.parquet") + ] + ) + except Exception as e: + logger.warning(f"Error: {e}") + if item.endswith(".sdrf.tsv"): + results["sdrf"].append(folder + item) + elif item.endswith("ibaq.csv"): + results["ibaq"].append(folder + item) + else: + pass + if len(results["sdrf"]) == 0: + raise SystemExit("No SDRF founded!") + if len(results["ibaq"]) == 0: + raise SystemExit("No ibaq results founded!") + if len(results["sdrf"]) != len(results["ibaq"]): + raise SystemExit("Number of SDRFs should be equal to ibaq results!") + + return results + + +def generate_meta(sdrf_df: pd.DataFrame) -> pd.DataFrame: + """Generate ibaqpy metadata from SDRF. Each metadata contains four columns: + - sample_id: Sample ID from every dataset (source name). + - batch: PXD of every dataset (source name). + - tissue: Tissue name of tissue-based dataset (characteristics[organism part]). + - tissue_part: Tissue part of tissue-based dataset (characteristics[organism part]). + + param sdrf_df: _description_ + return: pd.DataFrame + """ + sdrf_df.columns = [col.lower() for col in sdrf_df.columns] + pxd = sdrf_df["source name"].values[0].split("-")[0] + organism_part = [ + col + for col in sdrf_df.columns + if col.startswith("characteristics[organism part]") + ] + if len(organism_part) > 2: + print( + f"{pxd} Please provide a maximum of 2 characteristics[organism part], one for tissue name and the other for tissue part!" + ) + exit(1) + elif len(organism_part) == 0: + print("Missing characteristics[organism part], please check your SDRF!") + exit(1) + + meta_df = sdrf_df[["source name"] + organism_part] + meta_df = meta_df.drop_duplicates() + + if len(meta_df.columns) == 2: + meta_df["tissue_part"] = None + meta_df.columns = ["sample_id", "tissue", "tissue_part"] + else: + if sdrf_df[organism_part[0]].nunique() > sdrf_df[organism_part[1]].nunique(): + a = "tissue_part" + b = "tissue" + else: + a = "tissue" + b = "tissue_part" + meta_df.rename( + columns={ + "source name": "sample_id", + organism_part[0]: a, + organism_part[1]: b, + }, + inplace=True, + ) + + meta_df["batch"] = pxd + meta_df = meta_df[["sample_id", "batch", "tissue", "tissue_part"]] + meta_df = meta_df.drop_duplicates() + + return meta_df + + +def fill_samples(df, proteins): + df = pd.pivot_table( + df, index="ProteinName", columns="SampleID", values=["IbaqNorm"] + ) + df = df.reindex(proteins) + df.columns = [pair[1] for pair in df.columns] + df.index.rename(None, inplace=True) + + return df + + +def impute_missing_values( + data: Optional[Union[pd.DataFrame, List[pd.DataFrame]]], + n_neighbors=5, + weights="uniform", + metric="nan_euclidean", + keep_empty_features=True, +) -> Union[pd.DataFrame, List[pd.DataFrame]]: + """ + Impute missing values in a DataFrame or each DataFrame in a list using KNNImputer. + + Parameters + ---------- + data : Union[pd.DataFrame, List[pd.DataFrame]] + A pandas DataFrame or list of pandas DataFrames with missing values. + n_neighbors : int, optional + Number of neighboring samples to use for imputation. Default is 5. + weights : str, optional + Weight function used in prediction. Default is 'uniform'. + metric : str, optional + Distance metric for searching neighbors. Default is 'nan_euclidean'. + keep_empty_features : bool, optional + Whether to keep empty features (no known samples). Default is True. + + Returns + ------- + Union[pd.DataFrame, List[pd.DataFrame]] + A pandas DataFrame or list of pandas DataFrames with imputed missing values. + """ + imputer = KNNImputer( + n_neighbors=n_neighbors, + weights=weights, + metric=metric, + keep_empty_features=keep_empty_features, + ) + + if isinstance(data, pd.DataFrame): + # If it's a single DataFrame, transform it and return immediately + return pd.DataFrame( + imputer.fit_transform(data), columns=data.columns, index=data.index + ) + else: + # Otherwise, use list comprehension to apply the imputer to each DataFrame + return [ + pd.DataFrame(imputer.fit_transform(t), columns=t.columns, index=t.index) + for t in data + ] + + +def split_df_by_column(df: pd.DataFrame, cov_index_col: str) -> List[pd.DataFrame]: + """ + Split a DataFrame by unique values of a specified column. + + Parameters + ---------- + df : pd.DataFrame + A pandas DataFrame to be split. + cov_index_col : str + The name of the column to split the DataFrame by. + + Returns + ------- + List[pd.DataFrame] + A list of pandas DataFrames, each containing rows with the same value in the specified column. + """ + # Check if cov_index_col is in df + if cov_index_col not in df.columns: + raise ValueError( + f"'{cov_index_col}' is not a column in the provided DataFrame." + ) + + # Use list comprehension to create the list of dataframes + df_split = [df_group for _, df_group in df.groupby(cov_index_col)] + + return df_split + + +def filter_missing_value_by_group(df_input, col, non_missing_percent_to_keep): + """ + Filters the dataframe by keeping columns with at least a specified percent of non-missing values + in each group. + + Parameters: + df_input (pandas.DataFrame): The input dataframe. + col (str): The name of the column to group by. + non_missing_percent_to_keep (float): The minimum percentage of non-missing values to keep a column. + + Returns: + pandas.DataFrame: The filtered dataframe. + """ + return df_input.groupby(col, as_index=False).filter( + lambda x: len(x) < non_missing_percent_to_keep * len(df_input) + ) + + +# function to compute principal components +def compute_pca(df, n_components=5) -> pd.DataFrame: + """ + Compute principal components for a given dataframe. + + Parameters + ---------- + df : pd.DataFrame + A dataframe with samples as rows and features as columns. + n_components : int + Number of principal components to be computed. + + Returns + ------- + df_pca : pd.DataFrame + A dataframe with the principal components. + """ + + pca = PCA(n_components=n_components) + pca.fit(df) + df_pca = pca.transform(df) + + df_pca = pd.DataFrame( + df_pca, index=df.index, columns=[f"PC{i}" for i in range(1, n_components + 1)] + ) + + return df_pca + + +# get batch info from sample names +def get_batch_info_from_sample_names(sample_list: List[str]) -> List[int]: + """ + Expected as input a list of sample names with SDRF-like format: {PRIDE_PROJECT_ID}-{SAMPLE_ID} + and return a list of batch indices (a.k.a. factor levels) + + :param sample_list: list of sample names + :return: list of batch indices + """ + samples = [s.split("-")[0] for s in sample_list] + batches = list(set(samples)) + index = {i: batches.index(i) for i in batches} + + return [index[i] for i in samples] + + +# define a function to remove batches with only one sample. +# takes as input a dataframe with samples in columns and protein IDs in rows, and a list of batch indices +# returns a dataframe with batches with only one sample removed +def remove_single_sample_batches(df: pd.DataFrame, batch: list) -> pd.DataFrame: + """ + Remove batches with only one sample. + + Parameters + ---------- + df : pd.DataFrame + A dataframe with samples in columns and protein IDs in rows. + batch : list + A list of batch indices. + + Returns + ------- + df_filtered : pd.DataFrame + A filtered dataframe. + """ + + # make dict with columns as key and batch as value + batch_dict = dict(zip(df.columns, batch)) + + # from the batch_dict, get the batches with only one sample + single_sample_batch = [ + k for k, v in batch_dict.items() if list(batch_dict.values()).count(v) == 1 + ] + + # remove batches with only one sample + df_single_batches_removed = df.drop(single_sample_batch, axis=1) + + # update batch_dict based on the filtered dataframe + # batch_dict = {col: batch_dict[col] for col in df_single_batches_removed.columns} + + return df_single_batches_removed + + +def plot_pca( + df_pca, + output_file, + x_col="PC1", + y_col="PC2", + hue_col="batch", + palette="Set2", + title="PCA plot", + figsize=(8, 6), +): + """ + Plots a PCA scatter plot and saves it to a file. + + Args: + df_pca (pd.DataFrame): DataFrame containing PCA data. + output_file (str): File name to save the plot as an image. + x_col (str, optional): Column name for x-axis. Defaults to "PC1". + y_col (str, optional): Column name for y-axis. Defaults to "PC2". + hue_col (str, optional): Column name for hue (grouping variable). Defaults to "batch". + palette (str, optional): Color palette for the plot. Defaults to "Set2". + title (str, optional): Title for the plot. Defaults to "PCA plot". + figsize (tuple, optional): Figure size as (width, height) in inches. Defaults to (5, 5). + """ + + # Create a new figure with the specified size + fig, ax = plt.subplots(figsize=figsize) + + # Create a scatterplot using seaborn + sns.scatterplot(x=x_col, y=y_col, hue=hue_col, data=df_pca, palette=palette, ax=ax) + + # Set the plot title, x-axis label, and y-axis label + ax.set_title(title) + ax.set_xlabel(x_col) + ax.set_ylabel(y_col) + + # Set the legend location and adjust the bounding box + ax.legend(loc="center left", bbox_to_anchor=(1.05, 0.5)) + + # Adjust the layout to fit the legend within the figure + plt.tight_layout() + + # Save the plot as an image file + plt.savefig(output_file, bbox_inches="tight") + + +# define a function for batch correction +def apply_batch_correction( + df: pd.DataFrame, batch: List[int], covs: Optional[List[int]] = None +) -> pd.DataFrame: + """ + Get a dataframe and a list of batch indices as input and + return a batch corrected dataframe with pycombat. + + Parameters + ---------- + df : pd.DataFrame + A dataframe with the data to apply batch correction. Expected to have samples as columns and features as rows. + batch : list + A list of batch indices. + covs : list + A list of covariates to be used for batch correction. + + Warning + ------- + Every batch should have at least 2 samples. + + Returns + ------- + df_corrected : pd.DataFrame + A batch-corrected dataframe. + + """ + + # check if the number of samples match the number of batch indices + if len(df.columns) != len(batch): + raise ValueError( + "The number of samples should match the number of batch indices." + ) + + # check if every batch factor has at least 2 samples + if any([batch.count(i) < 2 for i in set(batch)]): + raise ValueError("Every batch factor should have at least 2 samples.") + + # If not None, check if the number of covariates match the number of samples + if covs: + if len(df.columns) != len(covs): + raise ValueError( + "The number of samples should match the number of covariates." + ) + + df_co = pycombat(data=df, batch=batch, mod=covs, mean_only=False) + return df_co + + +# function to compute clusters +def find_clusters(df, min_cluster_size, min_samples) -> pd.DataFrame: + """ + Compute clusters for a given dataframe. + + Parameters + ---------- + df : pd.DataFrame + A dataframe with the data to be batched corrected. + min_cluster_size : int + The minimum size of clusters. + min_samples : int + The minimum number of samples in a neighborhood for a point to be considered as a core point. + + Returns + ------- + df_clusters : pd.DataFrame + A dataframe with the cluster assignments. + """ + + clusterer = hdbscan.HDBSCAN( + min_cluster_size=min_cluster_size, + min_samples=min_samples, + metric="euclidean", + cluster_selection_method="eom", + allow_single_cluster=True, + cluster_selection_epsilon=0.01, + ) + clusterer.fit(df) + df["cluster"] = clusterer.labels_ + + return df + + +# Function to run the iterative outlier removal pipeline. +# This function applies sequentially the following steps: +# 1. Compute principal components +# 2. Find clusters using HDBSCAN +# 3. Remove outliers +# 4. Repeat steps 1-3 until no outliers are found +def iterative_outlier_removal( + df: pd.DataFrame, + batch: List[int], + n_components: int = 5, + min_cluster_size: int = 10, + min_samples: int = 10, + n_iter: int = 10, + verbose: bool = True, +) -> pd.DataFrame: + """ + Get a dataframe and a list of batch indices as input and + return a batch corrected dataframe with pycombat. + + Parameters + ---------- + df : pd.DataFrame + A dataframe with the data to be batch corrected. + batch : list + A list of batch indices. + n_components : int + Number of principal components to be computed. + min_cluster_size : int + The minimum size of clusters. + min_samples : int + The minimum number of samples in a neighborhood for a point to be considered as a core point. + n_iter : int + Number of iterations to be performed. + verbose : bool + Whether to print and plot the number of outliers for each iteration. + + Returns + ------- + df_filtered : pd.DataFrame + A filtered dataframe. + """ + + # repeat steps 1-3 until no outliers are found + # or the maximum number of iterations is reached + # print the number of outliers for each iteration + # save a plot of the principal components for each iteration + + # make dict with columns as key and batch as value + batch_dict = dict(zip(df.columns, batch)) + + for i in range(n_iter): + print("Running iteration: ", i + 1) + + # compute principal components + # transpose the dataframe to get samples as rows and features as columns + df_pca = compute_pca(df.T, n_components=n_components) + + # compute clusters + df_clusters = find_clusters( + df_pca, min_cluster_size=min_cluster_size, min_samples=min_samples + ) + print(df_clusters) + # remove outliers from original dataframe + outliers = df_clusters[df_clusters["cluster"] == -1].index.tolist() + df_filtered_outliers = df.drop(outliers, axis=1) + print(f"Number of outliers in iteration {i + 1}: {len(outliers)}") + print(f"Outliers in iteration {i + 1}: {str(outliers)}") + + # update batch_dict based on the filtered dataframe + batch_dict = {col: batch_dict[col] for col in df_filtered_outliers.columns} + + df = df_filtered_outliers + + # plot principal components PC1 vs PC2 + # save the plot as a png file + # print the number of outliers for each iteration in the plot + if verbose: + plot_pca( + df_clusters, + output_file=f"iterative_outlier_removal_{i + 1}.png", + x_col="PC1", + y_col="PC2", + hue_col="cluster", + title=f"Iteration {i + 1}: Number of outliers: {len(outliers)}", + ) + + # break the loop if no outliers are found + if len(outliers) == 0: + break + + return df diff --git a/build/scripts-3.10/compute_ibaq.py b/build/scripts-3.10/compute_ibaq.py new file mode 100644 index 0000000..1dc018c --- /dev/null +++ b/build/scripts-3.10/compute_ibaq.py @@ -0,0 +1,201 @@ +#!D:\venv\ibaq\Scripts\python.exe +# -*- coding: utf-8 -*- + +import math + +import click +import matplotlib.pyplot as plt +import pandas as pd +from matplotlib.backends.backend_pdf import PdfPages +from pandas import DataFrame, Series +from pyopenms import * + +from ibaq.ibaqpy_commons import (CONDITION, IBAQ, IBAQ_LOG, IBAQ_NORMALIZED, + IBAQ_PPB, NORM_INTENSITY, PROTEIN_NAME, + SAMPLE_ID, plot_box_plot, plot_distributions, + print_help_msg, get_accession) + + +def normalize(group): + group[IBAQ_NORMALIZED] = group[IBAQ] / group[IBAQ].sum() + return group + + +def normalize_ibaq(res: DataFrame) -> DataFrame: + """ + Normalize the ibaq values using the total ibaq of the sample. The resulted + ibaq values are then multiplied by 100'000'000 (PRIDE database noramalization) + for the ibaq ppb and log10 shifted by 10 (ProteomicsDB) + :param res: Dataframe + :return: + """ + + res = res.groupby([SAMPLE_ID, CONDITION]).apply(normalize) + + # Normalization method used by Proteomics DB 10 + log10(ibaq/sum(ibaq)) + res[IBAQ_LOG] = res[IBAQ_NORMALIZED].apply( + lambda x: (math.log10(x) + 10) if x > 0 else 0 + ) + + # Normalization used by PRIDE Team (no log transformation) (ibaq/total_ibaq) * 100'000'000 + res[IBAQ_PPB] = res[IBAQ_NORMALIZED].apply(lambda x: x * 100000000) + + return res + + +@click.command() +@click.option("-f", "--fasta", help="Protein database to compute IBAQ values") +@click.option( + "-p", + "--peptides", + help="Peptide identifications with intensities following the peptide intensity output", +) +@click.option( + "-e", + "--enzyme", + help="Enzyme used during the analysis of the dataset (default: Trypsin)", + default="Trypsin", +) +@click.option( + "-n", + "--normalize", + help="Normalize IBAQ values using by using the total IBAQ of the experiment", + is_flag=True, +) +@click.option( + "--min_aa", help="Minimum number of amino acids to consider a peptide", default=7 +) +@click.option( + "--max_aa", help="Maximum number of amino acids to consider a peptide", default=30 +) +@click.option("-o", "--output", help="Output file with the proteins and ibaq values") +@click.option( + "--verbose", + help="Print addition information about the distributions of the intensities, number of peptides remove " + "after normalization, etc.", + is_flag=True, +) +@click.option( + "--qc_report", + help="PDF file to store multiple QC images", + default="IBAQ-QCprofile.pdf", +) +def ibaq_compute( + fasta: str, + peptides: str, + enzyme: str, + normalize: bool, + min_aa: int, + max_aa: int, + output: str, + verbose: bool, + qc_report: str, +) -> None: + """ + This command computes the IBAQ values for a file output of peptides with the format described in + peptide_contaminants_file_generation.py. + :param min_aa: Minimum number of amino acids to consider a peptide. + :param max_aa: Maximum number of amino acids to consider a peptide. + :param fasta: Fasta file used to perform the peptide identification. + :param peptides: Peptide intensity file. + :param enzyme: Enzyme used to digest the protein sample. + :param normalize: use some basic normalization steps. + :param output: output format containing the ibaq values. + :param verbose: Print addition information. + :param qc_report: PDF file to store multiple QC images. + :return: + """ + if peptides is None or fasta is None: + print_help_msg(ibaq_compute) + exit(1) + + fasta_proteins = list() # type: list[FASTAEntry] + protein_accessions = list() + FASTAFile().load(fasta, fasta_proteins) + uniquepepcounts = dict() # type: dict[str, int] + digestor = ProteaseDigestion() + digestor.setEnzyme(enzyme) + + def get_average_nr_peptides_unique_bygroup(pdrow: Series) -> Series: + """ + Get the average intensity for protein groups + :param pdrow: peptide row + :return: average intensity + """ + proteins = pdrow.name[0].split(";") + summ = 0 + for prot in proteins: + summ += uniquepepcounts[prot] + if len(proteins) > 0 and summ > 0: + return pdrow.NormIntensity / (summ / len(proteins)) + # If there is no protein in the group, return np nan + return np.nan # type: ignore + + for entry in fasta_proteins: + digest = list() # type: list[str] + digestor.digest(AASequence().fromString(entry.sequence), digest, min_aa, max_aa) + digestuniq = set(digest) + # TODO: Try to get protein accessions from multiple databases. + protein_name = get_accession(entry.identifier) + uniquepepcounts[protein_name] = len(digestuniq) + protein_accessions.append(protein_name) + + data = pd.read_csv(peptides, sep=",") + data = data[data[PROTEIN_NAME].isin(protein_accessions)] + print(data.head()) + # next line assumes unique peptides only (at least per indistinguishable group) + + res = pd.DataFrame( + data.groupby([PROTEIN_NAME, SAMPLE_ID, CONDITION])[NORM_INTENSITY].sum() + ).apply(get_average_nr_peptides_unique_bygroup, 1) + res = res.sort_values(ascending=False) + res = res.to_frame() + res = res.reset_index() + res = res.rename(columns={0: IBAQ}) + + if normalize: + res = normalize_ibaq(res) + # Remove IBAQ_NORMALIZED NAN values + res = res.dropna(subset=[IBAQ_NORMALIZED]) + plot_column = IBAQ_PPB + else: + # Remove IBAQ NAN values + res = res.dropna(subset=[IBAQ]) + plot_column = IBAQ + + # Print the distribution of the protein IBAQ values + if verbose: + plot_width = len(set(res["SampleID"])) * 0.5 + 10 + pdf = PdfPages(qc_report) + density = plot_distributions( + res, + plot_column, + SAMPLE_ID, + log2=True, + width=plot_width, + title="IBAQ Distribution", + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + res, + plot_column, + SAMPLE_ID, + log2=True, + width=plot_width, + title="IBAQ Distribution", + violin=False, + ) + plt.show() + pdf.savefig(box) + pdf.close() + + # # For absolute expression the relation is one sample + one condition + # condition = data[CONDITION].unique()[0] + # res[CONDITION] = condition.lower() + + res.to_csv(output, index=False) + + +if __name__ == "__main__": + ibaq_compute() diff --git a/build/scripts-3.10/compute_tpa.py b/build/scripts-3.10/compute_tpa.py new file mode 100644 index 0000000..8a801e5 --- /dev/null +++ b/build/scripts-3.10/compute_tpa.py @@ -0,0 +1,229 @@ +#!D:\venv\ibaq\Scripts\python.exe +# -*- coding: utf-8 -*- + +import os + +import click +import matplotlib.pyplot as plt +import pandas as pd +from matplotlib.backends.backend_pdf import PdfPages +from pyopenms import * + +from ibaq.ibaqpy_commons import (CONDITION, NORM_INTENSITY, PROTEIN_NAME, SAMPLE_ID, + plot_box_plot, plot_distributions, print_help_msg, + remove_contaminants_entrapments_decoys, get_accession) + + +def handle_nonstandard_aa(aa_seq: str) -> (list, str): + """Any nonstandard amoni acid will be removed. + + :param aa_seq: Protein sequences from multiple database. + :return: One list contains nonstandard amoni acids and one remain sequence. + """ + standard_aa = 'ARNDBCEQZGHILKMFPSTWYV' + nonstandard_aa_lst = [aa for aa in aa_seq if aa not in standard_aa] + considered_seq = ''.join([aa for aa in aa_seq if aa in standard_aa]) + return nonstandard_aa_lst, considered_seq + + +@click.command() +@click.option("-f", "--fasta", help="Protein database") +@click.option( + "-p", + "--peptides", + help="Peptide identifications with intensities following the peptide intensity output", +) +@click.option("-r", "--ruler", help="Whether to use ProteomicRuler", is_flag=True) +@click.option("-n", "--ploidy", help="Ploidy number", default=2) +@click.option("-m", "--organism", help="Organism source of the data", default="human") +@click.option("-c", "--cpc", help="Cellular protein concentration(g/L)", default=200) +@click.option("-o", "--output", help="Output file with the proteins and other values") +@click.option( + "--verbose", + help="Print addition information about the distributions of the intensities, number of peptides remove " + "after normalization, etc.", + is_flag=True, +) +@click.option( + "--qc_report", + help="PDF file to store multiple QC images", + default="TPA-QCprofile.pdf", +) +def tpa_compute( + fasta: str, + peptides: str, + ruler: bool, + organism: str, + ploidy: int, + cpc: float, + output: str, + verbose: bool, + qc_report: str, +) -> None: + """ + This command computes the protein copies and concentrations according to a file output of peptides with the + format described in peptide_contaminants_file_generation.py. + :param fasta: Fasta file used to perform the peptide identification. + :param peptides: Peptide intensity file without normalization. + :param ruler: Whether to compute protein copies, weight and concentration. + :param organism: Organism source of the data. + :param ploidy: Ploidy number. + :param cpc: Cellular protein concentration(g/L). + :param output: Output format containing the TPA values, protein copy numbers and concentrations. + :param verbose: Print addition information. + :param qc_report: PDF file to store multiple QC images. + :return: + """ + if peptides is None or fasta is None: + print_help_msg(tpa_compute) + exit(1) + + data = pd.read_csv( + peptides, sep=",", usecols=[PROTEIN_NAME, NORM_INTENSITY, SAMPLE_ID, CONDITION] + ) + data[NORM_INTENSITY] = data[NORM_INTENSITY].astype("float") + data = data.dropna(subset=[NORM_INTENSITY]) + data = data[data[NORM_INTENSITY] > 0] + print(data.head()) + + res = pd.DataFrame( + data.groupby([PROTEIN_NAME, SAMPLE_ID, CONDITION])[NORM_INTENSITY].sum() + ) + res = res.reset_index() + proteins = res[PROTEIN_NAME].unique().tolist() + proteins = sum([i.split(";") for i in proteins], []) + + # calculate molecular weight of quantified proteins + mw_dict = dict() + fasta_proteins = list() # type: list[FASTAEntry] + FASTAFile().load(fasta, fasta_proteins) + for entry in fasta_proteins: + accession = get_accession(entry.identifier) + if accession in proteins: + try: + mw = AASequence().fromString(entry.sequence).getMonoWeight() + mw_dict[accession] = mw + except: + error_aa, seq = handle_nonstandard_aa(entry.sequence) + mw = AASequence().fromString(seq).getMonoWeight() + mw_dict[accession] = mw + print(f"Nonstandard amimo acids found in {accession}: {error_aa}, ignored!") + + res = res[res[PROTEIN_NAME].isin(mw_dict.keys())] + + # calculate TPA for every protein group + def get_protein_group_mw(group: str) -> float: + mw_list = [mw_dict[i] for i in group.split(";")] + return sum(mw_list) + + res["MolecularWeight"] = res.apply( + lambda x: get_protein_group_mw(x[PROTEIN_NAME]), axis=1 + ) + res["MolecularWeight"] = res["MolecularWeight"].fillna(1) + res["MolecularWeight"] = res["MolecularWeight"].replace(0, 1) + res["TPA"] = res[NORM_INTENSITY] / res["MolecularWeight"] + # Print the distribution of the protein TPA values + if verbose: + plot_width = len(set(res[SAMPLE_ID])) * 0.5 + 10 + pdf = PdfPages(qc_report) + density = plot_distributions( + res, "TPA", SAMPLE_ID, log2=True, width=plot_width, title="TPA Distribution" + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + res, + "TPA", + SAMPLE_ID, + log2=True, + width=plot_width, + title="TPA Distribution", + violin=False, + ) + plt.show() + pdf.savefig(box) + + # calculate protein weight(ng) and concentration(nM) + if ruler: + avogadro = 6.02214129e23 + average_base_pair_mass = 617.96 # 615.8771 + + organism = organism.lower() + histone_df = pd.read_json( + open(os.path.split(__file__)[0] + os.sep + "histones.json", "rb") + ).T + target_histones = histone_df[histone_df["name"] == organism.lower()] + genome_size = target_histones["genome_size"].values[0] + histones_list = target_histones["histone_entries"].values[0] + dna_mass = ploidy * genome_size * average_base_pair_mass / avogadro + + def calculate(protein_intensity, histone_intensity, mw): + copy = (protein_intensity / histone_intensity) * dna_mass * avogadro / mw + # The number of moles is equal to the number of particles divided by Avogadro's constant + moles = copy * 1e9 / avogadro # unit nmol + weight = moles * mw # unit ng + return tuple([copy, moles, weight]) + + def proteomic_ruler(df): + histone_intensity = df[df[PROTEIN_NAME].isin(histones_list)][ + NORM_INTENSITY + ].sum() + histone_intensity = histone_intensity if histone_intensity > 0 else 1 + df[["Copy", "Moles[nmol]", "Weight[ng]"]] = df.apply( + lambda x: calculate( + x[NORM_INTENSITY], histone_intensity, x["MolecularWeight"] + ), + axis=1, + result_type="expand", + ) + volume = df["Weight[ng]"].sum() * 1e-9 / cpc # unit L + df["Concentration[nM]"] = df["Moles[nmol]"] / volume # unit nM + return df + + res = res.groupby([CONDITION]).apply(proteomic_ruler) + + if verbose: + density = plot_distributions( + res, "Copy", SAMPLE_ID, width=plot_width, log2=True, title="Copy numbers Distribution" + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + res, + "Copy", + SAMPLE_ID, + width=plot_width, + log2=True, + title="Copy numbers Distribution", + violin=False, + ) + plt.show() + pdf.savefig(box) + + density = plot_distributions( + res, + "Concentration[nM]", + SAMPLE_ID, + width=plot_width, + log2=True, + title="Concentration[nM] Distribution", + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + res, + "Concentration[nM]", + SAMPLE_ID, + width=plot_width, + log2=True, + title="Concentration[nM] Distribution", + violin=False, + ) + plt.show() + pdf.savefig(box) + pdf.close() + res.to_csv(output, index=False) + + +if __name__ == "__main__": + tpa_compute() diff --git a/build/scripts-3.10/datasets_merger.py b/build/scripts-3.10/datasets_merger.py new file mode 100644 index 0000000..0a4156d --- /dev/null +++ b/build/scripts-3.10/datasets_merger.py @@ -0,0 +1,111 @@ +#!D:\venv\ibaq\Scripts\python.exe +import re + +import click + +from ibaq import __version__ +from ibaq.combiner import Combiner + +CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) + + +@click.group(context_settings=CONTEXT_SETTINGS) +def cli(): + """ + This is the main tool that gives access to all commands. + """ + + +@click.version_option( + version=__version__, package_name="ibaqpy", message="%(package)s %(version)s" +) +@click.command("datasets_merge", short_help="Merge ibaq results from compute_ibaq") +@click.option( + "--data_folder", + "-d", + help="Data dolfer contains SDRFs and ibaq CSVs.", + required=True, +) +@click.option( + "--output", "-o", help="Output file after batch effect removal.", required=True +) +@click.option( + "--covariate", + "-c", + default=None, + help="Indicator included in covariate consideration when datasets are merged.", +) +@click.option("--organism", help="Organism to keep in input data.", default="HUMAN") +@click.option( + "--covariate_to_keep", + "-k", + help="Keep tissue parts from metadata, e.g. 'LV,RV,LA,RA'.", + default=None, +) +@click.option( + "--non_missing_percent_to_keep", + "-m", + help="non-missing values in each group.", + default=0.3, +) +@click.option( + "--skip_outliers_removal", + help="Skip removing outliers in all datasets.", + default=False, + is_flag=True, +) +@click.option( + "--n_components", + help="Number of principal components to be computed.", + default=None, +) +@click.option("--min_cluster", help="The minimum size of clusters.", default=None) +@click.option( + "--min_sample_num", + help="The minimum number of samples in a neighborhood for a point to be considered as a core point.", + default=None, +) +@click.option("--n_iter", help="Number of iterations to be performed.", default=None) +@click.option( + "--verbose/--quiet", + "-v/-q", + help="Output debug information.", + default=False, + is_flag=True, +) +@click.pass_context +def datasets_merge( + ctx, + data_folder: str, + output: str, + covariate: str, + organism: str, + covariate_to_keep: list, + non_missing_percent_to_keep: float, + skip_outliers_removal: bool, + n_components: int, + min_cluster: int, + min_sample_num: int, + n_iter: int, + verbose: bool, +): + if covariate_to_keep: + covariate_to_keep = re.split(",\s*", covariate_to_keep) + combiner = Combiner(data_folder=data_folder, covariate=covariate) + combiner.imputer(covariate_to_keep) + if not skip_outliers_removal: + combiner.outlier_removal(n_components, min_cluster, min_sample_num, n_iter) + combiner.batch_correction(n_components, covariate_to_keep) + result = combiner.df_corrected + result.to_csv(output, sep=",", index=True) + + +cli.add_command(datasets_merge) + + +def main(): + cli() + + +if __name__ == "__main__": + main() diff --git a/build/scripts-3.10/merge_condition_files.py b/build/scripts-3.10/merge_condition_files.py new file mode 100644 index 0000000..358b7fe --- /dev/null +++ b/build/scripts-3.10/merge_condition_files.py @@ -0,0 +1,50 @@ +#!D:\venv\ibaq\Scripts\python.exe +# -*- coding: utf-8 -*- + + +import os + +from ibaq.ibaqpy_commons import * + + +def print_help_msg(command) -> None: + """ + Print help information + :param command: command to print helps + :return: print help + """ + with click.Context(command) as ctx: + click.echo(command.get_help(ctx)) + + +@click.command() +@click.option( + "-i", "--input", help="Folder with all the Intensity files", required=True +) +@click.option("-o", "--output", help="Prefix name for the file to group by condition") +@click.option( + "-p", "--pattern", help="Prefix of the pattern name for all the files in the folder" +) +def merge_condition_generation(input: str, output: str, pattern: str) -> None: + """ + Merge all the files in a folder with the specific pattern + :param input: Input folder containing all the peptide Intensity files + :param output: Output file prefix with all the intensities + :param pattern: pattern of the files with the corresponding file name prefix + :return: + """ + + files = [f for f in os.listdir(input) if pattern in f] + df_from_each_file = (pd.read_csv(input + "/" + f) for f in files) + concatenated_df = pd.concat(df_from_each_file, ignore_index=True) + concatenated_df[CONDITION] = concatenated_df[CONDITION].str.lower() + print(concatenated_df.head()) + + for k, g in concatenated_df.groupby([CONDITION]): + g.to_csv( + f"{output}/{k}-grouped-Intensities.csv", index=False + ) # '{}.csv'.format(k) + + +if __name__ == "__main__": + merge_condition_generation() diff --git a/build/scripts-3.10/peptide_normalization.py b/build/scripts-3.10/peptide_normalization.py new file mode 100644 index 0000000..9deb750 --- /dev/null +++ b/build/scripts-3.10/peptide_normalization.py @@ -0,0 +1,1214 @@ +#!D:\venv\quantms\Scripts\python.exe +import gc +import click +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from scipy.stats import rankdata +import os +import random +import uuid +from matplotlib.backends.backend_pdf import PdfPages +from pandas import DataFrame +import pyarrow.parquet as pq +from normalize_methods import normalize + +from ibaq.ibaqpy_commons import ( + BIOREPLICATE, + CHANNEL, + CONDITION, + PARQUET_COLUMNS, + FRACTION, + FRAGMENT_ION, + INTENSITY, + ISOTOPE_LABEL_TYPE, + NORM_INTENSITY, + PEPTIDE_CANONICAL, + PEPTIDE_CHARGE, + PEPTIDE_SEQUENCE, + PROTEIN_NAME, + REFERENCE, + RUN, + SAMPLE_ID, + STUDY_ID, + TMT16plex, + TMT11plex, + TMT10plex, + TMT6plex, + ITRAQ4plex, + ITRAQ8plex, + get_canonical_peptide, + get_spectrum_prefix, + get_study_accession, + parquet_map, + parse_uniprot_accession, + plot_box_plot, + plot_distributions, + remove_contaminants_entrapments_decoys, + remove_protein_by_ids, + sum_peptidoform_intensities, + get_peptidoform_normalize_intensities, + average_peptide_intensities, + print_help_msg, +) + + +def print_dataset_size(dataset: DataFrame, message: str, verbose: bool) -> None: + if verbose: + print(message + str(len(dataset.index))) + +def recover_df(df): + """ + This function is aimed to recover data shape. + """ + samples = df.columns.tolist() + out = pd.DataFrame() + for sample in samples: + samples_df = df[sample].dropna() + samples_df = samples_df.reset_index() + samples_df['SampleID'] = sample + samples_df.rename(columns={ + sample:NORM_INTENSITY + },inplace=True) + out = pd.concat([out,samples_df]) + return out + +def analyse_sdrf(sdrf_path: str, compression: bool) -> tuple: + """ + This function is aimed to parse SDRF and return four objects: + 1. sdrf_df: A dataframe with channels and references annoted. + 2. label: Label type of the experiment. LFQ, TMT or iTRAQ. + 3. sample_names: A list contains all sample names. + 4. choice: A dictionary caontains key-values between channel + names and numbers. + :param sdrf_path: File path of SDRF. + :param compression: Whether compressed. + :return: + """ + sdrf_df = pd.read_csv(sdrf_path, sep="\t", compression=compression) + sdrf_df[REFERENCE] = sdrf_df["comment[data file]"].apply(get_spectrum_prefix) + + labels = set(sdrf_df["comment[label]"]) + # Determine label type + label, choice = get_label(labels) + if label == "TMT": + choice_df = ( + pd.DataFrame.from_dict(choice, orient="index", columns=[CHANNEL]) + .reset_index() + .rename(columns={"index": "comment[label]"}) + ) + sdrf_df = sdrf_df.merge(choice_df, on="comment[label]", how="left") + elif label == "ITRAQ": + choice_df = ( + pd.DataFrame.from_dict(choice, orient="index", columns=[CHANNEL]) + .reset_index() + .rename(columns={"index": "comment[label]"}) + ) + sdrf_df = sdrf_df.merge(choice_df, on="comment[label]", how="left") + sample_names = sdrf_df["source name"].unique().tolist() + + return sdrf_df, label, sample_names, choice + + +def analyse_feature_df(feature_df: pd.DataFrame) -> tuple: + """Return label type, sample names and choice dict by iterating parquet. + + :param parquet_path: Feature parquet path. + :param batch_size: Iterate batch size, defaults to 100000 + :return: Label type, sample names and choice dict + """ + samples = feature_df["sample_accession"].unique().tolist() + labels = feature_df["isotope_label_type"].unique().tolist() + # Determine label type + label, choice = get_label(labels) + + return label, samples, choice + + +def analyse_feature_parquet(parquet_path: str, batch_size: int = 100000) -> tuple: + """Return label type, sample names and choice dict by iterating parquet. + + :param parquet_path: Feature parquet path. + :param batch_size: Iterate batch size, defaults to 100000 + :return: Label type, sample names and choice dict + """ + parquet_chunks = read_large_parquet(parquet_path, batch_size) + labels, samples = list(), list() + for chunk in parquet_chunks: + samples.extend(chunk["sample_accession"].unique().tolist()) + labels.extend(chunk["isotope_label_type"].unique().tolist()) + samples = list(set(samples)) + labels = list(set(labels)) + # Determine label type + label, choice = get_label(labels) + + return label, samples, choice + + +def read_large_parquet(parquet_path: str, batch_size: int = 100000): + parquet_file = pq.ParquetFile(parquet_path) + for batch in parquet_file.iter_batches(batch_size=batch_size): + batch_df = batch.to_pandas() + yield batch_df + + +def get_label(labels: list) -> (str, dict): + """Return label type and choice dict according to labels list. + + :param labels: Labels from SDRF. + :return: Tuple contains label type and choice dict. + """ + choice = None + if len(labels) == 1: + label = "LFQ" + elif "TMT" in ",".join(labels) or "tmt" in ",".join(labels): + if ( + len(labels) > 11 + or "TMT134N" in labels + or "TMT133C" in labels + or "TMT133N" in labels + or "TMT132C" in labels + or "TMT132N" in labels + ): + choice = TMT16plex + elif len(labels) == 11 or "TMT131C" in labels: + choice = TMT11plex + elif len(labels) > 6: + choice = TMT10plex + else: + choice = TMT6plex + label = "TMT" + elif "ITRAQ" in ",".join(labels) or "itraq" in ",".join(labels): + if len(labels) > 4: + choice = ITRAQ8plex + else: + choice = ITRAQ4plex + label = "ITRAQ" + else: + exit("Warning: Only support label free, TMT and ITRAQ experiment!") + return label, choice + + +def msstats_common_process(data_df: pd.DataFrame) -> pd.DataFrame: + """Apply common process on data. + + :param data_df: Feature data in dataframe. + :return: Processed data. + """ + data_df.rename( + columns={ + "ProteinName": PROTEIN_NAME, + "PeptideSequence": PEPTIDE_SEQUENCE, + "PrecursorCharge": PEPTIDE_CHARGE, + "Run": RUN, + "Condition": CONDITION, + "Intensity": INTENSITY, + }, + inplace=True, + ) + data_df[REFERENCE] = data_df[REFERENCE].apply(get_spectrum_prefix) + + return data_df + + +def parquet_common_process( + data_df: pd.DataFrame, label: str, choice: dict +) -> pd.DataFrame: + """Apply common process on data. + + :param data_df: Feature data in dataframe. + :return: Processed data. + """ + data_df = data_df.rename(columns=parquet_map) + data_df[PROTEIN_NAME] = data_df.apply(lambda x: ",".join(x[PROTEIN_NAME]), axis=1) + if label == "LFQ": + data_df.drop(CHANNEL, inplace=True, axis=1) + else: + data_df[CHANNEL] = data_df[CHANNEL].map(choice) + + return data_df + + +def merge_sdrf( + label: str, sdrf_df: pd.DataFrame, data_df: pd.DataFrame +) -> pd.DataFrame: + if label == "LFQ": + result_df = pd.merge( + data_df, + sdrf_df[["source name", REFERENCE]], + how="left", + on=[REFERENCE], + ) + elif label == "TMT": + result_df = pd.merge( + data_df, + sdrf_df[["source name", REFERENCE, CHANNEL]], + how="left", + on=[REFERENCE, CHANNEL], + ) + elif label == "ITRAQ": + result_df = pd.merge( + data_df, + sdrf_df[["source name", REFERENCE, CHANNEL]], + how="left", + on=[REFERENCE, CHANNEL], + ) + result_df.rename(columns={"source name": SAMPLE_ID}, inplace=True) + result_df = result_df[result_df["Condition"] != "Empty"] + + return result_df + + +def data_common_process(data_df: pd.DataFrame, min_aa: int) -> pd.DataFrame: + # Remove 0 intensity signals from the data + data_df = data_df[data_df[INTENSITY] > 0] + data_df = data_df[data_df["Condition"] != "Empty"] + + def map_canonical_seq(data_df: pd.DataFrame) -> (pd.DataFrame, dict): + modified_seqs = data_df[PEPTIDE_SEQUENCE].unique().tolist() + canonical_seqs = [get_canonical_peptide(i) for i in modified_seqs] + inner_canonical_dict = dict(zip(modified_seqs, canonical_seqs)) + data_df[PEPTIDE_CANONICAL] = data_df[PEPTIDE_SEQUENCE].map(inner_canonical_dict) + + return data_df, inner_canonical_dict + + if PEPTIDE_CANONICAL not in data_df.columns: + data_df, inner_canonical_dict = map_canonical_seq(data_df) + data_df[PEPTIDE_CANONICAL] = data_df[PEPTIDE_SEQUENCE].map(inner_canonical_dict) + # Filter peptides with less amino acids than min_aa (default: 7) + data_df = data_df[ + data_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1) + ] + data_df[PROTEIN_NAME] = data_df[PROTEIN_NAME].apply(parse_uniprot_accession) + data_df[STUDY_ID] = data_df[SAMPLE_ID].apply(get_study_accession) + if FRACTION not in data_df.columns: + data_df[FRACTION] = 1 + data_df = data_df[ + [ + PROTEIN_NAME, + PEPTIDE_SEQUENCE, + PEPTIDE_CANONICAL, + PEPTIDE_CHARGE, + INTENSITY, + REFERENCE, + CONDITION, + RUN, + BIOREPLICATE, + FRACTION, + FRAGMENT_ION, + ISOTOPE_LABEL_TYPE, + STUDY_ID, + SAMPLE_ID, + ] + ] + data_df[CONDITION] = pd.Categorical(data_df[CONDITION]) + data_df[STUDY_ID] = pd.Categorical(data_df[STUDY_ID]) + data_df[SAMPLE_ID] = pd.Categorical(data_df[SAMPLE_ID]) + + return data_df + +def intensity_normalization( + dataset: DataFrame, + field: str, + class_field: str, + scaling_method: str = "quantile", +) -> DataFrame: + cols_to_keep = [ + PROTEIN_NAME, + PEPTIDE_CANONICAL, + PEPTIDE_SEQUENCE, + PEPTIDE_CHARGE, + SAMPLE_ID, + BIOREPLICATE, + CONDITION, + NORM_INTENSITY, + ] + # TODO add imputation and/or removal to those two norm strategies + if scaling_method == "msstats": + # For TMT normalization + if "Channel" in dataset.columns: + g = dataset.groupby(["Run", "Channel"])[field].apply(np.nanmedian) + g.name = "RunMedian" + dataset = dataset.join(g, on=["Run", "Channel"]) + median_baseline = dataset.drop_duplicates(subset=["Run", "Channel", field])[ + field + ].median() + dataset[NORM_INTENSITY] = ( + dataset[field] - dataset["RunMedian"] + median_baseline + ) + else: + g = dataset.groupby(["Run", "Fraction"])[field].apply(np.nanmedian) + g.name = "RunMedian" + dataset = dataset.join(g, on=["Run", "Fraction"]) + dataset["FractionMedian"] = ( + dataset["RunMedian"].groupby(dataset["Fraction"]).transform("median") + ) + dataset[NORM_INTENSITY] = ( + dataset[field] - dataset["RunMedian"] + dataset["FractionMedian"] + ) + return dataset[cols_to_keep] + + else: + # pivot to have one col per sample + print("Transforming to wide format dataset size {}".format(len(dataset.index))) + normalize_df = pd.pivot_table( + dataset, + index=[ + PEPTIDE_SEQUENCE, + PEPTIDE_CANONICAL, + PEPTIDE_CHARGE, + FRACTION, + RUN, + BIOREPLICATE, + PROTEIN_NAME, + STUDY_ID, + CONDITION, + ], + columns=class_field, + values=field, + aggfunc={field: np.nanmean}, + observed=True, + ) + normalize_df = normalize(normalize_df,scaling_method) + # TODO: When restoring the pivot table here, the previous grouping caused + # the dataframe to produce a large number of rows with NORM_INTENSITY of + # NA at melt. This results in an unbearable memory consumption. + + normalize_df = recover_df(normalize_df) + normalize_df = normalize_df.drop_duplicates() + print(normalize_df.head()) + return normalize_df[cols_to_keep] + + +def remove_low_frequency_peptides_( + dataset_df: DataFrame, percentage_samples: float = 0.20 +): + """ + Remove peptides that are present in less than 20% of the samples. + :param dataset_df: dataframe with the data + :param percentage_samples: percentage of samples + :return: + """ + + normalize_df = pd.pivot_table( + dataset_df, + index=[PEPTIDE_CANONICAL, PROTEIN_NAME], + columns=SAMPLE_ID, + values=NORM_INTENSITY, + aggfunc={NORM_INTENSITY: np.nanmean}, + observed=True, + ) + # Count the number of null values in each row + null_count = normalize_df.isnull().sum(axis=1) + # Find the rows that have null values above the threshold + rows_to_drop = null_count[ + null_count >= (1 - percentage_samples) * normalize_df.shape[1] + ].index + # Drop the rows with too many null values + normalize_df = normalize_df.drop(rows_to_drop) + + # Remove rows with non-null values in only one column + normalize_df = normalize_df[ + normalize_df.notnull().sum(axis=1) != 1 + ] + """ + normalize_df = normalize_df.reset_index() + normalize_df = normalize_df.melt(id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME]) + normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) + """ + normalize_df = recover_df(normalize_df) + # recover condition column + normalize_df = normalize_df.merge( + dataset_df[[SAMPLE_ID, CONDITION]].drop_duplicates(subset=[SAMPLE_ID]), + on=SAMPLE_ID, + how="left", + ) + + # Remove rows with null values in NORMALIZE_INTENSITY + normalize_df = normalize_df[normalize_df[NORM_INTENSITY].notna()] + + print(normalize_df.head()) + return normalize_df + + +def peptide_intensity_normalization( + dataset_df: DataFrame, field: str, class_field: str, scaling_method: str +): + """ + Normalize the peptide intensities using different methods. + :param dataset_df: dataframe with the data + :param field: field to normalize + :param class_field: field to use as class + :param scaling_method: method to use for the normalization + :return: + """ + # pivot to have one col per sample + normalize_df = pd.pivot_table( + dataset_df, + index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], + columns=class_field, + values=field, + aggfunc={field: np.nanmean}, + observed=True, + ) + # need nomalize? + normalize_df = recover_df(normalize_df) + """ + normalize_df = normalize_df.reset_index() + normalize_df = normalize_df.melt( + id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION] + ) + normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) + normalize_df = normalize_df[normalize_df[NORM_INTENSITY].notna()] + """ + return normalize_df + + +def impute_peptide_intensities(dataset_df, field, class_field): + """ + Impute the missing values using different methods. + :param dataset_df: dataframe with the data + :param field: field to impute + :param class_field: field to use as class + :return: + """ + normalize_df = pd.DataFrame() + # group by condition to detect missing values + for c, g in dataset_df.groupby(CONDITION): + # pivot to have one col per sample + group_normalize_df = pd.pivot_table( + g, + index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], + columns=class_field, + values=field, + aggfunc={field: np.nanmean}, + observed=True, + ) + + # no missing values group -> only one sample + if len(group_normalize_df.columns) < 2: + group_normalize_df = group_normalize_df.reset_index() + group_normalize_df = group_normalize_df.melt( + id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION] + ) + group_normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) + normalize_df = normalize_df.append(group_normalize_df, ignore_index=True) + # else: + # # print ("nothing") + # # Impute the missing values + # # imputer = MissForest(max_iter=5) + # # imputed_data = imputer.fit_transform(group_normalize_df) + # # group_normalize_df = pd.DataFrame(imputed_data, columns=group_normalize_df.columns, + # # index=group_normalize_df.index) + # # # Melt the dataframe + # # group_normalize_df = group_normalize_df.reset_index() + # # group_normalize_df = group_normalize_df.melt(id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION]) + # # group_normalize_df.rename(columns={'value': NORM_INTENSITY}, inplace=True) + # # normalize_df = normalize_df.append(group_normalize_df, ignore_index=True) + + return normalize_df + + +@click.command() +@click.option( + "-m", "--msstats", help="MsStats file import generated by quantms", default=None +) +@click.option( + "-p", "--parquet", help="Parquet file import generated by quantms.io", default=None +) +@click.option( + "-s", "--sdrf", help="SDRF file import generated by quantms", default=None +) +@click.option("--stream", help="Stream processing normalization", is_flag=True) +@click.option( + "--chunksize", + help="The number of rows of MSstats or parquet read using pandas streaming", + default=1000000, +) +@click.option( + "--min_aa", help="Minimum number of amino acids to filter peptides", default=7 +) +@click.option( + "--min_unique", + help="Minimum number of unique peptides to filter proteins", + default=2, +) +@click.option( + "--remove_ids", + help="Remove specific protein ids from the analysis using a file with one id per line", +) +@click.option( + "--remove_decoy_contaminants", + help="Remove decoy and contaminants proteins from the analysis", + is_flag=True, + default=False, +) +@click.option( + "--remove_low_frequency_peptides", + help="Remove peptides that are present in less than 20% of the samples", + is_flag=True, + default=False, +) +@click.option( + "--output", + help="Peptide intensity file including other all properties for normalization", +) +@click.option( + "--skip_normalization", help="Skip normalization step", is_flag=True, default=False +) +@click.option( + "--nmethod", + help="Normalization method used to normalize intensities for all samples (options: quantile, msstats, qnorm)", + default="quantile", +) +@click.option( + "--pnormalization", + help="Normalize the peptide intensities using different methods (options: qnorm)", + is_flag=True, +) +@click.option( + "--compress", + help="Read the input peptides file in compress gzip file", + is_flag=True, +) +@click.option( + "--log2", + help="Transform to log2 the peptide intensity values before normalization", + is_flag=True, +) +@click.option( + "--violin", + help="Use violin plot instead of boxplot for distribution representations", + is_flag=True, +) +@click.option( + "--verbose", + help="Print addition information about the distributions of the intensities, number of peptides remove " + "after normalization, etc.", + is_flag=True, +) +@click.option( + "--qc_report", + help="PDF file to store multiple QC images", + default="peptideNorm-QCprofile.pdf", +) +def peptide_normalization( + msstats: str, + parquet: str, + sdrf: str, + stream: bool, + chunksize: int, + min_aa: int, + min_unique: int, + remove_ids: str, + remove_decoy_contaminants: bool, + remove_low_frequency_peptides: bool, + output: str, + skip_normalization: bool, + nmethod: str, + pnormalization: bool, + compress: bool, + log2: bool, + violin: bool, + verbose: bool, + qc_report: str, +) -> None: + if output is None: + print_help_msg(peptide_normalization) + exit(1) + + if parquet is None and (msstats is None or sdrf is None): + print_help_msg(peptide_normalization) + exit(1) + + if pnormalization and nmethod not in ["qnorm", "quantile"]: + exit( + "Peptide intensity normalization works only with qnorm or quantile methods!" + ) + + if verbose: + log_after_norm = not log2 + + pd.set_option("display.max_columns", None) + compression_method = "gzip" if compress else None + print("Loading data..") + + if not stream: + if parquet is None: + # Read the msstats file + feature_df = pd.read_csv( + msstats, + sep=",", + compression=compression_method, + dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, + ) + + # Read the sdrf file + sdrf_df, label, sample_names, choice = analyse_sdrf( + sdrf, compression_method + ) + print(sdrf_df) + + # Merged the SDRF with the Resulted file + dataset_df = msstats_common_process(feature_df) + dataset_df = merge_sdrf(label, sdrf_df, feature_df) + # Remove the intermediate variables and free the memory + del feature_df, sdrf_df + gc.collect() + else: + dataset_df = pd.read_parquet(parquet)[PARQUET_COLUMNS] + label, sample_names, choice = analyse_feature_df(dataset_df) + dataset_df = parquet_common_process(dataset_df, label, choice) + + dataset_df = data_common_process(dataset_df, min_aa) + # Only proteins with unique peptides number greater than min_unique (default: 2) are retained + unique_peptides = set( + dataset_df.groupby(PEPTIDE_CANONICAL) + .filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[PEPTIDE_CANONICAL] + .tolist() + ) + strong_proteins = set( + dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(unique_peptides)] + .groupby(PROTEIN_NAME) + .filter(lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[ + PROTEIN_NAME + ] + .tolist() + ) + dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] + + print(f"Number of unique peptides: {len(unique_peptides)}") + print(f"Number of strong proteins: {len(strong_proteins)}") + + print("Logarithmic if specified..") + dataset_df = dataset_df.rename(columns={INTENSITY: NORM_INTENSITY}) + if log2: + dataset_df[NORM_INTENSITY] = np.log2(dataset_df[NORM_INTENSITY]) + + # Print the distribution of the original peptide intensities from quantms analysis + if verbose: + sample_names = set(dataset_df[SAMPLE_ID]) + plot_width = len(sample_names) * 0.5 + 10 + pdf = PdfPages(qc_report) + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=not log2, + width=plot_width, + title="Original peptidoform intensity distribution (no normalization)", + ) + #plt.show() + pdf.savefig(density) + """ + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=not log2, + width=plot_width, + title="Original peptidoform intensity distribution (no normalization)", + violin=violin, + ) + plt.show() + pdf.savefig(box) + """ + + # Remove high abundant and contaminants proteins and the outliers + if remove_ids is not None: + print("Remove proteins from file...") + dataset_df = remove_protein_by_ids(dataset_df, remove_ids) + if remove_decoy_contaminants: + print("Remove decoy and contaminants...") + dataset_df = remove_contaminants_entrapments_decoys(dataset_df) + + print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) + print("Normalize intensities.. ") + # dataset_df = dataset_df.dropna(how="any") + if not skip_normalization: + dataset_df = intensity_normalization( + dataset_df, + field=NORM_INTENSITY, + class_field=SAMPLE_ID, + scaling_method=nmethod, + ) + if verbose: + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + #log2=log_after_norm, + width=plot_width, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, + ) + #plt.show() + pdf.savefig(density) + """ + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + """ + print("Number of peptides after normalization: " + str(len(dataset_df.index))) + print("Select the best peptidoform across fractions...") + dataset_df = get_peptidoform_normalize_intensities(dataset_df) + print( + "Number of peptides after peptidofrom selection: " + + str(len(dataset_df.index)) + ) + + print("Sum all peptidoforms per Sample...") + dataset_df = sum_peptidoform_intensities(dataset_df) + print("Number of peptides after selection: " + str(len(dataset_df.index))) + + print("Average all peptidoforms per Peptide/Sample...") + dataset_df = average_peptide_intensities(dataset_df) + print("Number of peptides after average: " + str(len(dataset_df.index))) + if verbose: + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptide intensity distribution method: " + nmethod, + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptide intensity distribution method: " + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + + if remove_low_frequency_peptides and len(sample_names) > 1: + print(dataset_df) + dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) + print_dataset_size( + dataset_df, "Peptides after remove low frequency peptides: ", verbose + ) + # Perform imputation using Random Forest in Peptide Intensities + # TODO: Check if this is necessary (Probably we can do some research if imputation at peptide level is necessary + # if impute: + # dataset_df = impute_peptide_intensities(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID) + + if pnormalization: + print("Normalize at Peptide level...") + dataset_df = peptide_intensity_normalization( + dataset_df, + field=NORM_INTENSITY, + class_field=SAMPLE_ID, + scaling_method=nmethod, + ) + + if verbose: + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Normalization at peptide level method: " + nmethod, + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Normalization at peptide level method: " + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + pdf.close() + + print("Save the normalized peptide intensities...") + dataset_df.to_csv(output, index=False, sep=",") + else: + if parquet is None: + sdrf_df, label, sample_names, choice = analyse_sdrf( + sdrf, compression_method + ) + msstats_chunks = pd.read_csv( + msstats, + sep=",", + compression=compression_method, + dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, + chunksize=chunksize, + ) + else: + label, sample_names, choice = analyse_feature_parquet( + parquet, batch_size=chunksize + ) + msstats_chunks = read_large_parquet(parquet, batch_size=chunksize) + sample_number = len(sample_names) + + # TODO: Stream processing to obtain strong proteins with more than 2 uniqe peptides + temp = f"Temp-{str(uuid.uuid4())}/" + os.mkdir(temp) + print(f"INFO: Writing files into {temp}...") + unique_peptides = {} + group_intensities = {} + quantile = {} + print("INFO: First iteration to get unique peptides and strong proteins...") + for msstats_df in msstats_chunks: + if parquet is None: + msstats_df = msstats_common_process(msstats_df) + msstats_df = merge_sdrf(label, sdrf_df, msstats_df) + else: + msstats_df = parquet_common_process(msstats_df, label, choice) + result_df = data_common_process(msstats_df, min_aa) + + # Write CSVs by Sample ID + for sample in sample_names: + file_name = f"{temp}/{sample}.csv" + write_mode = "a" if os.path.exists(file_name) else "w" + header = False if os.path.exists(file_name) else True + result_df[result_df[SAMPLE_ID] == sample].to_csv( + file_name, index=False, header=header, mode=write_mode + ) + unique_df = result_df.groupby([PEPTIDE_CANONICAL]).filter( + lambda x: len(set(x[PROTEIN_NAME])) == 1 + )[[PEPTIDE_CANONICAL, PROTEIN_NAME]] + unique_dict = dict( + zip(unique_df[PEPTIDE_CANONICAL], unique_df[PROTEIN_NAME]) + ) + for i in unique_dict.keys(): + if i in unique_peptides.keys() and unique_dict[i] != unique_peptides[i]: + unique_peptides.pop(i) + else: + unique_peptides[i] = unique_dict[i] + + proteins_list = list(unique_peptides.values()) + count_dict = { + element: proteins_list.count(element) for element in set(proteins_list) + } + strong_proteins = [ + element for element in count_dict if count_dict[element] >= min_unique + ] + del proteins_list, count_dict + print(f"Number of unique peptides: {len(list(unique_peptides.keys()))}") + print(f"Number of strong proteins: {len(strong_proteins)}") + + # TODO: Filter proteins with less unique peptides than min_unique (default: 2) + plot_samples = random.sample(sample_names, min(len(sample_names), 20)) + plot_width = 10 + len(plot_samples) * 0.5 + pdf = PdfPages(qc_report) + original_intensities_df = pd.DataFrame() + + print("INFO: Second iteration to filter data and prepare normalization...") + print("Logarithmic if specified..") + norm_record = [0] * 2 + for sample in sample_names: + msstats_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") + msstats_df = msstats_df[msstats_df[PROTEIN_NAME].isin(strong_proteins)] + # Remove high abundant and contaminants proteins and the outliers + if remove_ids is not None: + msstats_df = remove_protein_by_ids(msstats_df, remove_ids) + if remove_decoy_contaminants: + msstats_df = remove_contaminants_entrapments_decoys(msstats_df) + norm_record[0] += len(msstats_df) + msstats_df = msstats_df.rename(columns={INTENSITY: NORM_INTENSITY}) + if log2: + msstats_df[NORM_INTENSITY] = np.log2(msstats_df[NORM_INTENSITY]) + if sample in plot_samples: + original_intensities_df = pd.concat( + [original_intensities_df, msstats_df] + ) + if not skip_normalization: + if nmethod == "msstats": + if label in ["TMT", "ITRAQ"]: + g = msstats_df.groupby(["Run", "Channel"]) + else: + g = msstats_df.groupby(["Run", "Fraction"]) + for name, group in g: + group_intensity = group[NORM_INTENSITY].tolist() + if name not in group_intensities: + group_intensities[name] = group_intensity + else: + group_intensities.update( + { + name: group_intensities[NORM_INTENSITY] + + group_intensity + } + ) + elif nmethod == "quantile": + msstats_df = ( + msstats_df.groupby( + [ + PEPTIDE_SEQUENCE, + PEPTIDE_CANONICAL, + PEPTIDE_CHARGE, + FRACTION, + RUN, + BIOREPLICATE, + PROTEIN_NAME, + STUDY_ID, + CONDITION, + ] + )[NORM_INTENSITY] + .agg(np.nanmean) + .reset_index() + ) + rank = msstats_df[NORM_INTENSITY].rank(method="average") + dic = dict(zip(rank, msstats_df[NORM_INTENSITY])) + if len(quantile) == 0: + quantile = {k: (v, 1) for k, v in dic.items()} + else: + # update = min(len(quantile), len(dic)) + intersec = set(quantile.keys()) & set(dic.keys()) + update = set(dic.keys()) - set(quantile.keys()) + quantile.update( + { + i: (quantile[i][0] + dic[i], quantile[i][1] + 1) + for i in intersec + } + ) + if len(update) > 0: + quantile.update({k: (dic[k], 1) for k in update}) + msstats_df[SAMPLE_ID] = sample + else: + exit("Stream process only supports msstats and quantile methods!") + msstats_df.to_csv(f"{temp}/{sample}.csv", index=False, sep=",") + norm_record[1] += len(msstats_df) + if not skip_normalization and nmethod == "quantile": + quantile = {k: v[0] / v[1] for k, v in quantile.items()} + print(f"Peptides after contaminants removal: {norm_record[0]}") + print(f"Number of peptides after normalization: {norm_record[1]}") + # Save original intensities QC plots + original_intensities_df = original_intensities_df.reset_index(drop=True) + density = plot_distributions( + original_intensities_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=not log2, + width=plot_width, + title="Original peptidoform intensity distribution (no normalization)", + ) + pdf.savefig(density) + box = plot_box_plot( + original_intensities_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=not log2, + width=plot_width, + title="Original peptidoform intensity distribution (no normalization)", + violin=violin, + ) + plt.show() + pdf.savefig(box) + del original_intensities_df + + # TODO: Peptide intensity normalization + peptides_count = pd.DataFrame( + columns=[PROTEIN_NAME, PEPTIDE_CANONICAL, "count"] + ) + norm_intensities_df = pd.DataFrame() + if not skip_normalization and nmethod == "msstats": + # For ISO normalization + if label in ["TMT", "ITRAQ"]: + median_baseline = np.nanmedian( + list(set(sum(group_intensities.values(), []))) + ) + group_intensities = { + key: np.nanmedian(list(values)) + for key, values in group_intensities.items() + } + else: + fractions = [i[1] for i in group_intensities.keys()] + fraction_median = {} + for fraction in fractions: + fraction_keys = [ + i for i in group_intensities.keys() if i[1] == fraction + ] + fraction_intensities = [] + for key in fraction_keys: + fraction_intensities.extend(group_intensities[key]) + fraction_median[fraction] = np.nanmedian(fraction_intensities) + group_intensities = { + key: np.nanmedian(values) + for key, values in group_intensities.items() + } + print("INFO: Third iteration to normalize and counting peptides frequency...") + size_record = [0] * 3 + + def normalization( + dataset_df, label, sample, skip_normalization, nmethod, record + ): + if not skip_normalization: + field = NORM_INTENSITY + if nmethod == "msstats": + # For ISO normalization + if label in ["TMT", "ITRAQ"]: + dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( + lambda x: x[field] + - group_intensities[(x["Run"], x["Channel"])] + + median_baseline, + axis=1, + ) + else: + dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( + lambda x: x[field] + - group_intensities[(x["Run"], x["Fraction"])] + + np.nanmedian( + [ + group_intensities[i] + for i in group_intensities.keys() + if i[1] == x["Fraction"] + ] + ), + axis=1, + ) + elif nmethod == "quantile": + rank = dataset_df[NORM_INTENSITY].rank(method="average") + ref_dict = dict(zip(rank, dataset_df[NORM_INTENSITY])) + ref_dict = {v: quantile[k] for k, v in ref_dict.items()} + dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( + lambda x: ref_dict.get(x[NORM_INTENSITY], np.nan), + axis=1, + ) + dataset_df = dataset_df.drop_duplicates() + dataset_df = dataset_df[dataset_df[NORM_INTENSITY].notna()] + dataset_df = get_peptidoform_normalize_intensities(dataset_df) + record[0] += len(dataset_df.index) + dataset_df = sum_peptidoform_intensities(dataset_df) + record[1] += len(dataset_df.index) + dataset_df = average_peptide_intensities(dataset_df) + record[2] += len(dataset_df.index) + + return dataset_df, record + + for sample in sample_names: + dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") + if len(dataset_df) != 0: + norm_df, size_record = normalization( + dataset_df, label, sample, skip_normalization, nmethod, size_record + ) + else: + continue + sample_peptides = norm_df[PEPTIDE_CANONICAL].unique().tolist() + if remove_low_frequency_peptides and sample_number > 1: + sample_peptides = norm_df[ + [PROTEIN_NAME, PEPTIDE_CANONICAL] + ].drop_duplicates() + sample_peptides["count"] = 1 + peptides_count = ( + pd.concat([peptides_count, sample_peptides]) + .groupby([PROTEIN_NAME, PEPTIDE_CANONICAL]) + .agg(sum) + .reset_index() + ) + norm_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) + if sample in plot_samples: + norm_intensities_df = pd.concat([norm_intensities_df, norm_df]) + del group_intensities, quantile + print(f"Number of peptides after peptidofrom selection: {size_record[0]}") + print(f"Number of peptides after selection: {size_record[1]}") + print(f"Number of peptides after average: {size_record[2]}") + # Save normalized intensities QC plots + norm_intensities_df = norm_intensities_df.reset_index(drop=True) + density = plot_distributions( + norm_intensities_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + norm_intensities_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + del norm_intensities_df, strong_proteins + + print("INFO: Writing normalized intensities into CSV...") + if remove_low_frequency_peptides and sample_number > 1: + peptides_count = peptides_count.loc[ + (peptides_count["count"] > 0.20 * sample_number) + & (peptides_count["count"] != sample_number - 1) + ] + + final_norm_intensities_df = pd.DataFrame() + size_record = 0 + for sample in sample_names: + dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") + if remove_low_frequency_peptides and sample_number > 1: + # Filter low-frequency peptides, which indicate whether the peptide occurs less than 20% in all samples or + # only in one sample + dataset_df = dataset_df.merge( + peptides_count[[PEPTIDE_CANONICAL, PROTEIN_NAME]], how="inner" + ) + size_record += len(dataset_df.index) + dataset_df = dataset_df[ + [PEPTIDE_CANONICAL, PROTEIN_NAME, SAMPLE_ID, NORM_INTENSITY, CONDITION] + ] + write_mode = "a" if os.path.exists(output) else "w" + header = False if os.path.exists(output) else True + dataset_df.to_csv(output, index=False, header=header, mode=write_mode) + dataset_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) + if sample in plot_samples: + final_norm_intensities_df = pd.concat( + [final_norm_intensities_df, dataset_df] + ) + print(f"Peptides after remove low frequency peptides: {size_record}") + if remove_low_frequency_peptides: + del peptides_count + + # TODO: No peptides intensity normalization applied in stream processing. + # Save final normalized intensities QC plots + final_norm_intensities_df = final_norm_intensities_df.reset_index(drop=True) + density = plot_distributions( + final_norm_intensities_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Normalization at peptide level method: " + nmethod, + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + final_norm_intensities_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Normalization at peptide level method: " + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + pdf.close() + + +if __name__ == "__main__": + peptide_normalization() diff --git a/build/scripts-3.10/tsne_visualization.py b/build/scripts-3.10/tsne_visualization.py new file mode 100644 index 0000000..82488fb --- /dev/null +++ b/build/scripts-3.10/tsne_visualization.py @@ -0,0 +1,187 @@ +# import libraries +import glob +import math + +import click +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from sklearn.decomposition import PCA +from sklearn.manifold import TSNE + +from ibaq.ibaqpy_commons import (IBAQ_LOG, IBAQ_NORMALIZED, PROTEIN_NAME, + SAMPLE_ID) + + +# function to compute principal components +def compute_pca(df, n_components=5) -> pd.DataFrame: + """ + Compute principal components for a given dataframe. + + Parameters + ---------- + df : pd.DataFrame + A dataframe with samples as rows and features as columns. + n_components : int + Number of principal components to be computed. + + Returns + ------- + df_pca : pd.DataFrame + A dataframe with the principal components. + """ + + pca = PCA(n_components=n_components) + pca.fit(df) + df_pca = pca.transform(df) + + df_pca = pd.DataFrame( + df_pca, index=df.index, columns=[f"PC{i}" for i in range(1, n_components + 1)] + ) + + plt.rcParams["figure.figsize"] = (12, 6) + + fig, ax = plt.subplots() + xi = np.arange(1, n_components + 1, step=1) + y = np.cumsum(pca.explained_variance_ratio_) + + plt.ylim(0.0, 1.1) + plt.plot(xi, y, marker="o", linestyle="--", color="b") + + plt.xlabel("Number of Components") + plt.xticks( + np.arange(0, n_components, step=1) + ) # change from 0-based array index to 1-based human-readable label + plt.ylabel("Cumulative variance (%)") + plt.title("The number of components needed to explain variance") + + plt.axhline(y=0.95, color="r", linestyle="-") + plt.text(0.5, 0.85, "95% cut-off threshold", color="red", fontsize=16) + + ax.grid(axis="x") + plt.show() + + return df_pca + + +def compute_tsne(df_pca, n_components=2, perplexity=30, learning_rate=200, n_iter=2000): + """ + Compute t-SNE components from PCA components. + + This function applies t-SNE (t-Distributed Stochastic Neighbor Embedding) to the input DataFrame, + which is expected to contain PCA components with samples as rows. The output is another DataFrame + that contains t-SNE components, also with samples as rows. + + Parameters + ---------- + df_pca : pandas DataFrame + Input DataFrame containing PCA components. Rows are samples and columns are PCA components. + n_components : int, optional + The number of dimensions for the t-SNE components (default is 2). + perplexity : float, optional + The perplexity parameter for t-SNE, which can influence the balance between maintaining + the local and global structure of the data (default is 30). + learning_rate : float, optional + The learning rate for t-SNE (default is 200). + n_iter : int, optional + The number of iterations for t-SNE optimization (default is 2000). + + Returns + ------- + df_tsne : pandas DataFrame + Output DataFrame containing t-SNE components. Rows are samples and columns are t-SNE components. + + Example + ------- + df_pca = pd.DataFrame(data, columns=['PC1', 'PC2', 'PC3']) + df_tsne = compute_tsne(df_pca) + """ + + tsne = TSNE( + n_components=n_components, + perplexity=perplexity, + learning_rate=learning_rate, + n_iter=n_iter, + ) + tsne_results = tsne.fit_transform(np.asarray(df_pca)) + + tsne_cols = [f"tSNE{i + 1}" for i in range(n_components)] + + df_tsne = pd.DataFrame(data=tsne_results, columns=tsne_cols) + df_tsne.index = df_pca.index + return df_tsne + + +def plot_tsne(df, x_col, y_col, hue_col, file_name): + fig, ax = plt.subplots(1, 1, figsize=(20, 10)) + sns.scatterplot( + x=x_col, y=y_col, hue=hue_col, data=df, ax=ax, markers=["o", "+", "x"] + ) + ax.set_xlabel(x_col) + ax.set_ylabel(y_col) + ax.set_title(f"{x_col} vs {y_col} with {hue_col} information") + # set legend inside the plot left an upper corner + plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, fontsize=8) + plt.subplots_adjust(right=0.8) + plt.savefig(file_name) + + +@click.command() +@click.option( + "-f", "--folder", help="Folder that contains all the protein files", required=True +) +@click.option( + "-o", + "--pattern", + help="Protein file pattern", + required=False, + default="proteins.tsv", +) +def tsne_visualization(folder: str, pattern: str): + """ + Generate t-SNE plots from protein files in a folder. + :param folder: Folder that contains all the protein files + :param pattern: Protein file pattern + """ + # get all the files in the folder + files = glob.glob(f"{folder}/*{pattern}") + + # get the files into pandas selected columns + # (Proteins accession, Sample ID, Reanalysis accession, Intensity) + + dfs = [] # list of dataframes + + for f in files: + reanalysis = (f.split("/")[-1].split("_")[0]).replace("-proteins.tsv", "") + dfs += [ + pd.read_csv(f, usecols=[PROTEIN_NAME, SAMPLE_ID, IBAQ_LOG], sep=",").assign( + reanalysis=reanalysis + ) + ] + + total_proteins = pd.concat(dfs, ignore_index=True) + + normalize_df = pd.pivot_table( + total_proteins, + index=[SAMPLE_ID, "reanalysis"], + columns=PROTEIN_NAME, + values=IBAQ_LOG, + ) + normalize_df = normalize_df.fillna(0) + df_pca = compute_pca(normalize_df, n_components=30) + df_tsne = compute_tsne(df_pca) + + batch = df_tsne.index.get_level_values("reanalysis").tolist() + df_tsne["batch"] = batch + + # plot the t-SNE components tSNE1 vs tSNE2 with batch information using seaborn + plot_tsne( + df_tsne, "tSNE1", "tSNE2", "batch", "5.tsne_plot_with_batch_information.pdf" + ) + + print(total_proteins.shape) + + +if __name__ == "__main__": + tsne_visualization() diff --git a/dist/ibaqpy-0.0.3-py3.10.egg b/dist/ibaqpy-0.0.3-py3.10.egg new file mode 100644 index 0000000000000000000000000000000000000000..3df0a855bc8fbcab373de5b14473d99ee707799f GIT binary patch literal 100087 zcmaI7V~}V;vn|-RZJoAl+qP}nwr$(CZQI?aZJTqxH!*MSz42y#WW-iRRps89xpuBB zc`0BJ6aWAK2!MVUZ}oj2w+%}y003oG006{)S4G6cXe4As<>=(4{_lfp?oX#JQKvpD z(cd8!3#$cu$P0rpIIefij6@rs20TPmE=;*lkbplP3=sY9;)a~xdtGr|Lx3P8oN=?x znltM{KyF@MUZ1->xl@Di--EY3X&SYO?gVf>K|49neiYz>GT(FOmTqIcTdyxYMLRhP z9oioomISq`BSaA*7vOnc9VjA1fyNoBAt;oz#@SVdB&EKzA$&7Q5P4Id%t;)A$ZfvL zliqHpHjT+WC@1j4h2)Fg^S5&<4di8b0GDzr-j#{GMT>ho91=GNRptb$7Nc}OMzQb+yEn63sQsz{U5`y!J2|(< z7qkKM$@hU)yQW7&i)LYR?Xa)_Z6!Z^OrGL<32!x z4cyG!POYjcUEO$pmc(jOznEj1-@Y4P%S-QoanfG}I{9V_k7#epKuW>&yH^-56{z_;XaV*R?&nMU zrU(qMKkXU&l7H^Kf$#Y`A#_9J^+Dv~;_>G3dc9u{T#YEaeVOco?@4oE7tEl;S_cfF zEe+2N`q4!lv&&6V$M4Oc|3>bmGAbWGbV<(|gH1-}f>Lwr-J|F?OT&bKqp{6U%KaNg*$u}|LB2EgL=D!)!MuKkcAPpnJ3Nj-1iZ2*fs7tF6DYsDy z=L8K!76Y`vrE-uraZgk~s{(+H9w5vEDrFF2yrmi=C1>G_P$ft^W$b(`iAO32=0X`j z!82BQ9Dwz1YvV@sU~8UA8VRtD!K2~2f(B1gJqc;E(P@NP^|LhV!$LY;r}AI#7do^s z#JnL@o5+j$f^m}1Er>Nl&3{UZ@^@)L)Uwl!?Ibj>&L=ook^ArkBaSlWQ>omlPjT`S z0YZdq7#oEs*;?I#7l=?qEkZq_J(f5_%~=3A@fW>vIBfzW-4gQzRBB+!^hr2FHbn%o zdJsX%@OFJ-d;$<{!&Q8;elJWwsW0F(p&1kV2@VI&oNU*?nM4AP z`n*V@@IFH!_v^55(<8=pF3YoSXC}-rHfB*!k=*t;vLP#Qm?$s*BTc#R>0mZwV3h!i zEF?xk@%Vl#Byv9cofVF-iA~XdQb|7%$^7Wtb2CT|p~9B+TDKsM|5A+qDT+EXSjT`c zFqEw(p`BIk8p!Gw@4t_%6K;*fAJV@oD5%A_IPA)i$6j97sDSAC^0Iw!g{Zwo@MUa) z0f$>OG%b38!0v2zbFtfM7$~0(*?mg~8U8V} z-uvU};Nip>KoXLKHS*~sogkXv8t3e#IU03Q_D!Il@K9+vo@A4cukKtpA4)?YV}VG| z@4Ll4Yn#w4V3g86W0H@%NDgA6F9aiw8r`1&Tc+$F$S2v9sCq<`Pvsn-OpX*8UzzjH zw!RA(eofUMkp1Sgm{|UVrJxEG1Q+0qEm37KfGkwOa0W3QK!q4Vw~prZ)ekM7YT$l! zAg;rinHwU@9k;O7IC;NbNN=37*ILKnoUV*n5+p--WjX79I+tll>wRjx)F;^C{Gitl*<@}?+TpB9+g$a@Ril%x;2FNqHzFPi2?#Ui$M z2Re&HAEa>B^oD)7s7(DQLQ%WGKK4-mH2mvdu}4=AimZ%grcduqs<^<>?7 znzSmQgr3@=BxW$A3Ur`rw-|RVM3t*b`Y1|WK3g%s08WTC|EF`} zndUG%zV`2Cy*${2Afrgt%@-%$cfhtKt;3DuVim9Y8Cjo~+VWUq~&fhqWV`#SAzhhHXSxr+>3v}#{iHl?$DgVJJ@5$qLeiI!b{lMAS`jF6wHd_dX!0k`S7^EYi(z zP(R}>TTnH4GfF;B6`rG79p_lz-4YyHEQCUlDWot7ovnti8$BP=!Wu*Iv`P2a(;GX={VflEU zOb_xj&^Ms_>#a{8jh@x0?Q~k2VY1!LMKGGP@0q)i+$n&WOk+b8IIA|R&88s{w_Kz^ zIpCm)@La%zDJ<2R&T&DgJuJ0WEQ6t;;rp!P3Z{a8)U|wrJeQ`_mi6B9+2q6g7YDl8 z5g=fEdv`YT4d7mXM=(aV^#akW7v?paRepk7xLxFg#RO3FfcmB5;UC$F_*vM2CQb4) zZ4w4tW-e#(2ev+_A67gl*-iA124T4>vSfLZzbyByu-|Q(Ald!|brHnnQa~+~u6B)T zoLjQdB|+ z*ci+lv9Ztk1sjV%_AqEJ$^!qFTeFqaVZlX(w#=J0O?7Z#;bvJe&)D|#YeAL|&`+nx;o5OsnE1p84y zytHP{4-Awt;JW@Hw>uKoup<3=FQ$5|8SYSOpt#pJ4h$oZ^cJduD&%XR{st#`qoFta z(0z1&rDBVgc!f^PiUpt|qiKzRvDjJ^3Kx$$sJT!o(JdC0=sZ-}n_?O!XQe0sYET7X zO9pAw2v7tDYNIT|B}hYB!7OVBn7uQSI(9><0AT3{aXtqW#^6PfI(~zQ=2|I1Y?2eh zMly!8`nXdX6$$|xeJFg9m^`2zK@Yo9yu0R0r2HYj8ae|rz&bnL z7C0`Z5A%9kK?_(3P0YDPShc!1KKdrBQihSp=jwdj8+H%_jJ^?`xV>?6Z>$Qz*gQV$ zIk;W}X;I_MSy`};Bz7)_q^j`eEe;7%$ig)nNkKO%|9&?hRrkcIbyck zw5LvuI7G-8h3)X>MSEC0`jxWxd;1rH#}@_$PkN)1%$HD7-hyiLD!d>c_%L;Kw{i0F zba3>xr6JBICeH63%n0rqW+#L45*^aVAy&P$@U=C0!L-mBCxp~J3Iat;G})04uK2Ry;Q~ zRv`#}Mn2I{EKP4nZ_OZpxghoE$X2(F?6MNGSEgYdEuj`q-riNZ zZ>bpbTuM)S=w#p!EJ1`sp{I1yVzjW={?c)`G-kOS;FVEPDr5%{Ao7FTQd{v*%_e$V z-dsuihO5DtOQ%)&LstdTk+Pj5|s}iW`{;j>Ov=*VpH{TR>cHh>zu|c{o7i_I08jOM9 zE9)JhJOEdVv}aomm3n7sMfq;eHl;hsV9rIs^Jw-N?iC&${)H!Zr->yvwK1^PWBjcR zmAzSS$G8e*_KixKM+13BF@*poYELh^+@CAui!L8JbfndD-dl$X3u4N_oxY_P@Wxi) zRp1uBV<3Lg!H(hE{~LYZQtodwDv2^H$2J}vCxM=py{7k_J5V;%i#kI)jOTgI`V2yLR)n^}WX zsU{d^Ja~$bdk9foV5+jk&Gcas&iV;7Z@1`?gbqrn&5rDfMa6(BW;xBGl#NEo#ZfPL zUNJ+nM9DxZX0hmu^=b=c$)sdPcXS10*UoQkdw3kC+g)fMy9iSgH0*6n`#%8sPQ247mwAR`mM&ABjstZh7^*CsALM9nq37>=8jWZW<9=2J3j&{B#K z{08SdL67!Ik4jbi%8zfDrv1EMLDE(=9#(vxwhDWE+zY1jq76lMlmB zlIXm3*vjPOLwTe@9F0Rt zjAKJ-jSwDaD4(oF`H>3Az`3(Xhu9dZrDr0c)Zq?8P%Y+IS1{Qf7C!G91UKPPH1mg~?Ep7FYI5Wtm=M-~>N zRtB@k+Pj!m7C4l=)J*KakS_pV@X_bQF13Tw$pwjy2;$U30f1w% zE!+T1(YmV}2q54ht%al{Pk7}acso+qKU%*7BwUNMq_yPVos9>9Km>2*!jpwhWuO-l ztgl8XbR$V3kO@yoHHL1SoSOk}_3_IIN5ZBwAfN_G zmc8Vw=NF^#(PSA2LyTF$)!wBwqPX;Z~z_xa~erpyH9r!RXyBKI&nz%!p`yGu}!fB6y>ApGwHD(CUY~oCm$orqJU!ni?ei* zUQ72fnTT6CH{uHng@tD`p;ph~^AUgW??bkVOsSwmC14YH=qM3!JGUo%~&q*6?Nx=FId$=Rto0Ri0Ql*vRCplphv&Un#xu0tP$JJwZj&=4y9)= zjhc+*Qy#5B<(y=mDxRN2&p@eZ3dQmr7AdpyO38V_(Nu&-R>wl?R#l%^Y3ZvvSb!Sz81;g!dC zmJ0)Y=py!uy+_NcK%)VhF=g zCebP`O(|at>6rz_!YkW*5i=9FULFK|;iVdE)}r-QJA@2o?B&$`TwV{a5MIsBl)U74 z!`Y$Bv|b6SYtTmc_#FCOBT|A1*YfeTvX^SO96YHIftp6T&!^OztzY~6v4D1Pl5bgD z0CxEbg^Om%HksxAee>{Un#F~yH9=B+9Dc+}(W@=ofmRJ#AJQ5ZNd;U^+yZ$>m!?%5 zVJX%tkR%4u;kM;ss9Il_ch5D^L`7ml^nJP=%h0fk$whXWiM@LX)!NZu{-rxaqY`w0 z_{z?Se!|@?opPK!@Y6iHj}d3^m93c9k`%9v{td*X-eY1oW0Ci+!dlEVrZDqvE@9CkGy)>Ek|I+Hwt+ z)e!gzw0mfeyjWiGk^1{=$kVOl{;s=?tWI?3FbcA>2?f7HD!o$lwqF^oHgwEcWCl5S zK2(H#kUnNjbTwMu{>|E`hDCv4@$WJ$%x_lb?jInq;#R(K!M-%bdk|glnveJl$VAIE z&BvjLF%nguo-$j=M^wxRB~~P%OQ51h_16H2ayItq8@nz7ti<8npQ~x%tA)DSA*sgjflG(23y>ApgAsFeZ=l1aKziM1sld){4Y|T+_ zb$<3h#0a~Ui5C$=-$Q>KFj0P8-J9Lo_b|tYioU;t(&%{D8CVeT;a_@!zQ2cjp@w*H zr0sI{`iAHuA1FW8hJ0lvVRDakkq&dN#bpGvRq?`0kMhQW+rM4Qapkg0kCPof#4sa- zuITy3sARUC7 zY`Gx_udB_&NPEw00x`<#(M3PI-eZW>~X`&L?7qPMflW2Z_lQ1e}Qp=Wndnc zs^nn^bjkNg>BrjExqKL!ls*oo^j!K>fmsh(=vhHm2?J{f4Ao(>K3_i%T^l#KPwa(j zJ~-@^mIoTPS$%%i@o$LKH>w?;NcUNk`(70M{sYi!W554lh9s~xM&N(T|7QgN-2Z9@ zV-tH5TVoSjBM&`m3tKCv|JdUY2;hGhF{BU$pZDFEkB4*}Bu`7^?XSv?%D*dG}2-81W9=usad z%zCbv#V8ZR^9L8rA7fwFQTog#I`ln{-#t)W)jvDq2ISDGmxe;wM>Y)|C0eD~ z$x26@=*ml%p{mx>H|j%O3St^e3N44a1;?e}Jn{Q#KGIGaDvHfPg>GzBBaS`Y8Zi?O z*T-U?IlX=FBRn;>ygu#TeP3%UhTguSzPCogwsw25kdq}-D<76FZQ8$1(zbj%)pe;= z&MxnNFSM9x(=%NLVkVAGZ)P+Yuwt%;CZmvgj7WQr5>mZN$8`ZSWvWs$Q(NV1rK1W} zbQ*)J|6-bMrDdY3>XiquidGSz!tA?DXb3d^@nFjNv9R=$aQ?LDXeR zYI!ET*d8x!s;y-LA}1}`?#)1IImEtR32YU$B&Eu&8qp%-uA(R>u90Is16?!e!R#FN z(h%U^=VFMRja~5ZFGM?ocM!bn@HLCkt=u)i#QGrn}0?6Ca>gt*}qSgE*)|Cs9 z_J(yB1Mh(C>~foQK7i#4L|`TnwwI`?AuwSeN<^GDe8D$NzmjDk{T7NdK4&bc4r!aG z*?~S|cg?z>$z)GXjb%gHSs-X*-yDI_AVnX~U>-eDMSl>4DiHcR8`dl)U%>rUu&> zRyg6yh4z^D5ofPu3PiP~Q>mT$m{`CXE;=AUSg|n}htWnss7^J|sb`9INmiom>80sh zujyLx0Z|Dq2Cw}-rgWX@Tv;ryFV~w__$Ajxj%bmmU%q>Hbkx7jPJ-~_fIF}xZ z8SbGLQP^xPyUk!vL|cLLhM?UQ`K`153GHVdpxI>1C!nZUh0qs^G|1Hq3t@>I=hsiW zxO)l1z#_9zdkIB8BD9-pZ)2WAceLU_27eDex%am4J~BDew{x$cz&v^u@c0QghKfPK zAuj2^eR$TA;QGN5g(c(xx&F+3Zh(s$v|L1aHh1KAy=9UM%HsrpTQ8;ACcHCNJb?nO z`%1ld{$0A5yA$)5?`IFp1BjM^@foV7YWgS(AfJA2#~y*x7PH&T93hv|y2(jGN@s{{ zxLF*-QjNWRT}yMjoT4hoIt-Emr<%`Ult!rE1l~PU%}Ji4gxo>~p7lQ0g}ePscI?jVqAY7n{*Xa$h* zOdH~hj=KL^pChycs@|1oo7tycAWX0ea8l3W-Ny{|2;lr&hf_T;zX>@4Qs;vi12Vi8 za`x~T>n|c2lOD{9 zN~glRvDCj`>t9W)R-24*Ar$35W+@QR&Pea&>$rd*h-C*-b4+k)N3P8Q$D7%sKZVb> zf7#^vO5X~Gm7DwZcLG^N-notU#bjAOmrR#`!)6KV9imQG)UEVKi&hk#VRubBsW08C zkiU6-Klau(si^e;Dr43L@zLx?B`DfH7jwHAj27d`S!2v^WD1g?{1 zjid>jwGl!Z2lUXhU?F02qKy3Jok$_x7rQrRyD(^18m}ArOXl%+?c)A%oMgWEIONs| zrrKvdDb0VByxp*yV9lL(8iiUr>oRMUu_M5BNYbmfe~bKYT{d9q8;}^J7x`W1CRm@1 zb9APe_F)vvW zbIpE!Qt6*G?;dP``1gPPU;j^=*nfc4FYH51G!y_p7a;%u{{J7aa<(`4U%2YrZfhL% z*S7z6i29MRn2MowONN!QxqMB`ikTM)TgMtT03+d#1uJnFVOVQR)c4-!F3$3ud?Ku7 zGl^sTkcZvwy%+lPRaVo)CwMAf_bK$p{kM+xgP`hdRE%~Mx3QeH-z_F4ezXKNQf?h- zaq85)TYF<)Ti;ldU~*#kq(`_Co}YOxBW0BGcrx9fVUSy}YO%OUQ$fB5kW_o-)h&%w zLI*Q6mY=?d;zm}vcw6j7 znpxUw8r#|*N1w+V%FgayzP{eB?q3UG+r6FbJUv~pH8Qo+KaOsWE}tjdo!r>hw+VAP z&4WL>gQ>=_sj?c$L-$BAx}=CDyd?@Fys{7w=<)Zj!*T+% z37*;J1>AF76Kul;e^9K)A!9rQ@jNmUL75gw2y;V~lLAwj|D=A~P!Jz-1cYTC6W}b= z)+$XzB_{++cIKl(!@5iu>zp7vCNTvfA(pU$&%sUsLQEnPIg&uQz(HY?JjE1oWh9=G z-PX?evh(e&*?~8lafVZqfoL#&Cw2_x*~97=@RI?fA}DhXXmB zc^#Ip5$(En7x%;4BE~Dtc}{tbTq0ss#X{Q|+l_$+xj|N4CH^f@gZ!Ey(R)a*gaFtd zXCbG+6sO?;5-hH84<;5m%1jtc%)rGaveZ~c%78CduwT*Y=QR>XxO=H2{&#=lx=;7Y5wkg7s&^3b|0TGM>iYyxg#$lWhNT$ksq_lG1 zP6t(t=#0ihlqzA$-1>V~V|Ayw_p-~r>U~ILtDf7)T%)8ZXt>Y|%sx!XL!P-3#TGzy z_4hA;mccMfXzO65SmO~w>7>OZ&j$*!>*ORnbIrgzR{^81{;?oH`sQ6fx43zt{Wi=^ zsQq~IbTBPHk_`r%o2C`VIwq#uK2?-pQbKryR~#*g^>~8Uq@My=G(?hv`LV+SJxHTy zT%3b4BQ-v*%q(ql0)TA_3dxpfTU(jDeZ8timSG$oN}!=wl@HCj64lNWezJ<3?2w6Y^YuqBt(yC!ej8}o^xJdlyv z{3^A2AjgWeO`3u+dX4(*lf!&va1EvDm2bfBc@tqCF}dV`BzH+pqh?0!jKyXBmTsUG z5sbkekY!ewLw(XtGy|6_PJ-B|n$<>|S=JsJUf#GAE(?sO=0tsu$iO*tqoNpE5J#4C zGq}usTMJ;|sBu<8Rbz?~1pl=X7S8P~uMH6Rksi`X4s9ti_%QSO13deJmrvd}V_2Mg}X;7J; z99&HFWMZ#j4_MQifKeTicEPj@7#&^z;dDb9k$OrYEgH68s};39HPVxE&BZjlX6Pc> zVBbFBJnezUVD43JX=f#F&7V_)94B*|Z-QhXu0ud9Zky4eN^wAx-Mk+D=XC87z9uVP zq9%``A10(F=1~3+oWzLa5lT~;7k}crhEmERf!QV)W`h?9Cz)s)h95H#1@Ty%faYv> zI8X1wTr+esw^y=LgaYhCycT=F0?~8Cx=pUF_w`o}q{wQ#c?(O{au9 zbD^wCjq_oomp#}VBc?DqUH(0d1+;)7S=pQEH^9pMd?)a0sb0pg%>`lzfnutJDEyRy zZ~gGB#@p%y@E{5=Ydd2(2H_`v>S}5c$VWghPFdU`WPP#ny$YoH@2;nL@=*jG&$Swg zII}@%4=IJy%GiHxNgpixlA*Y6+gGCUL!;8*uBCTO6~2Lo5E!NhK{vd3M063S0%;dy1JF z2t_;e4l21GYOIMZdfgoo?9JqhF-iaK&>%_Xk`6uawaB1FL--u5Je6n7y5h0l5d}kK z3R?1gNIH^_Qh<`<#fnFPx>pcht_GlUrEBP2>gIFVuH%h+dT)6#k8Hal%Z7}=^Tu%p zn4%=&PZ3HM$Ca@i8{&Z+`QwD{CGy1=3579g07ZN^AG7&{AuB~#pqByfrF^*xr>GyL zC}^foqE;F4oFI@KIaA$+i_0w zo>t}WjeNYT%;(F>0m@c*Ay7f#BCEfexVw9MKa~|7u3f#w-N2>WsZtNpaViLSV45@! zF26mTsl0uVOg=uGZQXUL)x}l^yKmeBux@IR&&5?cs{}8|3T|Ohylu8Vo>pRJVVi;q z2bB*7PPR-~CgAlI>btUsnfs^Brw+!LxvLUB49FoL4?4xd41Qh*6_b$CNd=D7CYYaW zdvZLvrW1WkvS*-*3M4d?AN0}hFaPCv%wCI+a%N4(o%LERw>G_Xs7ETWGEl@mNjzHi z6raJg6qox;3Cq;vVIJ^z)~sC5(dQVED>YyD%buf^YPR zJmFlC#EJj&ve|$N1=zz9y<1n~gPh#m>*Lg0bVxlo5d^UGioIu)i_||&e zs!MSdHX#n$l1JU8daHrPb#n~!1Jlez^91^+(Klt8Hv00pTK`b-_-PfN@)1SviwJg} zh|{7Gc*S&!G98oFaM3qP%=bVPUS#3btahqgPn6L)~%AkWnm z;szDwOTVbAFaTYC{z6%l{X?4PCAiPslU?b(TT1@OWdftc215*yXZdWF0 zH?LJ0U?#2&lCGYCSer@Xni_%eeD==U8{J0hpHVW{ufD+HU!kSH&w2f(uQu6&vQ889 z5|RbILl<=0e+qfuEEuNap5oz(AU>DK2Y;5Mh`va;BDs@rclLt(kM#aKkTu7+QBwI= z5>5D5W&HDh0H(GlDsmWu}gn1CPF=d zMVny_^sueGJ|=`F=Y}w`A|VCaPTgLP@MI#c*$&X3`0>M@F1K6Rulum9k#D@^A6?tw z!*5EqMu(T)_Y+smk~@$z^hVoehw1&pyF@iczx>z5v&;pB6F2s~paw;%S+OpQ+RGSa z{lJYDKQjEaywuA!y;bjL*B;-#FPjlY)$#2Su5zqeY_Zwo@&Qvn(MO%+dUM{gp0hek z+`jI&7HF~^tQ}T|+LR!&F&cKVl~vQRn|pack`jbDv@~>>-$L>W{8cUs%%5#Ib(&&B z*-NOsU?Z&#@G0Tkpb?)LVHJkyO0>%Q^YIz-{C?p)p>6E<5o=-PL97J6a2jU|Q$&VU z?;&9Zv#2SJC(0KzrYYT#pZ#f~ZEOC=fnf#0?EWj7x&dpzopoHr*@#B!4{5t#K> z$5a=U!6lhX|0P*xdrnAP53;@PWfeeD$w5m*Jg;!Oha%PdSNeScRt_>&>8=vtlx_B&n04%|z$Z~?VWIG6!kMGB^s$*yq+`uDQ zrw5nuPXoNrZ;o+)NWBTk>39Ngw8>Ki6irt^mJhcdBt(~YO07^v0o`O4QuC% z*U$^Vv<6Gg5hA?;zhsqJ#l zg6@9dQexN2-IsH2oOv7(W9Pkn)Cwb&y}r-f_=Pd_J-c5aTW7MN7!VmdkPv zi~+jlIXD}__>NWhl;X{e?Ce>^mkcN}>T+Ev)~sF!mlk%3HFB^aws;2l%E#z>rdd zmz9cYQ~~iH_5RJ>3dR}k*_txeMqTqz_qUf?w^OB-2UF7-9A=}upRP=qy#?M`k?0?s z-%D5pIXM?8Sri-lNOE$s`Mu=@D$?V&Akbr%_y)&M#9zbtDGvG8>Gnln5bQGU=cpDd z^yC(VS~|q|p*YQmiwT%7nJ}$XL4+yQfgfbs6cI&B$aiuIC>D1J(}d|k@!A;yoCU_4 z*cO&&0GNi&c>YQhlFm&VtNB_O9Ge>);ZL;NFq^u~k0+WEVNz`}v6|*PjHhV1#$nge zH8GiR$Tf>vswzZZR2wEqjhofvxaP?mA(|*@KRVDA=(F&^)9vjlJVHqig+B1Mq4f6= z5>(qz?Pm*~T0wQt2JtcYGlOeawo9MDG+yHoI15RKPj|`?PtwGVRv0UqJ9pRGhG`v2HaZwlX(_yBrs5x{@2SFH+~ZXz~AWvikEs)F))2QWxfi9gWqFVG;l*XKo%xV5+El2WyFi~8{*d4+uM=GBYSn5!@Iwn z8EsQus&ZpC0gH5#@Yr9N=B{Ae0Z?F@GxmklA#gIsg5R8~+>TJZ-e%M}R8BVQLP3m* zTc*!b!+)IP3?!%`+d47tPU4Gft55$T~W_GSJhX71Mi4^OZ+v3IsGHqo=SbF?wAw(vCg zk3anLglnE}r^D8UJ~Pr^;KHmm`IzeUv}T_}kIAab$CliVM~>Dd*}THP^F+i=sp7a2 z?^m8*eSFlt?QcXDd2|w6ci>vY0V9SD>Q}UAQ|Nl#o4@y=Cv1GZd!!ycwZ7bWX&a@r zzT4X?Y9hR)ZBLJLjnAdzR*l)0d50{1F7x zCcZLHEJ}Qbvz!b3^Gc(Zj3uZvqzsAmdJSuOX_H)*erI$PQ%Az?Nh$X1D$b6)G}!1S zT^c8UBo0|jf~8n6z7HPfj2EsLJ@`WTvbx-LtJg$op~dq2MAx6o4&BVe%Isp{;b~(< z4;KFF!O6TJZ zWQ&WHqm7lFjidXiy_wt7>E_~z+{5`gKa=ske^pxfn&@h3YtZ?4I(s^JTT$?Kw|}?^ zCj1=r{YeXI;=|W~pNqpNB@6qw-{^`eI!t;y+OP3+c76IdgF)r^6e#fWy?c3@QSknf z+|m2GySSaX9@&cO*|Sr9Lfg63cmcJ(bWs+Thk0!S1YR zN-lI;37x%k?22V0(J^jURk2&@V$YbfP%#mq-4hp$T$?EMP+V}&GRtZ#-+72GJDfUT z4aI-c$W(F3(6O~rZB3+;?4-3NKdkSTUDcR1bwge1rcIj39K21QjpC}JZ`iC9}n5CM<)Covq|=;2 z`{;4cR|qFYc|i4#P-RxLiD}|4X&d@0U!cOM$w=Kr=5(l8Is6&Fy*?OTmk;$MhLQ1R?eL7;q5qJve`IS9ajbDm!JELb^QvIjhG&c*TP;)=zJT7{fC(ncb3 z%8y>6Fx1jcnjPreFb~IGSEHXi30}iT!g*Wr=!BTKsb_E)z(UwJgU8CYhk(Zh}W4rS4K!PL^?t3Ax_l2XINpg(6cp4$&=v z5;3hsMgA=l5~)?7=&8#nf`8sT2SJU-B1T)EPD4tj4DYi%vXvf82v4FT33A5hJmk8%bceKh{m(R5lSAW1^$pP+){qDS z!0`~YW1!6VIN<0WBR^6Q`Vnf4#Y8G7C8c91zrU#CItI6SGGCN1vJefM$jn4A48|Bt zyf&f>Xbq7Qo)H5JmCyy3 zxQ@F-B`C0X?cIdvnf~J0hmu9$u}WY?O^RpvNY;QD&&iT(iXGY{Pv4Q0LLbNJ0Hyor z{w73&9qCpoQ>}&(GdpxzrQL#-0^HC1*hTqh*i?eHRfF$8rs1^OJXMjM>+*!2hW!*U z`+r6<23WwFp7xPBa1uDwwWazG2c9HW+w(u_zKkB?2AzCcX6gdq-wrw@Crk3W(;LB{ zWB%1TelQTH$@1YJMfq`AwOm>b7hX!9(0B)Aq5uJ9CpDQPYxVO;r?>OZitEsK zU*S(-v9kJKdI}D5<6=JsdLOh<5If_A&I4cPa|5tBJZkOjpkp=j`yf$aT>rlkAP7VO z^fb9%gw}+`22?6RS@jyb?bOx|&XfR)$^%I?k@!ftjZ*PFN3V#mvLT-jAXh;?c~%`1 z6y0&B$uubM*=aAIWr)_Cd#Nf4c)9}@p_Ul7wmLu7Sxt;Fx0-$I)DQpmGC_FReBQeh z7@ec`7=>83acRS!vAuXtj0pw#mlh59tPgb&c*v}Z{HT`*XK0%;+dXWGa});uk>Lyr zR(+c~847i6Es6emLpx@GCpwUdtV?`0$&BCnzsS3%C{5lhVf4vL+qP}nwr$%sDs9_# zrJa?wRcYJS$$wAxOn1+jZ@#;8abm4laq-4oJns{G?b!RbKSTpJGE~xlGXIz^8c!d& z1SF$Cpn_s`w#2W|F}4kWm_m?-G&ng+!RfWxET^YmJzCk@vD6$;1TKScXV(P{~ z+=*bZNNO@@iM*W!ll5nrG_Y7*QrHEZ*rKznD>kO96yE5m(?xg(mSA|EJV9ybHpwjq z#jnYF8>(fzX8SmL>wIN&&8c_DGDc1MHj2FH%kp5swm2SKS=>QY!!q517H-XY4xk34 zv#dIYn6^675E#NfQ$a6%Ku^ zRjb8N>UK5L8SDV>C6Q_nSg%9z*(z8RPVpD`fv$ZXhPTK)RD@*|0d=$-G;DmbHoyp3-mnaSmss3icO&R&3u*Ors@f1Bq1to0*n421D++DR2OI zif17ZJv}oGAWYX>l%IJM21?;vCP_)fkEXVx&PPRL+B4J|RN#Of+ZJ$p1TLT*fx@t~C(tu>RbU9W1NcVbxr$YIn zVkg^VWtDU88gxaC?j!_@cbI2V_aeRKf;{LOmO0(`)jyTH?cuc)8@-lG(1Q9yO)^0# z8pv4~ysqHaHFk*ZL5rr6-g=#04>80RkPL3?8A#ag)OwA{>0$yTRM)>4BUF+W@E~z} z!HR+crp${;QYCb1&Lwl9RNYn``v8LIhS3cN1X$RS0pWm200@4Mp!nO<6nh6DjyJ5> z(q0u%Jh4P!(l_oT-321{zL;i_y=0VVB*4+Q8_j41P@zY)bpGyD8Pc#S)3Gh3ebKEc zwPe0IF0k|(#qEF>3*6Ai)voW`Qb#nR&S+w5F=TV-k0 zcq6j}>Q&8PopzO~d#Gq7QEhjRhW*qKN8^xKDo>1~NV4~}T>ho*%X;2a1z|G`mU!U1Q$i=f>6!Ahvs zWUR{Or#Y8dwW1n44nWQ&*Nh^=Aby|(zY`5lsDXOoDk~xQoYDaO6|~ab@QhDG=k(g> zhQ2!rXnyQP+s&dAlK^+C63cusKhEm$ANr5;r zQDoORhS04Rk_oO4-$vruyEJtgKjx9hMGXA2))TbVPZ`L$4GKAF3oVDZA%&D7HVF+EIb=Bg zWw2=o=9QKupmk6=YKsT*^iM&&r8Fuymn{K%Ru0TQ2AeFbd?pAa(^Sj(+OSyBHZK3M8QKvK$wWbig98)iF>bAHAX-I zQE2)hmg95YGRJS{&Rr_I$YOQFm*51Kut^bH5f!s!FQeiJqu|Ez5K8SogQO_f5>Dz4 zka5bf=e5?R?XbM_#*|8=+e*w6+Pq3-eRmr%Atut&S^FXzHQ(tJYOV<<5hT&lB*%J+ ze};87ePr^EdGR-8iJe0AC+K2ckV!EJcer}<$~Ko*$}o~;!ersuXPUyNp65W$t?KKw z;)>m*o@ol=M}ml?YAJ_ftSEbqOLo)t!XZ~DuA>KDlJ(pHps^_W24<=x z0t`h)O^RN8x_F%hAZM&atM}WmknQzbb30<4bc4L`@P`a6g1!WHn!H=Kw|hR;eBFCc zr-`K-yZ7H)&Zm>=IflMAd@(E3Jlfafdyyx9P(bCF1N0Z!YhNYjO93(fBvGqK*#CXr%s^v!bn%|Xi=I<|tc zO;>~6!tnO>sBN5X@EXPm2S%1!6w_7)NSr!t^MTlQ&5KcY4w9X

*zdUTM6?flNezSI_3;X=Ai$Q(aXb-8NUm^PQt`p6PgUpxqj3~ zAzqwW`2q>Wlu1#{L3E#|7kzDh0(u0(u+xG3gz(ec40&joe`+@mwVrBTYb7cRoQzLv zrp0AoCc(-%5W3-=3~MFi{JjuU7iBy(rA!Tzn2&l?*Q?w?%()#wHKiT;m7LnPAy9$> zDHbnHfhOnfT)N6+s4@!Ix<(hZFMWhFyY6L?~WNE6L*(2Jwi#h@o)BWO+!$6gMe^=;1TQgfMn+I!ykJN;*9C6edl31KgMCOhAVDNc7e@?6|ob z$5~E|uHY&y#dz7agg*yV{>)vRah>M%0xz;tNM@KkB$w%e+JdP492KgSid$F&|0>Fd zZDFnaU9X~zK&GuZ>9$88t*{r6#(YSW)gi?@&s~7|w11EI5#TL#%po>6LW}A#j2C9x zXEE7D5MiqJS^zS+_*1+g!WaB!3%Xv}+fE~)eoYWJO@el%G5mvbZwQC^ji*ies+hbC zS+H3MTjq;=(sHsrq615G3&4jmO&Q4n6tZjRtZQRc{Y`A!{&x1!qxNLF;1zta5xrgV zNiQ)~xHj#c63||m??%q~l zLq}aKUxr9DYCLr7hEWsw8!&PFM)}0M-;&e^o=r)(JKa0!rJ|1~q>;o(soXh`6vn`5 zf2@%38ot8OkGMpw*=DeMTCx*D?`8|Yb8%@oUWqRZf*BhcJ&yZuW4X^~PPdS(@e!`` zK2cpAKyUw`FI_$};VS*sZQHQOTC+DF^ds)r0r2xLtonmps$8T{1PL_L(P=3`=b>*h zxsX^i(fdZD*w`E3)K|g{KGqK;TXcr%+?ZnL46ECOl8psc$n8)onv2|_l`a^nM3q~9 zka{tUNgl5>Z;llLY?l5Fj5#cA>LUOv9#iK45vBMJ^bF1em=_@llJnc zu;9o(FNC@WfL}3l0*#`xpz5064%q#eNZ2>1QrqC({_5@Zvvg-ED z=_OT71rsUKXL=qqzLygTz8V)ZjCF1-jIHPUUQs~^?L(`i+R;Y@pK-;~_A(%6MC|Z3 zEnJ^CFA?V#nv2i!Wx{{rRWED=Xk6xgl5IJ#ukpFU#t)!Cmnf?pGh=Ggdm|N>dTu&F zN@E(jM=WS`6(sg?IbFLS8W;Is?cp4Rl*ho?=uGRruvL0$>!9EfqCA9x^VTF_wU4C4 z7geaJUAtqm7pBwUBnzpep?0wwOuac-zIcM^^su*xCX z--2YX{e*5YLA{|YET#a$JOWyw=QoPQArh%_l*QU~yi{Zeq1Fz~dmu$)LoWCcoe#$o zn_37X)RWmmmV)y#C!;ghe7Y!=m6qiupT2C(-dZ_FV(;A1E`J`yHl?CtioHktgxO}g z{9(0CXWHh`XOeqD{z4AnzDD!hIi-DYfz=ww= zBAuGNJF%g3?Hu7U^zkZ)q{HMqxX%Gf2BLROYoUS#<-4E!?+Mk{p z2bVnDaJaB%E~3=s+77P8+mHBC7fB#*kjl^7)NDDTIdkzWi_t|l8jn}S+p9KpZxEB6 zR}d+HoL-G#4&|DP4%(orDDVDG1YVRk4<|0!Zo>m?*v8jKAZeHGWGQS=J=?g(!=JXe zZNr!{eBu{q(q+kl5ax)<;q);~pF2#1K(+=0at?hCL=*J*4$5!W5mU7W^p@=4Fo(FH z)6Cv6B%Cyn2-ShA_PQ5u_wVi@NA9iq7I-SS?uXjCngLDlBZMtITA$ z4%hvxXzxn_lm3~k^Fs{v>R+-BU+vbzzNNE3op+H8k)MfbH^456`|lFo!|3coPoUcD~`x{!E@0K8raDNTswV8Z34e|gYRX?m4p3;f{Q3vJt+ zR^7UIW`ok(1}~eb{`rYw$~W!%wD(xCDB2ns<{iYBJ*$7;ssQO%FaU>h)j4Jht~Js) zZRA-Ivbpo)c^Q;~*1&0P^A-tf%PRZ|31Q0HsD4$=+JD&@1_B>72JRw-)DpeqdSPW5W}P(U++j= zCXm`F7Ii`hm~7A60l;k-&qYV$s&M4+XJ>&oCS5}kEQ@S{W|ouA0_aI~s7e6`!tYoD z877<-O(Dg?wmD^E3%M>FefAS-wj)j(<0?OW)+bW?8NR+TiK2YylK8F{ur(lPE^ zGJIZce1M>zAqnL;kmXo+cKKshcB|&$^FKs^=yI~eVOF|m{*a%uU1s(UcS;iLMR4!l zhDO4u4>))%plcfc=`AmBalqp4PYs)`=qRUVVhT#fObb^ch??Pd2nJ}Aa#S;qPhAL% z4b3H<61QNgon&$%e`|5-ts&uu%LgM>zRPC2k$J0khflZM!|b*X@hqq-;gOhzhE39^Nh!C#qU`W)c8V&TxX{Adt=$#pT;K>od4yy6{b0A#O8 znEdkjRaDnp+dLh)T^Ujw_R;2-MV+dUxLF~x*;E`g2lIxS+-2t4mL*cdlvW*yOI0296PvP5WUe|`Tpe!yHb`b@7UdgEOTiv>MjT~8! z@FufE*7mExd)iEP?OMmM4Ybe94^1E%btU4+G_D*NXOw$R9|~( z)VVGp4)R26vtkGaYE1TIbyhM~tXfC|C2JhYI&3VOgZp6Ucb3obtk9)Ec!F!xnZGGw3fgur*bm?Ua2ftkT|72A%e`E2KSqbR|d}SIs@d zw(YwC>6b^5XJGGzsf;$#d9_w-*`Y67(wHxod&7Nu$?>LTW;xgWlVn(|2~#omSyAvx zl$@tCvQ=AC8$2ZgQGMA(nb+Nt>6gFBXLE4F5d#@=lDhNMn(^?50!cpLhAaQpjC{V1 z+cJor2HTo97BuHbtBSxQF!YKPCXUnZ()?Y+dwVetG0g>DJt!kjEj9?&`p6(aD9_@F zo3x6zbRIQt8eX`c9l zit_TH=1^TvC5#D@*5k#hVw~x55o`!CE=8m7T^A#M?;%;ae5t3}(yhku^9_DQ2EM2c zdbjft)`;OIBW#^nu~lUN1ZchJv_IZ*;XWBUQ)&O6KsM^ZBy$EMKUxpLk;T0UNlOuw zyYu{#f=6lFbohYoaJz9eUS2l0L~?`d|13Qct6 z9-~p83%Of#-NCXReVv6l`z(>5;Ot_Z^iruBID49CtWLLR{~&>LY{-k*uQJCvTXL}F z@-WDDfb&+VEnaIfS;?eB1qJl^&7r{1yHwNYWM5Z&A(lHWOmoRoJ}8ElBTUn6Z8cjH zz*kNg1Qv`qhkTCHwIjpv_FdD2X-~dS)^~O!L3_clW>Lt^M`A+S2i%S(8LCl|AqKV`PxDGHMQa;+6p(}TE9`n zJI?Q)r@`Yl^&swf5Orx8Dx$7I1eH*rRj$t5AONm=Y3>kRmn(sH9=|($&Q6iRT^sy_ zL8NASio(w*=oMQH`~9aU6>>9ha>U$wfz3XyX3u?^)ObM5uHx#KrbCJD52*Y&0k}=76gf}1rSsdSqd~rV zX|Ae*OYQ|*o^@l4m|JGu8aT@3aOD#^Dn@4{IgdTx9p8<=L}Denx%fjpZE?Z@Wuk!B z*oRa_X&lfloc8Lr9-2mGDVXqeVo3DO% zl|Xzgs5B_K{;iD1n-hL^Ha;Gt(g!Y911}32fy%1=g|$R3Li7vT#{uA74nuFi_|nHY z*!}#tD9iqXWxiPevfE3RhJ|~k2Vdp1(2>pCkmi0X*WaF}BgKNG{rR1Vwa2I?<2S=o z%J2>6=qn&`P&IJx|D zK~A)~!ryx9>1Mw8$<>wH$27S_;5q=CR?_0STex7EzJkgJ>N=lH6krpxUI`KJAIhA4+D?*O6aT< zEac-Cys!}}#A)vF@z-s-h9xxMoLG`UC-RILq$x$!REQi@CG3l>D`*t>KnkO!Ayd@w zOIi<;oqe~mvT?(0%J)r3AjurwK2Cvj*aW>rzQnKfx~BQpn-j)HLsGvuKI|S3UY(>T zj!$3Lm*?qHzdG1sM<-j()+5I&&rk2qMBX1^UP^neO7-lzwT)Ac5Hx+?4#ShGC!)97 z30tF8nH1Ty4?9cxCzKN;V7zG6Ply{hOAQo;giQ;DUl|A+x9}}TITr7ggj&vjwj=QVP^QDE9j?)jJm8nn<1_GU|pCU^n|z{ z@RS$y$(d^&+cX$`I>{=lS){qe1MJWaJeiEGrQilA ziJs7_KCso`mvJ9r79`p*!^;ysT&g;UcYZ6P8eE)>~SRQZ($47(n#OF z&9S2k)nDuILciNJB! ze_d*RT7RfnN8cI}kveclnjRm8q7Ntibc4tIf>5yWLkQz?-sARcm>_0SAe?8uYCet< zK8LKQ_iKAOmM^|@v9W@B2OLP#8en`liy0bk4DwGPrZk$-#VoA`TPL{1unMNL5>ehB zFE$7kFx*F9{WJHVcC|j(rLqSYrev2XOt0qf1MdgGC^JsjI14I)RO}IH5B3bw@ZB+q z_7c;v1TLvjGa$crT79G~n|Aa?(9$n|+Ko!^hx06|jcXJj{FfRkay-xnu^MzeyS&r{ zs1Y*-|5$`WIDu#>`#@;*xu?cAm}xc;X@Llh8k=^faHf)Vw+oG6Iq(#@MN8!=glj7L zr8wZUMXZW!%?Dz509-+p=`%l{Fh_JdjG+MFXSWlv-WOm2PHCVQ(+sr%QMbJKhZW1i zT;q!uU)02kArM7|_MP08K)ZBhYzzoO7SmIS>|pt_CKS@%VFf$uDhYa(h=^`;YeJ0< za!n(1@=GiH(%u?l3e8aY0N<)I`nr0<*Bn5b6~Ve!|L7m`^2!|dlF$eZV(5Z4%^a*g ze+VS?6Cavy3@#sca7z5vB9KT$-B*I}@?Y}SbH4v4p#%2oGro4g+5w>l|akY){q_6M75}OllGk zo+Xht_Z27wd+X4VB<(fBxqL-^nMbpEMlSL;{dACA?Lx1BPrz!eyr%qU3-_o_YupFk zejYjnjzvr+@Ox`%=~T{+6LSuns!aAgwbrulHq8DZ2b#1mt;*>HR3r zFB!RvN4YKa808;uri(v|gXsGIw zCK`2!wi5|WEXoY~4e?bNZIL?2HL5D9CzQb0n*eZ-XrtIGbA%BHPVwpdwR8Z#I7=)w zi$dRFsZUZV&`uk_lRID4a|XcN0zsW*LT=D=C@~Pt$u}KsPh{>6KZ*oI7pK+vFPzXO zB%?m8LtE@a_w5FRvw!4H>;VGj3Es0+OER&;n=AGugnrCA;(MLg%nRytn?pKYgf^}z z$2&7EolXifaKxcUc>}HO(C3RiG9T+G{#51CMp!L0fL)ELOo`~m3U)0dTe&ba8{p?WZkrG7aln5+i(@-v9_m_nW_O7mvp ziPFv)o^heLzw6q~=`CImFTeM7%w)Ci6wU@pYb4OU+^h|?qV9E)&<2nb6}J>+vVWpM z0yWkS>zfvEX49hHF!~XDXOhGWO)JHrU1jP^ArM}uK-PYb>wU{d1-QRANISkr3le38 zBBU*M)k&wY_uaUjxtMx3E!C>rpUoH^1g@F5YlZQe@!?anH-nbx&hAXCV?oK-g0Y^K z_YBaLFT;-qWTY~{_(lN)cM0$IGaURV#6-kmdC@zJ=7(Jk5||HEBCUFg@Fnt;LtiD|cwCKgltunQ zrJfKndwa${S7~EctQ}@>g#xOolNmN=G<2Oqr)Z4mqnV~$Gdj!>Rn(O|dwXlHEEkKp zjXcWu)JHrUoj3d6eQ&}cOmU2btjA!(>~-CJ+eU`|i&bJy=^lwP6C^GQKf>_JF!^I= zYB5_c=huEw>n2Pz)lm~!NCr|5j7z0}k|CuW7IIX*L;Y){^Bvs$%}V;~3io{)TG-O* z=~>uXIP2+s*BAr;6R-4FV6?oY43F^54T*hoLr8zW{eKgNQ2v`Rg#Vwy5aNFdL)rgb z7)q_Sq-CO|O?M}J5V$I0=j*^vWJ5|A4~)EsYVu@y!D`OR&Cb24NQovzS?t(mq4W5( zTG2>VOy!n31?K$@k91CL`DH`^WmYt^a&0g*LKrb7{Y(nvKMATMVH^_N1`lIw(;XFM zlxb2V6Mb)j3 zbbK1NPQupp{>+~~e)q+|jr-})4IP>7KkG;pk#(!8nwPX0HXe7D zyvw2h#WZwYe9`D*6&@)mAo+?GSr=_~;cjf793O}m!yV;5b|+5V(JEM@jHq@Va9;m* zh5G;Q3O)SI75e-)SLkO7jZ%?{zv+BMaY^w-BG!C?xk@TEft7pc>Y8F>|0xv9%#b-s zQ6OL^EeDmjE52RnJfF-3-Oo#2I<(~?fIG$t^qOEyud?P%;cTqc`=vz}QGmgsr1ci1 z0$1n!M(|bW>*P3!Q~eKKNw`q zbyCC1?)=VKw4GyEpa@WaOUQdmmzFkV8{ETEzRZ`Ab`a?pHlN|$1k~Vo({ZZ}$N(Hu z?w@Hal2MY*&4>kWC8|Dspq09^hl1@!a^9s&f|Iu&%agqp-=}Pms#$=;`U_@F3$Q2` zeu!m;BXocOV+0MQbwUvNHW7|oZ)qs>#B?nh^Eu#AB_hBypnbj%d`}oS2chrlbf9Sv z1umR5X2MLIHEzh4RC50zIa{eeATIzED{2m$d->3W6)J>v731mo z20VD6x^CJa0NkV`Od}ZMk1XegDD2CX`;;3=&RlwIG;bL?i4l=WkNVFeptWle;m!Xi zR_H^YwEgREtWf`du|gUD&I;iiC6ax!LO3B8uxuiNk%3s<+XY4-f=u+k3n>0r`YZr# z#{#oGBM1Jj*+(L>?Z8yXdn;MLo`5#$cm6g;()>o}Lk&(Gn-T`29}w8}EFiSf*1OPO z?r|efQmXF(w8Q^8uf3bmAmNo+j_c$j9k*kyd%3^FIz>x#$IF-D81 zbt*xxpm|QTphx;qBSYjq5Z2v5H&STPt zTZ0KkqFk4ZeOEt=YeRPeO(9|41wz8ig1VffKrE#l3yo$$yLw*5T#4@z{Rb-~^vw#f z{Szw`#?7~a8=B#n_9y!BPnM*J%)#FTUy_rh5Y{Xerpbd90^ z!3yCGK7O-89e=SxT>r%ih5f|}!Lg}5omE=pXATtp#R}!Jdj7=<{ptNLR%q7S)}HD) zHKk9}X;n}+pd#=HnCkii@F#%6p>^FhVAzBW0@OiYKQlqg&;SEO5-h4$e2C!@K#_=( zZJ@f4xZ-Tl=1KW4s!73BmPpSR|0DM}M|<6}p4F zaoIy91l>RaSh~^-y?WyC+SyESb`Ngp9d0l&0-WeiUPyQ#7C%tnQlCgRw8Il+(8$YR z-z>baTfi{BM#EgQeLeYJKv?FuY=W(8KH2jI1 z2iI_WhTMFQPT?5_L*h|H5?x6;{c~b<X7fc1|A z?d|Pm+Y0eUq&t(tT|<~0jEz85HhSEyfkrJSQ5!i3cvk;lh0xipeL7lM{(}`d`aiQm zh<|5=-u_~R9{-aSI{GJ8NaSBwp*#J*SfSy+vqG2V|H=w={WB{x{mlx2{TnN!RpZJ` zg%iBDV**HB*Y~7Lj*+>3u$4R_KxA z0cg@9W3^Az>4~wC`D)jO&NJk$$0YM3(vESqH zxP-;NPeCr1WZ+TJ_UjzZ!=OWBt5fe(q2;jM__0?MHQ;BV{vUkiZ&&LDdk|QU7;?^T zpe0_*c_wqT%PpYC;`2+B$2X@HB%`VG`g`WqFT96+k~{{;8g#l5m>f7JP^jg@*OGc^ z+q{jsxg5G{R>@~7Gk=oqUWI;K$j+LYK07R)LednJLUc(wK5T)lSb%g{+|xs|C1HXN zB7|LA6FDYzXy7ZCEQ?PyGa<#s)nbbD=}~eSD)!@|lM#}w?H>TfpZ^wcdBCCi2Y$Ok zwcoA~;=e^u|4Iqr{(};l73&xk4MBZ)HIE(=Gc4>=g%3yINH7(0M0YgT>R9>ex|$S# zf#X;eiEI{zb!5Mq%JQ8LTK{K8(c``Kc5Q2Kgvf;^4BVwXhW9AuYJaKGOSfqhC z^p;g)a}K2vTBRNVbi8q_ds8fZ0joPNSLnTTMYw*1F`g~+eLnN7ar|vO^9-S_`^6qb zM|Ue%Z%b$Mi=D6q&-QYfuD0+jvGNX~jjOHQ#|>8}C;Cx0j_!XEgslEA1)g)#cusssYE@?tZAIZW9pB#92TKv7MpSgiqL=Wt-yV?ptOO zDAOV-L2j6GQcxNTQR=sKMe!jgAUKwB0rmnNozes}3L=PPX8|fy%+rL?_6f3mQZo<| zLP;Cg9IO;z_#`sneM$IJY-Co+V=OT@Mv_`tUhUiud(Y3$T)&U?gN`0Ooj;o!e(yJU zwfFzmf{;N!RWJc(lrJ2wOwTUSMBGu!`&G^;{*$=V1b!hxE4!V84%na3+RZbVY~@iU6Jvm znvy4z$sg{A8DzqAa;#eGWnu}-tAY#xGX?&;CpQ3L(oXOP5r>^)n7C}F0m=omMxwtJ zPofT8**(uNJC|Cz*kV|=TEnqYN~vYel2YQ-o2~$0>m}hLOkJ*N?L&CDvGu87%z?W;(KU*nKqRY)hn1aqQ6O1d%cjP&2BVxccr?D;7VX0uWg(`EO6m7n- z)P_rl{3cgzGY=dDVhZrR>P1W0RZsdU2^%PHV@9JleuSDkqzx&le@^-1z7E8xVs(wS zaD-N~KI>#JPX$6tWn$$6=xg3gh*v^BDKObxlFQ^5qfX}hl0i!s$g(hce>dm?E8Lzw zSqFxZ>p44NY*g)1gWWW14-I$D=syEORR4E?5YqpD03oUW4hXgVe+>x5j?MVe|CU*G zTN(-KOX6u}YbZr0^yEiXO(_U|0RYA+joE>yBU-RihB*Gz@ia#^3a9NlTS^*cGAQXR zE^|^D_Onp`;3Vi#m+=cJ@dYhTB-wJ$-X7vV1w*RUhh&-y$z3T4ETYGQgsS3zbdu*{ zm048K$hSB_{ykAC-WAaxrng}YW}SS2ZPvGA*&8xCr}=Zb1+cO#TRz8*Vv;(1);6t` za=McWYjlln^A9n$Ci2P9==U$E5F~Pm1~0j5q)?*4yf>Df3o>S0aMq0k!!)6U<%+r0rlVNDv0a4n&3zjRE!_Ndc8hZdwGB-S*>Bs;F)F1~t zU^j|4GMg`yP95;uckMQ_d*EmJ@21H4fGbH~t^!Hw+cENL$>fN|dR(V)M921IH(}yb z3Z)kU1_?VXCLlVX08lB-YL!HOG^Pt7kI)^ zLSi7Q-Z;6rd3j&wXK%Dz-^JX(CfKM@chIuS@;aj!H4QAieeOxU-!Kn6eC}0&B^j@qLICA)?btsqb6Y*feHka4FnF>jDLy8>M783WeG9& zN|{UT3^8$6!oO^n{&d{o5c{$7@hC8dn3zh^XSgiR>}1=8?a3vL;A4_C0a=(gqON?C zi(!9yo#QcOH6p@^H356XW2Ma6_{6UMbLJO)IgFEpqa{z#5zPpCjqzItx95z1-vbe9 z*$=yJnY+nn4om7oYLi%j(pq_^rKJ0lodsOr35RIZH?45od&@5=M>q-BHx#fAc@`Xj zc>$%YU&d!VQ6>q)tz^1!&2qEarJ6r|9Di?eHVYEuwS>-M zHM=B)T?Uti_VG3gPhP69`IJ{sTcv@2VB|2*8M|U{HE%3Yg?!Lsl;=Grnk=94j2jMj zW%cC-xR!);>+_zuX$_-Hzb zjfrh%p1xiDlUaG=uxh1w{>w(UdFAq||Ezo+0h;jB!YpiCx33qDwVevn9T2oy)+D%a zdcjEUIZ~~q#(Clce~!~=ZJ5T$e2e@z2BE_NfsNmLxsfhyxg59g-STo0MrnC_1v4E9 zOI{BqkW-_5W+SF<(*As+cgP6Gx}}b9g^Oafr+bhg5WpNdC5yLo>koT&Z25EPrCT8m z%(Q=4=vb$wU-L`f**Hv`N_mf^^LdZ^F39vxFEz|K>wN|oWD7mm-z;SLcRf^zl8F?QiT7r8D#qp zGsuL$=PxtJrp3?(lGv7BAM4u;8Vx4-HiPK?ZU!O!iy8FUF+PZ++j zZ9H$emlhB?_?)oJxHa!(;XJUf%1z%h@B$c>tm(kJD&w?~r$^s+zSff%dp6CM>DKf8 zFY1^)9GIMTqc#ee(CG|nKWKN^m;U*~`q8UtO#~uRm1-^WQ+w_va!}5P-NM9qd8kbl z78j>wJ4-ovQg&l6A4pP?K(~et^)D+36PNYxRuIu&RuJa56_m{FZ$dcTx7%%6P=jH* z2(Gf?ylaXmuWvX)fE)F0$VxzIP(AWg5Xd`8O*phxn?`!heYs(8cXvn2Ush0l;f*}841&QJbN1HqSrxhgq4=adGvyb`;4$_1wOEkW+^AfUG z8wE=T0Zop3n}4x_ z1pcyu0>7=G(g2pb-s_36Tm$t>*i4icfB?Z`uz^H!U+Jw*^xzFCY`*MDlD}C&Mw|Jg zd31s>&B0Rh1jtViw_XbC_&63G8j+`4#kK6e$Qetsvc(+cS+tP5Xf>?lJsMCxjBlHZ zVrYUtX}jCd!B@pWs=3a}R(V!JNeh)5PB2Xmw)pEdhh#~_Y^TYDa;nYOFvWD;6h|lm zF_o~Yz--fjG%lLg&YNh}f5MQdR~9=O)d&?D07F{Scn@Em!hA|SMvPnRiltbzUG4PJ zNFoQBdph>S17PoH+3Yy}>i@Qa%1#FkP7S8Q@a#aXJ0R!Dif})^Mk`R39KWKmd4un! zL*N+PV*B14c5BmJhUnlD-P;+b;?mP|6{aymb95UuHILl%jz3tatY`vv8b8`hwcZWE z)DlxPh+KB|g_ldm#S~=Seo#oOd4Gtu+9$>xFrC0?O_#*D;Q*K0Se@*WzYKi(LwXuE zL3h4zDYNS4?n<~cjNbj0VCBtjl`!E$koJa!b5dB#-tovtd^)lTl;AGrOHatjlFxPz zlmkBJ+dKY^{vMD9W7FI8Ar0Ii#swpQd$-xB5y&!n3K@J7>zD6Z0Lw8i}y z@pCdUqF{R`fyEBfPIyDOW#~qCb&Da4hdA+RnQKZTr9RxLVW0c9WS}kbpRJ(3K00`{ zPE&V&TS4OARuJ~zH=zHS2O70qr-$#ty5^Hk7Ajgus0{%`162gM&X<5~0|gX_{GNg* z%!r)40)BgSaM)E2Ev0R`IpM{$zu0mzlX>#}gvZ!sv^o?Wx!VJ8OM*2S2r@CCQ&Rcf z$K3b*3g*Yu<-2n23pwGXUNv7*Hxo2nvJ-fSD>IJ=;(}NOH5>X>6Km`o?Zio_^4=ky zBzLX3fu4*tP{Cq1u{$}yN0U#=P?vjJ2ZCT%Z~X$85`JGwSGp`Pfh zpe$6EEiYClpt3#2VPE7oXC`I(72qsU{bH)XjB0W;(?E{Dt~F1UzBTMAXex~_MV5@0 z`O|@{2)6{oGr`@VL?eWBS6~C>8d7Kfb5wVQ#(viD${MB{wjV!}FCB2WFcMy)oUj%1 zoZ+=lfR+{CB5d0k<`{Rrav7b)T-`RiAOQj{(I>q<<~Tz$QZ#3P3fF)Bl3*=HPJf3I zi!8t%Rs31?WQJM%kt6O0NrwCvH#tktW(I_?p+y2=$ZD$0!!WSkA-~cO1;?I6t13{m z2iV7=D@9nSuT_tBV;Yn>oHU+JQ8M%n@(qZEn0}$>Ag5@ekag6e}mSdPLf=tuP$w;Aa_~vkW*^LZN&_B5dLpb3B;gK&Q;Sk}X zltIGcp_%sFpD=gcZ}wkaH%;!Xeh%S!cB(bS6{=l@qe$|tQnyU8mIvYoH$TAJIY`eS zm~$rDEZEJ*ZG4f7+1pJr#wzfq4JhzU@#+nDOYzR88U698(83POvJ$x>;$LuVytwg^ z0o3e2Zcc1!D&30@&zud~ZV`690^!r935bfEE~Ld%8eXU4ox%Tk4vM?4m-?{MWdZf~ zs(1gcLqh-is{aQ8Wb$qGIh)%V{{>=ItH@aW`CjpK=YH__+bk=Wtnffk-68z3>sAn= zR^va*c(F%iH$EPcnH|sQIP#-4Iv*!G+3c!i zK5A4Gh(3Z4s)U!w=g2h>tIOR5yLCsUO4kKb)8d~N*lx)bL@MiMeks56#HGSUrZ6&! zUbvx@$J0-z%~EkFyYx%EQCB`UZ4o#*{7IPxH{><0MYIYel&uRFRq&#ibCtx;q#WwJ zLDZgwX_e^~@PC8Hd|IC&Bc)3Nor&{qbnCN_=P`!r7I9#~)1pMoyI!?HxfcQ5CT)ZP z?E#(HjLHP5G@qOG(Yr21 zdAKa7T%?>PQl@l3xlXP%W_iH_U0_^AELSz$438+(LAO35>h9qk#vM)dDa#Dq(0+dB zjiKW^g}FL;VI(LbrA&ljTaN7UyNrI_G)X#?1qOP6!>9q|_+&O5Yt!!BOZ2l$F!oQ2dt8!597E(xASyL-DUf z?edszrKUAul@3jfgv7BMb@Fg@#c*+dD1S`~%8#ea95hc|fTG1USwrtQaQ%|fzmAU5 z(wp7N)b$E)kMsjN3tu5tw+EGYzZa3_O=J_96sHfT?y} z!c3KFsQvYaLs;BHtPktNeF`ril6krw^`yZJQ@PfyT>3Z|r)eEuiiQi4$^Ue+#)D|RG5>fl(%TiK8k#s$AB8}tw1L&XDm%ASIx@c{**gh|y2FrM{ zTe&#d-8{V9t?ZaT#lrk}**Q2oD=#;5J9)d=efjlLm;o@>3m6Jb#3l4X7qfO|GM zHe&^@<4id*bdl@mzT0LNs*5JBZPgm9lHKGd?XCGy1Gk(SW*!O4D$;lD(yUgHJxZK( zmsR~Ul#{p5x3p7=4>f_XH{GaELlX`u%gnV6oP{W-1hli#KU%CEg``-IkRP~7SV_E4 z^=21Ib=9OE?ZK6f#_1H>Q-*Z@d1XSIK^Ox-4J!iEWjsE(SC=KVe)ltdGPFsDZs z=j|AyY*;yTLdWg*C0orXDWX;80kW5HNwz*-sn8;^8uCX;k8@_oL94vrtQ~TxHw4Co z5s5d{YFy5Au7b$htm}&d5Kp0!FU!1z5Xv0Ljd)vS|K3e<$jN0#4^6cd(ZOV4{ccW{ zkKUhd&T7rtD{Aq9lYvhgViVs=ydcKswD^4=S9UxTh^^- zO&%!U&Q~&Mkfb{QlH2U(Q{1v4r$8<0E(qSVHBS$aM^9|A^Fx(S^wwpwvtLHvUwaA2$c9%Q~D|eH> zcCm|JO3w3=K95N}DwmxiatiGXkc(?AEb{N3kxH$E#>wKu76tMaIreQdmp0k_^cYgN z=6GEcm9PBCjP@io&Wv=SI!_jLyd*z>L#jMCWqRZD*t-)0{l?D*B>yNfViD}c1(f6) zHIz6J2!1{TV8W%$m^x#)t7O&%16CC1B%)jmjKnj1rw1itb`#cLNKXc6<>r%EK!I73 zg+%oQNW3;ku`w??sVrv7jruxH0*#?7Y0T2F95Z96m?(xA?}SS&Vlg*QV6`Uzr%+|a zir~liKfhHN#JVQ+kGNv>B`lJ4SA%C^>48+h_#7LD06B zLZV>lVK44^4AnfqEWk$K@UhcS;2w+g)^hhXk`?odi#Fon^y{kQQb!LfqWf^B2gN_b zrq{0f+Ah9tat`-VUd)JPOqt?f@5RD0au*o_jUD2Tt-{Hj(HYS2J5=%z6(uPn7FnXM zmX_#_r_EoolpSm6Qm<2v;BlePK=I)sFHf0v7&oKqcs;r7vK-=T9RMtr|i* z+2Ah=8V8^B|C!z_jetkbr-aDt2NF=q#MU)ol0_7V8$eDA7-|CJBaq86je;%z8W*Ja zqT)N&*m%cz@nx*8DAau$i=eitorS8)--$u)^8$HevIkopV)R!W>PcvnF>E+*QEUkiA`AQ?-Lg0Q zG@2-ME#@|9$(J*VZs4XI!8Tl1(QA5EN89zLsQfI)_5fBcnOh&A-*724M`Fstg^yhu zj8_n9SlehxjZY$EN-*|aB&w$Er-GLVxZRGZ1nv!XP^_`JmIX`gt92%e9C|jjpYeh< z?7P3ebfv^=q{61Uzch+r;os>z13pe7M7XLDcE8-R-hv&P#|i8z4&#rMz}@UZXx&cO zZc*EAs=(bs>u6nqo}%u>PE`ra#yd!=WJo~dZymZAaU~wau{Y?6X1fwm&;UCtXN5rW z$XJ@&sgX`=uz#AuzwW z=<6Ti*Ypm;5VRCq1jfo110qI&TG(sD?8gwnX;x-$h0ZvPn&`)nECC^sN9S8Be<9wb z?f_ep^(LI#QEg>Z=oMIfZcEOH@Bi)gZA~O@#@(vN2dA(2-c}3!a2LJOt~7Y}X!&ra znVWXurTQ^qfJA}yCZKkuna}`}Cs}`vSAEo_W8l%(+hiuii;oS;k_TrqgufFue#KPp zAVy6lam-BOo}wlykyM6jzVN%0Wl#@50AOjMGd>z)uS9T9MABXOLZf9+4Bht2o@}*B zD~P&w%vN3g$x9zoQ!CC;skYA(x9o!?fmhBj*-)8_b_IA3{n2vLgKFya+wx9xk3$ZL zno4xXlSO*v?qMpEL8DSZA$o&aW6w7?l1VKJora_AJC4+=V5J6n_6#j{oUfqA{838F z^t=@Bp4db2-(LJ!WtW4LE=6VAYm_4^k27nuQBCJ$1FILDI*Eu!iw(ODv>Kn;96hsV zOr!?PZ}?T50OVAy1bGe1Vq8k?oFQ+qRAMYj=$1|q)^l*~eO`allu~$};v&-tKBZ;w z3%t-McQ#Wg)6y@G$8_dH8`ovE&IlmgdNSNbc%3;!dTn`AXe?9Tz9xowK-?T!t|}D1 z4lY!C_6~~!FqO)+gkvInTUL6XvMC$iK!p4B#Zoat3kW6N^$Fn`1fdKX1V*BxOS4oe6O(Ifq2#Oe(Tzb^9qTe<+2|Utd}}WXxc4>&Qc?p zlPYZ;d<)1=u%|Ux^9Bp6p(Dxoi8Kk4Q|=8)AFN;s-;U@F9*|Y$mh3I23Ud;jdJMPk z%|VIC_iX?Uq)@{$P8uLk(FK&*S=y7dp6Zkzvr9<5heI_r35Nvmscmnu}H90@%D(VlS;Zo?%+r zT1h=AChg8jUO`lkK_le_Go^k&)Ful(0JpRvVpWaZwPkV1Ogqt3bM#@poYWje^ELL> z6udbU6V^cxMqHccSY29eXnH@KIO$LpJ8pQ}*M4t+W9%FtE{wfrQ`q&=(55l;1KB{O z-EXN9JFqv>Dg%PPfr^XO>jG&Y%? z;OqK72bi+`PXeYs{s2?2|80P&pMMQZMReBxTfmh3e;k+^k?E%Xe*&gV{taO2d+=NP zzW`HT1FV^m#JMy^iXm^(b;3_Wh8FOi6ui9F$M#}G_I;)|)jY>qaxcIbNdiGf?{aD| zv}D_XdZOUixGcz!wEpn6VxlaIs+?pqL(Uv*)8U`CYWeh`tqbtC-_LTk@PTwv*t9r( zU#r4T!5vMoa4L0&m#5U~RY{LW+$A&jjZI1;^?*P3#vO4RzOJXmld+3^O79&UoTRNS zSdb%y&;|s2KU$aY*Q1YxDm(7$i z#M^*{VyKLo3%hRX%UoxA)`zL&A`@Veu?BGtjaldi6-k6}mBcjVkRf5$yd8g-_#)k` z0Qn_hLNYf7RMMp|OiF#ZOlJ}CA76S8XlRpu7;#%5WbcmhY<7 zd6|WI6bauA#9uU};JAvF;2N2gP2|S(?@`dECbHbV#9=S(anDQ-YMXbCpY$D_nK@1n z0&naT$^#zbR5A;Y(?{vuLh&S)D-YMb<7IUCCd3VaV%*vTC%JTChuR7f`TDfdW^wXI zlt$S|vOoh&fzLyQ$B^?Lz1~l}y;4Ua(d2q+J?^w=2mz)>HEC-UfHPBv)W;Cf;y57; zP78f$fRWS1yTj-KiXc*_G^SI&kzoCdC*3hULRrGNKeE|KBiVko4Lk{bZ1{{yIdn{0 z3ZfF4fCq`X)qA6sg(GcW(2Qm4k5gtDm^}N3fw*CbAiTB_bjSXpB`;2bel&K1P-zo^ z{UbA+QUf!W{5qArZ%T;Cl}Ffx}naq}%*iWp^Pb+barDo#RT-6C#- z+6e_{L5_Gv7f8mMWFV9jAazp%Flttk(|&?2Ig7ZdyEUZ6h`Z}SQzC0DxWUdUEct6I zcZ(QVVA*m`;NX6x$|7vJ+s!OxQa>Ol@j%?GryXsC?P4~#_1*dPQDo>sp!@iOvB0u( zD^w0DS4+yCB46QVyIY&)*wst1IF9X?#fotFj&*U|690|~b?i17bOG(U_1vJg;v%FE zzd7BFFHo2#kf0*v%?Kg5xE&opsQTpZgFpdBJi9i;)F7IPGP;Zw{4=J1k+hLDTm~Pa z{w`UMi*N(UH9kJ&C?k#|UQPtmHBpiL5D%%oEXOZB4%fL&!VZhag$WoB$fcv(5o@;- z=Cpa0INlN0nQL=o8b^J$evq!~NTN37YPw6)_FnU@^h4ohk7`!pa!AI$e#f%7gvVee z3Yy-dW->nIeAnTF_vH}dwpoMdP#s8)G|ja=1(m4R{XTlK6uS{g9z;gS)&DJV5yn>z z1Cu0dE7#Y_WFsXOzsw6Fw|8V{ZC6-w#x{+k*G9#LUH;KR(?}v1YgH+O*XrHjvbVECuS~a z?v$r7{)a7!bI>h&2ft8`N4i)cfE)pYC?;mJ8)KSOMa}YBCgC*F7G-*PP-g%}bN~+c zycUBs7~Ip4PyjqjHg0OR{jefA3}*e)c_6q&hSj(X?Kohp@iS>s-TA7kctVLp=v?TY=Smk*Y2JTm{MW3ENj$3#~itaMLV4&*O9>&9+d5A zm05X^=D2Z}{+Iz>WwhT6O55JaQIyzv7VB;H-i>>m>7nD*(|9e8@z)}`ZH0c)pT@}XU-?p zwz~abUvcN9Ew_>c?igE1u<1WRQc8TLiT%$hwfmg4sHgvg(kPT@T5Q%ZvdyuJYRF}) zIEdVUU&pIA)y-1g*OwapLZRfnMDIg#=n}Bnb)|!nqeB7vU4gT^-LpFy;hpig5o)OU zlR>!IrjCBYxp*(*e6It}md*0@Mp9>H;OEfCIgSS8ncs^;Xb-EC`_HvsWy#kud{rta zJEWSADLofn1aT6gdvdbX-8N$jDe*p2Lwl%HuqzgtAU9ysJkxnjYQKJ9d)XhL9!1@h ztb4jX!N4gbAj z#{0w@g|U-m5UKTW-6@imXMo`I)h_bkl^As1P^tK|@fyzTb)c4y+7?zm<-2(N9q}yd z1{-*F0aJ;Z)ZJPr(71fcQ)_OW^%9Qd1)Fp7(YyK0Zlp)^PtdJUke&QP4byM;rw+mv zjVg|QJ-!3n_tQv-f{`fY7gT)j6SreE7tvLb?4JY+D)bKsoV`5^QjX9W3s1>8t1tSZ zUOAx-N4ODF-NvAu#Wqqvby&K!guKs&u6@r2BR9t#mRw$LVRR??I5jm3m65s07O;0O z2k?dU)U2QE_zqSB$Zo-h{HkuX#Z4U%-e0qw=*J2-5}B|1A+l*PMY+hSontby=Y!8b z2Z_hYQGr~9fJ&z@faz6LG7CZOdJg?XrWERztxnb^oMIoF+87n&U z(+#aGdzUHsA={-`EbY}%A}wFfl*qO5Dzk);)PKw4t_JL+=w6GIYRsF)qa!HFUCDds{79^;vqaHZf)@XT`6Zp6v9F%2FM$Z6 zT#KF?lY;eh*!;?#z8n~JREejT;+G%v3Qk;Zl-K~W` zR4Lmrnv+N>PZXc`=#2lsrp5XPMV-QdCeBnre;1G<+^eb#6j;vHwC0RK+PYFr(aIuS zY}~#gZM?Qx(y7C}J zFX=W^2Zbg%Q=|ecN#e&d7)rW)6BueRY}+NB0IV^7hQfoO?exn^EYo0^`k zr+86m{iRl`t@ZuP!c73n%~V$0o1DByoyj4(dWgVi2-sn*ciI~8A!1$P>@a3d!C!pCNoExXnB z{f(u%n%`F>4#3O5v$r(CE+H>6KC?7dL_Lvf2mtbUKh^1k(O-^d=pMPbTCPe=LG{VgZpZ*p!Wp z`~>Ns--F{ts(F@nW+n0LCo-MdzLB$D!xQlAG;dZEqH&g*#1xZ~WSQd#k@#atnaKyf z4g3bPn4;lHIfhg#0| zc8%P}__Q;m4{LokzlA2=^F%{W((|+H$l^N}I?;H3)Y?ux^hg`Rn>)XaW=zMA7f+oI zHOzGEVM*r;=kI6MyGpmvdfyF%@#K>U=c^w4bZT8TQOAX&^ttgxgY_ncv)XIf>sIXK z!IE_M6wOn)ZXO(u>H4~>ERwMPic zh~Ah%aSXwXA&D5wfl;X8`ry(dy=k$-h5DiPQp}aD%)iOVDa)8UhUvSTXWJRug{Mi& zS$lZMDVyudd3l-Zn@f9j>I)yzBV=s8C(dW8^2QDuQxNs_1N|$|>Mu+6sE&S*{qwL* z799wP?(Z!1uW+lsS#Fl5jy>)O`W9}AYJh`nQ7-kS;Pi<>$;ba`9pMSk=Ml z$H_F-W3S^$?4X1MGw9E^bAver4YE_ldDqYBzGr5y+Zt185|lXpv2p9Qc=4*2@}jXd z&A$0O_0D{I&0s8*E`Ni18Sgcki0&i#3lqH?n$JS^Q;n)}K{NT+h2=&;lc^W>ilZ6r zhR;m);~Aa%PyePJM(v2t_$ILPAIe@7S`&l0P0|dp5d&Kp2Pp0~ zqo*uwB+F)96@}lbiOJnFx`nF;R(xxuX5!R5vO7Vy;ZL>$KRXmtLN~KXuj8I`tR(Nw z1t$tIMpCGLXq#dl?6}%o14RXbs&z-C<;-9J)@*MS~WpNQ)uUJM(%DN@fU;~_J z3@S9{=d-ddqpCitvMyCO-b_F47klSF0mlo@GL6ly-K{O3YspP*aSE4L-tp>=i(Bhk z+nZY(>tEv9YahlIKHqINw)A^i(vqK+uP?out?X?J6XGzPIf)BgQd*fU>>4J1 zF@-Kz4{rKyAeLgTd2dQn@+NM4s}SGEI?}m=nyszscZKY8!3Y=In|IPDn4p};2GF+T zf~k#e{76Fgqfy(ho{^dLSHa|=yO1!D0U*3F;!ahc$NiQ^vw94}Wy8zydinghe!6Mj z-uIz?BB}Syw&j#hI|^Yz8Hr{4c9S4W9GQ(O*;L*f))-S_U)3Vfb?w~P2DFYdaCHPF zrEOm6+kr`v4W$zyIAbrdDhcXgnlp*0LC+7Kas1q@dL!tu>SNI z{2E!42HXx|$;i7{UVRSC<{_zzmpDm{sh^esCweg|B&$&@4ylGXsREN7amUKW9EejH zoOZfxaX*iiela$4aL9uo+3+y43rj{LjR=FTfKRxLelv^6FFpkueCVH^!Zm;IwY>+{ zFdP>7Cc14qgqve(vqA!il^<$DvDX^KHEqsfRueYuBR)=0?j>HDIf}^6(FaTkO%+cS zp?t-1#vyu#iFMGc!M44NIPZ^iC3_QJ0z$3D9OJ@a=N`RHHhtL=+XgZezQ`IfI@;?r z0jm>@xvh_n)I*WYf;2Nalp$I{vr+wU+=E$vIgP+ca5c@ex(tbntNcqU0%UGVlq2k( zCtTAZf0P%k7OmBy6%rLPac%Ehcj2t{MYdHKJiIMF@2vjTIxm;{0tD_MQKCZwz_$Kv zGHVOnh@O*+T>x=v(@HK9gFCYKn$6ik6aR5MN}m3~UbEEFOIFAzFy}>R~M{K*&>~KsaD4Pr>s%BOihj$TyAwm(CSr zgg5e++YZd1hmURUNwdwMLI$kQYerz*DlOpw$^8uZ#o^r07ibR@I8k|f9vLNeTuu+0 z;K0AOAH{_eVTE!ekdlAS_IY>5sX3c;Tyo>{MPEZS?>}sH979xZKThSIb5aisA;<+4 zFE~kOo)R<)8;)a_o`6dT^$obE-#TP}8WJ3{OoWV5YB-xnE}CyVTAI@vrNAN9-CW0U z&C{~u#8BG=GhaTsFz;j_5Hqaf09GZ2!kQPrp3#hH4%@{*0iFSv|V~BBDe1DHVeRK1CvbHQ4K6c%) zlxZ#Xab>QW%<(q;YYew3-qL zE4qhS;?3t7$l9i%g??7f*N_##pQfbzPViSiFP-oLaZ`&Q3y^LFl!qmweIs(*Un=w< zdoFoFo0Jo^NuME8k-W1nG{vbdxO1dW8`dNE*=vD#B=khWoK5?Y2IlQWz)%}h=taUP z`+?n^I*G@aCAA0FFfuW#vwQyzC%B=cVEPF$o6YO@;^ie!hIA-)x)Q(c2Ysse+7@}a z4@((e%4q)BmtpE};AEzZd{6V6--osgA9_7t;$23PB7tk5=E));9;<^5uVXPsKK!D_ z3u0P>bjb`_g_eb#GVWBoOmO5p3^?=XwW%<9@{ZM%A}v77@u_XgV8@=&K+umXdy8}mJPq?tvwlrjT^Nx(+)#NZ8C6C?)^1ZK$AHGCXJ3Ova9 z?4>yyF3^|sSD(OqM}dA}-x)+iNfWO?%IY-Nx^XsQe}1~a&~Qk?9>(G7G-e6VrmD1# z8a;;oTgfn1R|8hbYs}U7RwU*ZOENrDt%8UUjbR)HwtXb7-ow-8VNr}gGMph3T2|5; z_{gjcwN%8(xNQbfBUgYUf&Y%z7wc#A93m-t^SGL}H9f*J}KFx($nZ&GKpF*JC?eQL}_9qCl=^!Pw( z9N!_bK|E1b5MWYw9-{-fUY@C^Cii_ggwB+30~p~WDOk$gS}lE%_=eY!WM{dHMq@sB zwu!@}SPaGmtIx|Yc4RA&M0uuEmJ0mX!`cRx4du(hl*;{im)V~XjzETY^qbh+#$S?y zKN!ty%5hs`+R@0YgY+E!L`q-PyGfYMuL3tf-ZVFmyWA_{-9evq!wfZ6j|J-{9Ur7u z(st>DlJFw(m8@Mno4hG^BVgy?ES%WsHOqY4%m)T`gF)|=t3&tD6YOnNjOmac?WFrd z9gHnYP!0^|c#kLNtzS=ty0tLJ!!ItW23)-VYP=aFPc8qR;)$mE0~YZMmCmkU{$a%K zM0gLvUVxM6!Rmt|Lt;u*A|~1Tff`Hr-Vn6H+4ZNr&j$k#zN>>&IdaQT6?WXAh<_{b zCazem4AJ+a-*Cv8XKBYehmVi^FzHa*i!?1-C)bX0JSwEql1j9hQ#Q`0_5}XtPdP}cLHw0)FOeQ zY+DA(`$u#L1~FtqjoE!6hbS#H&ZuiXe)L8Aiux zMU_7FlpKc|E2(!3TfZ$Cx+HIy0)WtuSHQ82BJqchm_J z7~rjroi@8gQ!L~}OkkA))WGp&mQOqy$kYsK{tQT}A2kTqbhMQyTiv}IRO@@7K16rF zRyq5?tgIT)=gqsN_6C9bK(xU0N31iy2d=36JFgdM1%&1hP=uW5E9uaz4fxMI8k;un zCi|v-p_O&cbb<9{v&*Dn<1h}?{c-B3rz(G118PGPakXwWwnJR3*OYu(#-#karaxIj z)Z~XbGOwG0k#^Xv@Bhm1|4V0&eCo)PivR>9f(ryh`|ou2e?iy&rm;`bln3BSyIh^w zabzTlDWTBICeyc!5X5kkj%DXO+I8w%u+?|%Uue#+%0!;liLz;lmmhxSRYWw4v&9AA ztqNdCAswKB)sBI1LvkP2mjH3IfsjQ2340;LPP)0Wy4R!^O6zRNNjd+}c&yO4 z{`vS>J`fhx3DPrtwsLVyiFUZ1sNo&q@Dol$l{{PoU50Py+w~yDUG-afRf02}39f@i zSEc=WG2TMwCdSqe#_O0hlh0QED}_~5?>>f?gr`D-#)!{E{zHY0h0jQS(C;jjV(>1Z zNoUr2C0f=3^Osr@?IF+T5-vxU(MP9P_13a@`^0lF9`D+9+RmBk+q(2$jvb3im(nXdB-7 zUw411mj(GbcyT&_AggH-wJtnh3#h2Wx)zw`_lXUaEoq_w=L|Yz$_!qSLs;Yw1>j(_ zkib{VT@z%51jm<*n-qO3m1&YRuhdn4k_u7m%7`}CkG@KX#!1kussM_f`~1CRxB?>t z*achMU37Y9a-c6cQa<-PcUMo+DtmV(&uQ6l%{z4%X;D69(9GaNn}nx3Y19Pj*ZD#4 zmb`rzxbcuGj=!s#6f6&sNu_U+|BAB z?*?*Baki|aBfI#XQ0u)Na@{vUrBv=By)B4pf(ke*c-TtahyL_^@f3CaX_>^Y5SDRS zv%*>{l|gfl*`cyduqi$PJbws2k!|Q-xs|ujC+iCZ#%i5XtKh3;qeYRd;k-HZ=~wr% z2>7M5Xfjs$qr+@zq!hbUPC9zkEe#VBKOm(xhPt62<-v=DuYv2$d3!oHP33`MB`>3< z9>)A>k?ej|anB$2Di1Y-e;W}J+0BO!&c^KDVxy~@sxO3?C7plJP}Z6yo(RqY$3l=+@GAsu8%g}!s;!>*TKyNePokod=OXEIX$c%HVjS}_R2EwhkjhmX$*Muy~vk5CX) zu|39(H`H*wkH`z}o~9}kd+*!!z;-xXcBq1Dd%?*zxrM|rOG z+n(uRk8a(CgSiW+vu|Zjn(mcLYE+wB-lhWNF=Hl=j>Xl&w@=fg4d1~o%7K<4lEj{? zUP2q$#7+pwK`g{D*fIk8uYPU&_sW)ozistFN!iKrw;yhgUy?83w)HJMA4+*2L)HO1 z)6OjBk=$0^vx5P&lmNCJ$b)LzBekm&hjR|SfqfLksSh8D??HLm^arZuS+h>_4oSla zaic4UeNJ6$+1gC{0Z8LigHr;BmP4VNH1HU;&g`Z1>cQvC*hdVS8n?;SaNLpkMDoLq zadUE9@j{YVk3)g;S51xFqeNJ*lEzO^u-Hb7nxe0bkO#C_3|fe(8><*KCfxnigk^cw zUf;X%mW2Pr02*>p#wZF^_89&Q@#3Kg-IPTtNKsg*I#;XxFa}jtWQ=2guh>Z0BO1Wg zDok;!t@l}USbqY;@r274#ElMt4kP1~)^jS&p(7!tlHto9Z% z3iCQmbGff5uHiBKUD0t(O=`w7(mrdRP3NU*Ri?_O|T{;K~ zP{meV#$dZU0h6TRCQ1zvs1UQuSWDoic{KtOF_Fg=nZn|rl@NB@y1UyvHm+ zC`ayb$ep|J4G{$&G%f2Ec$|uPeg|H7!1*c!KID#x*=k4Ci!waebghUN9@6vt{7A&6UD^gN%Nvi-1E^~F}brgo;`y~M0j*mZ% z1s;wQbym+bDdb)?w3E;FTLUbdFDU&KkPeE5tOR26>K?|%7ex$j$kojdKqUBK{X#Ub zmlQ1b8=3{=_KB?x{2lTnDv*@^i|TPw;r2BF+2pu&#wV&1uFTW#WV?BW5K5!LN3m*j z@OL{Z8HDF*AQm+3Pr?uH$Ob4BQSl27Z0#4Ei?ax3=h%}8Adcth2(L3T1BfWzch*Q# zpHGF7FB5>-8Oj&*gT7o9{BaFqz`5%<_GxR_m4sdF*%R;+>$ir}OaG`+2MpG$5dDb| za+RQM5r03=Q7N!iFBYcEF%7ie)cKBR=_E>FZ&lUhUn+~vEZc#UDp31KM zk-zv27qSg-rzZ-$bKahUx&s16Vi0Z!jlScr(RBw8Q!zTQbLL(cy}EJ*rx&|9Ci>6pbrH7R+_@k8xz0LRtUTbtTlFhigad^cur)6(4rlq`0~ z8XERb-Cb+$@58RBY;ZVXYhNT5`;e38Mo7iYq&*KeKdjJP*X+R3BpvjJ`{2uvRA~sP z;)BkHzFX7H@nIN#5OaC(%!~n+9s;W*H+DgJak-zhe%U&xtS4a#0OdKMuwrXI)lQu5^EY#=Q+xlk=QviPOpRb zmDNsq_WpR*IUv1_3m>UE*m{U-N+C@UTjP~!`0mp?cGXR=dsz1&N5wTy7kzJ$DlGgOL-fX{(bk-Eser%+N)L9R;c{tct0JGXx zlikC1Id)S^3{s&_!P!}j(Sb&JUbkYy8E1K!ch%A2m->CgSJ8Xit2Y7Q{9i*+e>vzH z3GrgPLI43F{JF+r{5uETzqK0w=CJ!yUH1FtFfs&04EVFSGMcz_G~H^QNR_D2w@NO? z5e|9jXc`VRa+Y$maB8rK>|a5lBi0bI6QxVi<8fbpJVy!#ln3#+)!c`fDf^O{c675t8>IJl5G!phAVm7wy^1PQmndTOOUf(dhV@)X3 zzN1adys*4y*7SCGp?mku^m=M;jobkz+jjGCArgulhka`PT+3-0PWd){*Hl82-wDc@ zO|``%2+4_<_JV5QQ<mz_E^VXsWV^}G}*o4YiJXJHvd=Zf#QjnIT zS{EO3*NEwW-^!p(TXZpjzY4G@vsNvs>XryUD3-78l4DTewvSp4a00+GP&zVP`i-Kj zFSx~j&+$N-D#kM-d4@j8-{(3QA~a=IwEz~shQI-g?g3+R0LSD?dtML!(g{_AVg2Iu zd@k(R{&T>#laalFOptP*q8k~b;ARNvt~b8_w$u!%GPMs$Xvd z1yk=35^gUfa6a(~QaA+!Rj@G5SYfMIiQU{Op_4!{d43`Zg+aB&``tYLY`YRmoATANrE(VlYQgGK@Xsa1;g`nqozwfEzdTsl&v+IYe=K?(g z_qT941qU3I4Z`opcWed@nP@e;KqUl*q}r%t^uuGIgN{`7W2^YxdOv-~RljV~oc|cMii_dLsRA^zADKL#+Wuf8XF6S-MshpIU zAi^FQeUnyxcX1T)Xnz6>0x(_13wBF$!jzKz_=Wbsp130B-`P^Jjbf#MmBD_PO0)Np zFBf}@#p$^>GtbkyOtl1r1lXaj&y?@)TwYkj8+^wr6qtb>;BG(SqheT0gxt!3V`fTM za(A3q?pLFoT4Se9`L?F2{oOdZ;!CvdE-7WUQ1@$DlyM`%LcJN|TkeGbqJ!1+lL0?Z zV^wX#O*?}L^D*PawW1un(cyR)%-N#bH~}_yM!5?7DzoQen#(t~Mq|5Fjsx<=hcb4( zbdHj$5}tJ)N{BPAi1p_w$zN8_-X`Nh1#mxg4{g1=rhbWR^TgWkEfow6(eQVjVRUh& z55nUr!Mq#I(4_4Y5sx?~!rBh6hCB7^RL`Xb>{h8VB7%^B3GllS>El9tPYSRgNzLOs zqu&q663nFUU@_3iB(9U#qYL|iax4Xi7DcFUthzfH+& zz6*a#gAF&Su$x!1Fu-70xRO`QyDbI|DBJ&@=}F7$>hf&=X}-PP(bnVR`CQO7Uz(ju z169L2bJOsy$8iP-Hzk}wB7{_a`@u1iFezR(k}4%@f@C4AfA&*ux+vJS-hb2v9LW3f zfX!pvzx%6-w|PsRC3WzfA4KjGIjBgxa%$Z=g`GWcl}+sJYmoWJ7=DKY0%HF=d)VLSKL2JXi&E3E*_1%b z#|2t%v*B(;U}yuRr~~JZ zm1_y_Qy$h!J_F)0pEKvG8JUA*G7GZerd_+aTjYIln^f@kKsK2z48XVubEY z6f+%g5*P%uJRw(IRnzi$|3+7d3G@`Ba`QGSbU(H~Xqqb|?=s+#ImweD z@d#r9y%i!LwLPdFgQg#){;;)@+vTJk4O5#{X{CMz7P2)?Agyq^^*wNXG9tVtq_oQb zps{v2~#+cLYHJWm(nDzrD4EnOOa`|kLcQj`T)WFmsYXs3|h5h z+iw6_EzR(lXYb4DR}>XnR5G}wNYe*QhVNdb z&a)Ug?_DjXK=2~Xg(NUImzzr;Tv0P`bYxHXcI=g*I;ARCuhg|PE>aaFBasCyv(EaI za0xq0%`XarqD~L@Rc{`x!h@bSpM@p4T*T8qYRu%o_=UcRyIzp#GAl z_ZZyUJ#=YOQ#%=5wN|YrWaDs5kGm3{-h3VVYd!3{%*`{+oShQy?YCVW7+%UBw$If0 zU4KFR%BOJU?o84@%X_Z`88G zq@GOKq81O^*>_KbD0Z3I17FS+DOgT(3M@z#;teKV%nE7p@xRjC)wQO|#q_9gxcy=^ zrYJXgj!!d9TXij>apH?Hv+af(iWR^ec3;9cjf=-Le188t7n4R&R&6=OEK}gT_pwpq2szp*Su>H z0v=Cp=C)THqEX*_H)m==A!^$7t}$SBa((k{MtPMe<3m@JN1YMYjyxUXfQ-z={CNMW)5Rr~BjX4B%&^Ue<%Q+? z_WI@k^OcdQk3W=rt<-($-sVD|yXj!D1~aaf}CLWAVn`3}dxQKgNh3b->*u97F6k$*l~{y&6N!tH1`R7hy+N-Nb~5 zy2+fT>-c5^-B zuri#>;aa0!a;Ady`&EJ(k>`W1f%eM|0y)8k>XT5!av&9m(GhI}G0P4@I3b3rks!o| z2wk9*0lUKuQX?sayie4sOR>w-5z|KnY=(jgj#Y6X5w{Zj-rqcvv|5Sb!xf=KK%Ef~ zZdb;3^LWwqP>&N<@QEXDW1U#a__}F*^h$g9fpRbz)1N?OkDg8|m9VIefwoLDdbVgs z+3j0Qpt{tXQl5U=rjF6u6Yl0%PJVQk89@r;{lbQBk=_|S@cvwJMHM+{q!GAQ`@tgj z^r5-=`o?tfjM!o1q2Y>Qcy)kIOTm%L+G#()n!?si}6+7a!1Gj^UGZ z5Gh*Ey$^uX;1ac1-p$E*2nIc=!DKvisjhk%1hr*H?FGT42tUaNP29t{iyOJ96uBku zsOYXkZ;%9{LrEt-^;3~TfDpp`C}FS_i6Bu6wVOr?JGxHd zljaJwZ-Uy)8F5Ev@N0Fm-B;bpPH!-C=+q^cLuMD_doP`hMU?YUP=Nr^z!xYzLTJ} zZW`?L@r=f>SVOI&dlxr)WUMtn1tm8^vS5uiVw!5DJ}k|ljVU51cBt_yKGHAQsQM^AtLR~= zNt~gn$|SCZn-saW-RotBE1`KQNKXhrII=;ONYxBv;g6(tQv@G!!5{nlbq&(z{S9;b zXXG_Z|L?b4e_sOrn>y6x1?!Br@?&)}V{L71$QYPG`29uaPy7@bw70l2(hwn(OrkUr z8POPy>=xY(#16>s{S08~N((dYLiurMy`n4W{ncsU_2II&N2`p3M)~xIoa_Y(`Hh8y zZ$8o+i%J=T(wT$NanIIP1lb!B4zpuW!T(?!%HxgjpKa4B@N4+yt z<6o-kA6fp}S})vMdP=SROZmoQ71m|WX5>@zYH64aaP7nNsMVGKi>-5t&V&oPbexXa zv2EMv*tTtTY}>YN+qTUwd1Jq^r|0UQnKc)6TUY0-b*j$Z`^lgpul<+nGizOJGbO#) zx@#R?t^N-hU4uwdjCoWlU(HI@UzpO!vE*B`p|)jeSL9v1p?A^fCEcs;P&t1=`=s+# zI=gnVgkd9NRo&X!GP|X|WpBr5R%>e&-dfzUyrtze+r@)h*p!yIab?mbavD z<6yObLwDW$oSN5`$8g>LoQ|joEiJmHcwXSz@OOeomB$vJ0lzUWB~DvHX1vCv)S|{h zdc5|i6m2bXN$S${TLlMDcK>%xKD94Oo@JBm z!$=tN!U);<@r%aC<2J4t@HLy<}Xi+#??UNx0&4t^(T52;LRXeA5TK@X*$)@p^8P3C>e|-A- zR(6Ww7Z}gMZvbGt!+a|Z$KmfC5gcP2=6(Ac8h>BCxXjf*RxhD|=9AR_nXvdp`>AGm zboAL;AQ$albo}P}Ew1{U%(o7_f%eEFjL^xK~ovc5caUr2nkQ(Q@X2R~flQ!G7?=s~_3h*d6ju}Txe4K)Q7gH84H>aB`{CodZd2tQq zyq>?&$}N08r8sC%W1^BEyWiIZ;H?3tR;cV7k1WTWveRrZ(-X&qim%-4&(Ozw_T6 zY;=Y?7t+w!t5iN)qvVy-xI23sS)3(e;%T6lVdzr*_u3t*SGCjRvYgtK&G>ujt?3eQ z)kF(iO^SWl$*ZH|`7SHKdp5+J>Z}~q4dnechXy6Tqh%d{>=fB-ot7V@--zpnBX;n^ zzL%0|oPE=21L=cZTmZ7@{l3fSHo~m#Ok%wpPsGOGemWw@B3l6-DSB5OZuAk`x(lb2 zw^DS6zc9bX7F=(`M2`OnulA^_?wV0KVM_@iIIfFQw+iyRtuhJjA}sdvMQ}W}5$nDh zhc&a`$6gr`uoUBqo%EC3QyWq0=<)ZlE>r~6D-gSd+ha~a1Ihh9H!h#At}azVftUX+ zh`<9-K15f=3SeRxvDD^30Xcs(5hVY%8mwxJusZVAav;#G@l~ulDNf}yl`zn{Vyz@W zX`EDIrf@d7&Z_X$DrhYcE4)0GV(GCfoL*3mE}z0HxEw*IvexmNR)Mw3#e2Tf>8M{` znP-m+VV||4P11eJ?NM=UW`+XUg7QK?dy$&MWvlWGq)2>o_l%3%MIVP~7mg)nEsJU9?LrE;%= z=U#wD5n2mj2(cvucWyOZFlz{wb}&yYe*}PEX2AhEs{7_Hy!fHWqGnm0V6`LWGxcL| z?Tyus@58s0owYJ;j-b{99Zy7C?LBN6jt5y4?#P~%_n=z03)-H(R-inTK}GA&9q%`Ir)NOUr@z^_l>Rp7$In?h6t zSabL#c36Wk$i2G{_9o=_C(@_g7Gbh78Uw!z)(;xtaUx_8d0A5EPs*?UMRc&}H{BSo z&Z(O>-5~JV>{o6T1xVXcdA}XDerYXCZAMLCC?ZT>NPvPEC*d`v1uGI!noD70Of6_N zVx>$d!)RS^APZ?w%Wgxs1tKS4MsJ?`Xgcia^EuKQ=z@5Z+rG5fr4QdLg1<^!FCI+6 zl~U_K*dc)ffu;Wzgu|QBkZ1S~PKFUHX+d%|Tsgi^VF;v{O6Cz|Mp9srPcN$@MBt<5 z8l#Ygi`r1qq-g@ehrHmuy-G6So*}!OJ_wGKaYE<9=GecLgunxvKI&-1G#R&8878>E zi;H(FcAia4ul5{V0?w9EJ%jO$v8J~WG6^?6nG_%nfM6NE4q!xCpa`fG+$~NBF-qP55v=F#P5we&3$HYz z0;9;4XMGy9^-6%pK29ifbZzd~X5Nfsxs@?l#?U&F zV{_mL4bALbx^QY^PD{tSkoPKS;IWAFHRqXuo5m$%Y)gQu@_9?cmX@ts{d8yb7&0S& zxLefvNffkY^Ky)4NKCqV9jbYAaDQW}%^BUoWE7?%JmCx(zRjU%6;JnfsLY1{uE?!7 z(>%3iw)2mlW&H}zcq&K{35WVIZfukOqZDO`m9Y?Nn%He*fh<2%uWAG2;c-Abznhhi z0>e7u;8D>mnMbi@^+y3m89f+EJ;KkjlFv}*_9Ur_)9z!5Em9N$Fi5xbzVeU^R%2=Z zl0=T~DTYYVKsmH~rvsWU^dS9Zm(l_u357%(MuY-M6#~p8WcuLeqRN#RY-S%4+aPiFGL?BQ4i3p0HlF}kFc^{_GJs_xTYor(~eXJAB`7s z)poi;fm`d+g*kwTqBaxq9as>p6Hn-j%D|wA?+TGGtkgEyNhlj#WM;@o10hS5gogGc z>+(lFfdbBuehmJ=;$_i3PY4)|!ENxpPPjVAjCVH5>0TwdlbAaN-U3 znb=jeI!@u0MTo>%-fW_--z$3{t)y*pN^Kp6@Mx`4i~{8WQVuGu7j2R zq;EZf7I~b@Xz}=1P6=DDHPtt2}j1K38}OZUQ5lN6k9V!Q+3g0d&xb;?xH1h)N{$pW$q)EGKUo3dtC zypWhH=hcZs{yw#A$;PiNO;qnQU5Q{4TUYzQSTYk@C=)^~oR7T1W(n~a-*u*qkFd>B z9HRBGs^V2y+c){Y^_jF~($|yR`_{viP*W)ohY=){!d%rZ>>BPSb%U$*SpSWTsqAo7 zNH5C;>?6D|y$Kht@7L#uVnX1Tp>j1cb&BaEh3Z^2r7D3D;kYGymWp)#*HLGj>9mRS zI?}}K!)+O0o~UxhTw&xIGadO~2tpP3r_Mw`=VXv0jWGb~!o;AvPw^RzG?j(1Am56d z0kZjMR4VX$$S^5~IIv3DcAx|;nL!p=5bQU6r-m^e+G87(8u6Oa=q~xV(*x6K`q>2J z*{j%yBjaK+b6MEW`^OAhU#``1X2vo5V4%Uue!6;7b)d zAhB)udtk(#czc;h?5UDay> z>==%%o3rSk-RY<{A<1J6^*=_E@;J(eyRl}JkA(bEj3$m?sM2%7mo&kCZE%RcqMKys zCYAg_(({2&#uSOA789c*$}=u5t?K5oT(JEkE50P03`a6TVJnfIAr6mn_ysXjG2pG| zz#Vu(%*OjlPwprrD+i4&Rg_b4Lb^Oqj{X^=rZ83<1IlJ}$hwiFM3dt3<`eA@Om_Z@ zZ1P++k!)a4nh;eS`p*I@WfHQnP@3*<08LnB^Qx#IBKXB`AT&y1 zludx_D1WS}w4fOMT;@wEc4{`C(>y?G?>S+zZwG>Ui7ZvlP^&%GSvh6dPugi&nDOCE zp~H$M*%3i;8o%z)tqXT2rh8GYTknRVeb+8q&Sbi-YX#w$qrmj=Ni}1c0MH`AZ6_?R zL-bY2To?xUosS4ZfmpK!<<-p5AvqK4G2*$3I^2l??bcI%u!Ze0Ybt5zH+sYJSQy;Y z2qFVzBt`1*2iN92jNLuh%L5~-&nTfA>gF}RAzLwW1N!k!T@``JiGT-C9m*c#3j6gt zXExtHoS-k1-krMj!8D4X+*L?}jh>$OdDc7>odcRD+*S;*9^3_bI=XHhT2*0U9*zz+ zQnIRFbvGnCc(RdWQmT{X{#hr|3IFXxnoe<9J=SD|plioO2Hj52J6Nw!jlF8xS}3@? zutw#Lo)k+)6_ul1Sk+%Gpm<^u@IYKigcyE=*8W?2_;nCNag>Ct*Sg@0H+8Fy5^S~T zSPY^2EKxii9o{*}1NkXpnjG~hT1&E^a#)wjPpL6!pnB#- zyAD8*wa*{Wlo`_kjI-wOjaW|?VgolT0f;LVfZz6C=36fAw*)s3eMdbP^6g%w|CYVb zn4N04gjoL6MUB|H-R!*9L|zh_=J+KXDfhv+M9Snfr5xZ`bOc@hm&1MDHXCSOHQsSE z>da)5DKp%YDP1rwEFcLj;C)du78~BAPH71aetPU%p(8x~Op~`*98y;d&3zWFXmF2o zO}lpnx2+wti4k9nsIxU7w?dKgXer|Wo7J_---Vs0yLA?XlDqv^Lr>jF9^f|rBt#9h zvACn(j?_@x@m0s1E%b0v39@X?RAk5ZgnaUSgMsw;sR!k*)5~?%+JR|sF<_?83GTj~ zWbOKp*x7q(4-5tvqa0m2$(Jy5fC>9;LZ)9*BxYHisfO54dyAK z(Bhv=(6p9QiuGL`P1`mSadk`Ks5wM>_Y>fj+x#*$9;{+zJWO zjT?PKA0BVTy+&tbI}W2n1Vf}l=wGCf7CwI1tq#>jWxG76i0MSRXi)9?<#bJ_L%xqY zCkyu*oxT-Nzhr#lQQR5=Nh9u8{+6BrcBJk1+N%3_F5M+HT~q?-kx91|*q_(ec&PdR ztJ^ZZq*!rE@lw^J+oPF%cb{n^yM!hr2iGSB)SNK|PoIrNZSMWp&0L$0NplL==ymn= z*}rnwn3QnF?8}(XWb7EC2tw}q9*y^h`2SqQ8u4BCdZAMYgb_#d{qNVZA=sI)H5a#V1UGKt7kkrz1=sV*NnX_t4(|*@n z+J|s-gMo^+57vC)&BD|#&slS9ko^5O3?K*bF|-_hYiy2ko1>JM-WBEWJ6mdW45JWP zT0YUK9XfkF#lfl%)};O3V?QIXpBjz}x@!ISm3%m!&`DBF>Bt@J<9LsXGbp$8O)tTc z899;|f}9RBVjwcd0W0rWsQJilQ3x2f1Xqeeu8Y^ccOD&GCnH`mpp+Whj3gXR_e2%G zPyemG=Zks!gF|63d?8f>mu6phrA@8WDM#{0rCJTFiGJfh26g*W^N^f)#xUlI?4f(g44|TkZmWE!Yi~~@P=kNo%2md=u~y?Dr3 zj2qH6*v&GvV^r{3^W^L5Vs!QM<^Euwpm#k5DMXmJ5!nEF_R)QHb-7Nbz06;o2M;5V zEz97nh@DD4!k$N3oz(E3pm~sQg4!d5_uQ<9NW*d-W4b^F8ak{@3%b@sN|Tw=A?*?< z=5l=V2&*cqiF0-;w%{WQ=4u;GG`3z2oBltqJ*rg<9ZAj7km0XX8+)HW1FDc0pCA8K zw9?%Z;3NyLMqzr~sOxk2#f%YaEUD6}Wk4Elu>cGknkR#Q?ju~(om7Go5yR^s?Ub~= zCK&afoWcCH4 znEe+`vjYd;qSyB@bbi5Rb$!D!I_w#R_NFR_FuJ453A8?9s|!k!7+D8f=hlVMp;T^d z>7QLUvHK)lEyX{unCX_xp-x?1G{+YFkoEHQb^S2THCw#LVdzDXI4kT?&$;j;`RQOV|#7&^e(6We)7OtIS2XL zpik?bW?%O%%X9P{9#|NBTswU=-~DXGkEtt+L}L3Y#xQ91C63z-JP0`xy6voZTRKxYzz)1%8@}UcwvahE zWfoI<7ql6}e*dNBTP|K{@8a!&2ROL68zXQ=vjN+Hu|05jr`LOJrR6=>SbxA(s<{;Z zgg%4J`=AZ7fS5#(y78d_p@|F*{V@;G#+tV;Px<%4x^V_~WgOdki+Sd|yo2_Xdyp(Q z6&&>=4L7DfdR60xI0a)`hd^p=onD5p-cO-$nWqxEQ|ms0{7ihdyGq-ZpUx{(-FB6E z&$B^|ArYJa0E@zB*~Ev<8r}E7Iwk= zhP-h4x&)I_#Yp>kpwarSax+= zPO{6e%)F;}nwsUk!<|XNabD%E-Q<^6AJRWT5_GTL9z%+v!wX+sHvZG|o$p>jdwRY{ zMK#FcETd<);>S!?iNAc<`}p;+YumfPOUg%2J5noBNr8%`%Z=6TdFk>rY-rEz%d>a( z{dpQNjzaa;%0Vz?H|D%YoI5*x&f^2cWMSKZeJ{Vr;&pYWS@8tV_|5L950vLxOTcIT zFQ))VZLChP1MSQ21=kqCafMx0LaG+Mj^l^1)8Ccr72j4hKYza5{H#VUT^S)he?Ilv zW-g#Tpz@F;dsmBETS}EOrjUer%d=Za1zD?s&3e*bM9Qc5PqF2f-I43smJqsLh+I}6 zRNMJcDxHk$1Vptx+q^=5U%3N*iZ}rX3yVR-3$a}f)Ct=U5Ab__^OygTs@`by29`4LL5d$E zrXCGGh`n9mgW=qIs-4Em258``y$nkv8_Y2Q$(s%MJ!+Z1ErM}~S6T%U{P4L7%yBoHLt1o7w zXmM7r_(9kD3(M(P zU0M?y?i|`O05YLR>;K0KzM{6SjKlB>O6kf5Wvy>)+|u-ZY$AZvJ3-?;g(7c1=$hTR zr?At>FUJ6K2O%(zz$*zK>UszGec$yqOMLLERIp7UAW)4Wk@pL5Og4b;QU{rXfwm4N zA026bumGpsna*zwAh3z*WZ#1XkeM1DfGR5@4Kw>j*lx)lkHOWxKNdz zAsAzK;JQ$PPjdg4t2gZ8bkMaPaaxiV5ou89Chuh-wWUDvCx2ZaKBsk-w>PS}DWSLx za21l%EZ5`4a@d+YU_(hNv;RXZFj$qOaL}FXU?1pL+r5;|JhBO*;^A(5bF>E|uvdPk zw>QUNrk}2$S%gWqsI`Lz+xf^Rs3Q3F-`^5}lOlnW0cg)zUX(5IZKa?-5>Nx%D9w7s_o@zX%@Q^BTvw{L~ZcBO}Ne88v_{fB!5N zh@yM^Les?VY~-qdiIGf#{Ug@w_Kfz^M5^C)3btgKZ449RkOX`H?r{_ zUt`dyRlybF<1)g$w(5lCFypm{OM6%@IwW#7jrfJ|wP2EsXhTASZ>{Fm=we(*wr+we z$J?Syk5p(}4IXdT+F4rY0^|d}*Fv&!Z&c z_eY%(XpbTeP8l{j^WxT)vrmt&K^Xvck#B4l`^+#=uJ~)=X5K7?)iEN4s;;vGoLa8B zwtRZkj4U5X?x3%CiW!0%|Io;N+Q|ALqNVg`+b%JvA@$S~)I90obbGjReb7wM&`6{p zLhIC;+Ta9nlk6MN{W8ox=)nnqyS1=$pEds#wMb(Y5PK%-v8Oouad9n`5JlQT7~aUy zcK4(lDyY966sqk*A9oQ8wtj_E8@qKM>OHmG5*86l@!DySPcn1jzC_DL)$utq;jq_>w2UCb4D+X z+@DngK7m#AER-7$bHYB}I0X=-Hq}mMgTMywlJ4|JOo9dhv?qZk*oOL&2^Dv&zAQ|v z%*o_sfXu=?+8ad~(#Fi06S1m>>dn^g!GNSCpbQ(Nb3|;QYxIHME6s3?c1AK91B)|! z0;y0V*u=Xbm7=Q^tVY4+3RRB(+(LA0V6l`OvH;yjy{f^u2tIIrh}Da zv>SVr^K}mk8%eW}k}Yhotvy+qyEW`0n`TsyPko3D2XXgW$L}zV5QN*2K*P_ZS9Y`9 zVxRklQ`-#}nk(#KK>+c-`fAW?mpB!S{{O zY<0lsCR_*A6rYE7em?PtS48++Bh=iWR*&XFgAyUhX4XR z)%3>|>{oqw!Jh(uqS;diL!jWAismKOl#mhR#`~c)A>1{h2jXSM@#*7&NZyw~V^bgJ z2FvI?GFqqvl^#bqeY0aw{RfT*Aa+CFAUM*RX?i@~Z-xyYE-K-rhhSqmW(km3YDSBm zQ4i|V7ne_*TyD8HHiJdAmn2F&aOhy%1f48?#UC~R^C>BTWhY_rHfID>V0tUxhzK+Q zMb-UmL^mR#pS;A(`t==7+~Ma9x7T(ND0_}Gwvs8U5lG*9BjJU&v-RhNFuPJ7K2gJ! z2V&zXu+n|W0=|f_`ub-X85X1qE3V$H!gXndjnO7n}hEFAB?40WSSjyY)=F%ca;^r+`+na0|T>4z0;}w2^+e z_JGpnY#Qw`6w1Irlgg1c?3_7Y4Np)#>l89We(6*Y{sYkxy?n6g@WoUxm4@2^&;SY@ zRIW&kV)CvH10~y7Hox2*N+WB0RhCQ_dqny-2ht>lo?eibwpii3nQLO|a9jFp_0v_B zi{kHG{K#=U(;(IRAc_^nj{t>7ipZy^tYw^h__!tTIiBeaQq~V8*?)&bA`~XqUc9lD2k{;UK~0Rdgaj7|!bHd&G{PwL%3Tf`}lQJ=7J*HMMM?;VMWD$uQ4| zV!nPGkcW{{4qaG2kz+)*q(#mTHI|Esl-AdUl%oOO!eWor@(+gnP#1@?R#Em&f?Tw& zm7uhFs%gD!pN;%*z3qn^>XEkul>nXaXYf2{zZ`K;{#~1~ZbYV~dyY~6(f^j%ZrxKB zs=iG}Tz#EH+S ztgig6!rF=spj>5pMWkk*#2QYPw5)l$ZPYBcE&xK+i^QMbTFk%?H&^5qbg@=}vhxNr zSzpNIoqGso1`zPbUWuhP7~aMvD~yp2N937)!>&lPwDQ_PU86w!!}h_>f|iL-pHqV? zApDchc;0=Wu_mf^XGoQeg1}>yRkl6i|D0VH++T>~R7B9~!8MMnK6In{Q_re)vwi-F zmH+uAxK><$Us1LuWpFA?M_yPHNGb;X3+U=GC()xQVR@;lAg-?9z- z*stG36sq7{*}GLAuQqC~y)TKw%Zbl6daJrUxNubpSgck;$bZ{}oXjoNq^}{3_-k2j7wIKhSrt7D%Ra^{4hn z_6D&ZT8}=mP96=vkvm>gWMiE9tBYA206jjmm%R-1T9EPBv>L*KHi+M5b5k}vA~+t zw@^26yNSjl2!Ktp0bHu2N-Mp~{KaEGIn;<7DhcIs`VgH@CPWACpC`r2|3h+qZ*MetC&j=!Gr{>F zSl%SXknv0JErE7TroE!Fv4(M(!5;&CzaxzF$|!49)H*ZNl`Lo(W7QK}AYEJ{JRiI8 z_Hn3Fxh36i6tA|tT8oSEO~=0S#jx-Pdw=Am1Epb|{LW}?6cibq+}do{%=!YoH0+<} z*}QE;xXKP_-8aMCylto>?XUIG{-&HRU{U`j{J%BSIW|1u7yoS63K$?DEdL)L$Nz`u z@c#&?V>M^vaixCW-r9Tf<;BaIC)p-ln+c-cj_uht5h5oc5NGNIB}qw=BxKcS+i7gt zm+xMyrc<)2&xZm+*r4MA)PY1!8$Ti7k~xNnFi7&k(x&0?Q#GLe-&?KQ)kTg!Ti4y~ z>R;KMZgLsYnlV6uXxyz3^V~&W{8k;+-=Vab(kphZGSe-ES=78$RRhUk3OYRNyy9j5 z>M7-02*;JuGj_f*lo;*2h>eW2I|LYRDdAgbjb=b&3-g_9tdYPx z3`Rtm%(cwWPP;Q53P|OhMOCcz%;}mL3f7X4_dSkAJ1&J2MFJ1)gyz{JTdw5_*F?IQ zytstX&&uW$oe(;QOW@a8pWK904ks(`Gyt8uH#c5)6O`7JNAuw6fS9V}Y{-@W{P{hn ziyvbwwgK~xJ-BvK#u%S#HjE2+`|rBt4RTPVEV{ELZ6(AW+yl6By43pBUYWk6GVjUj z#4QA@Cv3Nr+Vy7(r~Htf>oVbX%*7Q1<#cj|g1E@-5>PBw2U{|we%x|)Zk}h#Kiz;Z z=JZ*cFZ)~;PAh^w=)4YB8eGF{Q?gI~b4Hb<_>pJt*n`wRr`N-&Y18kiG}6gHjun{o zKb-6beerdj@M1EPDU4V2Tz8r8GRHF47F#9=GbOOsH&68`^=yT!b8zgY%?lbB^NedB zMt=y_q%we8$xL-ndcgZ|B&+JD27)mpPaGA0e%TLUbVM)$3a8q3>Zo+mvp5l7FN}0? z?zGgwOVZfoY-?_=Bc*@N|xZ4gpGep_;c2T!r7yZ(g~ zBBnfRV5;zMk==@lJQUjyt79zDRBtasFD-t=Nu> zrf6Q9DwkFNUNmYREd7nAIuWm@dhx#8hu-b0rF+Wj(bpS%?XCBiIl3D^__)`aIQohe z1UnLLyz=~3%Iy$v-Xlgz>0HHMm z!}gaQsi{LuE*m3;BPS~wLt8|hMsQJwqH1(iVXkT|rdV+aeI@zt=_2zAvsDkJmFNq; zHfzL3W6Z@(MgQ)KgWY|@dv6^joRdHg>sY0qw4`FcGSDX6fUkhnOm5Xqjun(A(Ck{2 zEpRVEHI!4(+SxdAl&y`mHoS;mZYQsf!+~tU{y5N%$r{9Naz+#~T?`|w)RGCJ1TCVk zJMp+41iJ<6(IP^fK0(brUf^@MaF-FvH@*jNC;M^1QMiS2gl#XOLFNeE3*dx?@R;P1L*9b@NLsl8m+DhVU`DhiKEY{JXr2P` zV^bL!*W36GEOpFuA)V$7!Wyv?_l@d^P+kd^G&aozQlgW1VxqGvukC$qD3J88$kR$2 zCcu4fI2h86T^k|62da2?c=)wMMnJ(+{^=<+XpVO>(0Jb69%yl%i?@jTLVkI(<`}4B z*(pVPILx@Af+x_0(0Jn!4;Xy>c{|Mgtiij8f#yH>y>nkm+!;`f+w>y?7Kb#QN-|77o%zCO@!2nntWi?QJ9-GG3$4(}D^Ww@Gs#n#@I+M7NP(#<_Jj)eZ6@^&EdF%HGs5ESGP!UT!Iq8m!Ey)PGA^Lm zP>PvFTxNEg75RL68DjKC+P)$}z=GqzW+8n>#IR@>Fw~`Pc)7=R9q5Z3ZExrZIfB>S zq?HKdn%p@l zXPI&pD!dD}Y^xPjHx_{^a(z^~Qemlv%$%%eif$x=ZU{rZHT^b0n3cul*OC;7Otn(n z%0w)EC?$76hzQK+2XIff5$oV7Q@F4+D~5&64ERCN>17%ub3nL7RBewwV!p=nQ-tw| zN_f=ujqeRHrqz&H+&#MGH#r<#LRXORQih)oIIgR>8y^2>swnaoym|6ScTv0lgI?di zKg)YAToZ3RR7vpnFzx!~WY|0!AyGpvQ(Dbl4ACf*oHwYE!wuTXZncn}929Z7 zMpR8ub8qy}nN^M#s}<=1UOY%cg|T#}ps_NL+d@enG@m>fRsD|+8^zvEM(Z2s`J>9s zrd5NyFzN4XVie#E!Xw7~PD(0**8sGVWf;nn2_WQj+?xAL5;8~A3q`_0D3>vCeulIh z?E9~>-iq`<*~Uu(kIm}IK&5*2fF?C=&qrnr(g=_!O1x8Uh*}XUh1fUXv_h^p&k@Ua z2PyD+$aJzmvG%z{`fzY*Hxl%w;;C8KicLZL=$uFMheG<>*J>XH6^=GtVxw4C#p-ir zVz;#VN4V=9SrH;fCAAZkY(hs{W^;*Ea=G8bW_luq7%vz<^=$mN2iQ0n1)tRj$ulBe z44uLgg1xiki4KpGq>;`oSEf|dwzbR`#EF+~m_VqkL^w36<)998bBCC@4o*xnu^ngI z)eONLvI@m3c|o*`DFF;={w!WWVrb0*HPY$CRUA*wzEWG17Bg)^=rpWB0M3uB7Lkks z^P9m;&0J!+hfmzly%xczw#dyXTN#VM$$vs!3Ul=3@ZjN~ERVW@o;^Keor4wUNgo#+ zd3gLyBjhhk#Zz(M*I(}W)wj2O0WuYSd0DgSl(qB+8TsIb?H|;sh85*4(Bst;mLSAF ztO$eks`cn?hapM^=IK|U1Uh;{btAS$wa^1*%Fifm=w{ZcNFK%4nOKYh6V-6R?gCX( zE2-n4i9_kVpD+y-Pm(URO=R#ToI|pK{w_wWu-q!a(i7pb(NH4}C71BYzry0inN-46 zY2}3FjRFxtA$l_PJK|0mDn!Rj1KoW2>yI)_UluR*$KNT_)1QidUVB%A|VY% zjHdq}zQPcc!~MdD4m-DG2+1+)h{c?XZZfk(4iGo0uuN5jUSg711(A=zGz*60DnI&| zAocApYQnR2xk@#E$BI^Hcw+lNdFB`ZvzxdkmE`J$^blqhGwNO2o>5uqNeiHrJ#guj z!%hoLZDz2-H!5cX0mN<6SOYBIC6yC!3X|wciS6m;8&1dfyM*x$l_tP{5*rBk>Wmge zE3VxFsiZ$xV(QR1(hL3fbC&cu5J9X$Qj--OE#{t60;#S)(6aV5`dvD+6a4-NjgH5$|uSr%9qSk(`Y!Tt|?$^UMH6}WKwLGgdI`Nu3Vwz4;K{)be#*4}m4;Xw80JopJzL;eHC!9yu*5OX7nL)lwUi%WLC zY)rgBYdtG2Q$_O61~KvD$4W+?nBsQ*c(VYeLE1p>2Y-6RIh?8VT9;T$(Yj;bxsjLW z_`$GxF10#^Z&lG(_x9WA==R|0(NvS7ORt$y6=f0L1mQP%%s_wHNNE^+e^LMGYxO23 zZs)dCp;uCCiJ1~}Rd*WwRW{A`)c%H*KGLPi7TZ%_N0FBcLd#Z~9b3AwaDe@?l{y=4 z7t!zfOOZvrfBO$Eeqv}p!%TC##v|##*HMI8UUl8;Xgk_&sEUcMkn^!c!wid6YuaFdb(W zRYOp&M#+(mzP|>b_RI-_Lf;*-0i&?stkHiaW7PNmxkM_b%;XPgHQfwmh0gsp9a|cr zYNUc^EvL-Rq7l=d-2Dwrnxi8xt)2+hg7k@bPE}~^+Db&e+S0Jb58ZGYahd;08AjDr zNDnQVhgN$3C_`!J`+M?%Z&IYtS%Jb3nOwS;*nz!1 zBEhSP+QMDeD%G6m>JzLkDyhAak>R@~g;sp;F6-YvySQ7C&u&|jTr~iUgO`#d-^AHK zk~))xvXOW6i4!dnN+*DC_@M(T2lJVcQa>vwa_O~&j$$v{=s!SBfU?TT1TY(8A=Nzk zY?l|CEkroV(Wq0C{57kFboRi|hziMxe$+s#W#)9e$*3^7jzm3}Dwzkk`JP}aMC&m> ztKwdeC6%e(#pB1cj(iVe^;IfDkIbSm4gJ3dvwOFv6_$mwgcWivlF8 z_pVC6Q)-V*skyFUfYchdrl3WUYC9fZ!OONSecy zZz3KDJT&nzMb&A9+DqS??Z(1@zuwM#mL9(>{d1e4$~c?sS7IAkCc?u8KFmDbASS;B z@V=uBkB~mL)&s2}qGpk_q5sMFLeoYm$Ff|MMX}s5(~-gt!aSq@+y!kmx=qLJGsI8C z3#Cgc-A2lRp?k9Y38I8jFMg8Sg2EPMlyJNak&G+?pIN6{8lu^0oZ}tzza_%50Pq6& zNCE_(S>78ovaa>_opR;V4-`^;5UKr%FSOm*rAgRAxx|Ut&CuB@9muIY)f&@a_+hL1 zq{>QI_p^O^TY>5*fAAIaGHivPn@DkKe)XXsWd7rXW4I|C1a8rFmWrGPij2KK=J!CA8(21)jE#c1wPh#!d#TrU>YJW}m)TEyD3rfDG zw9-4G&zv+fDg7{V%*0_4x#>7TcG=6(&2(OKoF!Rs$AEAX2$GYld(A)oOGB8-5!6as zwj}S~KI$IlSNtp@itd}5ev~QLk(qeglBt!wN`N_SFhV(5P)*=-e0sWrT|=vOm(jIA ziF-uzCJuw^!)6MM2K6ua0!P~H_mZI+70>WNQcbL)cP?Lluj6NiU!hliZ!^o})oQkj z1K<*~i0ZJb-j=sLclH9vdofS)O!{GSn0)K7NTBDxmpg`MhFMJ$F?yj)tblBj>J~QH z<=HT*#r|77*$OKsEX@^ranY0aYK9KQpx`jIbBo%(Z(IzUPMWIju*2DhKgJSi-12aX z*N8Q!kQ|Wa6iE6Xw@MHpoyk(0ms{F3Y#mv9-;9m_u{;yL_l}%{BV!}qwTn#3%cg_6 z@bwwp@WCs7ksEIj9}%o43*CNeC(j+hKDnDoAI>MXYh&fUNPu1O1O(!`4G#CgOFS*$ zQu}7ID!n`&XKoq=|edt$sm%#sWU@R-)himrcc@b^=nkd#+mGzQSH$@M+i-2 z2}H@3*7NLd>@~RfXqr1$ac3QN3N(na=u4oa90>~(QW24509KAN{QqL>oMLPX+bvzS z%e!pbwr$(CZQHhO+uCK@wr#8Hbmuhw^u~#0?TCw@NFEZ9E$jv62rCMIM9P|Fh`U}Iqj}$M1vi z^+wr5rR}RYFhT+dh!J|Pbw`KuJKNfD0DuzDTg>96=FF}sZL8XskdfMcdmMY_GHxwB zs42XPEj^{UpJb0cE3%dur0_F^b~XI=^)fl&Q({?WY61rUc(=3=Omv1$(+g<%>QIf; z3jbKp5QIO10QfPF5G3yt(NLL6Q^~a<=|{H4J~F>2FSK>HphOkt@moJUfq9~dj@Tg$ zW8Zlu8sbLO5BQ+#iC}^L?Vf22Fgy$P8RVM#^%R^ncxV+lwVT*q1Cl~RG3Go3xx;5S zj!xtvL~_NbD{()tfjcI;;oOJO3g(LFj3d-BPEp=Bg-w-vM>0e?qL}J`E9EzWNXmQJ z!i!8s_HiVeIAsG!Z2*E@2Vaq_G=A$HMt2PGpw-aRP_qaapk2k921fp4y$bXf?M4tp zi;+&Qrl`_!b>hZ&>`(>Oh#lq<`I72#-5S$m$9#xnIxw$gdi>r#D*A9)#hvQvdjhAY zDINpEYm9r|FJWzDyUn|Gl;%y-nW-7^zNzrs?zuO?9sXk}_kQ|Un@JG%>IN`!u8dab zN}ToT1{!QoFt71D-+2SK%NRR0vwj^*3EI;h|{d@b=gW<7i4j#BM zoIV}mn*IAy_nCA&7!jGAj_wL(MAh5Y&L2n-o+J=5XvT(yhs;g~SeCeHC;M^R{cCQc zPpmEgIs3h`d?pyX1ny5jZzsd#v^fZAdv(|=$ zxE|An>U+ens({C0C6FlPD4xvbb)ito-Os4&9B9+(6~AWstmwLf6~8SGI(*$|04J+J z4*-sOKGQH~i$J)GWDt))wOVU918?I1nHac>Nl^S{Ovpuns^xfS!KM`ApRxL0&KCVY zGxNKLLc7`PbpEDib`BJ`I+-a}U$mMEmhO+7D+=W0Nbj4kI|?UOl9{Z|#xgPI9bs?I zi%=$s*B^?+e1cpp8l?sf1vt9l=}!li^wmB>?J|@mrp(qv%i-m*^i%fG&*O_@X}3+F zALEOX{nJvU;;WbolEjy7#9Kr`+dGh}0Yv0RT((06gPNw2BPSQkN>x`4ZZ4ijhKqBS0yM5wEsC^F&%hvg+2ha%q_l{qluA3is(#j3 zex>k7>)MP8AtK5j7rrB-3Sb@6XeSWk{wC2RE~w(#PoR7_FEqrk*Hh_3t4(noye~4} zQ3%kEJA@7X-8HEbJeD~y6r9GT5gU{miZ2r9z@BG-(wNpV^zwAlHK-#q%oVcPQWcS1 z%iyJxU1EA5rG;WBJ%r;kO{(wGZWaR@Up8n&j-ltL`paR0-_%38Su%Dpomt%1g!6kiI4fEu`S-iTZR$#$d#x}jQ78u1lwMas8(Q-Bk$VrDX%RklHQ%=W|IO{M3 zU4nW2k=TktqfzA%w34gSG%&yVQyMwl^^UQq^;Igb1{p|?5fh(^`&hs!D10F)|E$!C zJb_-Q>jN9saD>Q$VkX=#`MB+?^j$yw9*4a%00ZhEDJpY=-`X6U60aCSWh@<)E|=F^ zC+P<_sL8nGr=)FItaAsnlgXbBHVgC-xAEGJ?mWh%^N;cYKPA&yOg*6o{8hnD=XW0c zx8q#IM^5cDOIL%rQOxJFlIQC_4OxMW{9r*~f`R^_IEfHFyw9$%I!To~fajk_Gcm;B z)sl#vmfF_A)iK0$W9FR|_=}tvwCLmxz0;(2b$VBfR@dpRtL~4|o~>=>o9i3Qxc*&V z?U-X5lI6xoX|kvE1v`*b6A(5kz!XesjY5n_w`&l5l16v`XLo%2Bp?Uv-Q%*NBK2cy z{|CC%OtRs04!uB__crO{EKz{$;^6&vG|;ySX%zTUHL>1S%UL?5g zwCKTnG7>w&=*ww>PREeNBZMA5FtNu`$X9mLqEWgdvth_0Vs_e6rr^57ICFxrltkn4 z$iWmWr9EPn2`ZTR48~7|rJF3 z-GDu=9K9+wXf?FSC)K)V^XK6qMz12`3D%WOvuIY$vkmq z^SW$P8s~m!kGCVW$;?%scugMo{NQhHH1{5uhS;DoF*=jHu=rpf)>F6XWL4F6<~rVs z-GT%Yu={;7(hd<;q6z*9jMH|8Tf|@vfJ_N}Ei?b{u9Dc2RhMXkJJ^ddZzCNKY~v;a zWQ`fab9b?*q_gq705F{~dQoTrqn7?wc8Rs3dR{cWoo%9a#eD`)e3SaMd7rwyRN*aFq*G4J+# zd{rWUrPa1yr=A0=)I5ia<4lQwOg(ko+UK%RED1lreN-I!_G3iiaYl0D-^oKSi*HOA zk|7x;Q;(3x7^e<{)yMghMGY~y{Tp2$Jw=|GA*r(6oBvj@S2YB^x7-lkY^9yZMU*81 zV_WFdm4#`?$DhcGu>-m>CNcc=q=l5pHa33Wjq{pfZ zwP0fZv7VYIc61@4?WIczm@Zj%K8W!gn2~?j*6Z4j%@!F%X>Y&PbN}o_Nn+VhQB6wL zkt+jta#>7t08BR$krrNvau>!jNVLvEr4&Q!S|yK-*BUG)T6MOl3hC!_W-xHS3$DtX zj~$*1eEBwdpSnv2$%qae*?xOk36xjAx_$NMUk@&Bls2noRg9E~SSQ#>=qf2RAeheJ z0F`yFdr7T$n95sywg5h#6)u9a#;ywrH@zgUGqqq^IT6;%u{aoOZpS|;HCUlHt4Or7 zXQ_^J6)T0YWU9vgy+-4%))?62vJkfy2`=(E3b`*E1gpauR+Z_{MYV>^brcC3;$jp6 zSOjCbN&+ZQ7F{OQ))GFU7fJ99ULEHGw7B&e(zo-`y{6y(35u$+i8ep+*6Ct*x(n~Y zi^(w`lacK(h2M5u=3)?TUmJYl`ei zn3A7-eNJQ3LjO4Tb(2nR7EXIJAsd?C7VE)y#r`b|C5j&Drem`I;F^VrZF}dW+CzTz zqYdi5ea5Z!dK)5!cb8-6PDE`IPydei3+^qe-%sqG0if~Z^I-DKkJ#-x{D784TKoud zr2|ujMl)LnXqmtel}AOBO$zS}%gDZ7Rx!rg7zgIZqWx!*aGR8H&j4O~`ceYNH$wz& zd^K}bMWWVxGi0@tcFpvF8nfO(;AU=lo02oqu#Xb3 znV$S~-!p`!cz5&`i-L*;4;T(0%ghpdq6kOtWOwiPKLJFQ@muUFztciF##f(W3>cH` z1oz5m`iRc3kJb(Ck?RIoZ<|(~_KF@}<7I}W4Jj3MN8h@sRSHz>oE-EopnD09T3K8sQ!nl1irw?h=Q8(t)v3@EVY+zVqIZ{qqYzYgf2=JS z{j_P%n6B?J{!!ZEkE@~*h~>pnIcMBMhk#cOV;pyD!=wf(f5XS9(Pcymj@^GmZsMDd zs{-jbgimR?4-xsUF*Y1un(c%Y!0oItup90;Z&;{$HWFOKg?K@9)PHHj-}?{Nu~&3Y znt)z=_``EJk|KT-lfd&Ylv0`gUcao$)iuBihwvR{fHn*>VD#G(2x(On_Tj@*9@rl+ z_zRmg#xWp3^>CYkzcid|(GGn008L!wx-Tb=2rjHk#gPG zZm^~7KtB5JmZ2$iI6Fzdc(9?Bf^>*;q;{^!%(~dX(n0yBm}P3J&xbi6*v9>QZF>?C zD*MiZ!o|IxKkC>RD_laW3On zptVB!CRJqD=bo#m5Yw7mo*?jadCgI8+_lhDpxll;0rJw5(CA%0tHN$n0A7~jN`;muxYH--lY{T~I8SMP!{L#hb)mh*>fNPynr8p>72Uk<= z0%3Rp>zwx)aSkEBi=a%`RyxwK=})Ca`>d`6jmZGs3Dq*L-V7J3A9~sI&^5tj_t9)% zeD-cbl(>WTf;OpwYb_9?8lR}-{r()7K#kuP`fZ^j2xBeyiXZN0q6SpY5D}Z4Ky6nW z7#60yGcRK$4O&cVu9!U0Gcpfo@S2chx2N%yL>n*g@sC+IgI#0Fg;I!M`ha> z{e>vH=xqWeEw^v8(V15DCcG7&%c#ph|GZCy?)YC1N-Rp%s6+hKql;uvb#2%R@nA71 zkn5LPhhF?=hOFlSuC!eh$gW9cO2rp&_D~p-SiOpmgh#=L?YM?UB9v_f9x|pdfRS-m zYYj8aYPu48SH%YXCz>X*s={SfE{72`*sFP1F%X6FtkO$%x> zo)gX=bvRtzksYA>{;SRr5r2c>Wvrf9!q5z`r~QV%8&I!W>jgJW3vAYqzXGp8xZ=>r z!dhltVX~mqNaK8$n2^yn1KqG}LXZaHONN+m5m^?wbymp2r{mz(^dRM>q%e$bQ+KKq8aq2!mSM>V*!IZqtPTO> zGj2?;P5o#Ql*(u?NfbXDO#hqh0rs1Cx2o(dkv5Vs+EA}Ar{A@$k^Yvf{S@GEQai?x^E;(v`D_3d|k z%mWfTSUCT{T25n^yUFR%JBNFI5uo0tGE_z_>oesC-g%Vpq%wfM-f)C&IqktB%?|~n z6s%B&EQ^x$$cS9#O3g|3AQHZWy{s@7S~1o8cI zm}bl?YEccFVJ-Zej6EARH`xPOQZ_4S_JPMJDh{d<@RlLD@6a~w74c9~vET)pk>tN1 zuiqNXUKY%_&M)c=Vs?z`ZcBx)LJVDCvB|v55!C_?o?Y%(epj~k-Q3iKpy6J;7cgQECPh4&4e36GO^6>?FN3IH=Q*(yEY2VAreT&6HO6^}&g*E|!8i!n9Ru6oX z7`|F$-8`V%C*mk`*&M0>OYkS${O6{39Cpok9=P#3!XcT?V8@&@CgO7U49FiXK4@9=+m|At?gYwz@&-bZ6jvI0k?^?ELUJ$V^F zD4ty--*sP&xjn*D!|{YVM(BItX3^q|fr9(@g4QpEweQr-6j0Oc)4K$*KkA&>NpTF? zyHC;#WeA>678%?a$eu=V;3{H%r30X~7?m|`7^Z-E)nNK_C*W_dh`8&F)ZAl<`a(eM z6Td0|g%MHqWm-j^oT2UA?3ovAnF}E5ZzwAxCyuQD9T)dgY>S%{Sf+-OM5C?joNr*d zC*y;@VLC@duc?)JpRg?yT9-Ur7++LR3h4(`v_*=+b;b^OSc9Ni=1%j?MnAxN9yR1k0QWtcs(X#$8VU@4vhOm z#aJy}vf!^%Tz(zZ!%u;Pzy~F)zc89sm}8_<4@uElHtOt-mpq-taqv=F|8ixlQg}S? zONhQGjkVNVi_2vcTWilcI7)2`{Zw*1<%u%r<7@$Y_aL7+iQ%tJG-$FGAyP-;qj4nO zjECD5 z4>83SzS!~5$UW%9QgYJyIF7XD-O`{2#)|5eyR4ht$ASr;^I%s| zm!H1xWc-zRTNo`Bn4166C5}_+p6?@^hB=#o9HbM-@+Vee86MEXV`fotV zz2VoHpxfn6@(|$GMo1GQ=2H%M5b}8fxojJzba zK5E9?Abk10%Da$7*NCD}M&O~w%~Pt9s-DWh)P4Blww4XP@Jz>(EGhlv&u}d7%;u^* zONp$Jz0Cawlj|19o4G~qG@w;U2M~0DDb>5w7C({^E*M8_2^1(9y}eArd_#~*sBdx_ z>+=yVzmSYX)eft5aHEUJcLOFT0juG*Ve#F~+`oiy@9g@GX&m#!G@arg_^qtGXimiv zWej1ymw#3RnjNC-D`KhKSEOl%xgC}Q&{h^ue6CUTSJ*OHz)?3k6^EPpEB~hJW$e(Z zZDlC6!=r*NQ9TOW^ZMIx`#+|OJty3f^eD&jML9e2XlZVVF^WxXl31=XEp`G51(upy z;guSQ;KF9C8(;NAx7XvTl}MRPXt1o#HN#qv#c$P{u+1np1a|jdghH`(CZZF3k;Pwa z8J}z6Te3rO4ngzgI`<@!?@%^~-=(ov{q|eUPN|)h;$o8nMUiVdSYvCS(GI)wC;I-Y#98;@(reGAocV|+PSKhjS?@XY=9Gs zkqadp0fewkS`@5ljUjCn2-&gwYJ2*&suunLC>$cdsn{B*%QnR?%InGB)- ziuPB%sp6Wm$wM04G68_nz#sh%qYFV35N&w`%Se}5+zqD(FuW&xr=9-6JNDl$*e!e+ zy5-qiU_mMiSsnu#BcOb1nSQwrdIu@TvD9ee)5YA`F6FN}w(M^nCIP&H6B=<;XTry> zYOF8^hCV0bpE6ejeR9;EWJ`c@If8ZeO?GTvn|iFp%9+COh4Y#I6#9Efv}SxQEOd=K z8$Gs8b47U4+oN2z;`3 z>F~XesMCidVso|o#R1O)y)3!2i?>HC=PQGF=-y`IA=a6LYbjC;lhj_1a4Q3Xh%1c_ zPCp1WU3u#u&c0@;6bgxAED25o#|c!;uqSUJoWEm`<(HKXZyOr{v#5P38#5QiTXD|* zfq+o&o?xlz;p>Ib8ZI7AmxqIMh@YU zDCAG=f?>y7_0aswTxjrUBhIH)CQO!YvA5CE%aW-o7~Bzi*l%{Y+_KP~$58##3r;+_ z(T*A^xbK}Vkw11_uwFPqlc;$d!6f?nZ>h78;39a( z{hBU?Bs+(%Ydvjz*fKk_Jv~P)4;)^1h(J7B!?FxK@-rm~R9ulu@Vk%ImQSekrfXfh zrVuF++mY!RpBcP1u`b+e0Cd}+@3R1R9#h#YjIhP5S2db*BM}>ciqI`jAO_jEFmDdl zMR88wZxfb=t2wQ7g8V}LFrP{Zh_4_Bl1)|TA6W(63ro?QXU)Wwo)zT{-m27_C#(0@ zVI_dzK9xw&o7fITOlv0Eb18(Kp8P_v*A8(R71nF7Ng>7TypisdS^&%h|zj|7hb z>|#fhNj+knP7SS$+tOUHM>{8NtwNG_SozyMP_}=Fw!5BDThfg`?|9z~ZoYJFD>($+ zvfkK-+If!8Yt!{aBb5hU^lU~#G<(swrcc5$OQX?L{y%{qV4oLwh%aW4 zFO8XXM3SN7 zLZBq~KDHM2Uyo3rODsT8G^D;q+0&!XGVM#~^wv)Kh3%EWOaadjq^68##x^w5Yt-Kl z%fYxiDs>o_oi{WXO*_YcovLnri=JeDCx?7<#}4V0bjg00`tz@=)YaeNY)v%2l8qpdQ0M8Bwim2L}@h zkr9j!5BmqI3`q2VOa9@y8sf$2?Bw*~3;^D#ccX#RT z>BM(;9w5oBY*r2riV}_rSA5UNflZEd0RI=|`(Fq7W1R7j3k3j>@=FW@ba&xql_lRj1So~a$AXzlsU5i`H&H8nxB<-fLtf(C?Qo(B$p zT2QVyVdZhOnTFPv?h_1=l5Baz; zL^%Y&mwmd4{-TSmmqPwsYu@_~F6>9+?nDu&3U@p=G``|0mq4Oj9`Hz@J;_qcurWXV zdGT%-*@JL9+B-uuefG%_9wq^j6+wT4{ty3D7$Kwz{(Yp-x(|7ZMXgv$m0f*NIJWr8 z5Fq&ID%{|HL+mVD&S+431}^XOrrf@YgS=#R-E596bfPP2T*=mK#_93}y28mD0eB{J zX9b)Ofed*&-mrC1>|!?Oq#Ym#CMTFw6mm^NkGvNs6JV0Gxl6T4gW!=8%X()MpjvRQ zz+^RaH;|S-3jeMAs&NZB8Wizw(~(sHIRj@mls$9c;+UxvHN^F#{O7g`ZP1!Nl~oEm z%5iKNu@3|Il8)X(Q6fRUd zW76;v2x}>{H9{doCn1+Hv4_X_<%N;qoZ-y#iOq!u#C`)LN#9_)*UW#CKB6LW&wu+- zgb!~7cCZDLhL5J(pE2-;6cj2rgT-$u)G&}ux+`D)3E}Wm)zv&j92|@75U%b_w6`QE zgQ)X)PDuE~C+F-+RT_XyRG2uU{z**cKShZzRU z&~tpXu)(xbIT;?hNM%&U7SId@Tf;+0Z)|U8uhI${ma(BVRdZFtFp^35OX`*uypL?I z$274&xF$#fjf6k3;zkib$8VeR46W?>RKQe_N0@_9z9Wun)x2DUT)aiY7T~`(O0Jep z3ly(%OE)P*Qcd@uHhQ#wMEAEv!JZwP=n~ox-Lc;7;|h3JF4ojk)^zSG5|XIu3`orH%CDfU4@qvsu7fYN!LZwLuT%pq_>qD3IZQHm#YwXtJb_{#M4!@{U1 z_82*lExtSA7^c{Co>^i~^bRk713j}3&1E*aJ1TZT+2H=ZPn1LreuBC)9Y#jwpJPlG zlj98LIZ?cgU!NcX?c zL`>IN9QzsL||T(9n4NO`!q45C}3TfT(Fiia^(ZAS(h>#VQ2!yw*!NB!H6I} zBv^NN&T6Y?z+7TMjE&;Y5e}u-o#!tzv3i=zeIuKO;u(_)AwInuvz8XTx5E_Rwf|Y13_O zKs3ytK$rfUHI{E#x%0MT=sA)JCF8z13g3(%!*3z>L}sru{;vz|HfGh#*LWYv=5gZO ze-V%i<}rx&y7}=WS#gXB!f3gng6DxQiS7g>86pPswUwlB!z*fNsEW0 zeT200m)RhC_LOUiWV3`-vp#Zw2$f*n=OL#p$zT1rPAi z;F&V`(xMN}OK?eP!1nQ!C_E$aWQ6Vp%9rqp{1WFM%_aS4008}Hd|2+_r~OFFXbEH> z_rgJ`+)_o8@Bl;F<2!mFV(Aicn%Bxou@ppdath%vd$=)T3V+Zc<3s?#1k=_=S%LW1XIiZ!ZW5@hOh#`IZZMh0NfAuo1{#&zbp2JkzCu&`*QS7aVMhTRC(L zley7yK~zj5Wgj8@m|8+=bw`wN0F4r^O<{77vo46iBstKz7JYa?Jk+tAQ zBUE!C^#wY@hMLhz!BB?s1qEMu%_1FXTGaimfBq>&pW%3jl{*CE%gKM?78GuV`UY`; z&o*+O_J6B&xOr`ZMONq;5?!+k(#~9pheJB)-$U?-%UT zUrYWu78~0Y#r+$wR-D4Y9bho5F8r)a)FU(}-JAHC#K}5`UsTyMD`@2nA+RPvAWx(#rcp5+R4~qm_i{Lj&gL+CKu0OfuUqxSDkMn`7 zdB5ITnLj%SX^!g@x8;Amv_W`|vxjubRdH0)0R44yvZf*?1A1#KYX6u}puGa{vHAz< zDH{@xI^rp&AGWgBLUq08JU5R!AhW5cF@ue46Z9MAKvIVNfoTEJTSS&SBlJ$?(S^kN z$m=qk9HylpSs2(lIesTNnp-YR9!=f4^30yof7j{f7Obt~aUS1<9Jw98jG(D2Dk{dT&qahgCtH6?lOy zB>j!dv1BMhE>8-iXt8>$^L`p7ouczJS}kJz6gD<^Nxma(VwRmSrZP$EP{ zb4DA2cL6pavfA)$`7YjVv$%V!HMb&`T_bO}Gmp8LhlDr1O?w7?z?L0Y^OUMlFWlR6YJ0qwtPf zv4E?$wxihii0wU-)>L&xDPEn};MYnI*ZGMvoi`gQF0$9skrGd0<@_a@Rc>7kJL1ec z5j!L4*0L@%KjJyXCX*)SXg3>};Bu;uM@vvVkYoilCWP`_}h$^OaBtNZ0Nvn|BG;@r?E?PpTXUU>dEQi^( z4`hHub&GNf`t=`n?W>5iQy&=sVE?z2lJ@^+*ZzZr{|~RTq@iwyDe5r{1sWKLCng_C zt;_i@x>@)k&M3XbSP_qGPBMTA@Xbsz5<3rbceWR5lMxP9aLwpST0y)?GjmG4QfX9) zLfL0jQRf+7)_uybLZQ68tLT2P)hO}F<7kTGXU9P=jXdJbNSVc z1k|?`mDlu#0X6c=((3evh|9LukQY97$Sg{t8trZyA%0Mkq`_@OT&oK?0nRp=XoypI z{x(JP&Lt#De3f-uVM9G3*=f8NP9~x85!gLUCO-5`{2rWJ7^k>&%q3kY~Yam z9o#*GM;sgk+V7v#(^8+$Dp1bh;sQb>I^InYl#%$R_468g1$HfH&zB z@ljkXw?J0#9m*SJ{=^JtzINdR-`hVY&?(~|_!(4ouv5&mphu35-y?xH#JuFAu^8$p zW!ygr-M~i%@2^mVHxBPC9`W3gnRzbnoSZ`1d36q1+F91?nD`S>AI+{zd|D{4b_Z4B zlm3i;?O-X~i*_egVVxpj65AOjqYkNBpMp4bkLam6WR?}V)67RqImQ_W==D%Bcw)|Hh9q2J~CCF%d2Jas4SC3p;FHMxZj*W}RZ#xeR}= z<~8LUgRg^_JDp%H{wh(7+_%QcSxC5#DRX*eryTLobQf{HGpuY-)QL}2dN?EY2UMo%2X{aoeT~t$;W}_Hk9S^&&5Rs$R(O?uqFX)$8vw0p8 znL-cS?^_I*)mK1V5GS^5U-RK>QK4_#F6;KsAYvNqRbf%N&)1TjmKg6Z&IUDlq;8$Vsy!-F(p#ff2x*VzMHV=fU{xu z9I9gptWJ0(v2kYKJeO!&x^<{fIjEx|an)Sxi4PiRmmb$Zq$G#*_G@}t;Wm?P}9h$4MZTW~97OYv+ z=AVo|)iA_)b&H8SDP^DjlcXL9(Z7VR;cd!Yw(9ebkkvDVBa<$(Wq!*7#-;^i!n5g0 z)5)rnqedfVcl28O+XJcxX68k*g}HroTC76Xjv@bpvq_On+#%)wV&u$A5sv8GQQJ({ zC5e@sbquC0J>uy%K>+;PtGTTt656fxd)0{|VI+`HO8?3*FjB0)d8WdFs#(TYcjYO?AP-ciNCy&C@S zLa?iIKmoRtVBtFZav5O|`<3rhm(o@(4aTb)ne3V@W5PWVEX8kW3sRGiy{CC1o6Y3isLvg`ZJd9t^vZ8B8D}c0IkR`z>@{m)m zP2Nc=kb#x`6?4++kVJHR>0~hby}CiIrds4sM}F!Bd&5GWs{1ub?4Z7fa_H$7O#ku#fO)ZfO9wjpO{>!d`-W?tUn@!PIVj0X-O0 z)M%AaR#YS+i~o)xN$NSUF7FB>NDlSCd*ZAksV;E>u#VFbg6oVNLf`cTt`LxnO6!2f z&xcMh@@RpwK>kg+z^kpwq|SW1A1iK7U{N#+ zpQjhmiyj84p3EOO9?H;#>}80(Z}L<7k!S8Cy*Fop0|?LdU8F%*o>#o|#1+^g+{fo9 z8RZRy7zh=QySqGFfhqzUZnze>iTPXnel6SOd(e(~vy=7xesQxiy3^*q5Zyli$bF{8 zb3MM>IC5n(C2gQou!6qYP-!+Xd_i53sqEWpm=IM`g%gC#U)ckz6`bWEp+Kb5|Ga`% z-Ub24>{_)wZ@m$ipJ@kGI-~`%U%<{wSu@J;U7s|E~U-wI>j ze88ybuOwpz$+%DXeYp_QICWDx#V#s$Ns<0LoXAMel69tDc-Pt+5V&wuO$y-SSjgPq zn0N=sLBvv<1te99xVEwAi3!ieIQF@T-eScP|DDqa+dzW8Ld(UtNvLf>Qdt|YqAMcy zK?=w>n@!*#ZyugfcFMqxfO z(AomhnP?5vRO27QaZnMXHFBhOa&rW!Q*?GhYprs8~q!aP2zd z;tNF5Tk%18#5Sc<_OB`EU)8DGI2YwkQc>C4)6er@2ch(EOH|mX0_WBAq(X}9qY!gh zvp+OQ(nyfpgB5=3F-He;iH!OK-jkx$`Jsxja**o=6D>^H3OxcqQ&(yP|1gQcfcCG( z%4V0d@fU2M{szc_@cMgek@Xp6uw-P&NA~jPrJDqjnEqJOKV<;t zX@EQOPi#59*W4jLi77q0gUG2lD8so$GII454pS#VNV`pD<-xMiet+*dKjuBDm`aj{ zyR7Il<=Pc%tFAjD8-a@VG3|CcStVTd;$VXfKNvCg_4zvX9y7~l98qWQ^zM+Q|%O%mC{7tcb7v_BOmk#oM}fa{Dj6v&if z0;6=}CXvwzNfIi<9en4?t|u<=m(!L@=nWh@=qs5;u8W{B;Sp>XvE}t7fvnJ1DElj{ zLZtqw@amOc^iKQ~X?31dDuneobmQq~Fih8W<4>0pt$zt2@ke5#0nrtSbgxVrWN8#z z-Afi{I7m0SzcmrGFE#H(ZI#6y#p^{(_9!OcsJVF6 zb^K4{kS%>3EA_G+Px97lO7p>Z%3Jy14nI{?XZ!s7v4(T85_!rvW%A2rkU6`aMS{St zrRQ~c^uCC(ak&~bRm!p?W$=(PtG15Th8nesYRcwia`MyE{xHR&YJ?3&*Hdxw__zl| zcQ;EdfsD;x2s6fQBO8=h=?F?s5dTVe9VJ3c zU!8tQN;Bqi3O|Y?N%a)p^4{MFIZ)EQF|l*nhTG(y5&*XqCc0fe0DAyWAp6~BwMRX1 zp`dw%%VYB!EZ_rvj32&Z(|n==!3qG5133Kg9NQQD3<)^+e$>6CpGH$=C!5GP>4#Lh za{!80quE}3#M4F$1Mtunf;su8>bLt&$5hY?3QSAKCCUri+9WPWqx4oZS2*<%PLa9Q z(s2PJ=s-BG7OS7vb7c5L2vRXDBYrrN{U4#Na{)Q!!S@NBFIxx`utfdaNK^!u)*~e_ ziiJI_0X?_8KqDy4(5r1eyj_0>l5)R-K0jW_s*@hVEW95d!?2SySXe~IUoa{(d52d7hO0cH}xQQ;7p42ESW-V(-+we{l2x&~hE=rdVYmrm#Ch1&G7~p%+cHF07 zfmXs#jvelYs|3q+qU*-UDc1P1(J8P16BSs}@(R!i1)d9ANr_d3-(=dooY>dDNE6m0PQEg)2=weXDHrA?v!WkEBz87YkVC7_{w zRxji_ULqLlW$=mY7F^`@R+w7P{ibMt_6I>5`NXl{bc_jcFbU zOg#G|?TC+XZ2cVfeZ;qoUn_%Uz7%9>{MxkN9PIz->@2{d+`2tZcS&~$QqoFycXv0^ z-6h>2-3`*EFqD*}bfa_$5(cqRz1A}M8i9&u z&0e*ib!BDeG+|g5%$u2M42M>gt~8%Z)T%g}J3sfhl+&G*{#~WYD}8+5hE)vWGU49Y zJr$V7Bk8WLpnkF_$godPGTn>rsA2i|9wBheWJaC&1gO=Jp5`|eL1G*Ruco>_3w|@v z^u#lEA?Z&3OlI6cqqmT%zS9J&0EJhi=ezG+uSVP2GO^nXX!dFx?2K*qhfXq-%qk#s zZK^&*x_eK;jEBdGJYUf_HJhR`c}{V0E*WrU1!Fhk_T`G;5lvwrOTTR5hnUCf%*gh^ z$Fkf^cWIbuFAJzD-RYe=G1ms$8YIElKuH+Ald zmHIbO_+gMUxK4Udn*WCQ7|5QD=|{jQ-UX8CnI^8RhH1S14845X1zH^}=sH#w{s zZ)14u<}mljOlpb0ynA_iq8;m_c7GrlvdB@l9y)7KC)5XFBSD3}GcvpG+luBhpgqQRkN($_vg8Oq5x+v*qHxgWR@kAW>MoVFPu6^?aR zzh#JTu6{yRdQ=92$TB068@Ll87(JK>^>qpOqy~iuXLZMv>mbd~Y{tbHmtcyIFBA*K z9xp8azS1dR9U?9S9?i)2?YQSl*p<(dwMPD~C4RS(KJi@vj*GIM5#8O7xx1LJ=5Ey+ zj3bapa;Q%#Y$up8FcNb`tIQuaVnJu$;b{h;ZalC7&$^a{G8D`3KBY$?_O zn=ap_Hl{UnQ;K?qm3NQ6*FH-9uJWXFi&6DL*sC7%2m=^XWIjVK@eRQ1voX@I&btQv z1rXj?JbIGZC%@9|j{xtK+Vcq8%*50Q>p-_eN6WCiLxbKYa}U12`^rx^`15FArDuBD9z@dbm%%;l0ReqFRfy-!6Z|DG0s3 zEEzfa38rD&n_c>+nz*ZvvC=hw7{*R+q1b6sGHO*dhNmt>M4E%?<1^N}hp(QuPT@ha zw;8wt@^T)1!ZvF^BmfF>>`;Pa(+6J4Ke`)!d@JAEb|f0j$mH6WliiH%oRVqYxFiOh zzaF7#&54F&K+GO_Vb~k|*!mG#CCiFP7P2s&R%BOIeB(@|U9`LI0UJ{14L>4!#nH>B zDv9UAQIw3ILv{px8_6!Prf&TA6wUBeZ!y2V3i1yJf1~(^`zlHg32Lew@5t!fW+WyE zs`hW?(QAVQQ=CA8$=TOf<|4Nq<|-gg(9B@MOM1navKzTEahY@H-Sye>JVlFAHfOX0 z$|_kt=k2s(=_ezKIVTbt-@Mr09HpCSwV*sr>l=C|n3jhmp(fC2$V=c(8%<*2&*5gg zcQQjY!Y9D>eTGuCbUfyc^OsB5{h5?DE6gfu@YWR@4+I1__}tF`cE1kS{BRF@@FDH8 zuC5F5fLm_v@V5gK$RKj5N7zr$-6&J3$)i#kh+d0N!_ZBq(&9>kPj{Tm8Tq)gb{&ko zd4UU~fjq#P5g`{PZCmP`lmVe&aIyC8;t2@3y!co^y?p4iv_tujsM!Lpi-O1a!GVca zy=f_Z_uR>tf=8afvLDkb_cX%5z%bIaUGC)|zq5|=tvv>ryI>xr?695SU8@=JMu3<@lYHXNBD$ zw&tnktrxv!L^^@+Bb2EBB7BJw;~e3y5LY|(VwWa=)nz7V?pb#0w!kK6wu3ftj*f5Hf)QqX2y9h_AA(5v~w*w7Y$myAFs&7CZw?8b~3s6mbt&FnAgfc z$HMuoE?>cUqr8V5E-mcwrF>aSi7g$gwsMp7bNA>~{t}q2RPbrb_u+|TWm#0zx(kfv zS+*7YZptmK-dV2krwF0)t5d6*D>QYkgtHErh0E|mjK`1&+xm7{4D_6X927_mOj|m5 zWXqQ}LyZLbk+HE^VxMLrrgEZ@b;YgE8qky5=mE{*03)I%MBN16^rp2>sz`INN$&tC zjYQ5%zKnE`Hx9I}wyIaD?sgKshBWgj%~Y1lLu^Y{n*h9B*O9)R>c1rqpo zzFskEu%A(ut%jcRV~2eB_Bq;qgwJCXSS=fkH?E@~#OUy$g~Kp@t_luWH`SEAot?9; z7ciPOjC z11bZI>f4g=7j;#a9r zyV8+EeU@r(oJM+G_#BpXolth(_BMoUjzrMgv|}~pYmmL|b%KjP?wFUu%dttAPD<#v zn(!92%be5gyBZfH2R)5wjy(g}y(UP}?OJ@Hef|s<`DGi;Ni75!nc3<<6jKV_x{ojb z{(#g#ik;;T-wQCJBt9aR$)dZR_h1p@x%2K_IL*{@yLU|MIA`E+c%}_TrS`-rRzI0{ zu5}Y?-RRB@uP{Z)Q7#NqE)=)trQ1MKK}QL0Bkg%jUGi+-Esf}P@mlyO^~abEw{Td4 z2{|N%wphF8tEh>`C9w#A%aK+*O}F}Uac()2d<-!}r#|Nc`FW}g;uYDdsFi>Xp?bL? zm{f!sqzskQ_)~Zb<)sXC>Zl|o-ce@toLj(U68J5A(uBL9Uh9wr=z_MGeE}Z>a_RQ5 zUa3=Y+5}{9*a{4*mhuGPB$;qY)lNd2LJhtxU&S};77o$$LA!M~X#I`6{{lXVzjNLGL^SLy76`j(&_kxBoQAa^IpwO|E{tAv{->ShG z>g&M;y0l^?U3Wt6lxhOyT1|8HkzvUR-Vx@+&}#)6j#vm;5`O}rNtmlP*^~XZa>)|;B>7Q${TD#240BobYiu;`E%=R`RjEY0;E%D5OO(EB zS3gtmYcW(t5B>;*Vk$xL2jysBAR>ppbkW#qSm@CG{w2sQa#d#s7nb`-&9+B`K7mO0 z?IVn`HjMyMdjUxZJ!3zw?$)DIaqMfF18g7V87I7|;p?nDDB}a|4R8CyucG} z%Wua~BybovS5J~}cV{4QLm-zlLLxVHkfeH!diI&1W#hKlCXEh5ZqXJRYP(MJq&#$X zNv&l-W$U-`Y$MC{OL30K9(f~QjV7hE`Ch%z8oDYWO#r03=J|~rLwZM?Fxn5zZ}98who<#MTAv2cCU6gC)0QtQ8Utm*tUbT@vZW@83uTnZO(q7F z8GnKNRH@~Q6g3=xAEhFkREXE$SNe?_PLr8s_hU$V?rB%YET#{tco9Tu+h$Ns{6Yub z+Z|xWq=f+nO)D~5_lmgo$C^nrwRvHl$8GqoQ0UpzP)apfuwfmOa|`=3q>813+D*~4 zvgD_TPMB|=H~4+H2pG(HAFCH)Qt>Viy9*0jpa%bo=n1@eHPk9~hEkt8MQX)&4xYgH zD7QMG#CnHz1p>1(CNP)>N-$d?JhEH^JaXSAd1Lr-#)WNembW{QEj_mcq+B4kjolkODrKtY7UKpa%6q9-Mhd8OP{+@0M7#@?dHh;rK9(j}P$)0XklOgz z80$snJXvjogxOeAw5p$To4f)6(gh|Px5L|=4)BlWX% zR79T&fmC24%+pz1HpAvN9}5~!}<8DV!Y-6gRaokM|a$5H^985RVFPyZ(q(s_&6PWV>G^2 zZ*~`*(ym09nMgS8P1}G_qMB%ihGJ1wEK-&-*qV7g10-!2+kms+(a*=-G2A@GmFU$9R-AxK-MSu6wQkAs`13b*G#+J~hvl1;7PUq9D8`elhE|$lf#?$7zi^ z4|(@>BPR9z1Q~mYd0HE?7UjVUpufhP14qD)uqVG1-di4)Tk|=m5)Azih)5A247DP12Hrw7!^)N5DdRCT66PP(KP3h(9%uk>9A158}8=ZJ6YL zQ`lXi`Z+Gu1j(HgHMx6Ykfy{I>VszwbY*`wxrp*^d}gK!<|?JTvfL|F?P)R!tk*F3 zS7bt6nmDL7&J@J)FT(`VU012hsK=-bhasV*=t~erFM;NI8VQ>Y*&)!NYgCd*Bim$P zkog<(hX{=~Ssx+7R;PNJFD6v-Y#D(-X1b&(&fu!rv)Wiyp0nwR$mCv zJtriUN*7{NwBRC|Ls+}UAb`IK_RORtp#VVJkjF^O-5onK0Rnuwv8UExF5X@zlNJ^7 z&aw{din1S;9X(^i6s z^ljI7=*x7ZUJS98Li^*dW1*x_`Iah`)&OauMrvf5WtgcbEZyvQ3o}F7MN6?1AU{F04e6xghVd>lEn@}sD6^# zO4P=vcx_V~R<-hJ`UT9jr{wZukk~FD0Gq5B$#L$xvM)%bsK}vMCV(Sh01>Js+Rn`c zxhKd|h;o4m#sD%cfgEKUPM(Df29OwyWEe+M@s>OVwwQz=oc$?GHFL}?EYo2j$0n;f zCRq?L{R2mlTM+tiV?Ylgf}QFnm3D2PaV&4=r!X>}da607f__gq4(H5oU+64qrU{;- zsVw4}<@0mVv5E!PcAzY}O!MeI0qly2+`;$eNI9)0Me(l5)!8<6-7EziTA8a0i}{mY zqxQ>UlqH_}Sev{a&f>d?0PTcK@~!h$KiPW+9M5?Cj-i>P;efW%MAwCf!aE513NDd8 z!apBoRZK^+zRbMvB|Fh^hUC;8O*RWpRU{CJ7D`toEQnSQ=h-O=9Tfb*=x7)2E|8dS zsoXOhREN_Z9+(s;JSYEzNr_}(Uy+Ve?F470x#$D2dUs1=#qh3@kYkLYu#7xvkRJR{sSfl6?=il*wvY}44y?PGEkQf~P5usxH^ z6IOrdkP3Z|+z2=0ArCZK&vwZ~ncUTI_<}1HWW%KpjN}&+v)IN@_Clvc%3iYp2^;pv z=-A2|6nFha6fjF!*yiJf83&K>up>htSb$|p&|Rs?ml0t9{W>cXA@S+zz4u$Hp`~Pd z3Mv?t?6#)^&bHTCk13~)?d4SW_k@KcTgNoKL^CYcQP67RQ*~a-IhU5dK=bNi5&9x+ z-?ih;^zL1sjRV#Z3i;l|%E2Q7N8HxCKF-f0Jn5b=vkFwPwMl`H-N;Q%Q|w0WmSg7{ zUg@*W8?gKB$av$7x#jxJ7h`GT*&TYkP0ifrDr~uC9__H8Hq)d zu|PO&YwqNDRj$vN#y}tOo2ubBw~$Y#-Gpl6e<-lbe$6Hs_l&0$G#Enh#0n0k*o>4j z*)e)ppmJ;C$`LJS*f>Fon=jwh(5}@f_B#}A>!UdqNk&y{_OTnEA98sZ*N1^F%P z)%wSmH~wUB%akI;r6E4F#|6u@G>M_T&t6N)#);PfKW~zDT7DdvdP47&RsL8nN8HBGbfuZu2!zC_#8OM4Lj@Ovb$a zu}d;9aa`^m$?LJ^*K|)Y?3a|D*@^1I*ey;0KJSx^%${-)wiU_1n{qK1qiq*1BF*c` z+D9o>vm9}HzSndF?H|zwOsM3C^U!^NG1nq)5IQD-Z(Z>+XL2##9}zbg__X8) z3452#DQIcOMA;$^FACY_-$9G{1!UZOW9|2-4LgD@mPSfh8Vvz`zqdGoPHL4F5>pANwisjfciU5#z&^d^uhXU zoU-YNINIR%;Lp6DAZQ^4Ov&ha*LDGQtcJgoAJng`*6y6QXvvH7rt8IWBggU7&cJ~&VK$P4` z2Zl2n;r-YtcXB=h`+(DuOP&5WB0Z6!gt_VSl+DB;wk{w|Av9{WcHDw^axb6Uf@U~^ zOJGtlBz9{Z9i=pMcSC09bfOKr80r%h%;@%yWWsxFFRu|W%eiFPP-_Xi^vSV3gaqNF za!xY?S{Mybbvb;Ni-5<?c;CgvhM$hMXaiqC#zJy`$AG*Azg{$)3RP>Ok02K9V|xQgC5DY{ z_1M?}`X|?}Dg7jz5}_7Vri?t_h(h?08D0H5z-N4JZap!fFUN(eyRV4&^AVXy?Gl*u z-|c2g*-d6KJ>Ko0l)yS+Iav%}fgEj*4?io^=QooJIH4HG&E39I4+?!Q+56S34K?2^ z0P*&W5}7c6lr0mWHqih|zfMMUKpHq!Hny>q`)+JWs?y!q{PoH5ej>_jwl!H1Rzj z&nLAAOl&|Z*UA>ZEqdsS{KVn`(p3s0Lw+2jXDD z!*F5JAGxSkj5u2bRAB3HQJ_I#=X8>c8P47)^nUN?Jj+&Jv1BL-MEy#{1hSq%woWW1 zJg>M3Q5V1&u5{#la&g83EW@`ZLC`7tfJC`KB^c#Y)| z%up`VQBqC90R*ME!86NK7gKjSpH3Ta?uXVTUx!PFmFxL$qe-(OY~1f0KWN#rU+D7q zywo_n6hQ*Y4YNhuG;Ch2pFT%>HF}Vd^L5#M3uOEX+gx28{!l&*f)&OcC>w3~&h>J} zE6lN%$=MNO7W`qDD#I;`jm(=`USB4d)d0BXy7h;TGPbABi))cU`|q#Ju~bbC;PI`D zjJjQHZXgpyl)jtrNgsn z0YX9fL}A3aX)?Z5%5%a?*vss#nBBr4d|)yXFjighY$&QleL;g`c%pXcX>go~+#7B< zHF@Dv=+M-SdIJ!gAjHVH{a!7WN|uF#ZRpNxho|lJ@+F!^SwOBB-rUpd z=F0U>6{X;BW4ZFDl7&7jt(cv=&wG11DdUjzD;;_3=RsfYa3rKhN9*cAeMxipT7p6|LYGWBdZ@EDedA0I?^Y2Qp!Uz6tzau{#%eyO3bK5Cy41bT#j zGU}|EQSlh>Q|N-euq}g?&9<$4%9oIyL3p-f8BE$mwkXt`q?;7u6a%TsqoJu^*Lt-= zpndMB6(%u}bjah)CV6?YPE050(<*?_Un3lN2h&eDxSUOFGg>YEP0Yk{&5-y_NVVTu z!KjVCUkAzaPfd<7t7221U2f>U>EhR;AZyGTdDPt;ngZdqt|@R=)u((xF#8)fJ_LKm zby{6Po#*6fEg$fhHIbYmDu%o-LdXzi*YMCUD^iHrh+I(*nCmWN8Cdp5C{=wJc1rA? znSab3LsV*om~@1*+b*y<@erd9rioXZL1ZhaLU5k_Y#z!yx37Dp7i!gfLtDxh>gc(yAM3P(DNt9-7e3kp$6qJ>yn|OsuZC1U z>Y@{ueB;2XKAk8%-^Bzg-mi?LD;xUbH2KSMr%l_e?oJ-SK5=m z&W>dY9Kq|nv{12b1?q0BijNSE&&^wph={xlg~g+~CPM}CppAbzp3dH|d_iRbs)@6d~$a{W`Ww&CSGy<0nO0tkp*bvy@oe2bp z`~8YFKaWq=gb)yF8W0eVz_0!+fWZ4_;*ye#(h3ra%u2F9o;) zt(Cov%kNNCp@rBmU{fgY1AJiq&jJWs`76ra!Ijb5%8}8<*uwl_YbHj$ko#Z&V332PCxWVyn68Ej{*pc|0}}P!O_sx+}+&vVNXPS zy|oHFIx~pC09EiHlK;p4{sIUb`zyo6)Y;0>)rA@C=j`a_YHoP%2>_0T|JP34)9z4@ zV`GpYAlfJ(AjtkkqXBPG{W(bwX|9gOzkylee^XTlk8%RIkCcCd9f2LR{{b^Ib~Sb} zcXcteGk1RRNA9JEt-I+=mF)pfnHt;=FYcdAKwudAZ-9Srz=8km&8%Fl9PAA(tZc#a z@b^yN3!4RLZ%rQDKt8}j>?^L{U>(gJU9HT_4ecGA?Tl@$yp8YS53u){rAR;aZM{YS zC#VAZoZs*7{Vae$D!t!ef0OCkh82t+f7fB@&L`sI=TPviMj zHH=4KsYrsQg7r6J{{I;N7j@j%?uCvU5Cm37Em$4kSeAblK;TRJAGQAk{8JG3_S3+f zX?d_9LcxN#&(H9)00LRPe%Swa4cs$)n#UR5frmf}tO2mV|15yOH-0}D|Ced^^!L#k zQ2kcO048~Y=LozF`)2_JmIeGE{hVQQuq0hA9n2nzhY&0tdJP;b5wLg|!Q%OQdys;E zQ2sx9xle5Nvw+q8`UmM>C33Ik*|*IL;OT@213zegGQd|rplR(7%>N~T!(VT`d#ZO$ z-1RH)w8?^(Iu!u#}3?tOc|wEgeo zdx8B6*jR{{zyqujG;p);y`g>szvetB?oZvaEkra6d31bN{U3pIZGFyaDeO5)eFTYfUx*a{O?8l zFI7BD?)|e9fIv9}$p5TXzm7ZnM&18~f4>$yOab_dABGM2^E`!>y_Kt>;cwRGhnfF= zS&#m-2J84|9>BlK_P*|ixn_P@XORA|6Gvc9s~vc#Q=x6{n7rPA%g!}{U7(^ zgUBquO!ox;?E0^ DataFrame: return dataset_df -# TODO: Useless funcs -def get_mbr_hit(scan: str): - """ - This function annotates if the peptide is inferred or not by Match between Runs algorithm (1), 0 if the peptide is - identified in the corresponding file. - :param scan: scan value - :return: - """ - return 1 if pd.isna(scan) else 0 - - -def get_run_mztab(ms_run: str, metadata: OrderedDict) -> str: - """ - Convert the ms_run into a reference file for merging with msstats output - :param ms_run: ms_run index in mztab - :param metadata: metadata information in mztab - :return: file name - """ - m = re.search(r"\[([A-Za-z0-9_]+)\]", ms_run) - file_location = metadata["ms_run[" + str(m.group(1)) + "]-location"] - file_location = get_spectrum_prefix(file_location) - return os.path.basename(file_location) - - -def get_scan_mztab(ms_run: str) -> str: - """ - Get the scan number for an mzML spectrum in mzTab. The format of the reference - must be controllerType=0 controllerNumber=1 scan=30121 - :param ms_run: the original ms_run reference in mzTab - :return: the scan index - """ - reference_parts = ms_run.split() - return reference_parts[-1] - - -def best_probability_error_bestsearch_engine(probability: float) -> float: - """ - Convert probability to a Best search engine score - :param probability: probability - :return: - """ - return 1 - probability - - # Functions needed by Combiner def load_sdrf(sdrf_path: str) -> DataFrame: """ diff --git a/ibaqpy.egg-info/PKG-INFO b/ibaqpy.egg-info/PKG-INFO new file mode 100644 index 0000000..990f17d --- /dev/null +++ b/ibaqpy.egg-info/PKG-INFO @@ -0,0 +1,354 @@ +Metadata-Version: 2.1 +Name: ibaqpy +Version: 0.0.3 +Summary: Python package to compute intensity base absolute expression values +Home-page: https://github.com/bigbio/ibaqpy/ +Download-URL: https://github.com/bigbio/ibaqpy/ +Author: Yasset Perez-Riverol +Author-email: ypriverol@gmail.com +License: MIT +Keywords: Proteomics,Label-free,absolute quantification +Classifier: Development Status :: 4 - Beta +Classifier: Environment :: Console +Classifier: Intended Audience :: Science/Research +Classifier: License :: OSI Approved :: MIT License +Classifier: Natural Language :: English +Classifier: Operating System :: MacOS :: MacOS X +Classifier: Operating System :: POSIX +Classifier: Operating System :: Unix +Classifier: Programming Language :: Python +Classifier: Topic :: Scientific/Engineering +Classifier: Topic :: Scientific/Engineering :: Bio-Informatics +Classifier: Topic :: Scientific/Engineering :: Visualization +Description-Content-Type: text/markdown +License-File: LICENSE +Requires-Dist: pyopenms +Requires-Dist: scikit-learn +Requires-Dist: numpy +Requires-Dist: click +Requires-Dist: pandas +Requires-Dist: matplotlib +Requires-Dist: qnorm +Requires-Dist: seaborn +Requires-Dist: typing_extensions + +# ibaqpy + +[![Python application](https://github.com/bigbio/ibaqpy/actions/workflows/python-app.yml/badge.svg)](https://github.com/bigbio/ibaqpy/actions/workflows/python-app.yml) +[![Upload Python Package](https://github.com/bigbio/ibaqpy/actions/workflows/python-publish.yml/badge.svg)](https://github.com/bigbio/ibaqpy/actions/workflows/python-publish.yml) +[![Codacy Badge](https://app.codacy.com/project/badge/Grade/6a1961c7d57c4225b4891f73d58cac6b)](https://app.codacy.com/gh/bigbio/ibaqpy/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) +[![PyPI version](https://badge.fury.io/py/ibaqpy.svg)](https://badge.fury.io/py/ibaqpy) +![PyPI - Downloads](https://img.shields.io/pypi/dm/ibaqpy) + +iBAQ (intensity Based Absolute Quantification) determines the abundance of a protein by dividing the total precursor intensities by the number of theoretically observable peptides of the protein. The TPA (Total Protein Approach) value is determined by summing peptide intensities of each protein and then dividing by the molecular mass to determine the relative concentration of each protein. By using [ProteomicRuler](https://www.sciencedirect.com/science/article/pii/S1535947620337749), it is possible to calculate the protein copy number and absolute concentration. **ibaqpy** compute IBAQ values, TPA values, copy numbers and concentration for proteins starting from a msstats input file (or a feature parquet from [quantmsio](https://github.com/bigbio/quantms.io)) and a SDRF experimental design file. In addition, it supports the merging of iBAQ results from multiple datasets and the elimination of outliers and batch effects. This package provides multiple tools: + +- `peptide_normalization.py`: Generate the peptides dataframe from a msstats input file and a SDRF experimental design file (or directly from a feature parquet), then normalize the peptides dataframe. It includes multiple steps such as peptidoform normalization, peptidorom to peptide summarization, peptide intensity normalization, and imputation. + +- `compute_ibaq.py`: Compute IBAQ values from the output file from script `peptide_normalization.py`. + +- `compute_tpa.py`: Compute TPA values, protein copy numbers and concentration from the output file from script `peptide_normalization.py`. + +- `datasets_merge.py`: Merge ibaq results from multiple datasets. It consists of three steps: missing value imputation, outlier removal, and batch effect removal. + +**NOTE:** In all scripts and result files, *uniprot accession* is used as the protein identifier. + +### How to install ibaqpy + +Ibaqpy is available in PyPI and can be installed using pip: + +```asciidoc +pip install ibaqpy +``` + +You can install the package from code: + +1. Clone the repository: + +```asciidoc +>$ git clone https://github.com/bigbio/ibaqpy +>$ cd ibaqpy +``` + +2. Install conda environment: + +```asciidoc +>$ mamba env create -f conda-environment.yaml +``` + +3. Install ibaqpy: + +```asciidoc +>$ python setup.py install +``` + +### Collecting intensity files + +Absolute quantification files has been store in the following url: + +``` +https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/absolute-expression/ +``` + +Inside each project reanalysis folder, the folder proteomicslfq contains the msstats input file with the structure `{Name of the project}_msstats_in.csv`. + +E.g. http://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/absolute-expression/PXD003947/proteomicslfq/PXD003947.sdrf_openms_design_msstats_in.csv + +### Peptide Normalization - peptide_normalization.py + +```asciidoc +python peptide_normalization.py --msstats PXD003947.sdrf_openms_design_msstats_in.csv --sdrf PXD003947.sdrf.tsv --remove_ids data/contaminants_ids.tsv --remove_decoy_contaminants --remove_low_frequency_peptides --output PXD003947-peptides-norm.csv +``` + +The command provides an additional `flag` for skip_normalization, pnormalization, compress, log2, violin, verbose. If you use feature parquet as input, you can pass the `--sdrf`. + +```asciidoc +Usage: peptide_normalization.py [OPTIONS] + +Options: + -m, --msstats TEXT MsStats file import generated by quantms + -p, --parquet TEXT Parquet file import generated by quantmsio + -s, --sdrf TEXT SDRF file import generated by quantms + --stream Stream processing normalization + --chunksize The number of rows of MSstats or parquet read using pandas streaming + --min_aa INTEGER Minimum number of amino acids to filter + peptides + --min_unique INTEGER Minimum number of unique peptides to filter + proteins + --remove_ids TEXT Remove specific protein ids from the + analysis using a file with one id per line + --remove_decoy_contaminants Remove decoy and contaminants proteins from + the analysis + --remove_low_frequency_peptides Remove peptides that are present in less + than 20% of the samples + --output TEXT Peptide intensity file including other all + properties for normalization + --skip_normalization Skip normalization step + --nmethod TEXT Normalization method used to normalize + intensities for all samples (options: msstats, quantile, qnorm) + --pnormalization Normalize the peptide intensities using + different methods (options: quantile, qnorm) + --compress Read the input peptides file in compress + gzip file + --log2 Transform to log2 the peptide intensity + values before normalization + --violin Use violin plot instead of boxplot for + distribution representations + --verbose Print addition information about the + distributions of the intensities, number of + peptides remove after normalization, etc. + --qc_report TEXT PDF file to store multiple QC images + --help Show this message and exit. +``` + +Peptide normalization starts from the peptides dataframe. The structure of the input contains the following columns: + +- ProteinName: Protein name +- PeptideSequence: Peptide sequence including post-translation modifications `(e.g. .(Acetyl)ASPDWGYDDKN(Deamidated)GPEQWSK)` +- PrecursorCharge: Precursor charge +- FragmentIon: Fragment ion +- ProductCharge: Product charge +- IsotopeLabelType: Isotope label type +- Condition: Condition label `(e.g. heart)` +- BioReplicate: Biological replicate index `(e.g. 1)` +- Run: Run index `(e.g. 1)` +- Fraction: Fraction index `(e.g. 1)` +- Intensity: Peptide intensity +- Reference: Name of the RAW file containing the peptide intensity `(e.g. Adult_Heart_Gel_Elite_54_f16)` +- SampleID: Sample ID `(e.g. PXD003947-Sample-3)` +- StudyID: Study ID `(e.g. PXD003947)`. In most of the cases the study ID is the same as the ProteomeXchange ID. + +#### 1. Removing Contaminants and Decoys + +The first step is to remove contaminants and decoys. The script `peptide_normalization.py` provides a parameter `--remove_decoy_contaminants` as a flag to remove all the proteins with the following prefixes: `CONTAMINANT` and `DECOY`. And the user can provide a file with a list of protein accessions which represent each contaminant or high abundant protein in the file. An example file can be seen in `data/contaminants_ids.txt`. + +#### 2. Peptidoform Normalization + +A peptidoform is a combination of a `PeptideSequence(Modifications) + Charge + BioReplicate + Fraction`. In the current version of the file, each row correspond to one peptidoform. + +The current version of the tool uses the parackage [qnorm](https://pypi.org/project/qnorm/) to normalize the intensities for each peptidofrom. **qnorm** implements a quantile normalization method. However, the current version of qnorm can not handle NA values which will lead to cause more NA values in data. We suggest users to use default method 'quantile' instead for now. + +#### 3. Peptidoform to Peptide Summarization + +For each peptidoform a peptide sequence (canonical) with not modification is generated. The intensity of all peptides group by biological replicate are `sum`. + +Then, the intensities of the peptides across different biological replicates are summarize using the function `median`. + +At the end of this step, for each peptide, the corresponding protein + the intensity of the peptide is stored. + +#### 4. Peptide Intensity Imputation and Normalization + +Before the final two steps (peptide normalization and imputation), the algorithm removes all peptides that are source of missing values significantly. The algorithm removes all peptides that have more than 80% of missing values and peptides that do not appear in more than 1 sample. + +Finally, two extra steps are performed: + +- ``peptide intensity imputation``: Imputation is performed using the package [missingpy](https://pypi.org/project/missingpy/). The algorithm uses a Random Forest algorithm to perform the imputation. +- ``peptide intensity normalization``: Similar to the normalization of the peptidoform intensities, the peptide intensities are normalized using the package [qnorm](https://pypi.org/project/qnorm/). + +### Compute IBAQ - compute_ibaq.py +IBAQ is an absolute quantitative method based on strength that can be used to estimate the relative abundance of proteins in a sample. IBAQ value is the total intensity of a protein divided by the number of theoretical peptides. + +```asciidoc +python compute_ibaq.py --fasta Homo-sapiens-uniprot-reviewed-contaminants-decoy-202210.fasta --peptides PXD003947-peptides.csv --enzyme "Trypsin" --normalize --output PXD003947-ibaq.tsv +``` + +The command provides an additional `flag` for normalize IBAQ values. + +```asciidoc +python compute_ibaq.py --help +Usage: compute_ibaq.py [OPTIONS] + + Compute the IBAQ values for a file output of peptides with the format described in + peptide_normalization.py. + + :param min_aa: Minimum number of amino acids to consider a peptide + :param max_aa: Maximum number of amino acids to consider a peptide + :param fasta: Fasta file used to perform the peptide identification + :param peptides: Peptide intensity file + :param enzyme: Enzyme used to digest the protein sample + :param normalize: use some basic normalization steps + :param output: output format containing the ibaq values + :param verbose: Print addition information about the distributions of the intensities, + number of peptides remove after normalization, etc. + :param qc_report: PDF file to store multiple QC images + +Options: + -f, --fasta TEXT Protein database to compute IBAQ values [required] + -p, --peptides TEXT Peptide identifications with intensities following the peptide intensity output [required] + -e, --enzyme Enzyme used during the analysis of the dataset (default: Trypsin) + -n, --normalize Normalize IBAQ values using by using the total IBAQ of the experiment + --min_aa Minimum number of amino acids to consider a peptide (default: 7) + --max_aa Maximum number of amino acids to consider a peptide (default: 30) + -o, --output TEXT Output format containing the ibaq values + --verbose Print addition information about the distributions of the intensities, + number of peptides remove after normalization, etc. + --qc_report PDF file to store multiple QC images (default: "IBAQ-QCprofile.pdf") + --help Show this message and exit. +``` + +#### 1. Performs the Enzymatic Digestion +The current version of this tool uses OpenMS method to load fasta file, and use [ProteaseDigestion](https://openms.de/current_doxygen/html/classOpenMS_1_1ProteaseDigestion.html) to enzyme digestion of protein sequences, and finally get the theoretical peptide number of each protein. + +#### 2. Calculate the IBAQ Value +First, peptide intensity dataframe was grouped according to protein name, sample name and condition. The protein intensity of each group was summed. Finally, the sum of the intensity of the protein is divided by the number of theoretical peptides. + +If protein-group exists in the peptide intensity dataframe, the intensity of all proteins in the protein-group is summed based on the above steps, and then multiplied by the number of proteins in the protein-group. + +#### 3. IBAQ Normalization +Normalize the ibaq values using the total ibaq of the sample. The resulted ibaq values are then multiplied by 100'000'000 (PRIDE database noramalization), for the ibaq ppb and log10 shifted by 10 (ProteomicsDB). + +### Compute TPA - compute_tpa.py +The total protein approach (TPA) is a label- and standard-free method for absolute protein quantitation of proteins using large-scale proteomic data. In the current version of the tool, the TPA value is the total intensity of the protein divided by its theoretical molecular mass. + +[ProteomicRuler](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4256500/) is a method for protein copy number and concentration estimation that does not require the use of isotope labeled standards. It uses the mass spectrum signal of histones as a "proteomic ruler" because it is proportional to the amount of DNA in the sample, which in turn depends on cell count. Thus, this approach can add an absolute scale to the mass spectrometry readout and allow estimates of the copy number of individual proteins in each cell. + +```asciidoc +python compute_tpa.py --fasta Homo-sapiens-uniprot-reviewed-contaminants-decoy-202210.fasta --organism 'human' --peptides PXD003947-peptides.csv --ruler --ploidy 2 --cpc 200 --output PXD003947-tpa.tsv --verbose +``` + +```asciidoc +python compute_tpa.py --help +Usage: compute_tpa.py [OPTIONS] + + Compute the protein copy numbers and concentrations according to a file output of peptides with the + format described in peptide_normalization.py. + + :param fasta: Fasta file used to perform the peptide identification + :param peptides: Peptide intensity file + :param organism: Organism source of the data + :param ruler: Whether to compute protein copy number, weight and concentration. + :param ploidy: Ploidy number + :param cpc: Cellular protein concentration(g/L) + :param output: Output format containing the TPA values, protein copy numbers and concentrations + :param verbose: Print addition information about the distributions of the intensities, + number of peptides remove after normalization, etc. + :param qc_report: PDF file to store multiple QC images + +Options: + -f, --fasta TEXT Protein database to compute IBAQ values [required] + -p, --peptides TEXT Peptide identifications with intensities following the peptide intensity output [required] + -m, --organism Organism source of the data. + -r, --ruler Calculate protein copy number and concentration according to ProteomicRuler + -n, --ploidy Ploidy number (default: 2) + -c, --cpc Cellular protein concentration(g/L) (default: 200) + -o, --output TEXT Output format containing the TPA values, protein copy numbers and concentrations + --verbose Print addition information about the distributions of the intensities, + number of peptides remove after normalization, etc. + --qc_report PDF file to store multiple QC images (default: "TPA-QCprofile.pdf") + --help Show this message and exit. +``` + +#### 1. Calculate the TPA Value +The OpenMS tool was used to calculate the theoretical molecular mass of each protein. Similar to the calculation of IBAQ, the TPA value of protein-group was the sum of its intensity divided by the sum of the theoretical molecular mass. + +#### 2. Calculate the Cellular Protein Copy Number and Concentration +The protein copy calculation follows the following formula: +``` +protein copies per cell = protein MS-signal * (avogadro / molecular mass) * (DNA mass / histone MS-signal) +``` +For cellular protein copy number calculation, the uniprot accession of histones were obtained from species first, and the molecular mass of DNA was calculated. Then the dataframe was grouped according to different conditions, and the copy number, molar number and mass of proteins were calculated. + +In the calculation of protein concentration, the volume is calculated according to the cell protein concentration first, and then the protein mass is divided by the volume to calculate the intracellular protein concentration. + +### Datasets Merge - datasets_merge.py +There are batch effects in protein identification and quantitative results between different studies, which may be caused by differences in experimental techniques, conditional methods, data analysis, etc. Here we provide a method to apply batch effect correction. First to impute ibaq data, then remove outliers using `hdbscan`, and apply batch effect correction using `pycombat`. + + +```asciidoc +python datasets_merge.py datasets_merge --data_folder ../ibaqpy_test/ --output datasets-merge.csv --verbose +``` + +```asciidoc +python datasets_merge.py --help +Usage: datasets_merge.py [OPTIONS] + + Merge ibaq results from compute_ibaq.py. + + :param data_folder: Data dolfer contains SDRFs and ibaq CSVs. + :param output: Output file after batch effect removal. + :param covariate: Indicators included in covariate consideration when datasets are merged. + :param organism: Organism to keep in input data. + :param covariate_to_keep: Keep tissue parts from metadata, e.g. 'LV,RV,LA,RA'. + :param non_missing_percent_to_keep: non-missing values in each group. + :param n_components: Number of principal components to be computed. + :param min_cluster: The minimum size of clusters. + :param min_sample_num: The minimum number of samples in a neighborhood for a point to be considered as a core point. + :param n_iter: Number of iterations to be performed. + :param verbose/quiet: Output debug information. + +Options: + Options: + -d, --data_folder TEXT Data dolfer contains SDRFs and ibaq CSVs. [required] + -o, --output TEXT Output file after batch effect removal. [required] + -c, --covariate TEXT Indicators included in covariate consideration when datasets are merged. + --organism TEXT Organism to keep in input data. + --covariate_to_keep TEXT Keep tissue parts from metadata, e.g. 'LV,RV,LA,RA'. + --non_missing_percent_to_keep FLOAT + non-missing values in each group. + --n_components TEXT Number of principal components to be computed. + --min_cluster TEXT The minimum size of clusters. + --min_sample_num TEXT The minimum number of samples in a neighborhood for a point to be considered as a core point. + --n_iter TEXT Number of iterations to be performed. + -v, --verbose / -q, --quiet Output debug information. + --help Show this message and exit. +``` + +#### 1. Impute Missing Values +Remove proteins missing more than 30% of all samples. Users can keep tissue parts interested, and data will be converted to a expression matrix (samples as columns, proteins as rows), then missing values will be imputed with `KNNImputer`. + +#### 2. Remove Outliers +When outliers are removed, multiple hierarchical clustering is performed using `hdbscan.HDBSCAN`, where outliers are labeled -1 in the PCA plot. When clustering is performed, the default cluster minimum value and the minimum number of neighbors of the core point are the minimum number of samples in all datasets. + +*Users can skip this step and do outliers removal manually.* + +#### 3. Batch Effect Correction +Using `pycombat` for batch effect correction, and batch is set to `datasets` (refers specifically to PXD ids) and the covariate should be `tissue_part`. + +### Citation + +Wang H, Dai C, Pfeuffer J, Sachsenberg T, Sanchez A, Bai M, Perez-Riverol Y. Tissue-based absolute quantification using large-scale TMT and LFQ experiments. Proteomics. 2023 Oct;23(20):e2300188. doi: [10.1002/pmic.202300188](https://analyticalsciencejournals.onlinelibrary.wiley.com/doi/10.1002/pmic.202300188). Epub 2023 Jul 24. PMID: 37488995. + +### Credits + +- Julianus Pfeuffer +- Yasset Perez-Riverol +- Hong Wang diff --git a/ibaqpy.egg-info/SOURCES.txt b/ibaqpy.egg-info/SOURCES.txt new file mode 100644 index 0000000..b2c3106 --- /dev/null +++ b/ibaqpy.egg-info/SOURCES.txt @@ -0,0 +1,20 @@ +LICENSE +README.md +setup.py +bin/__init__.py +bin/compute_ibaq.py +bin/compute_tpa.py +bin/datasets_merger.py +bin/merge_condition_files.py +bin/normalize_methods.py +bin/peptide_normalization.py +bin/tsne_visualization.py +ibaq/__init__.py +ibaq/combiner.py +ibaq/ibaqpy_commons.py +ibaq/utils.py +ibaqpy.egg-info/PKG-INFO +ibaqpy.egg-info/SOURCES.txt +ibaqpy.egg-info/dependency_links.txt +ibaqpy.egg-info/requires.txt +ibaqpy.egg-info/top_level.txt \ No newline at end of file diff --git a/ibaqpy.egg-info/dependency_links.txt b/ibaqpy.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/ibaqpy.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/ibaqpy.egg-info/requires.txt b/ibaqpy.egg-info/requires.txt new file mode 100644 index 0000000..4dfc27e --- /dev/null +++ b/ibaqpy.egg-info/requires.txt @@ -0,0 +1,9 @@ +pyopenms +scikit-learn +numpy +click +pandas +matplotlib +qnorm +seaborn +typing_extensions diff --git a/ibaqpy.egg-info/top_level.txt b/ibaqpy.egg-info/top_level.txt new file mode 100644 index 0000000..caa62ff --- /dev/null +++ b/ibaqpy.egg-info/top_level.txt @@ -0,0 +1,2 @@ +bin +ibaq From ac861ebbcc91e06d0ef86de2c39e754fd9163219 Mon Sep 17 00:00:00 2001 From: zprobot <1727697083@qq.com> Date: Thu, 28 Mar 2024 16:39:51 +0800 Subject: [PATCH 2/3] update: code --- bin/peptide_normalization.py | 659 ++++++++--------------------------- 1 file changed, 148 insertions(+), 511 deletions(-) diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py index 5cf18d0..bbcbd69 100644 --- a/bin/peptide_normalization.py +++ b/bin/peptide_normalization.py @@ -620,379 +620,75 @@ def peptide_normalization( compression_method = "gzip" if compress else None print("Loading data..") - if not stream: - if parquet is None: - # Read the msstats file - feature_df = pd.read_csv( - msstats, - sep=",", - compression=compression_method, - dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, - ) - - # Read the sdrf file - sdrf_df, label, sample_names, choice = analyse_sdrf( - sdrf, compression_method - ) - print(sdrf_df) - - # Merged the SDRF with the Resulted file - dataset_df = msstats_common_process(feature_df) - dataset_df = merge_sdrf(label, sdrf_df, feature_df) - # Remove the intermediate variables and free the memory - del feature_df, sdrf_df - gc.collect() - else: - dataset_df = pd.read_parquet(parquet,cols=PARQUET_COLUMNS) - label, sample_names, choice = analyse_feature_df(dataset_df) - dataset_df = parquet_common_process(dataset_df, label, choice) - - dataset_df = data_common_process(dataset_df, min_aa) - # Only proteins with unique peptides number greater than min_unique (default: 2) are retained - unique_peptides = set( - dataset_df.groupby(PEPTIDE_CANONICAL) - .filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[PEPTIDE_CANONICAL] - .tolist() - ) - strong_proteins = set( - dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(unique_peptides)] - .groupby(PROTEIN_NAME) - .filter(lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[ - PROTEIN_NAME - ] - .tolist() - ) - dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] - - print(f"Number of unique peptides: {len(unique_peptides)}") - print(f"Number of strong proteins: {len(strong_proteins)}") - - print("Logarithmic if specified..") - dataset_df = dataset_df.rename(columns={INTENSITY: NORM_INTENSITY}) - if log2: - dataset_df[NORM_INTENSITY] = np.log2(dataset_df[NORM_INTENSITY]) - - # Print the distribution of the original peptide intensities from quantms analysis - if verbose: - sample_names = set(dataset_df[SAMPLE_ID]) - plot_width = len(sample_names) * 0.5 + 10 - pdf = PdfPages(qc_report) - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=not log2, - width=plot_width, - title="Original peptidoform intensity distribution (no normalization)", - ) - #plt.show() - pdf.savefig(density) - """ - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=not log2, - width=plot_width, - title="Original peptidoform intensity distribution (no normalization)", - violin=violin, - ) - plt.show() - pdf.savefig(box) - """ - - # Remove high abundant and contaminants proteins and the outliers - if remove_ids is not None: - print("Remove proteins from file...") - dataset_df = remove_protein_by_ids(dataset_df, remove_ids) - if remove_decoy_contaminants: - print("Remove decoy and contaminants...") - dataset_df = remove_contaminants_entrapments_decoys(dataset_df) - - print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) - print("Normalize intensities.. ") - # dataset_df = dataset_df.dropna(how="any") - if not skip_normalization: - dataset_df = intensity_normalization( - dataset_df, - field=NORM_INTENSITY, - class_field=SAMPLE_ID, - scaling_method=nmethod, - ) - if verbose: - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - #log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - ) - #plt.show() - pdf.savefig(density) - """ - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - """ - print("Number of peptides after normalization: " + str(len(dataset_df.index))) - print("Select the best peptidoform across fractions...") - dataset_df = get_peptidoform_normalize_intensities(dataset_df) - print( - "Number of peptides after peptidofrom selection: " - + str(len(dataset_df.index)) + if parquet is None: + # Read the msstats file + feature_df = pd.read_csv( + msstats, + sep=",", + compression=compression_method, + dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, ) - print("Sum all peptidoforms per Sample...") - dataset_df = sum_peptidoform_intensities(dataset_df) - print("Number of peptides after selection: " + str(len(dataset_df.index))) - - print("Average all peptidoforms per Peptide/Sample...") - dataset_df = average_peptide_intensities(dataset_df) - print("Number of peptides after average: " + str(len(dataset_df.index))) - if verbose: - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptide intensity distribution method: " + nmethod, - ) - plt.show() - pdf.savefig(density) - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptide intensity distribution method: " + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - - if remove_low_frequency_peptides and len(sample_names) > 1: - print(dataset_df) - dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) - print_dataset_size( - dataset_df, "Peptides after remove low frequency peptides: ", verbose - ) - # Perform imputation using Random Forest in Peptide Intensities - # TODO: Check if this is necessary (Probably we can do some research if imputation at peptide level is necessary - # if impute: - # dataset_df = impute_peptide_intensities(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID) - - if pnormalization: - print("Normalize at Peptide level...") - dataset_df = peptide_intensity_normalization( - dataset_df, - field=NORM_INTENSITY, - class_field=SAMPLE_ID, - scaling_method=nmethod, - ) - - if verbose: - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Normalization at peptide level method: " + nmethod, - ) - plt.show() - pdf.savefig(density) - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Normalization at peptide level method: " + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - pdf.close() - - print("Save the normalized peptide intensities...") - dataset_df.to_csv(output, index=False, sep=",") + # Read the sdrf file + sdrf_df, label, sample_names, choice = analyse_sdrf( + sdrf, compression_method + ) + print(sdrf_df) + + # Merged the SDRF with the Resulted file + dataset_df = msstats_common_process(feature_df) + dataset_df = merge_sdrf(label, sdrf_df, feature_df) + # Remove the intermediate variables and free the memory + del feature_df, sdrf_df + gc.collect() else: - if parquet is None: - sdrf_df, label, sample_names, choice = analyse_sdrf( - sdrf, compression_method - ) - msstats_chunks = pd.read_csv( - msstats, - sep=",", - compression=compression_method, - dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, - chunksize=chunksize, - ) - else: - label, sample_names, choice = analyse_feature_parquet( - parquet, batch_size=chunksize - ) - msstats_chunks = read_large_parquet(parquet, batch_size=chunksize) - sample_number = len(sample_names) - - # TODO: Stream processing to obtain strong proteins with more than 2 uniqe peptides - temp = f"Temp-{str(uuid.uuid4())}/" - os.mkdir(temp) - print(f"INFO: Writing files into {temp}...") - unique_peptides = {} - group_intensities = {} - quantile = {} - print("INFO: First iteration to get unique peptides and strong proteins...") - for msstats_df in msstats_chunks: - if parquet is None: - msstats_df = msstats_common_process(msstats_df) - msstats_df = merge_sdrf(label, sdrf_df, msstats_df) - else: - msstats_df = parquet_common_process(msstats_df, label, choice) - result_df = data_common_process(msstats_df, min_aa) - - # Write CSVs by Sample ID - for sample in sample_names: - file_name = f"{temp}/{sample}.csv" - write_mode = "a" if os.path.exists(file_name) else "w" - header = False if os.path.exists(file_name) else True - result_df[result_df[SAMPLE_ID] == sample].to_csv( - file_name, index=False, header=header, mode=write_mode - ) - unique_df = result_df.groupby([PEPTIDE_CANONICAL]).filter( - lambda x: len(set(x[PROTEIN_NAME])) == 1 - )[[PEPTIDE_CANONICAL, PROTEIN_NAME]] - unique_dict = dict( - zip(unique_df[PEPTIDE_CANONICAL], unique_df[PROTEIN_NAME]) - ) - for i in unique_dict.keys(): - if i in unique_peptides.keys() and unique_dict[i] != unique_peptides[i]: - unique_peptides.pop(i) - else: - unique_peptides[i] = unique_dict[i] - - proteins_list = list(unique_peptides.values()) - count_dict = { - element: proteins_list.count(element) for element in set(proteins_list) - } - strong_proteins = [ - element for element in count_dict if count_dict[element] >= min_unique + dataset_df = pd.read_parquet(parquet,columns=PARQUET_COLUMNS) + label, sample_names, choice = analyse_feature_df(dataset_df) + dataset_df = parquet_common_process(dataset_df, label, choice) + + dataset_df = data_common_process(dataset_df, min_aa) + # Only proteins with unique peptides number greater than min_unique (default: 2) are retained + unique_peptides = set( + dataset_df.groupby(PEPTIDE_CANONICAL) + .filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[PEPTIDE_CANONICAL] + .tolist() + ) + strong_proteins = set( + dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(unique_peptides)] + .groupby(PROTEIN_NAME) + .filter(lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[ + PROTEIN_NAME ] - del proteins_list, count_dict - print(f"Number of unique peptides: {len(list(unique_peptides.keys()))}") - print(f"Number of strong proteins: {len(strong_proteins)}") + .tolist() + ) + dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] - # TODO: Filter proteins with less unique peptides than min_unique (default: 2) - plot_samples = random.sample(sample_names, min(len(sample_names), 20)) - plot_width = 10 + len(plot_samples) * 0.5 + print(f"Number of unique peptides: {len(unique_peptides)}") + print(f"Number of strong proteins: {len(strong_proteins)}") + + print("Logarithmic if specified..") + dataset_df = dataset_df.rename(columns={INTENSITY: NORM_INTENSITY}) + if log2: + dataset_df[NORM_INTENSITY] = np.log2(dataset_df[NORM_INTENSITY]) + + # Print the distribution of the original peptide intensities from quantms analysis + """ + if verbose: + sample_names = set(dataset_df[SAMPLE_ID]) + plot_width = len(sample_names) * 0.5 + 10 pdf = PdfPages(qc_report) - original_intensities_df = pd.DataFrame() - - print("INFO: Second iteration to filter data and prepare normalization...") - print("Logarithmic if specified..") - norm_record = [0] * 2 - for sample in sample_names: - msstats_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") - msstats_df = msstats_df[msstats_df[PROTEIN_NAME].isin(strong_proteins)] - # Remove high abundant and contaminants proteins and the outliers - if remove_ids is not None: - msstats_df = remove_protein_by_ids(msstats_df, remove_ids) - if remove_decoy_contaminants: - msstats_df = remove_contaminants_entrapments_decoys(msstats_df) - norm_record[0] += len(msstats_df) - msstats_df = msstats_df.rename(columns={INTENSITY: NORM_INTENSITY}) - if log2: - msstats_df[NORM_INTENSITY] = np.log2(msstats_df[NORM_INTENSITY]) - if sample in plot_samples: - original_intensities_df = pd.concat( - [original_intensities_df, msstats_df] - ) - if not skip_normalization: - if nmethod == "msstats": - if label in ["TMT", "ITRAQ"]: - g = msstats_df.groupby(["Run", "Channel"]) - else: - g = msstats_df.groupby(["Run", "Fraction"]) - for name, group in g: - group_intensity = group[NORM_INTENSITY].tolist() - if name not in group_intensities: - group_intensities[name] = group_intensity - else: - group_intensities.update( - { - name: group_intensities[NORM_INTENSITY] - + group_intensity - } - ) - elif nmethod == "quantile": - msstats_df = ( - msstats_df.groupby( - [ - PEPTIDE_SEQUENCE, - PEPTIDE_CANONICAL, - PEPTIDE_CHARGE, - FRACTION, - RUN, - BIOREPLICATE, - PROTEIN_NAME, - STUDY_ID, - CONDITION, - ] - )[NORM_INTENSITY] - .agg(np.nanmean) - .reset_index() - ) - rank = msstats_df[NORM_INTENSITY].rank(method="average") - dic = dict(zip(rank, msstats_df[NORM_INTENSITY])) - if len(quantile) == 0: - quantile = {k: (v, 1) for k, v in dic.items()} - else: - # update = min(len(quantile), len(dic)) - intersec = set(quantile.keys()) & set(dic.keys()) - update = set(dic.keys()) - set(quantile.keys()) - quantile.update( - { - i: (quantile[i][0] + dic[i], quantile[i][1] + 1) - for i in intersec - } - ) - if len(update) > 0: - quantile.update({k: (dic[k], 1) for k in update}) - msstats_df[SAMPLE_ID] = sample - else: - exit("Stream process only supports msstats and quantile methods!") - msstats_df.to_csv(f"{temp}/{sample}.csv", index=False, sep=",") - norm_record[1] += len(msstats_df) - if not skip_normalization and nmethod == "quantile": - quantile = {k: v[0] / v[1] for k, v in quantile.items()} - print(f"Peptides after contaminants removal: {norm_record[0]}") - print(f"Number of peptides after normalization: {norm_record[1]}") - # Save original intensities QC plots - original_intensities_df = original_intensities_df.reset_index(drop=True) density = plot_distributions( - original_intensities_df, + dataset_df, NORM_INTENSITY, SAMPLE_ID, log2=not log2, width=plot_width, title="Original peptidoform intensity distribution (no normalization)", ) + #plt.show() pdf.savefig(density) box = plot_box_plot( - original_intensities_df, + dataset_df, NORM_INTENSITY, SAMPLE_ID, log2=not log2, @@ -1001,180 +697,116 @@ def peptide_normalization( violin=violin, ) plt.show() - pdf.savefig(box) - del original_intensities_df - - # TODO: Peptide intensity normalization - peptides_count = pd.DataFrame( - columns=[PROTEIN_NAME, PEPTIDE_CANONICAL, "count"] + pdf.savefig(box) + """ + + # Remove high abundant and contaminants proteins and the outliers + if remove_ids is not None: + print("Remove proteins from file...") + dataset_df = remove_protein_by_ids(dataset_df, remove_ids) + if remove_decoy_contaminants: + print("Remove decoy and contaminants...") + dataset_df = remove_contaminants_entrapments_decoys(dataset_df) + + print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) + print("Normalize intensities.. ") + # dataset_df = dataset_df.dropna(how="any") + if not skip_normalization: + dataset_df = intensity_normalization( + dataset_df, + field=NORM_INTENSITY, + class_field=SAMPLE_ID, + scaling_method=nmethod, ) - norm_intensities_df = pd.DataFrame() - if not skip_normalization and nmethod == "msstats": - # For ISO normalization - if label in ["TMT", "ITRAQ"]: - median_baseline = np.nanmedian( - list(set(sum(group_intensities.values(), []))) - ) - group_intensities = { - key: np.nanmedian(list(values)) - for key, values in group_intensities.items() - } - else: - fractions = [i[1] for i in group_intensities.keys()] - fraction_median = {} - for fraction in fractions: - fraction_keys = [ - i for i in group_intensities.keys() if i[1] == fraction - ] - fraction_intensities = [] - for key in fraction_keys: - fraction_intensities.extend(group_intensities[key]) - fraction_median[fraction] = np.nanmedian(fraction_intensities) - group_intensities = { - key: np.nanmedian(values) - for key, values in group_intensities.items() - } - print("INFO: Third iteration to normalize and counting peptides frequency...") - size_record = [0] * 3 - - def normalization( - dataset_df, label, sample, skip_normalization, nmethod, record - ): - if not skip_normalization: - field = NORM_INTENSITY - if nmethod == "msstats": - # For ISO normalization - if label in ["TMT", "ITRAQ"]: - dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( - lambda x: x[field] - - group_intensities[(x["Run"], x["Channel"])] - + median_baseline, - axis=1, - ) - else: - dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( - lambda x: x[field] - - group_intensities[(x["Run"], x["Fraction"])] - + np.nanmedian( - [ - group_intensities[i] - for i in group_intensities.keys() - if i[1] == x["Fraction"] - ] - ), - axis=1, - ) - elif nmethod == "quantile": - rank = dataset_df[NORM_INTENSITY].rank(method="average") - ref_dict = dict(zip(rank, dataset_df[NORM_INTENSITY])) - ref_dict = {v: quantile[k] for k, v in ref_dict.items()} - dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( - lambda x: ref_dict.get(x[NORM_INTENSITY], np.nan), - axis=1, - ) - dataset_df = dataset_df.drop_duplicates() - dataset_df = dataset_df[dataset_df[NORM_INTENSITY].notna()] - dataset_df = get_peptidoform_normalize_intensities(dataset_df) - record[0] += len(dataset_df.index) - dataset_df = sum_peptidoform_intensities(dataset_df) - record[1] += len(dataset_df.index) - dataset_df = average_peptide_intensities(dataset_df) - record[2] += len(dataset_df.index) - - return dataset_df, record - - for sample in sample_names: - dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") - if len(dataset_df) != 0: - norm_df, size_record = normalization( - dataset_df, label, sample, skip_normalization, nmethod, size_record - ) - else: - continue - sample_peptides = norm_df[PEPTIDE_CANONICAL].unique().tolist() - if remove_low_frequency_peptides and sample_number > 1: - sample_peptides = norm_df[ - [PROTEIN_NAME, PEPTIDE_CANONICAL] - ].drop_duplicates() - sample_peptides["count"] = 1 - peptides_count = ( - pd.concat([peptides_count, sample_peptides]) - .groupby([PROTEIN_NAME, PEPTIDE_CANONICAL]) - .agg(sum) - .reset_index() - ) - norm_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) - if sample in plot_samples: - norm_intensities_df = pd.concat([norm_intensities_df, norm_df]) - del group_intensities, quantile - print(f"Number of peptides after peptidofrom selection: {size_record[0]}") - print(f"Number of peptides after selection: {size_record[1]}") - print(f"Number of peptides after average: {size_record[2]}") - # Save normalized intensities QC plots - norm_intensities_df = norm_intensities_df.reset_index(drop=True) + if verbose: + """ density = plot_distributions( - norm_intensities_df, + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + #log2=log_after_norm, + width=plot_width, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, + ) + #plt.show() + pdf.savefig(density) + box = plot_box_plot( + dataset_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, width=plot_width, title="Peptidoform intensity distribution after normalization, method: " + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + """ + print("Number of peptides after normalization: " + str(len(dataset_df.index))) + print("Select the best peptidoform across fractions...") + dataset_df = get_peptidoform_normalize_intensities(dataset_df) + print( + "Number of peptides after peptidofrom selection: " + + str(len(dataset_df.index)) + ) + + print("Sum all peptidoforms per Sample...") + dataset_df = sum_peptidoform_intensities(dataset_df) + print("Number of peptides after selection: " + str(len(dataset_df.index))) + + print("Average all peptidoforms per Peptide/Sample...") + dataset_df = average_peptide_intensities(dataset_df) + print("Number of peptides after average: " + str(len(dataset_df.index))) + """ + if verbose: + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptide intensity distribution method: " + nmethod, ) plt.show() pdf.savefig(density) box = plot_box_plot( - norm_intensities_df, + dataset_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, + title="Peptide intensity distribution method: " + nmethod, violin=violin, ) plt.show() pdf.savefig(box) - del norm_intensities_df, strong_proteins - - print("INFO: Writing normalized intensities into CSV...") - if remove_low_frequency_peptides and sample_number > 1: - peptides_count = peptides_count.loc[ - (peptides_count["count"] > 0.20 * sample_number) - & (peptides_count["count"] != sample_number - 1) - ] + """ - final_norm_intensities_df = pd.DataFrame() - size_record = 0 - for sample in sample_names: - dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") - if remove_low_frequency_peptides and sample_number > 1: - # Filter low-frequency peptides, which indicate whether the peptide occurs less than 20% in all samples or - # only in one sample - dataset_df = dataset_df.merge( - peptides_count[[PEPTIDE_CANONICAL, PROTEIN_NAME]], how="inner" - ) - size_record += len(dataset_df.index) - dataset_df = dataset_df[ - [PEPTIDE_CANONICAL, PROTEIN_NAME, SAMPLE_ID, NORM_INTENSITY, CONDITION] - ] - write_mode = "a" if os.path.exists(output) else "w" - header = False if os.path.exists(output) else True - dataset_df.to_csv(output, index=False, header=header, mode=write_mode) - dataset_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) - if sample in plot_samples: - final_norm_intensities_df = pd.concat( - [final_norm_intensities_df, dataset_df] - ) - print(f"Peptides after remove low frequency peptides: {size_record}") - if remove_low_frequency_peptides: - del peptides_count - - # TODO: No peptides intensity normalization applied in stream processing. - # Save final normalized intensities QC plots - final_norm_intensities_df = final_norm_intensities_df.reset_index(drop=True) + if remove_low_frequency_peptides and len(sample_names) > 1: + print(dataset_df) + dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) + print_dataset_size( + dataset_df, "Peptides after remove low frequency peptides: ", verbose + ) + # Perform imputation using Random Forest in Peptide Intensities + # TODO: Check if this is necessary (Probably we can do some research if imputation at peptide level is necessary + # if impute: + # dataset_df = impute_peptide_intensities(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID) + + if pnormalization: + print("Normalize at Peptide level...") + dataset_df = peptide_intensity_normalization( + dataset_df, + field=NORM_INTENSITY, + class_field=SAMPLE_ID, + scaling_method=nmethod, + ) + """ + if verbose: density = plot_distributions( - final_norm_intensities_df, + dataset_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, @@ -1184,7 +816,7 @@ def normalization( plt.show() pdf.savefig(density) box = plot_box_plot( - final_norm_intensities_df, + dataset_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, @@ -1195,6 +827,11 @@ def normalization( plt.show() pdf.savefig(box) pdf.close() + """ + + print("Save the normalized peptide intensities...") + dataset_df.to_csv(output, index=False, sep=",") + if __name__ == "__main__": From 4d24860a36203bf6ea1348b8e9d6cc424748cd14 Mon Sep 17 00:00:00 2001 From: zprobot <1727697083@qq.com> Date: Thu, 4 Apr 2024 23:34:22 +0800 Subject: [PATCH 3/3] update: ibaq --- .gitignore | 2 + .../normalize_methods.cpython-310.pyc | Bin 2375 -> 3110 bytes bin/__pycache__/parquet.cpython-310.pyc | Bin 0 -> 2141 bytes bin/normalize_methods.py | 116 +-- bin/parquet.py | 46 ++ bin/peptide_normalization.py | 347 +++------ build/lib/bin/normalize_methods.py | 114 ++- build/lib/bin/peptide_normalization.py | 736 +++--------------- build/lib/ibaq/ibaqpy_commons.py | 53 +- build/scripts-3.10/peptide_normalization.py | 736 +++--------------- dist/ibaqpy-0.0.3-py3.10.egg | Bin 100087 -> 92892 bytes ibaq/ibaqpy_commons.py | 53 +- ibaqpy.egg-info/SOURCES.txt | 1 + 13 files changed, 539 insertions(+), 1665 deletions(-) create mode 100644 bin/__pycache__/parquet.cpython-310.pyc create mode 100644 bin/parquet.py diff --git a/.gitignore b/.gitignore index 82e8311..7d2e1d0 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ res.csv venv /venv/ /compute-all.sh +/build +/dist \ No newline at end of file diff --git a/bin/__pycache__/normalize_methods.cpython-310.pyc b/bin/__pycache__/normalize_methods.cpython-310.pyc index 872356d544de50ef337491335ecfc1c7c88accd8..6530eb561d8e74fc9085a5440547a7b4881d4e3e 100644 GIT binary patch literal 3110 zcma)8&2JM&6yKTs@Y;@p6A0uBl+sdIRZ6K;N=s9|nj+;AspUWwf~<{a6E@jh+p}u~ zXf0JrEA_(v07w3tp6A*g$~iYK^!H|M$94l^SM%P?o0&K7d!|*ZRSbN|SGC;-Q-<+3 z4MsnX!ELnkPXJ*E78zSiW4>hq^T>+rEt?s_6xQ#?mLqK80GC8bxWKL`iwbaARK*l< zMbyMJa8;ZVb>Jy6BW8hX;RLRoR|l$iv_U=JR_FGGVrWm_YLp-8%Sep zdQ8oBx_**{QP9YwpQO7j88=Ou?0?Y7D53p`mi_~f896&-Ju^22JFsLshYkzIOZLj> zS-G`tJbKi#b7P+u^xQt+!rbkzcxhsllUccQ$n@@bVkIo}2dCA&QfB9++{yW&xy^fS zubeyJz1pi}PF}(JmGPZ8&0H|@$^pM^2upLQ=GENIE1;H_j3wj4>(`9V%}zZx?qeP7 zV{g7;&{etj{kRXK*)y-CYI{M}kU={Nn|>B3r<;VGZlKJl z)l^Q_io!I*AV}0KEljZ6kHY6c!wd1Ea7IRxQfJQiw@wFl6p~&0I>o>MH?%w%%JKXU* z?PuFNVX}QJPH|+q+J2^L$5b$~u{XT{;4+s@pEq?rco=gZAj90y7EFbkTG z6PPzz8QS@gXDQwm%9VjH8qM^Htl|Xe0(ho81*9x8sGK1(>0f{a>QOMCMN1ukX{cf` zO&g1P5}%qD|0X&)w;=ohmsLg?Ksf-BLF1s!fd=IYnlM2*4+RHfiYIGRS#jVex=-O9E7O0n zH>voinEwpzXchorse^P8u%bg(()@c>m!D%+UIrMG>Jy@#D3!cIQzVo00FZ})n3Ra5 z`I1(AL4Zd8JD-f9qih$QG1FiCu<7@?u!OzGg(FHYxUQ@xJHncDgy&8=Vh?BI`}bDA z^VcoqL}3#666si@Vj|nE$j`!7;@KlUqs0Dz<0ehSI+F&v1O_+I(ovUPId&P%kC}`j z*F@7MYon91eqvUN#5~0H)=$gRMOyYLl$Oh&ATj$CM6tXCiaDgH+v%GnS&vVo?rSx&-SM{O*j8JN{ zZJ6MyUT_iSV5=E z9T@(IiU{hP@zQvOQiIfHcuHuqyk>j+<%Cn@=$1%K0%THqX4JH~isTtDfqZs14>c%HByiP2J4E2btIQYgwc^TBg zo;`h&%+m;Os+`Sh3m4|#2sp$SsW5Q&ZS*Hi^DStU$tl%!mz`mfC}Y~tjE$!Ah$5k4!-Q#$)Y;*~1#dKi~AYDm7bCH%?MzhlvQDqA1)24|#_G=_Stt zsPej>`S+wB2dJs~(i*N(eEr^}*xzAKgqG5a+F=xn`gSI|cUsv=H^u{nCvD}l{Y3bw zK4ohzJ%~`LlGQdI2(r}-(lku=3Nh$iqHlM7@97&rmtjeTMF+VKMIAnpTrSB?+FX*3 s9T8E7AH5)yyB4=ZHwtc&H`5gW7gomWh`Tec?UvoD{%2jQ_^0gu0D$^lasU7T literal 2375 zcma)7OK%)S5T5RN?t0f=zY@OygA$2>#J~ll2oVETL?mS4x*4G6Cg3&?#SW`)>5Hh@?a;sE&qh+QErkWC;?g?K=o z0&y#(0puAFuS7%>_K$-`X*CQn1zK9$%1Wa!Q%)L*7}`w3k#Y`_d@oD_7e+UYv%ozD zUAc^u86U~ODH#mRmqn($3$EV$gU@fOY4X?2z+{-j$C2uko@+>(+}0+PyVa1{$-dpR z@-jP#pf;{IfCpK@_#$R!-+-w(DP?G{4r?>_zJ z>z&?|++vIhLc@hrdJd^JNmG97yDd`_ZnMvN$HxB?^XT^2`^XbmB>jckX>=9Z) zw{(HsV+%|Yc5Axl4=_sU(g`236HaAVpD;f`0X{^*9rH00CMdu>JTt3&W;(nxuH0Up z4AZPocASZ5G%$wMg^q#K-MD8vF@~!gmR2(Y4_2hbhi&*jlEz<##<XndJnJi`A7&yf zwAWyw|uu`oZXHq)$OpB5B)N oV46d!tjhZ`&BZW@zM`vLtV4K)hf}1V$GsKLn*3~N$Mag=zo4VNuK)l5 diff --git a/bin/__pycache__/parquet.cpython-310.pyc b/bin/__pycache__/parquet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..437f052223435eace52d90b7bb8815f88bcd2011 GIT binary patch literal 2141 zcmZ`)O>Y}F5GA=QX)Vi+k_JthCIHbMV&!0@SHUpi`lB$As%UJafa+keT5|18rCpii zqRB=s+So@69+}TB;IgYqyr7AIgOM ziGz#Hfx!dl`W_fT1nrXc_?Nb6F=lP{1rba*XGA!stnCQ8M;h)M@F%Sh?ZxK8-~n`v zMiM-u%>BA7&)jxdEDKLoV7wrzXQj3$7Derhv=_t@+;YWMc4S~O zC2g%AsN+l;zaN;RkiqHGhTkRVo6z+-7(<@YAsf&URX2GBmb2ztL9YGZ|(^+H#btUSRCi9XPs`nw>OD#I{nYYB1|+*^Lx(`I@L@`1$M3 zR`X!@^Np_$;!befe}2%3lY_~ztNrKJ^Zhu9jqigdh^~RD(JC!dk5>P!J9&MA`s&op zR&>zoccs3BJd%wE6v%%9AAm6#QoxuQXFyL2oKM(*AqOYUzyV5G{zsKh+Wt|jd6Xrg ziPMC4(?DpB=qcGxmEn;}d&PJR5Ep?7I)MgsQ^f;*sM4&@(}?%9ud+n1&dzPZOM@QA z$tK_6U4Z_A-`q_}kXF*<`?EV|Yx%dcPYf3D1!jQtBBnV%mz=&ol1lP1P<{}GQfqwI zZi~M-x#ZX=sRcmGI{LV)7V)cOGCf3iEYd=?4RP%55fEdd=JTp z{6b%mR}@m13U=lhDjdVkf&FuG>THoyiVD~wPimk5G6WTj=!rXU25!V26H1P62p4ow z%3sbf8uJIzn1nHA2;%8Tg+$`K<{cR)hrF*+1X>Cnm_jUozvL$F$rT}^AnO|b+5N`+ z=;yKQiYE63Xj#R5RGvRu4JkZp+OV#mx<7!fZ-Y66_dcnf&~rL~zjJbf{LDTgzp`Vt zH_p0(7)VF9df(FcHB&cXSHa{3%BpLq$1v^n*9#=^n%g8_nNkxQsr*T{t*&*`FzD)a z_?T`|7-Os9+G|K+mc&pMr~bC2g5*$Irz;cnA^3sP3N>LMZBNO_$8)R;U)p+|z=TJ> zh{n-<3_hw*RqbL7cVJqhoY;?EiDQMJHR}G8Uw>b~?*Qn)#!Wl{0i2eH5flh02mqo0 z0Ef;9%5cNUn=>A!i8;?*Aq8Ye90mrM?n~ee2%Kkpet%Mzw|4jTcUw@P-$}zu^%}}; z{cR7(Jj_aU4b62hwro;_tHBBo+lv#h;d(uW3}GoCQ;55_2-f#))%Sa;$RP22Tl4+nEa*;F9Bq`UgTHk#oPt>E#EDT@bBn7lgfou` ym4)GGOlprTvt_S-rM9SW9`wVMRJg1vXmIj1): + samples = df['SampleID'].unique() + for sample in samples: + runs = df.loc[df['SampleID']==sample,'Run'].unique().tolist() + if(len(runs)>1): + sample_df = df.loc[df['SampleID']==sample,:] + map,base = get_normalize_args(sample_df,runs,method) + for run in runs: + run = str(run) + run_intensity = df.loc[(df['SampleID']==sample)&(df['Run']==run),'NormIntensity'] + df.loc[(df['SampleID']==sample)&(df['Run']==run),'NormIntensity'] = run_intensity / (map[run] / base) + return df + else: + return df + +def get_replicate(sdrf_path): + sdrf = pd.read_csv(sdrf_path,sep="\t") + reps = len(sdrf["comment[technical replicate]"].unique()) + return reps + +def get_normalize_args(df,runs,method): + match method: + case 'mean': + return normalize_mean(df,runs) + case 'median': + return normalize_median(df,runs) + case 'iqr': + return normalize_q(df,runs) + +def normalize_mean(df,runs): + map = {} + total = 0 + for run in runs: + run = str(run) + run_m = df.loc[df['Run']==run,'NormIntensity'].mean() + map[run] = run_m + total += run_m + avg = total / len(runs) + return map,avg + +def normalize_median(df,runs): + map = {} + total = 0 + for run in runs: + run = str(run) + run_m = df.loc[df['Run']==run,'NormIntensity'].median() + map[run] = run_m + total += run_m + med = total / len(runs) + return map,med + +def normalize_q(df,runs): + map = {} + total = 0 + for run in runs: + run = str(run) + run_m = df.loc[df['Run']==run,'NormIntensity'].quantile([0.75,0.25],interpolation='linear').mean() + map[run] = run_m + total += run_m + q = total / len(runs) + return map,q + def normalize(df,method): match method: case 'mean': @@ -16,70 +78,34 @@ def normalize(df,method): return global_normalize(df) case 'max_min': return max_min_mormalize(df) - case 'z_score': - return z_score_normalize(df) - case 'iqr': - return iqr_normalize(df) - case 'robust': - return robust_normalize(df) - case 'vsn': - return vsn_normalize(df) - case 'quantile': - return quantile_normalize(df) case _: return -1 # mean def mean_normalize(df): - return df / df.mean(axis=0) + return df / df.mean() # median def median_normalize(df): - return df / df.median(axis=0) + return df / df.median() #max def max_normalize(df): - return df / df.max(axis=0) + return df / df.max() #global def global_normalize(df): - return df / df.sum(axis=0) + return df / df.sum() #max-min def max_min_mormalize(df): - min = df.min(axis=0) - return (df - min) / (df.max(axis=0) - min) - -#z-score -def z_score_normalize(df): - return (df - df.mean(axis=0)) / df.var(axis=0) - -#IQR -def iqr_normalize(df): - Q = df.quantile([0.75,0.25],interpolation='linear',axis=0) - IQR = Q.loc[0.75,:] - Q.loc[0.25,:] - return (df - df.median(axis=0)) / IQR - -#rubust -def robust_normalize(df): - index = df.index - columns = df.columns - df = robust_scale(df, axis=0) - df = pd.DataFrame(df,columns=columns,index=index) - return df - -#vsn -def vsn_normalize(df): - index = df.index - columns = df.columns - df = power_transform(df, method='box-cox') - df = pd.DataFrame(df,columns=columns,index=index) - return df + min = df.min() + return (df - min) / (df.max() - min) #quantile def quantile_normalize(df): index = df.index columns = df.columns - DF = quantile_transform(df,axis=0) + DF = quantile_transform(df) df = pd.DataFrame(df,columns=columns,index=index) return df \ No newline at end of file diff --git a/bin/parquet.py b/bin/parquet.py new file mode 100644 index 0000000..b461e96 --- /dev/null +++ b/bin/parquet.py @@ -0,0 +1,46 @@ +import re +import os +import duckdb +class Feature: + + def __init__(self, parquet_path: str): + if os.path.exists(parquet_path): + self.parquet_db = duckdb.connect() + self.parquet_db = self.parquet_db.execute( + "CREATE VIEW parquet_db AS SELECT * FROM parquet_scan('{}')".format(parquet_path)) + else: + raise FileNotFoundError(f'the file {parquet_path} does not exist.') + + def get_report_from_database(self, samples: list): + """ + This function loads the report from the duckdb database for a group of ms_runs. + :param runs: A list of ms_runs + :return: The report + """ + database = self.parquet_db.sql( + """ + select * from parquet_db + where sample_accession IN {} + """.format(tuple(samples)) + ) + report = database.df() + return report + + def iter_samples(self,file_num:int=20): + """ + :params file_num: The number of files being processed at the same time(default 10) + :yield: _description_ + """ + samples = self.get_unique_samples() + ref_list = [samples[i:i+file_num] for i in range(0,len(samples), file_num)] + for refs in ref_list: + batch_df = self.get_report_from_database(refs) + yield refs,batch_df + + def get_unique_samples(self): + """ + return: A list of deduplicated peptides. + """ + unique_peps = self.parquet_db.sql(f"SELECT DISTINCT sample_accession FROM parquet_db").df() + + return unique_peps['sample_accession'].tolist() \ No newline at end of file diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py index bbcbd69..90f980c 100644 --- a/bin/peptide_normalization.py +++ b/bin/peptide_normalization.py @@ -11,7 +11,8 @@ from matplotlib.backends.backend_pdf import PdfPages from pandas import DataFrame import pyarrow.parquet as pq -from normalize_methods import normalize +from parquet import Feature +from normalize_methods import normalize,normalize_run from ibaq.ibaqpy_commons import ( BIOREPLICATE, @@ -71,6 +72,7 @@ def recover_df(df): sample:NORM_INTENSITY },inplace=True) out = pd.concat([out,samples_df]) + out.reset_index(inplace=True,drop=True) return out def analyse_sdrf(sdrf_path: str, compression: bool) -> tuple: @@ -280,7 +282,7 @@ def map_canonical_seq(data_df: pd.DataFrame) -> (pd.DataFrame, dict): data_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1) ] data_df[PROTEIN_NAME] = data_df[PROTEIN_NAME].apply(parse_uniprot_accession) - data_df[STUDY_ID] = data_df[SAMPLE_ID].apply(get_study_accession) + #data_df[STUDY_ID] = data_df[SAMPLE_ID].apply(get_study_accession) if FRACTION not in data_df.columns: data_df[FRACTION] = 1 data_df = data_df[ @@ -290,19 +292,16 @@ def map_canonical_seq(data_df: pd.DataFrame) -> (pd.DataFrame, dict): PEPTIDE_CANONICAL, PEPTIDE_CHARGE, INTENSITY, - REFERENCE, CONDITION, RUN, BIOREPLICATE, FRACTION, - FRAGMENT_ION, ISOTOPE_LABEL_TYPE, - STUDY_ID, SAMPLE_ID, ] ] data_df[CONDITION] = pd.Categorical(data_df[CONDITION]) - data_df[STUDY_ID] = pd.Categorical(data_df[STUDY_ID]) + #data_df[STUDY_ID] = pd.Categorical(data_df[STUDY_ID]) data_df[SAMPLE_ID] = pd.Categorical(data_df[SAMPLE_ID]) return data_df @@ -389,7 +388,8 @@ def remove_low_frequency_peptides_( :param percentage_samples: percentage of samples :return: """ - + #= dataset_df[[SAMPLE_ID, CONDITION]].drop_duplicates(subset=[SAMPLE_ID]) + c_map = dataset_df[[SAMPLE_ID, CONDITION]].drop_duplicates(subset=[SAMPLE_ID]).set_index(SAMPLE_ID).to_dict()[CONDITION] normalize_df = pd.pivot_table( dataset_df, index=[PEPTIDE_CANONICAL, PROTEIN_NAME], @@ -398,6 +398,8 @@ def remove_low_frequency_peptides_( aggfunc={NORM_INTENSITY: np.nanmean}, observed=True, ) + del dataset_df + gc.collect() # Count the number of null values in each row null_count = normalize_df.isnull().sum(axis=1) # Find the rows that have null values above the threshold @@ -413,14 +415,10 @@ def remove_low_frequency_peptides_( ] normalize_df = recover_df(normalize_df) # recover condition column - normalize_df = normalize_df.merge( - dataset_df[[SAMPLE_ID, CONDITION]].drop_duplicates(subset=[SAMPLE_ID]), - on=SAMPLE_ID, - how="left", - ) + normalize_df.loc[:,CONDITION] = normalize_df[SAMPLE_ID].map(c_map) # Remove rows with null values in NORMALIZE_INTENSITY - normalize_df = normalize_df[normalize_df[NORM_INTENSITY].notna()] + normalize_df.dropna(subset=[NORM_INTENSITY], inplace=True) print(normalize_df.head()) return normalize_df @@ -450,52 +448,6 @@ def peptide_intensity_normalization( normalize_df = recover_df(normalize_df) return normalize_df - -def impute_peptide_intensities(dataset_df, field, class_field): - """ - Impute the missing values using different methods. - :param dataset_df: dataframe with the data - :param field: field to impute - :param class_field: field to use as class - :return: - """ - normalize_df = pd.DataFrame() - # group by condition to detect missing values - for c, g in dataset_df.groupby(CONDITION): - # pivot to have one col per sample - group_normalize_df = pd.pivot_table( - g, - index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], - columns=class_field, - values=field, - aggfunc={field: np.nanmean}, - observed=True, - ) - - # no missing values group -> only one sample - if len(group_normalize_df.columns) < 2: - group_normalize_df = group_normalize_df.reset_index() - group_normalize_df = group_normalize_df.melt( - id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION] - ) - group_normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) - normalize_df = normalize_df.append(group_normalize_df, ignore_index=True) - # else: - # # print ("nothing") - # # Impute the missing values - # # imputer = MissForest(max_iter=5) - # # imputed_data = imputer.fit_transform(group_normalize_df) - # # group_normalize_df = pd.DataFrame(imputed_data, columns=group_normalize_df.columns, - # # index=group_normalize_df.index) - # # # Melt the dataframe - # # group_normalize_df = group_normalize_df.reset_index() - # # group_normalize_df = group_normalize_df.melt(id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION]) - # # group_normalize_df.rename(columns={'value': NORM_INTENSITY}, inplace=True) - # # normalize_df = normalize_df.append(group_normalize_df, ignore_index=True) - - return normalize_df - - @click.command() @click.option( "-m", "--msstats", help="MsStats file import generated by quantms", default=None @@ -558,11 +510,6 @@ def impute_peptide_intensities(dataset_df, field, class_field): help="Read the input peptides file in compress gzip file", is_flag=True, ) -@click.option( - "--log2", - help="Transform to log2 the peptide intensity values before normalization", - is_flag=True, -) @click.option( "--violin", help="Use violin plot instead of boxplot for distribution representations", @@ -595,7 +542,6 @@ def peptide_normalization( nmethod: str, pnormalization: bool, compress: bool, - log2: bool, violin: bool, verbose: bool, qc_report: str, @@ -619,7 +565,7 @@ def peptide_normalization( pd.set_option("display.max_columns", None) compression_method = "gzip" if compress else None print("Loading data..") - + ''' if parquet is None: # Read the msstats file feature_df = pd.read_csv( @@ -642,196 +588,87 @@ def peptide_normalization( del feature_df, sdrf_df gc.collect() else: - dataset_df = pd.read_parquet(parquet,columns=PARQUET_COLUMNS) - label, sample_names, choice = analyse_feature_df(dataset_df) - dataset_df = parquet_common_process(dataset_df, label, choice) - - dataset_df = data_common_process(dataset_df, min_aa) - # Only proteins with unique peptides number greater than min_unique (default: 2) are retained - unique_peptides = set( - dataset_df.groupby(PEPTIDE_CANONICAL) - .filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[PEPTIDE_CANONICAL] - .tolist() - ) - strong_proteins = set( - dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(unique_peptides)] - .groupby(PROTEIN_NAME) - .filter(lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[ - PROTEIN_NAME - ] - .tolist() - ) - dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] - - print(f"Number of unique peptides: {len(unique_peptides)}") - print(f"Number of strong proteins: {len(strong_proteins)}") - - print("Logarithmic if specified..") - dataset_df = dataset_df.rename(columns={INTENSITY: NORM_INTENSITY}) - if log2: - dataset_df[NORM_INTENSITY] = np.log2(dataset_df[NORM_INTENSITY]) - - # Print the distribution of the original peptide intensities from quantms analysis - """ - if verbose: - sample_names = set(dataset_df[SAMPLE_ID]) - plot_width = len(sample_names) * 0.5 + 10 - pdf = PdfPages(qc_report) - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=not log2, - width=plot_width, - title="Original peptidoform intensity distribution (no normalization)", - ) - #plt.show() - pdf.savefig(density) - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=not log2, - width=plot_width, - title="Original peptidoform intensity distribution (no normalization)", - violin=violin, - ) - plt.show() - pdf.savefig(box) - """ - - # Remove high abundant and contaminants proteins and the outliers - if remove_ids is not None: - print("Remove proteins from file...") - dataset_df = remove_protein_by_ids(dataset_df, remove_ids) - if remove_decoy_contaminants: - print("Remove decoy and contaminants...") - dataset_df = remove_contaminants_entrapments_decoys(dataset_df) - - print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) - print("Normalize intensities.. ") - # dataset_df = dataset_df.dropna(how="any") - if not skip_normalization: - dataset_df = intensity_normalization( - dataset_df, - field=NORM_INTENSITY, - class_field=SAMPLE_ID, - scaling_method=nmethod, - ) - if verbose: - """ - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - #log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - ) - #plt.show() - pdf.savefig(density) - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - """ - print("Number of peptides after normalization: " + str(len(dataset_df.index))) - print("Select the best peptidoform across fractions...") - dataset_df = get_peptidoform_normalize_intensities(dataset_df) - print( - "Number of peptides after peptidofrom selection: " - + str(len(dataset_df.index)) - ) + ''' + F = Feature(parquet) + #dataset_df = pd.read_parquet(parquet,columns=PARQUET_COLUMNS) + header= False + for samples,df in F.iter_samples(): + for sample in samples: + dataset_df = df[df['sample_accession']==sample].copy() + dataset_df = dataset_df[PARQUET_COLUMNS] + label, sample_names, choice = analyse_feature_df(dataset_df) + dataset_df = parquet_common_process(dataset_df, label, choice) + dataset_df = data_common_process(dataset_df, min_aa) + # Only proteins with unique peptides number greater than min_unique (default: 2) are retained + unique_peptides = set( + dataset_df.groupby(PEPTIDE_CANONICAL) + .filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[PEPTIDE_CANONICAL] + .tolist() + ) + strong_proteins = set( + dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(unique_peptides)] + .groupby(PROTEIN_NAME) + .filter(lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[ + PROTEIN_NAME + ] + .tolist() + ) + dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] + + print(f"Number of unique peptides: {len(unique_peptides)}") + print(f"Number of strong proteins: {len(strong_proteins)}") + + print("Logarithmic if specified..") + dataset_df.rename(columns={INTENSITY: NORM_INTENSITY},inplace=True) + """ + if log2: + dataset_df[NORM_INTENSITY] = np.log2(dataset_df[NORM_INTENSITY]) + """ + + # Remove high abundant and contaminants proteins and the outliers + if remove_ids is not None: + print("Remove proteins from file...") + dataset_df = remove_protein_by_ids(dataset_df, remove_ids) + if remove_decoy_contaminants: + print("Remove decoy and contaminants...") + dataset_df = remove_contaminants_entrapments_decoys(dataset_df) + + print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) + print("Normalize intensities.. ") + dataset_df.loc[:,NORM_INTENSITY] = np.log(dataset_df[NORM_INTENSITY]) + dataset_df = normalize_run(dataset_df,sdrf,nmethod) + dataset_df.loc[:,NORM_INTENSITY] = normalize(dataset_df[NORM_INTENSITY],'max_min') + print("Number of peptides after normalization: " + str(len(dataset_df.index))) + print("Select the best peptidoform across fractions...") + dataset_df = get_peptidoform_normalize_intensities(dataset_df) + print( + "Number of peptides after peptidofrom selection: " + + str(len(dataset_df.index)) + ) - print("Sum all peptidoforms per Sample...") - dataset_df = sum_peptidoform_intensities(dataset_df) - print("Number of peptides after selection: " + str(len(dataset_df.index))) - - print("Average all peptidoforms per Peptide/Sample...") - dataset_df = average_peptide_intensities(dataset_df) - print("Number of peptides after average: " + str(len(dataset_df.index))) - """ - if verbose: - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptide intensity distribution method: " + nmethod, - ) - plt.show() - pdf.savefig(density) - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptide intensity distribution method: " + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - """ - - if remove_low_frequency_peptides and len(sample_names) > 1: - print(dataset_df) - dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) - print_dataset_size( - dataset_df, "Peptides after remove low frequency peptides: ", verbose - ) - # Perform imputation using Random Forest in Peptide Intensities - # TODO: Check if this is necessary (Probably we can do some research if imputation at peptide level is necessary - # if impute: - # dataset_df = impute_peptide_intensities(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID) - - if pnormalization: - print("Normalize at Peptide level...") - dataset_df = peptide_intensity_normalization( - dataset_df, - field=NORM_INTENSITY, - class_field=SAMPLE_ID, - scaling_method=nmethod, - ) - """ - if verbose: - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Normalization at peptide level method: " + nmethod, - ) - plt.show() - pdf.savefig(density) - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Normalization at peptide level method: " + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - pdf.close() - """ - - print("Save the normalized peptide intensities...") - dataset_df.to_csv(output, index=False, sep=",") - + print("Sum all peptidoforms per Sample...") + dataset_df = sum_peptidoform_intensities(dataset_df) + print("Number of peptides after selection: " + str(len(dataset_df.index))) + + print("Average all peptidoforms per Peptide/Sample...") + dataset_df = average_peptide_intensities(dataset_df) + print("Number of peptides after average: " + str(len(dataset_df.index))) + + if remove_low_frequency_peptides and len(sample_names) > 1: + print(dataset_df) + dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) + print_dataset_size( + dataset_df, "Peptides after remove low frequency peptides: ", verbose + ) + + + print("Save the normalized peptide intensities...") + if header: + dataset_df.to_csv(output, index=False,header=False,mode='a+') + else: + dataset_df.to_csv(output, index=False) + header = True + if __name__ == "__main__": diff --git a/build/lib/bin/normalize_methods.py b/build/lib/bin/normalize_methods.py index fb3bd0e..1a8bd76 100644 --- a/build/lib/bin/normalize_methods.py +++ b/build/lib/bin/normalize_methods.py @@ -1,9 +1,71 @@ import numpy as np import pandas as pd -from sklearn.preprocessing import robust_scale -from sklearn.preprocessing import power_transform from sklearn.preprocessing import quantile_transform +def normalize_run(df,sdrf_path,method): + reps = get_replicate(sdrf_path) + if(reps>1): + samples = df['SampleID'].unique() + for sample in samples: + runs = df.loc[df['SampleID']==sample,'Run'].unique().tolist() + if(len(runs)>1): + sample_df = df.loc[df['SampleID']==sample,:] + map,base = get_normalize_args(sample_df,runs,method) + for run in runs: + run = str(run) + run_intensity = df.loc[(df['SampleID']==sample)&(df['Run']==run),'NormIntensity'] + df.loc[(df['SampleID']==sample)&(df['Run']==run),'NormIntensity'] = run_intensity / (map[run] / base) + return df + else: + return df + +def get_replicate(sdrf_path): + sdrf = pd.read_csv(sdrf_path,sep="\t") + reps = len(sdrf["comment[technical replicate]"].unique()) + return reps + +def get_normalize_args(df,runs,method): + match method: + case 'mean': + return normalize_mean(df,runs) + case 'median': + return normalize_median(df,runs) + case 'quantile': + return normalize_q(df,runs) + +def normalize_mean(df,runs): + map = {} + total = 0 + for run in runs: + run = str(run) + run_m = df.loc[df['Run']==run,'NormIntensity'].mean() + map[run] = run_m + total += run_m + avg = total / len(runs) + return map,avg + +def normalize_median(df,runs): + map = {} + total = 0 + for run in runs: + run = str(run) + run_m = df.loc[df['Run']==run,'NormIntensity'].median() + map[run] = run_m + total += run_m + med = total / len(runs) + return map,med + +def normalize_q(df,runs): + map = {} + total = 0 + for run in runs: + run = str(run) + run_m = df.loc[df['Run']==run,'NormIntensity'].quantile([0.75,0.25],interpolation='linear').mean() + map[run] = run_m + total += run_m + q = total / len(runs) + return map,q + def normalize(df,method): match method: case 'mean': @@ -16,14 +78,6 @@ def normalize(df,method): return global_normalize(df) case 'max_min': return max_min_mormalize(df) - case 'z_score': - return z_score_normalize(df) - case 'iqr': - return iqr_normalize(df) - case 'robust': - return robust_normalize(df) - case 'vsn': - return vsn_normalize(df) case 'quantile': return quantile_normalize(df) case _: @@ -31,55 +85,29 @@ def normalize(df,method): # mean def mean_normalize(df): - return df / df.mean(axis=0) + return df / df.mean() # median def median_normalize(df): - return df / df.median(axis=0) + return df / df.median() #max def max_normalize(df): - return df / df.max(axis=0) + return df / df.max() #global def global_normalize(df): - return df / df.sum(axis=0) + return df / df.sum() #max-min def max_min_mormalize(df): - min = df.min(axis=0) - return (df - min) / (df.max(axis=0) - min) - -#z-score -def z_score_normalize(df): - return (df - df.mean(axis=0)) / df.var(axis=0) - -#IQR -def iqr_normalize(df): - Q = df.quantile([0.75,0.25],interpolation='linear',axis=0) - IQR = Q.loc[0.75,:] - Q.loc[0.25,:] - return (df - df.median(axis=0)) / IQR - -#rubust -def robust_normalize(df): - index = df.index - columns = df.columns - df = robust_scale(df, axis=0) - df = pd.DataFrame(df,columns=columns,index=index) - return df - -#vsn -def vsn_normalize(df): - index = df.index - columns = df.columns - df = power_transform(df, method='box-cox') - df = pd.DataFrame(df,columns=columns,index=index) - return df + min = df.min() + return (df - min) / (df.max() - min) #quantile def quantile_normalize(df): index = df.index columns = df.columns - DF = quantile_transform(df,axis=0) + DF = quantile_transform(df) df = pd.DataFrame(df,columns=columns,index=index) return df \ No newline at end of file diff --git a/build/lib/bin/peptide_normalization.py b/build/lib/bin/peptide_normalization.py index 28c10cb..0801333 100644 --- a/build/lib/bin/peptide_normalization.py +++ b/build/lib/bin/peptide_normalization.py @@ -11,7 +11,8 @@ from matplotlib.backends.backend_pdf import PdfPages from pandas import DataFrame import pyarrow.parquet as pq -from normalize_methods import normalize +from parquet import Feature +from normalize_methods import normalize,normalize_run from ibaq.ibaqpy_commons import ( BIOREPLICATE, @@ -71,6 +72,7 @@ def recover_df(df): sample:NORM_INTENSITY },inplace=True) out = pd.concat([out,samples_df]) + out.reset_index(inplace=True,drop=True) return out def analyse_sdrf(sdrf_path: str, compression: bool) -> tuple: @@ -280,7 +282,7 @@ def map_canonical_seq(data_df: pd.DataFrame) -> (pd.DataFrame, dict): data_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1) ] data_df[PROTEIN_NAME] = data_df[PROTEIN_NAME].apply(parse_uniprot_accession) - data_df[STUDY_ID] = data_df[SAMPLE_ID].apply(get_study_accession) + #data_df[STUDY_ID] = data_df[SAMPLE_ID].apply(get_study_accession) if FRACTION not in data_df.columns: data_df[FRACTION] = 1 data_df = data_df[ @@ -290,19 +292,16 @@ def map_canonical_seq(data_df: pd.DataFrame) -> (pd.DataFrame, dict): PEPTIDE_CANONICAL, PEPTIDE_CHARGE, INTENSITY, - REFERENCE, CONDITION, RUN, BIOREPLICATE, FRACTION, - FRAGMENT_ION, ISOTOPE_LABEL_TYPE, - STUDY_ID, SAMPLE_ID, ] ] data_df[CONDITION] = pd.Categorical(data_df[CONDITION]) - data_df[STUDY_ID] = pd.Categorical(data_df[STUDY_ID]) + #data_df[STUDY_ID] = pd.Categorical(data_df[STUDY_ID]) data_df[SAMPLE_ID] = pd.Categorical(data_df[SAMPLE_ID]) return data_df @@ -389,7 +388,8 @@ def remove_low_frequency_peptides_( :param percentage_samples: percentage of samples :return: """ - + #= dataset_df[[SAMPLE_ID, CONDITION]].drop_duplicates(subset=[SAMPLE_ID]) + c_map = dataset_df[[SAMPLE_ID, CONDITION]].drop_duplicates(subset=[SAMPLE_ID]).set_index(SAMPLE_ID).to_dict()[CONDITION] normalize_df = pd.pivot_table( dataset_df, index=[PEPTIDE_CANONICAL, PROTEIN_NAME], @@ -398,6 +398,8 @@ def remove_low_frequency_peptides_( aggfunc={NORM_INTENSITY: np.nanmean}, observed=True, ) + del dataset_df + gc.collect() # Count the number of null values in each row null_count = normalize_df.isnull().sum(axis=1) # Find the rows that have null values above the threshold @@ -411,21 +413,12 @@ def remove_low_frequency_peptides_( normalize_df = normalize_df[ normalize_df.notnull().sum(axis=1) != 1 ] - """ - normalize_df = normalize_df.reset_index() - normalize_df = normalize_df.melt(id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME]) - normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) - """ normalize_df = recover_df(normalize_df) # recover condition column - normalize_df = normalize_df.merge( - dataset_df[[SAMPLE_ID, CONDITION]].drop_duplicates(subset=[SAMPLE_ID]), - on=SAMPLE_ID, - how="left", - ) + normalize_df.loc[:,CONDITION] = normalize_df[SAMPLE_ID].map(c_map) # Remove rows with null values in NORMALIZE_INTENSITY - normalize_df = normalize_df[normalize_df[NORM_INTENSITY].notna()] + normalize_df.dropna(subset=[NORM_INTENSITY], inplace=True) print(normalize_df.head()) return normalize_df @@ -453,62 +446,8 @@ def peptide_intensity_normalization( ) # need nomalize? normalize_df = recover_df(normalize_df) - """ - normalize_df = normalize_df.reset_index() - normalize_df = normalize_df.melt( - id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION] - ) - normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) - normalize_df = normalize_df[normalize_df[NORM_INTENSITY].notna()] - """ - return normalize_df - - -def impute_peptide_intensities(dataset_df, field, class_field): - """ - Impute the missing values using different methods. - :param dataset_df: dataframe with the data - :param field: field to impute - :param class_field: field to use as class - :return: - """ - normalize_df = pd.DataFrame() - # group by condition to detect missing values - for c, g in dataset_df.groupby(CONDITION): - # pivot to have one col per sample - group_normalize_df = pd.pivot_table( - g, - index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], - columns=class_field, - values=field, - aggfunc={field: np.nanmean}, - observed=True, - ) - - # no missing values group -> only one sample - if len(group_normalize_df.columns) < 2: - group_normalize_df = group_normalize_df.reset_index() - group_normalize_df = group_normalize_df.melt( - id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION] - ) - group_normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) - normalize_df = normalize_df.append(group_normalize_df, ignore_index=True) - # else: - # # print ("nothing") - # # Impute the missing values - # # imputer = MissForest(max_iter=5) - # # imputed_data = imputer.fit_transform(group_normalize_df) - # # group_normalize_df = pd.DataFrame(imputed_data, columns=group_normalize_df.columns, - # # index=group_normalize_df.index) - # # # Melt the dataframe - # # group_normalize_df = group_normalize_df.reset_index() - # # group_normalize_df = group_normalize_df.melt(id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION]) - # # group_normalize_df.rename(columns={'value': NORM_INTENSITY}, inplace=True) - # # normalize_df = normalize_df.append(group_normalize_df, ignore_index=True) - return normalize_df - @click.command() @click.option( "-m", "--msstats", help="MsStats file import generated by quantms", default=None @@ -632,582 +571,119 @@ def peptide_normalization( pd.set_option("display.max_columns", None) compression_method = "gzip" if compress else None print("Loading data..") + ''' + if parquet is None: + # Read the msstats file + feature_df = pd.read_csv( + msstats, + sep=",", + compression=compression_method, + dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, + ) - if not stream: - if parquet is None: - # Read the msstats file - feature_df = pd.read_csv( - msstats, - sep=",", - compression=compression_method, - dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, - ) - - # Read the sdrf file - sdrf_df, label, sample_names, choice = analyse_sdrf( - sdrf, compression_method - ) - print(sdrf_df) - - # Merged the SDRF with the Resulted file - dataset_df = msstats_common_process(feature_df) - dataset_df = merge_sdrf(label, sdrf_df, feature_df) - # Remove the intermediate variables and free the memory - del feature_df, sdrf_df - gc.collect() - else: - dataset_df = pd.read_parquet(parquet)[PARQUET_COLUMNS] + # Read the sdrf file + sdrf_df, label, sample_names, choice = analyse_sdrf( + sdrf, compression_method + ) + print(sdrf_df) + + # Merged the SDRF with the Resulted file + dataset_df = msstats_common_process(feature_df) + dataset_df = merge_sdrf(label, sdrf_df, feature_df) + # Remove the intermediate variables and free the memory + del feature_df, sdrf_df + gc.collect() + else: + ''' + F = Feature(parquet) + #dataset_df = pd.read_parquet(parquet,columns=PARQUET_COLUMNS) + header= False + for samples,df in F.iter_samples(): + for sample in samples: + dataset_df = df[df['sample_accession']==sample].copy() + dataset_df = dataset_df[PARQUET_COLUMNS] label, sample_names, choice = analyse_feature_df(dataset_df) dataset_df = parquet_common_process(dataset_df, label, choice) - - dataset_df = data_common_process(dataset_df, min_aa) - # Only proteins with unique peptides number greater than min_unique (default: 2) are retained - unique_peptides = set( - dataset_df.groupby(PEPTIDE_CANONICAL) - .filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[PEPTIDE_CANONICAL] - .tolist() - ) - strong_proteins = set( - dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(unique_peptides)] - .groupby(PROTEIN_NAME) - .filter(lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[ - PROTEIN_NAME - ] - .tolist() - ) - dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] - - print(f"Number of unique peptides: {len(unique_peptides)}") - print(f"Number of strong proteins: {len(strong_proteins)}") - - print("Logarithmic if specified..") - dataset_df = dataset_df.rename(columns={INTENSITY: NORM_INTENSITY}) - if log2: - dataset_df[NORM_INTENSITY] = np.log2(dataset_df[NORM_INTENSITY]) - - # Print the distribution of the original peptide intensities from quantms analysis - if verbose: - sample_names = set(dataset_df[SAMPLE_ID]) - plot_width = len(sample_names) * 0.5 + 10 - pdf = PdfPages(qc_report) - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=not log2, - width=plot_width, - title="Original peptidoform intensity distribution (no normalization)", + dataset_df = data_common_process(dataset_df, min_aa) + # Only proteins with unique peptides number greater than min_unique (default: 2) are retained + unique_peptides = set( + dataset_df.groupby(PEPTIDE_CANONICAL) + .filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[PEPTIDE_CANONICAL] + .tolist() ) - #plt.show() - pdf.savefig(density) - """ - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=not log2, - width=plot_width, - title="Original peptidoform intensity distribution (no normalization)", - violin=violin, + strong_proteins = set( + dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(unique_peptides)] + .groupby(PROTEIN_NAME) + .filter(lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[ + PROTEIN_NAME + ] + .tolist() ) - plt.show() - pdf.savefig(box) - """ + dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] - # Remove high abundant and contaminants proteins and the outliers - if remove_ids is not None: - print("Remove proteins from file...") - dataset_df = remove_protein_by_ids(dataset_df, remove_ids) - if remove_decoy_contaminants: - print("Remove decoy and contaminants...") - dataset_df = remove_contaminants_entrapments_decoys(dataset_df) - - print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) - print("Normalize intensities.. ") - # dataset_df = dataset_df.dropna(how="any") - if not skip_normalization: - dataset_df = intensity_normalization( - dataset_df, - field=NORM_INTENSITY, - class_field=SAMPLE_ID, - scaling_method=nmethod, - ) - if verbose: - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - #log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - ) - #plt.show() - pdf.savefig(density) - """ - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - """ - print("Number of peptides after normalization: " + str(len(dataset_df.index))) - print("Select the best peptidoform across fractions...") - dataset_df = get_peptidoform_normalize_intensities(dataset_df) - print( - "Number of peptides after peptidofrom selection: " - + str(len(dataset_df.index)) - ) + print(f"Number of unique peptides: {len(unique_peptides)}") + print(f"Number of strong proteins: {len(strong_proteins)}") - print("Sum all peptidoforms per Sample...") - dataset_df = sum_peptidoform_intensities(dataset_df) - print("Number of peptides after selection: " + str(len(dataset_df.index))) - - print("Average all peptidoforms per Peptide/Sample...") - dataset_df = average_peptide_intensities(dataset_df) - print("Number of peptides after average: " + str(len(dataset_df.index))) - if verbose: - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptide intensity distribution method: " + nmethod, - ) - plt.show() - pdf.savefig(density) - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptide intensity distribution method: " + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - - if remove_low_frequency_peptides and len(sample_names) > 1: - print(dataset_df) - dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) - print_dataset_size( - dataset_df, "Peptides after remove low frequency peptides: ", verbose - ) - # Perform imputation using Random Forest in Peptide Intensities - # TODO: Check if this is necessary (Probably we can do some research if imputation at peptide level is necessary - # if impute: - # dataset_df = impute_peptide_intensities(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID) - - if pnormalization: - print("Normalize at Peptide level...") - dataset_df = peptide_intensity_normalization( - dataset_df, - field=NORM_INTENSITY, - class_field=SAMPLE_ID, - scaling_method=nmethod, - ) - - if verbose: - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Normalization at peptide level method: " + nmethod, - ) - plt.show() - pdf.savefig(density) - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Normalization at peptide level method: " + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - pdf.close() + print("Logarithmic if specified..") + dataset_df.rename(columns={INTENSITY: NORM_INTENSITY},inplace=True) + """ + if log2: + dataset_df[NORM_INTENSITY] = np.log2(dataset_df[NORM_INTENSITY]) + """ - print("Save the normalized peptide intensities...") - dataset_df.to_csv(output, index=False, sep=",") - else: - if parquet is None: - sdrf_df, label, sample_names, choice = analyse_sdrf( - sdrf, compression_method - ) - msstats_chunks = pd.read_csv( - msstats, - sep=",", - compression=compression_method, - dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, - chunksize=chunksize, - ) - else: - label, sample_names, choice = analyse_feature_parquet( - parquet, batch_size=chunksize - ) - msstats_chunks = read_large_parquet(parquet, batch_size=chunksize) - sample_number = len(sample_names) - - # TODO: Stream processing to obtain strong proteins with more than 2 uniqe peptides - temp = f"Temp-{str(uuid.uuid4())}/" - os.mkdir(temp) - print(f"INFO: Writing files into {temp}...") - unique_peptides = {} - group_intensities = {} - quantile = {} - print("INFO: First iteration to get unique peptides and strong proteins...") - for msstats_df in msstats_chunks: - if parquet is None: - msstats_df = msstats_common_process(msstats_df) - msstats_df = merge_sdrf(label, sdrf_df, msstats_df) - else: - msstats_df = parquet_common_process(msstats_df, label, choice) - result_df = data_common_process(msstats_df, min_aa) - - # Write CSVs by Sample ID - for sample in sample_names: - file_name = f"{temp}/{sample}.csv" - write_mode = "a" if os.path.exists(file_name) else "w" - header = False if os.path.exists(file_name) else True - result_df[result_df[SAMPLE_ID] == sample].to_csv( - file_name, index=False, header=header, mode=write_mode - ) - unique_df = result_df.groupby([PEPTIDE_CANONICAL]).filter( - lambda x: len(set(x[PROTEIN_NAME])) == 1 - )[[PEPTIDE_CANONICAL, PROTEIN_NAME]] - unique_dict = dict( - zip(unique_df[PEPTIDE_CANONICAL], unique_df[PROTEIN_NAME]) - ) - for i in unique_dict.keys(): - if i in unique_peptides.keys() and unique_dict[i] != unique_peptides[i]: - unique_peptides.pop(i) - else: - unique_peptides[i] = unique_dict[i] - - proteins_list = list(unique_peptides.values()) - count_dict = { - element: proteins_list.count(element) for element in set(proteins_list) - } - strong_proteins = [ - element for element in count_dict if count_dict[element] >= min_unique - ] - del proteins_list, count_dict - print(f"Number of unique peptides: {len(list(unique_peptides.keys()))}") - print(f"Number of strong proteins: {len(strong_proteins)}") - - # TODO: Filter proteins with less unique peptides than min_unique (default: 2) - plot_samples = random.sample(sample_names, min(len(sample_names), 20)) - plot_width = 10 + len(plot_samples) * 0.5 - pdf = PdfPages(qc_report) - original_intensities_df = pd.DataFrame() - - print("INFO: Second iteration to filter data and prepare normalization...") - print("Logarithmic if specified..") - norm_record = [0] * 2 - for sample in sample_names: - msstats_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") - msstats_df = msstats_df[msstats_df[PROTEIN_NAME].isin(strong_proteins)] # Remove high abundant and contaminants proteins and the outliers if remove_ids is not None: - msstats_df = remove_protein_by_ids(msstats_df, remove_ids) + print("Remove proteins from file...") + dataset_df = remove_protein_by_ids(dataset_df, remove_ids) if remove_decoy_contaminants: - msstats_df = remove_contaminants_entrapments_decoys(msstats_df) - norm_record[0] += len(msstats_df) - msstats_df = msstats_df.rename(columns={INTENSITY: NORM_INTENSITY}) - if log2: - msstats_df[NORM_INTENSITY] = np.log2(msstats_df[NORM_INTENSITY]) - if sample in plot_samples: - original_intensities_df = pd.concat( - [original_intensities_df, msstats_df] - ) + print("Remove decoy and contaminants...") + dataset_df = remove_contaminants_entrapments_decoys(dataset_df) + + print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) + print("Normalize intensities.. ") + dataset_df.loc[:,NORM_INTENSITY] = np.log(dataset_df[NORM_INTENSITY]) + dataset_df.loc[:,NORM_INTENSITY] = normalize(dataset_df[NORM_INTENSITY],'max_min') + dataset_df = normalize_run(dataset_df,sdrf,nmethod) + """ if not skip_normalization: - if nmethod == "msstats": - if label in ["TMT", "ITRAQ"]: - g = msstats_df.groupby(["Run", "Channel"]) - else: - g = msstats_df.groupby(["Run", "Fraction"]) - for name, group in g: - group_intensity = group[NORM_INTENSITY].tolist() - if name not in group_intensities: - group_intensities[name] = group_intensity - else: - group_intensities.update( - { - name: group_intensities[NORM_INTENSITY] - + group_intensity - } - ) - elif nmethod == "quantile": - msstats_df = ( - msstats_df.groupby( - [ - PEPTIDE_SEQUENCE, - PEPTIDE_CANONICAL, - PEPTIDE_CHARGE, - FRACTION, - RUN, - BIOREPLICATE, - PROTEIN_NAME, - STUDY_ID, - CONDITION, - ] - )[NORM_INTENSITY] - .agg(np.nanmean) - .reset_index() - ) - rank = msstats_df[NORM_INTENSITY].rank(method="average") - dic = dict(zip(rank, msstats_df[NORM_INTENSITY])) - if len(quantile) == 0: - quantile = {k: (v, 1) for k, v in dic.items()} - else: - # update = min(len(quantile), len(dic)) - intersec = set(quantile.keys()) & set(dic.keys()) - update = set(dic.keys()) - set(quantile.keys()) - quantile.update( - { - i: (quantile[i][0] + dic[i], quantile[i][1] + 1) - for i in intersec - } - ) - if len(update) > 0: - quantile.update({k: (dic[k], 1) for k in update}) - msstats_df[SAMPLE_ID] = sample - else: - exit("Stream process only supports msstats and quantile methods!") - msstats_df.to_csv(f"{temp}/{sample}.csv", index=False, sep=",") - norm_record[1] += len(msstats_df) - if not skip_normalization and nmethod == "quantile": - quantile = {k: v[0] / v[1] for k, v in quantile.items()} - print(f"Peptides after contaminants removal: {norm_record[0]}") - print(f"Number of peptides after normalization: {norm_record[1]}") - # Save original intensities QC plots - original_intensities_df = original_intensities_df.reset_index(drop=True) - density = plot_distributions( - original_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=not log2, - width=plot_width, - title="Original peptidoform intensity distribution (no normalization)", - ) - pdf.savefig(density) - box = plot_box_plot( - original_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=not log2, - width=plot_width, - title="Original peptidoform intensity distribution (no normalization)", - violin=violin, - ) - plt.show() - pdf.savefig(box) - del original_intensities_df - - # TODO: Peptide intensity normalization - peptides_count = pd.DataFrame( - columns=[PROTEIN_NAME, PEPTIDE_CANONICAL, "count"] - ) - norm_intensities_df = pd.DataFrame() - if not skip_normalization and nmethod == "msstats": - # For ISO normalization - if label in ["TMT", "ITRAQ"]: - median_baseline = np.nanmedian( - list(set(sum(group_intensities.values(), []))) - ) - group_intensities = { - key: np.nanmedian(list(values)) - for key, values in group_intensities.items() - } - else: - fractions = [i[1] for i in group_intensities.keys()] - fraction_median = {} - for fraction in fractions: - fraction_keys = [ - i for i in group_intensities.keys() if i[1] == fraction - ] - fraction_intensities = [] - for key in fraction_keys: - fraction_intensities.extend(group_intensities[key]) - fraction_median[fraction] = np.nanmedian(fraction_intensities) - group_intensities = { - key: np.nanmedian(values) - for key, values in group_intensities.items() - } - print("INFO: Third iteration to normalize and counting peptides frequency...") - size_record = [0] * 3 - - def normalization( - dataset_df, label, sample, skip_normalization, nmethod, record - ): - if not skip_normalization: - field = NORM_INTENSITY - if nmethod == "msstats": - # For ISO normalization - if label in ["TMT", "ITRAQ"]: - dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( - lambda x: x[field] - - group_intensities[(x["Run"], x["Channel"])] - + median_baseline, - axis=1, - ) - else: - dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( - lambda x: x[field] - - group_intensities[(x["Run"], x["Fraction"])] - + np.nanmedian( - [ - group_intensities[i] - for i in group_intensities.keys() - if i[1] == x["Fraction"] - ] - ), - axis=1, - ) - elif nmethod == "quantile": - rank = dataset_df[NORM_INTENSITY].rank(method="average") - ref_dict = dict(zip(rank, dataset_df[NORM_INTENSITY])) - ref_dict = {v: quantile[k] for k, v in ref_dict.items()} - dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( - lambda x: ref_dict.get(x[NORM_INTENSITY], np.nan), - axis=1, - ) - dataset_df = dataset_df.drop_duplicates() - dataset_df = dataset_df[dataset_df[NORM_INTENSITY].notna()] + dataset_df = intensity_normalization( + dataset_df, + field=NORM_INTENSITY, + class_field=SAMPLE_ID, + scaling_method=nmethod, + ) + """ + print("Number of peptides after normalization: " + str(len(dataset_df.index))) + print("Select the best peptidoform across fractions...") dataset_df = get_peptidoform_normalize_intensities(dataset_df) - record[0] += len(dataset_df.index) + print( + "Number of peptides after peptidofrom selection: " + + str(len(dataset_df.index)) + ) + + print("Sum all peptidoforms per Sample...") dataset_df = sum_peptidoform_intensities(dataset_df) - record[1] += len(dataset_df.index) - dataset_df = average_peptide_intensities(dataset_df) - record[2] += len(dataset_df.index) + print("Number of peptides after selection: " + str(len(dataset_df.index))) - return dataset_df, record + print("Average all peptidoforms per Peptide/Sample...") + dataset_df = average_peptide_intensities(dataset_df) + print("Number of peptides after average: " + str(len(dataset_df.index))) - for sample in sample_names: - dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") - if len(dataset_df) != 0: - norm_df, size_record = normalization( - dataset_df, label, sample, skip_normalization, nmethod, size_record + if remove_low_frequency_peptides and len(sample_names) > 1: + print(dataset_df) + dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) + print_dataset_size( + dataset_df, "Peptides after remove low frequency peptides: ", verbose ) - else: - continue - sample_peptides = norm_df[PEPTIDE_CANONICAL].unique().tolist() - if remove_low_frequency_peptides and sample_number > 1: - sample_peptides = norm_df[ - [PROTEIN_NAME, PEPTIDE_CANONICAL] - ].drop_duplicates() - sample_peptides["count"] = 1 - peptides_count = ( - pd.concat([peptides_count, sample_peptides]) - .groupby([PROTEIN_NAME, PEPTIDE_CANONICAL]) - .agg(sum) - .reset_index() - ) - norm_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) - if sample in plot_samples: - norm_intensities_df = pd.concat([norm_intensities_df, norm_df]) - del group_intensities, quantile - print(f"Number of peptides after peptidofrom selection: {size_record[0]}") - print(f"Number of peptides after selection: {size_record[1]}") - print(f"Number of peptides after average: {size_record[2]}") - # Save normalized intensities QC plots - norm_intensities_df = norm_intensities_df.reset_index(drop=True) - density = plot_distributions( - norm_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - ) - plt.show() - pdf.savefig(density) - box = plot_box_plot( - norm_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - del norm_intensities_df, strong_proteins - - print("INFO: Writing normalized intensities into CSV...") - if remove_low_frequency_peptides and sample_number > 1: - peptides_count = peptides_count.loc[ - (peptides_count["count"] > 0.20 * sample_number) - & (peptides_count["count"] != sample_number - 1) - ] - final_norm_intensities_df = pd.DataFrame() - size_record = 0 - for sample in sample_names: - dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") - if remove_low_frequency_peptides and sample_number > 1: - # Filter low-frequency peptides, which indicate whether the peptide occurs less than 20% in all samples or - # only in one sample - dataset_df = dataset_df.merge( - peptides_count[[PEPTIDE_CANONICAL, PROTEIN_NAME]], how="inner" - ) - size_record += len(dataset_df.index) - dataset_df = dataset_df[ - [PEPTIDE_CANONICAL, PROTEIN_NAME, SAMPLE_ID, NORM_INTENSITY, CONDITION] - ] - write_mode = "a" if os.path.exists(output) else "w" - header = False if os.path.exists(output) else True - dataset_df.to_csv(output, index=False, header=header, mode=write_mode) - dataset_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) - if sample in plot_samples: - final_norm_intensities_df = pd.concat( - [final_norm_intensities_df, dataset_df] - ) - print(f"Peptides after remove low frequency peptides: {size_record}") - if remove_low_frequency_peptides: - del peptides_count - - # TODO: No peptides intensity normalization applied in stream processing. - # Save final normalized intensities QC plots - final_norm_intensities_df = final_norm_intensities_df.reset_index(drop=True) - density = plot_distributions( - final_norm_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Normalization at peptide level method: " + nmethod, - ) - plt.show() - pdf.savefig(density) - box = plot_box_plot( - final_norm_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Normalization at peptide level method: " + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - pdf.close() + + print("Save the normalized peptide intensities...") + if header: + dataset_df.to_csv(output, index=False,header=False,mode='a+') + else: + dataset_df.to_csv(output, index=False) + header = True + if __name__ == "__main__": diff --git a/build/lib/ibaq/ibaqpy_commons.py b/build/lib/ibaq/ibaqpy_commons.py index 1d72e1f..daf5cd1 100644 --- a/build/lib/ibaq/ibaqpy_commons.py +++ b/build/lib/ibaq/ibaqpy_commons.py @@ -14,7 +14,6 @@ "peptidoform", "sequence", "charge", - "fragment_ion", "isotope_label_type", "channel", "condition", @@ -22,10 +21,8 @@ "run", "fraction", "intensity", - "reference_file_name", "sample_accession", ] - PROTEIN_NAME = "ProteinName" PEPTIDE_SEQUENCE = "PeptideSequence" PEPTIDE_CANONICAL = "PeptideCanonical" @@ -302,19 +299,15 @@ def sum_peptidoform_intensities(dataset: DataFrame) -> DataFrame: :param dataset: Dataframe to be analyzed :return: dataframe with the intensities """ - dataset = dataset[dataset[NORM_INTENSITY].notna()] - normalize_df = dataset.groupby( - [PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION], observed=True - )[NORM_INTENSITY].sum() - normalize_df = normalize_df.reset_index() - normalize_df = pd.merge( - normalize_df, - dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION]], - how="left", - on=[PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION], - ) - normalize_df.drop_duplicates(inplace=True) - return normalize_df + dataset.dropna(subset=[NORM_INTENSITY], inplace=True) + dataset.drop(['PeptideSequence','PrecursorCharge'],axis=1,inplace=True) + dataset = dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION,NORM_INTENSITY]] + dataset.loc[:,'NormIntensity'] = dataset.groupby( + [PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION],observed=True + )[NORM_INTENSITY].transform('sum') + dataset = dataset.drop_duplicates() + dataset.reset_index(inplace=True,drop=True) + return dataset def parse_uniprot_accession(uniprot_id: str) -> str: @@ -365,21 +358,22 @@ def get_peptidoform_normalize_intensities( :param higher_intensity: select based on normalize intensity, if false based on best scored peptide :return: """ - dataset = dataset[dataset[NORM_INTENSITY].notna()] + dataset.dropna(subset=[NORM_INTENSITY], inplace=True) if higher_intensity: dataset = dataset.loc[ dataset.groupby( [PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, SAMPLE_ID, CONDITION, BIOREPLICATE], observed=True, )[NORM_INTENSITY].idxmax() - ].reset_index(drop=True) + ] else: dataset = dataset.loc[ dataset.groupby( [PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, SAMPLE_ID, CONDITION, BIOREPLICATE], observed=True, )[SEARCH_ENGINE].idxmax() - ].reset_index(drop=True) + ] + dataset.reset_index(drop=True,inplace=True) return dataset @@ -389,18 +383,15 @@ def average_peptide_intensities(dataset: DataFrame) -> DataFrame: :param dataset: Dataframe containing all the peptidoforms :return: New dataframe """ - dataset_df = dataset.groupby( - [PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION], observed=True - )[NORM_INTENSITY].median() - dataset_df = dataset_df.reset_index() - dataset_df = pd.merge( - dataset_df, - dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION]], - how="left", - on=[PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION], - ) - dataset_df.drop_duplicates(inplace=True) - return dataset_df + dataset.dropna(subset=[NORM_INTENSITY], inplace=True) + dataset.drop(['BioReplicate'],axis=1,inplace=True) + dataset.loc[:,'NormIntensity'] = dataset.groupby( + [PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION],observed=True + )[NORM_INTENSITY].transform('median') + dataset = dataset.drop_duplicates() + dataset.reset_index(inplace=True,drop=True) + + return dataset # Functions needed by Combiner diff --git a/build/scripts-3.10/peptide_normalization.py b/build/scripts-3.10/peptide_normalization.py index 9deb750..0056e84 100644 --- a/build/scripts-3.10/peptide_normalization.py +++ b/build/scripts-3.10/peptide_normalization.py @@ -11,7 +11,8 @@ from matplotlib.backends.backend_pdf import PdfPages from pandas import DataFrame import pyarrow.parquet as pq -from normalize_methods import normalize +from parquet import Feature +from normalize_methods import normalize,normalize_run from ibaq.ibaqpy_commons import ( BIOREPLICATE, @@ -71,6 +72,7 @@ def recover_df(df): sample:NORM_INTENSITY },inplace=True) out = pd.concat([out,samples_df]) + out.reset_index(inplace=True,drop=True) return out def analyse_sdrf(sdrf_path: str, compression: bool) -> tuple: @@ -280,7 +282,7 @@ def map_canonical_seq(data_df: pd.DataFrame) -> (pd.DataFrame, dict): data_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1) ] data_df[PROTEIN_NAME] = data_df[PROTEIN_NAME].apply(parse_uniprot_accession) - data_df[STUDY_ID] = data_df[SAMPLE_ID].apply(get_study_accession) + #data_df[STUDY_ID] = data_df[SAMPLE_ID].apply(get_study_accession) if FRACTION not in data_df.columns: data_df[FRACTION] = 1 data_df = data_df[ @@ -290,19 +292,16 @@ def map_canonical_seq(data_df: pd.DataFrame) -> (pd.DataFrame, dict): PEPTIDE_CANONICAL, PEPTIDE_CHARGE, INTENSITY, - REFERENCE, CONDITION, RUN, BIOREPLICATE, FRACTION, - FRAGMENT_ION, ISOTOPE_LABEL_TYPE, - STUDY_ID, SAMPLE_ID, ] ] data_df[CONDITION] = pd.Categorical(data_df[CONDITION]) - data_df[STUDY_ID] = pd.Categorical(data_df[STUDY_ID]) + #data_df[STUDY_ID] = pd.Categorical(data_df[STUDY_ID]) data_df[SAMPLE_ID] = pd.Categorical(data_df[SAMPLE_ID]) return data_df @@ -389,7 +388,8 @@ def remove_low_frequency_peptides_( :param percentage_samples: percentage of samples :return: """ - + #= dataset_df[[SAMPLE_ID, CONDITION]].drop_duplicates(subset=[SAMPLE_ID]) + c_map = dataset_df[[SAMPLE_ID, CONDITION]].drop_duplicates(subset=[SAMPLE_ID]).set_index(SAMPLE_ID).to_dict()[CONDITION] normalize_df = pd.pivot_table( dataset_df, index=[PEPTIDE_CANONICAL, PROTEIN_NAME], @@ -398,6 +398,8 @@ def remove_low_frequency_peptides_( aggfunc={NORM_INTENSITY: np.nanmean}, observed=True, ) + del dataset_df + gc.collect() # Count the number of null values in each row null_count = normalize_df.isnull().sum(axis=1) # Find the rows that have null values above the threshold @@ -411,21 +413,12 @@ def remove_low_frequency_peptides_( normalize_df = normalize_df[ normalize_df.notnull().sum(axis=1) != 1 ] - """ - normalize_df = normalize_df.reset_index() - normalize_df = normalize_df.melt(id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME]) - normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) - """ normalize_df = recover_df(normalize_df) # recover condition column - normalize_df = normalize_df.merge( - dataset_df[[SAMPLE_ID, CONDITION]].drop_duplicates(subset=[SAMPLE_ID]), - on=SAMPLE_ID, - how="left", - ) + normalize_df.loc[:,CONDITION] = normalize_df[SAMPLE_ID].map(c_map) # Remove rows with null values in NORMALIZE_INTENSITY - normalize_df = normalize_df[normalize_df[NORM_INTENSITY].notna()] + normalize_df.dropna(subset=[NORM_INTENSITY], inplace=True) print(normalize_df.head()) return normalize_df @@ -453,62 +446,8 @@ def peptide_intensity_normalization( ) # need nomalize? normalize_df = recover_df(normalize_df) - """ - normalize_df = normalize_df.reset_index() - normalize_df = normalize_df.melt( - id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION] - ) - normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) - normalize_df = normalize_df[normalize_df[NORM_INTENSITY].notna()] - """ - return normalize_df - - -def impute_peptide_intensities(dataset_df, field, class_field): - """ - Impute the missing values using different methods. - :param dataset_df: dataframe with the data - :param field: field to impute - :param class_field: field to use as class - :return: - """ - normalize_df = pd.DataFrame() - # group by condition to detect missing values - for c, g in dataset_df.groupby(CONDITION): - # pivot to have one col per sample - group_normalize_df = pd.pivot_table( - g, - index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], - columns=class_field, - values=field, - aggfunc={field: np.nanmean}, - observed=True, - ) - - # no missing values group -> only one sample - if len(group_normalize_df.columns) < 2: - group_normalize_df = group_normalize_df.reset_index() - group_normalize_df = group_normalize_df.melt( - id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION] - ) - group_normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) - normalize_df = normalize_df.append(group_normalize_df, ignore_index=True) - # else: - # # print ("nothing") - # # Impute the missing values - # # imputer = MissForest(max_iter=5) - # # imputed_data = imputer.fit_transform(group_normalize_df) - # # group_normalize_df = pd.DataFrame(imputed_data, columns=group_normalize_df.columns, - # # index=group_normalize_df.index) - # # # Melt the dataframe - # # group_normalize_df = group_normalize_df.reset_index() - # # group_normalize_df = group_normalize_df.melt(id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION]) - # # group_normalize_df.rename(columns={'value': NORM_INTENSITY}, inplace=True) - # # normalize_df = normalize_df.append(group_normalize_df, ignore_index=True) - return normalize_df - @click.command() @click.option( "-m", "--msstats", help="MsStats file import generated by quantms", default=None @@ -632,582 +571,119 @@ def peptide_normalization( pd.set_option("display.max_columns", None) compression_method = "gzip" if compress else None print("Loading data..") + ''' + if parquet is None: + # Read the msstats file + feature_df = pd.read_csv( + msstats, + sep=",", + compression=compression_method, + dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, + ) - if not stream: - if parquet is None: - # Read the msstats file - feature_df = pd.read_csv( - msstats, - sep=",", - compression=compression_method, - dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, - ) - - # Read the sdrf file - sdrf_df, label, sample_names, choice = analyse_sdrf( - sdrf, compression_method - ) - print(sdrf_df) - - # Merged the SDRF with the Resulted file - dataset_df = msstats_common_process(feature_df) - dataset_df = merge_sdrf(label, sdrf_df, feature_df) - # Remove the intermediate variables and free the memory - del feature_df, sdrf_df - gc.collect() - else: - dataset_df = pd.read_parquet(parquet)[PARQUET_COLUMNS] + # Read the sdrf file + sdrf_df, label, sample_names, choice = analyse_sdrf( + sdrf, compression_method + ) + print(sdrf_df) + + # Merged the SDRF with the Resulted file + dataset_df = msstats_common_process(feature_df) + dataset_df = merge_sdrf(label, sdrf_df, feature_df) + # Remove the intermediate variables and free the memory + del feature_df, sdrf_df + gc.collect() + else: + ''' + F = Feature(parquet) + #dataset_df = pd.read_parquet(parquet,columns=PARQUET_COLUMNS) + header= False + for samples,df in F.iter_samples(): + for sample in samples: + dataset_df = df[df['sample_accession']==sample].copy() + dataset_df = dataset_df[PARQUET_COLUMNS] label, sample_names, choice = analyse_feature_df(dataset_df) dataset_df = parquet_common_process(dataset_df, label, choice) - - dataset_df = data_common_process(dataset_df, min_aa) - # Only proteins with unique peptides number greater than min_unique (default: 2) are retained - unique_peptides = set( - dataset_df.groupby(PEPTIDE_CANONICAL) - .filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[PEPTIDE_CANONICAL] - .tolist() - ) - strong_proteins = set( - dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(unique_peptides)] - .groupby(PROTEIN_NAME) - .filter(lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[ - PROTEIN_NAME - ] - .tolist() - ) - dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] - - print(f"Number of unique peptides: {len(unique_peptides)}") - print(f"Number of strong proteins: {len(strong_proteins)}") - - print("Logarithmic if specified..") - dataset_df = dataset_df.rename(columns={INTENSITY: NORM_INTENSITY}) - if log2: - dataset_df[NORM_INTENSITY] = np.log2(dataset_df[NORM_INTENSITY]) - - # Print the distribution of the original peptide intensities from quantms analysis - if verbose: - sample_names = set(dataset_df[SAMPLE_ID]) - plot_width = len(sample_names) * 0.5 + 10 - pdf = PdfPages(qc_report) - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=not log2, - width=plot_width, - title="Original peptidoform intensity distribution (no normalization)", + dataset_df = data_common_process(dataset_df, min_aa) + # Only proteins with unique peptides number greater than min_unique (default: 2) are retained + unique_peptides = set( + dataset_df.groupby(PEPTIDE_CANONICAL) + .filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[PEPTIDE_CANONICAL] + .tolist() ) - #plt.show() - pdf.savefig(density) - """ - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=not log2, - width=plot_width, - title="Original peptidoform intensity distribution (no normalization)", - violin=violin, + strong_proteins = set( + dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(unique_peptides)] + .groupby(PROTEIN_NAME) + .filter(lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[ + PROTEIN_NAME + ] + .tolist() ) - plt.show() - pdf.savefig(box) - """ + dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] - # Remove high abundant and contaminants proteins and the outliers - if remove_ids is not None: - print("Remove proteins from file...") - dataset_df = remove_protein_by_ids(dataset_df, remove_ids) - if remove_decoy_contaminants: - print("Remove decoy and contaminants...") - dataset_df = remove_contaminants_entrapments_decoys(dataset_df) - - print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) - print("Normalize intensities.. ") - # dataset_df = dataset_df.dropna(how="any") - if not skip_normalization: - dataset_df = intensity_normalization( - dataset_df, - field=NORM_INTENSITY, - class_field=SAMPLE_ID, - scaling_method=nmethod, - ) - if verbose: - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - #log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - ) - #plt.show() - pdf.savefig(density) - """ - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - """ - print("Number of peptides after normalization: " + str(len(dataset_df.index))) - print("Select the best peptidoform across fractions...") - dataset_df = get_peptidoform_normalize_intensities(dataset_df) - print( - "Number of peptides after peptidofrom selection: " - + str(len(dataset_df.index)) - ) + print(f"Number of unique peptides: {len(unique_peptides)}") + print(f"Number of strong proteins: {len(strong_proteins)}") - print("Sum all peptidoforms per Sample...") - dataset_df = sum_peptidoform_intensities(dataset_df) - print("Number of peptides after selection: " + str(len(dataset_df.index))) - - print("Average all peptidoforms per Peptide/Sample...") - dataset_df = average_peptide_intensities(dataset_df) - print("Number of peptides after average: " + str(len(dataset_df.index))) - if verbose: - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptide intensity distribution method: " + nmethod, - ) - plt.show() - pdf.savefig(density) - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptide intensity distribution method: " + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - - if remove_low_frequency_peptides and len(sample_names) > 1: - print(dataset_df) - dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) - print_dataset_size( - dataset_df, "Peptides after remove low frequency peptides: ", verbose - ) - # Perform imputation using Random Forest in Peptide Intensities - # TODO: Check if this is necessary (Probably we can do some research if imputation at peptide level is necessary - # if impute: - # dataset_df = impute_peptide_intensities(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID) - - if pnormalization: - print("Normalize at Peptide level...") - dataset_df = peptide_intensity_normalization( - dataset_df, - field=NORM_INTENSITY, - class_field=SAMPLE_ID, - scaling_method=nmethod, - ) - - if verbose: - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Normalization at peptide level method: " + nmethod, - ) - plt.show() - pdf.savefig(density) - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Normalization at peptide level method: " + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - pdf.close() + print("Logarithmic if specified..") + dataset_df.rename(columns={INTENSITY: NORM_INTENSITY},inplace=True) + """ + if log2: + dataset_df[NORM_INTENSITY] = np.log2(dataset_df[NORM_INTENSITY]) + """ - print("Save the normalized peptide intensities...") - dataset_df.to_csv(output, index=False, sep=",") - else: - if parquet is None: - sdrf_df, label, sample_names, choice = analyse_sdrf( - sdrf, compression_method - ) - msstats_chunks = pd.read_csv( - msstats, - sep=",", - compression=compression_method, - dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, - chunksize=chunksize, - ) - else: - label, sample_names, choice = analyse_feature_parquet( - parquet, batch_size=chunksize - ) - msstats_chunks = read_large_parquet(parquet, batch_size=chunksize) - sample_number = len(sample_names) - - # TODO: Stream processing to obtain strong proteins with more than 2 uniqe peptides - temp = f"Temp-{str(uuid.uuid4())}/" - os.mkdir(temp) - print(f"INFO: Writing files into {temp}...") - unique_peptides = {} - group_intensities = {} - quantile = {} - print("INFO: First iteration to get unique peptides and strong proteins...") - for msstats_df in msstats_chunks: - if parquet is None: - msstats_df = msstats_common_process(msstats_df) - msstats_df = merge_sdrf(label, sdrf_df, msstats_df) - else: - msstats_df = parquet_common_process(msstats_df, label, choice) - result_df = data_common_process(msstats_df, min_aa) - - # Write CSVs by Sample ID - for sample in sample_names: - file_name = f"{temp}/{sample}.csv" - write_mode = "a" if os.path.exists(file_name) else "w" - header = False if os.path.exists(file_name) else True - result_df[result_df[SAMPLE_ID] == sample].to_csv( - file_name, index=False, header=header, mode=write_mode - ) - unique_df = result_df.groupby([PEPTIDE_CANONICAL]).filter( - lambda x: len(set(x[PROTEIN_NAME])) == 1 - )[[PEPTIDE_CANONICAL, PROTEIN_NAME]] - unique_dict = dict( - zip(unique_df[PEPTIDE_CANONICAL], unique_df[PROTEIN_NAME]) - ) - for i in unique_dict.keys(): - if i in unique_peptides.keys() and unique_dict[i] != unique_peptides[i]: - unique_peptides.pop(i) - else: - unique_peptides[i] = unique_dict[i] - - proteins_list = list(unique_peptides.values()) - count_dict = { - element: proteins_list.count(element) for element in set(proteins_list) - } - strong_proteins = [ - element for element in count_dict if count_dict[element] >= min_unique - ] - del proteins_list, count_dict - print(f"Number of unique peptides: {len(list(unique_peptides.keys()))}") - print(f"Number of strong proteins: {len(strong_proteins)}") - - # TODO: Filter proteins with less unique peptides than min_unique (default: 2) - plot_samples = random.sample(sample_names, min(len(sample_names), 20)) - plot_width = 10 + len(plot_samples) * 0.5 - pdf = PdfPages(qc_report) - original_intensities_df = pd.DataFrame() - - print("INFO: Second iteration to filter data and prepare normalization...") - print("Logarithmic if specified..") - norm_record = [0] * 2 - for sample in sample_names: - msstats_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") - msstats_df = msstats_df[msstats_df[PROTEIN_NAME].isin(strong_proteins)] # Remove high abundant and contaminants proteins and the outliers if remove_ids is not None: - msstats_df = remove_protein_by_ids(msstats_df, remove_ids) + print("Remove proteins from file...") + dataset_df = remove_protein_by_ids(dataset_df, remove_ids) if remove_decoy_contaminants: - msstats_df = remove_contaminants_entrapments_decoys(msstats_df) - norm_record[0] += len(msstats_df) - msstats_df = msstats_df.rename(columns={INTENSITY: NORM_INTENSITY}) - if log2: - msstats_df[NORM_INTENSITY] = np.log2(msstats_df[NORM_INTENSITY]) - if sample in plot_samples: - original_intensities_df = pd.concat( - [original_intensities_df, msstats_df] - ) + print("Remove decoy and contaminants...") + dataset_df = remove_contaminants_entrapments_decoys(dataset_df) + + print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) + print("Normalize intensities.. ") + dataset_df.loc[:,NORM_INTENSITY] = np.log(dataset_df[NORM_INTENSITY]) + dataset_df.loc[:,NORM_INTENSITY] = normalize(dataset_df[NORM_INTENSITY],'max_min') + dataset_df = normalize_run(dataset_df,sdrf,nmethod) + """ if not skip_normalization: - if nmethod == "msstats": - if label in ["TMT", "ITRAQ"]: - g = msstats_df.groupby(["Run", "Channel"]) - else: - g = msstats_df.groupby(["Run", "Fraction"]) - for name, group in g: - group_intensity = group[NORM_INTENSITY].tolist() - if name not in group_intensities: - group_intensities[name] = group_intensity - else: - group_intensities.update( - { - name: group_intensities[NORM_INTENSITY] - + group_intensity - } - ) - elif nmethod == "quantile": - msstats_df = ( - msstats_df.groupby( - [ - PEPTIDE_SEQUENCE, - PEPTIDE_CANONICAL, - PEPTIDE_CHARGE, - FRACTION, - RUN, - BIOREPLICATE, - PROTEIN_NAME, - STUDY_ID, - CONDITION, - ] - )[NORM_INTENSITY] - .agg(np.nanmean) - .reset_index() - ) - rank = msstats_df[NORM_INTENSITY].rank(method="average") - dic = dict(zip(rank, msstats_df[NORM_INTENSITY])) - if len(quantile) == 0: - quantile = {k: (v, 1) for k, v in dic.items()} - else: - # update = min(len(quantile), len(dic)) - intersec = set(quantile.keys()) & set(dic.keys()) - update = set(dic.keys()) - set(quantile.keys()) - quantile.update( - { - i: (quantile[i][0] + dic[i], quantile[i][1] + 1) - for i in intersec - } - ) - if len(update) > 0: - quantile.update({k: (dic[k], 1) for k in update}) - msstats_df[SAMPLE_ID] = sample - else: - exit("Stream process only supports msstats and quantile methods!") - msstats_df.to_csv(f"{temp}/{sample}.csv", index=False, sep=",") - norm_record[1] += len(msstats_df) - if not skip_normalization and nmethod == "quantile": - quantile = {k: v[0] / v[1] for k, v in quantile.items()} - print(f"Peptides after contaminants removal: {norm_record[0]}") - print(f"Number of peptides after normalization: {norm_record[1]}") - # Save original intensities QC plots - original_intensities_df = original_intensities_df.reset_index(drop=True) - density = plot_distributions( - original_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=not log2, - width=plot_width, - title="Original peptidoform intensity distribution (no normalization)", - ) - pdf.savefig(density) - box = plot_box_plot( - original_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=not log2, - width=plot_width, - title="Original peptidoform intensity distribution (no normalization)", - violin=violin, - ) - plt.show() - pdf.savefig(box) - del original_intensities_df - - # TODO: Peptide intensity normalization - peptides_count = pd.DataFrame( - columns=[PROTEIN_NAME, PEPTIDE_CANONICAL, "count"] - ) - norm_intensities_df = pd.DataFrame() - if not skip_normalization and nmethod == "msstats": - # For ISO normalization - if label in ["TMT", "ITRAQ"]: - median_baseline = np.nanmedian( - list(set(sum(group_intensities.values(), []))) - ) - group_intensities = { - key: np.nanmedian(list(values)) - for key, values in group_intensities.items() - } - else: - fractions = [i[1] for i in group_intensities.keys()] - fraction_median = {} - for fraction in fractions: - fraction_keys = [ - i for i in group_intensities.keys() if i[1] == fraction - ] - fraction_intensities = [] - for key in fraction_keys: - fraction_intensities.extend(group_intensities[key]) - fraction_median[fraction] = np.nanmedian(fraction_intensities) - group_intensities = { - key: np.nanmedian(values) - for key, values in group_intensities.items() - } - print("INFO: Third iteration to normalize and counting peptides frequency...") - size_record = [0] * 3 - - def normalization( - dataset_df, label, sample, skip_normalization, nmethod, record - ): - if not skip_normalization: - field = NORM_INTENSITY - if nmethod == "msstats": - # For ISO normalization - if label in ["TMT", "ITRAQ"]: - dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( - lambda x: x[field] - - group_intensities[(x["Run"], x["Channel"])] - + median_baseline, - axis=1, - ) - else: - dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( - lambda x: x[field] - - group_intensities[(x["Run"], x["Fraction"])] - + np.nanmedian( - [ - group_intensities[i] - for i in group_intensities.keys() - if i[1] == x["Fraction"] - ] - ), - axis=1, - ) - elif nmethod == "quantile": - rank = dataset_df[NORM_INTENSITY].rank(method="average") - ref_dict = dict(zip(rank, dataset_df[NORM_INTENSITY])) - ref_dict = {v: quantile[k] for k, v in ref_dict.items()} - dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( - lambda x: ref_dict.get(x[NORM_INTENSITY], np.nan), - axis=1, - ) - dataset_df = dataset_df.drop_duplicates() - dataset_df = dataset_df[dataset_df[NORM_INTENSITY].notna()] + dataset_df = intensity_normalization( + dataset_df, + field=NORM_INTENSITY, + class_field=SAMPLE_ID, + scaling_method=nmethod, + ) + """ + print("Number of peptides after normalization: " + str(len(dataset_df.index))) + print("Select the best peptidoform across fractions...") dataset_df = get_peptidoform_normalize_intensities(dataset_df) - record[0] += len(dataset_df.index) + print( + "Number of peptides after peptidofrom selection: " + + str(len(dataset_df.index)) + ) + + print("Sum all peptidoforms per Sample...") dataset_df = sum_peptidoform_intensities(dataset_df) - record[1] += len(dataset_df.index) - dataset_df = average_peptide_intensities(dataset_df) - record[2] += len(dataset_df.index) + print("Number of peptides after selection: " + str(len(dataset_df.index))) - return dataset_df, record + print("Average all peptidoforms per Peptide/Sample...") + dataset_df = average_peptide_intensities(dataset_df) + print("Number of peptides after average: " + str(len(dataset_df.index))) - for sample in sample_names: - dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") - if len(dataset_df) != 0: - norm_df, size_record = normalization( - dataset_df, label, sample, skip_normalization, nmethod, size_record + if remove_low_frequency_peptides and len(sample_names) > 1: + print(dataset_df) + dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) + print_dataset_size( + dataset_df, "Peptides after remove low frequency peptides: ", verbose ) - else: - continue - sample_peptides = norm_df[PEPTIDE_CANONICAL].unique().tolist() - if remove_low_frequency_peptides and sample_number > 1: - sample_peptides = norm_df[ - [PROTEIN_NAME, PEPTIDE_CANONICAL] - ].drop_duplicates() - sample_peptides["count"] = 1 - peptides_count = ( - pd.concat([peptides_count, sample_peptides]) - .groupby([PROTEIN_NAME, PEPTIDE_CANONICAL]) - .agg(sum) - .reset_index() - ) - norm_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) - if sample in plot_samples: - norm_intensities_df = pd.concat([norm_intensities_df, norm_df]) - del group_intensities, quantile - print(f"Number of peptides after peptidofrom selection: {size_record[0]}") - print(f"Number of peptides after selection: {size_record[1]}") - print(f"Number of peptides after average: {size_record[2]}") - # Save normalized intensities QC plots - norm_intensities_df = norm_intensities_df.reset_index(drop=True) - density = plot_distributions( - norm_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - ) - plt.show() - pdf.savefig(density) - box = plot_box_plot( - norm_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - del norm_intensities_df, strong_proteins - - print("INFO: Writing normalized intensities into CSV...") - if remove_low_frequency_peptides and sample_number > 1: - peptides_count = peptides_count.loc[ - (peptides_count["count"] > 0.20 * sample_number) - & (peptides_count["count"] != sample_number - 1) - ] - final_norm_intensities_df = pd.DataFrame() - size_record = 0 - for sample in sample_names: - dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") - if remove_low_frequency_peptides and sample_number > 1: - # Filter low-frequency peptides, which indicate whether the peptide occurs less than 20% in all samples or - # only in one sample - dataset_df = dataset_df.merge( - peptides_count[[PEPTIDE_CANONICAL, PROTEIN_NAME]], how="inner" - ) - size_record += len(dataset_df.index) - dataset_df = dataset_df[ - [PEPTIDE_CANONICAL, PROTEIN_NAME, SAMPLE_ID, NORM_INTENSITY, CONDITION] - ] - write_mode = "a" if os.path.exists(output) else "w" - header = False if os.path.exists(output) else True - dataset_df.to_csv(output, index=False, header=header, mode=write_mode) - dataset_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) - if sample in plot_samples: - final_norm_intensities_df = pd.concat( - [final_norm_intensities_df, dataset_df] - ) - print(f"Peptides after remove low frequency peptides: {size_record}") - if remove_low_frequency_peptides: - del peptides_count - - # TODO: No peptides intensity normalization applied in stream processing. - # Save final normalized intensities QC plots - final_norm_intensities_df = final_norm_intensities_df.reset_index(drop=True) - density = plot_distributions( - final_norm_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Normalization at peptide level method: " + nmethod, - ) - plt.show() - pdf.savefig(density) - box = plot_box_plot( - final_norm_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Normalization at peptide level method: " + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) - pdf.close() + + print("Save the normalized peptide intensities...") + if header: + dataset_df.to_csv(output, index=False,header=False,mode='a+') + else: + dataset_df.to_csv(output, index=False) + header = True + if __name__ == "__main__": diff --git a/dist/ibaqpy-0.0.3-py3.10.egg b/dist/ibaqpy-0.0.3-py3.10.egg index 3df0a855bc8fbcab373de5b14473d99ee707799f..cbcbbbc7eea3f24ba2b9dc4d720ec79bf3d8d037 100644 GIT binary patch delta 35015 zcmbT-Ra6*Fx-Wd(-5r9v2PZ&~;O=e#g1bZG?(XjH?(QDko!}N6PTqHB&+OS}pNsFi z=vB44uc}vfRrmAz*Fz-K`cFtic`0xR3=j|y7?8Up|60T(JjDAXfAs^-$NRNEXM5sn z@CbnDS?fM#1ktS*rUaW5b&gysW`%3Zz#+#)G%6Vak%xN=-a<70Wbp@rwKlgc>xC+X zpF;fnK*?$|Q1Wl2LH&Hn-5}N=%F? zSNxzd`O9%bAu&Pn{&>151mR0Woe08Ls9dv&(-Y;`IGmlYG+Qa9=aXtC%)g1$-d88} z-TRn-J4`uf+G5p7!3js|G?RXLxj$MvHE;J=*$OoQ_$-x`##JsddVMTo)b|KG=0nq;#etac9FZiF~$DiV}ga5f5@uUPUf3@`)1n>Arnu-4EkT746 zz_39;EOg?tK4ZssbKwGf7SU^5aa-UMz&01xK$l&^$%OuceviY_TJ#1K&qdY+5|#sh zkf2mUWu)#b*^gKc(--Jn&%lS1?5`x3eIr%_xq=_Vr2S{tNE9tLmAZ% zM`9I;l?8#sb`!0AP!B^reyO^;61=+5yJ>2fJeOz2e3?`%nKYon&px;trcxr6qfL0z z zistMw;rO2B8_c+CrO(#TsaBgoqoHrpn-OPQlNYq)>^jrOPmm zmHW8aqRJG{6l_36Y<}Mq%LJhW@08`v|Ho3H`E7O9D0V!i7x8wsJ+_!=5@xHn)9X(* zuqC}}>kCHf>gp+3V#cDple5d>@WH{xwswsSJ$~&C`u(@_lMQ4WCZfLj^wVY=tcSY$ zn~S@NV~A$J!=cs{roE_quzzs({nySnCjhv$b?`vd%?94@cX>P?uT>GdTUtHP+d6HS ztck0pnELAH2PeT|I-S=(yxkv~;ft@|!I*V2P7d4CgTu>3Te!UP(=HepnJyw2U2HJh z9*S+OtC{-B9h^R{e!mW1{N9=DAI%<5;KzNSntG#q**x>Uw{>~$cd_mVa1ikq15hW! z`9jU`l7I@!xD?#!UvZ)`KTQlYYMl(bR5-0V;D2D$w#kNRe~Ep?G-)`~G{|pjz{cVriczsokZl< zkt3_MGl%}H$DO#s+1A>w7C6o%l@(zYHaMpekeo1hqj2(eP;~1Up_NdA3!2TE!Y) z-vRLHDxDHHTtcbj0j7`QJ}HUbcC+#a6oPs@cj%Ika)DLSl8{nC&GugCv0?)vALE`N zx*)&rU<`)1g0`nEn$g#C>T@P^E69#~$KZ^^Q1BcKQ%7d9J7K=-*;_h& z`n6vTbQ&;eR6igZ^B3>QNeFyh`N@hiY8OY)_KoH&B^nCdrsg#J)Y(XQN^n{xg{}aj zQO|1dxH(KI(|x>;SQ9l?#R`e~?*aUY6_WD)5c|(J!8EiAV3tWaRSA~>{>TBGov6hC z_*0^-N`g;g{*SW9xQCcaFKiG$Q8bn)x4uxpa>CFO_iMtn&R4;CEP379Lo%b*G{xp! z0@7>}E^0benqxMSPm>|FJjt*mJ_3N9y|`w3aijIevR^E;gi2(ZI$JHqh2+?0lnyl( zykLd5T%$;uQVnk6w}lA$_Z0kYDy*DuR+z0gqeH0A$Te8e+P)PAU&d!ybH1lu3=#Pc z++%_rjE74?a(Sy}9!n*Ap5lL&QHl0quHPhvcAG+uid_sB>bA>rppX2Y@2m-oX?Jm`5LkUxdEIFJB)L^BD&phZ{lZKs{K4A}$J#v7v7CzVs;lSVX6wnL#^J7qeka726rh{B4tjW#5;2Nc#`tjTOrY(`YWU_Y>AbFzLIN6$PA5G+@g7$(@oaJNpu z%wsxz-fTSXi&rdqDJQ=JZInG~M24W@+jDNu7$lZk)08z0frBW|ALRUAIKrA7_0li)FCUEJ_CmM$Fr8_#I{*uzuWeqlP;XiI9o}af$5f^PXs~K zE&*(uNEXSv%#2RX?%j_*T2VBK+zycDWSSjw62Gb@_b}XR zp+p_AHf(AsR62m+yFL52-^xwBvYgWG{1pdtZu#SS02gwYr*%l`^-MOleVtkN7DAEv zBH4g2Y2PAeOH|blRh_4= zNbh(c68#*=18+9B6(R)3cef-6W63fZUz8oRaAi-0r6$ihOq9jx*Y}`hjmRcIM1|Civ5j z)X!Zd(IREL0j_~Ss$DbB!Bkmve3tLJWV44w&ON%5d~m#V&_XVthxig1c7oBmGom?~ z36Sj>KiV$0p#IuZPOQCsP|D8ARO~m>XH=p*(y{JG=!DBNVZh5DSjhZj+F+2L_q7-A z;pJ?v#y8msd-&-B$8Ng4ZJH<;D>911zYQZOFOl^tVX>hopSn?*JXa@cY|>mClE>s{ zlEi&{4cDP;eIj4A7YZ8T;lMSvC3*jp1;9Wl;1Y*!_QtWCxo7=_%Ne;JnYA{W7)pI+ zczlz#p-{0Ane&Jrmt!LNy_oa~9(^Pp~#E_Ry&J^e)D+Wd^pqF|M|B zoKPv1m_#nvaS39O$8^ ziq61JsG1ljDRvv$4LC%H*7f;9X%E@s+V2tZ9K{x45;1NxhPdT&k6_i!0EMV_Wf-Of zNr(|a-QU8XgpwKbbFWQD$booQxrTS5w3_n~{o8j4Lva@}O{5pI+o8H{Zfj>OviRh- z-kiBO&%LQBC}WasiKlZNag-tP7N8t>mBcf8p(*h&`+8QGE5XV?PehNy0D?MhL61NX zJ%I2nUMTF{T8f8jTu2$=XB1P~n9?^KByc{e?>C^@=re3uaG(mYd+!o(7kWSGywE=f zRnLSK z8$R#wfV_w`RBBlIDPMO}ivk1oLuDarA*hm%zpLqg_VaXVTM;M^KsgK!I21|L` zkYgsK3t6q_`*57ZHU<%mWb{ej3a0Y$*Ahs$4S}KZIN8cl^}Ec^@UmwZ(4mo5x{eGu z6<(zRb0t0)7b~s=%Osm8Dc0=iNMB-JQ}0#SCwuPWF7+;fWY>x9_$@e-8I{!&RcYc- zxUNvt+mP94l^a6mMM?l=>C#KZibhgtkmml-VY|@qmggwRx!wbzlsh|2sTNNv5;n9+ zCVlK{E;>jV^E(ix+ zw~22%JmKLG&CiGOj)C_6nQ06mV@>kj3|;>AksZGN<_HM&7CFE@KbHX2_Qsv4!PoZ< zvD9obSH9SYIt(Jez9)j|84qnV;im-=5?EVcQM5an~HU?zY1uc(JDeF)8 zlu0tw32F1EEsK+RKe5gMx9KY=vPa^Z=wD%zYdgC!7M?LtMNt|7%b=QcW^Xg^l0}p!5 z{kf4@JHbDqW=rN{QSV)3FtSL98}bTjGQb{46u#?{0-XUI>=7LBW9@T~;)WZn8ZLf3 zL5>*+Qoo01XCc}uMIN%OlCCHw1aU>^n&5OZ<1!YEW~fcc?RGar8p~%@rn{rF{0!}LgbSq77SC`9-fAaEP9zv;0TLFUYiZu;#J=(8b;motHiQFR0e)XLSa)H3@C8n%BqwVe13nN zWPZuts2`SFqXSD{r*?Aq&DDtP9C1^_)ZqwzHl5&T+DCpNm*5cS@2fi47qHlFSnhCE z?i}m9@pE%MD;$bCCM%vvzQ?_*uX&8Y$SDLY`8NHcsaButym4?{F^@s(+Gw*}(d#1J z#)vDTG3=s-Vn{A~g$dkU3b4Sz)`iho0%9pjych>p@2$Y6N>=G*-CDF*Bn-jwoa?#U-{kbsg5H$>JC8qb>xJQ|`KWiB3| z3uS;C9w(j>WV#mq^t~g=&m@SyOaI$gWvdEZCZxp!RtNcMmIvalSE%dt9U# zMba#3MZ?PKiiSgiP`@c+$I_iTf8bp6;q3g-B77_AB+V|ugX)d-=e!V2#II&&l($C6 zbuzahcPHyJG~Mu2xz}7c2o6N#VL%eQW)m^kpuM; zH;uKKKk_y{=%FisktxuO@1oLD#jxOaY9A@nlltprknARtts7Z4 zET%QSIE%v^8E(S!3-x7KJim$oby4mT|3mx=cC3&aF4bcHi$@N0zB+}=8!~m9V*#1@ z%46f>!74c2Si#tVmpWM@T6pd2w|l4#SWSwQKGfnL*AX%LpS!S4BBG!}*p2zMmvXap zIGsB_7wdst+&l*G*!)m6IeDVta7OZwRbd=?iufsl>xfqmCx*hvgN^hC7^J(0>AjDQ z7IL-I;13tQ^W9-)Y_jSHUO!(4dW1X-+J8~g8wm}w=adhfOT--0`DCgp+uLi`VeM&_ zRZntMl}5e-r0=t1q60@!1qda=DPq=IB(S8%nC|tY&}w$~3x6Hn?hL&Xm+-Zch2%EA7lu^C&22={bKI7sLKSJbM&GK2Ey=@X_*T8+%FLnLe zt0|^&7k)&cLcf#9Nrr1;%;M*BhKgEy!aGp>vV3fDEthmQ_J4g z9Cuh4<_qqW_f^Q2?!Q&@zmA5y4Ta&kGdLrYp1yj4{b^!2otv&d=@K=dMvFXaGF>c> ztYGZ=ewcv_RCZ$Dls*=e0fvgg?@5U{yqnF zOMy%h3E1;U^mw6 zk;ylD1%2kkta-n`V^t)>Lm>A^V&oGlz#_n>8H&CxaxsgsKQwL}SsWD-hia&x zoM#;1UuxYt1E*H`Y4gG|uQ_ai6BXQf>)gL{(nQRyYx44>IfX~KIKJdQG#+-s#L~jk zst6>=G?T0^h4eBX8C*ytWdo5dn^B#(RejL(p$CKYmZaY_R=ry#-_C8*T`q{_0|2%qM);5Iqo5##MD6ofC~Eq69QxuR<=PL~^eYBVTI z9)V`$WNrV#Wx{wLO8*DrX(3W*HoHtf4^e{ejZ39GDkOv|+}t1o!fJulwd|w2GXkvd08}k?yhfw9StB z5IZ)re>VNv^C24Gvyl?hzqZ>~y0xV(nr+=ze!Mryv0$2BG6qJcd9zGM9SqnoxfVtt zF_E4#=Sidg2Bt7O6YlIjF2q^FO+l{grtpcgT{bWez!^NV_gGguE-bHag%(0#(BVey6TIAfy!^KSWJ3 zu4moTNS6hAL;a**#?zc)nQ|a>?iBbldT2Nq5WsWQ3lmVS^Y)!>cEh5o8^G8^vmsl zT7iFKmSV8KI%RM`lTvWl%!XS9P5vu;cuoXlTOiEvx_5>%KLG=I+&pJ!Xv0~>U!Pkj$ zbgI^SG_FSJgKkh@jt4(@$Dlxuh?x*f1S!2MtzkyeU}tw9xUNT+3==(N?&>&Fa|9l| zJUo{BqThZ~LEtnaU)8+Tb@KK=FMECbc6v31uiE`Neo7lUr-kyDj;rZha$iOGI!Q*4vA-XP6f@}wt(4Rmsu50hpm@zx^++& zef+fOYr2rAns8D+b=nsssvo(QDmy@ds{9W|`bYDro^9Xu%X%2*eUM*7Tm+HU_%jp> zNvWzUj$l(I=(_#_liN*nq%a7(2yyMoVu=VE)d@U@-gLEIRhzL7ItiO>`{ZcN7B+pY z5zk&<{rMv!upu>1xmUOP$4|eLQ{bF!YlcO6TnG!0GoO3da~esNQVZ*4)8GOtDdfpW zh^M{NtR73uKNnM#t#3~b%ce}Z=_9TQw;D!gU3Fet0|^Rwz8W>yU_sh0h05WgW>l~g zk-BPe=SYM8)(cG>@quj!jE!tYYghcH_nMRYU|UXpEzsoDD?G}xX9+^|@)6A%PaMpc z#X~F^BZ8&&Ey89Ivn=-ge%TthVb6W03$Se0@(NlhI9$i1q8zK4aQQa5H$hYo6~Wl+ z?vpBaUSX8ITV~tuDosW%A2Vv)OGvXoGAq+jtXB0Qn$G%bt8Mxvxc@zi!-gKX zh6>}8bnou6vy3^n`y8ynUnu)Rxx4@O+VBsHR{CSDmy44Nzmc#&K(uv0KnVWv04IY6 zfMNr-Zs-40SLl?F9y{9*uRXK5)ZU9;pQEZ~;&w#G^~}H2)$2dh)vNr!scTFFO`SUZ zV!9~*%VB#{RhJgUlMDTa;FwN5!%cO!{QRy?cUO+F|yTM>|(R2%NJnExn!fqbn*T4VB;ip7S{Xy{(f-x(^*;- z7BY7DOL=hi!sSbPaQa&K)$<(__R>zjT(kM}ATYUcI&<#jVfSL|T*=8E} z<^83@o9js_wT^1x!R|J^X z*HkR?uzdR|gMaxIPqd;co;2Bp90ISDd9CQpp|Ax5-`cMl#_rm_Lne$7-6Mg-YYBFSt_HRgLS(m)4)dD`A@XrB6|ArX-n;BI@!IXn#^Ym)v0yp ziAkrpYHwmuSSML-o}5Vuul7m!Co4F*3_*-P5W=zd%`C;ao{<+%(}E>eCYFG5JHdeCD6E>|-UNsJziiuE*67dgx2;1uKaW7#4GuxCU|*d7Zphph!5}E~aUpgS=&}%+=i(widt59ZcEM$|1lB{rfQLOgx9b1Kve zbEbU)*vW=2v?+Z+-b2HM=T`YEyt4& zDB=g7>t+yf!>fpl`XCj3;S`(^+&&QY#5JfZTg6QdEG6lNI83r52Uf5sTwF;&JOZ-; zZ|$qiuLcmWVw6@xdNFC=b2LGhFT14Uur$iU`Ho3R==kd~O>vi&_hh+KAyP?lL*hyY zCy2=aByKr}WkoXd5=i?!9YMdBY z+tQw&cyb79KWH4dEz7u81IiwOb_d~vF1_#&kaO>cqls1M?7D7r2j@Qa#)VI6x{3w1 z;G^c>&|(Bv$)CiyBYib}7F5TQ6Ut=t^T}{58ynZcIA)VP8ec5KkYOEynQyXMH znSk#(+@KL_ZQ(W<2>kX(lKIB6OfO`8e)a4!p{R3sRlt~*XZ-a(AY`5y@X;%sqOYn= zIe_^RyT;PQ8SVjl=QJND>k|ymdlhRj1gAUCE@Y3Nk!!pFLVQS; z8=po~RtS7Ake{_K{k3KRzxPbYnPrXMvZ1NnMgh_hA@H9!07XKc9npa@ER5t=?UK4hl2BNh^8VFlJnF6ed{Ba8C4iACo?eHaKn zYwUUb0D;1ZiZ(X9CgbL=-6B<&M)&&Y_w&c%hko`1oJF$9wmW#hrsxC+94fI6oER%S zm#tl0U)#_KIKHH*`_;5RYRKZP!$O&7iy?_L z1~b=N`C35iskXi+^X)-ydlspR+jNioKI-MU zoC$u7#)7JoLZ;Ph8!lNzM#}}TbU0Hi<#>v1DoR9^0`0qGJ*Vf=7Dux{=xFb7_vp#` z-h`%==VpjHBg=k=A3ee-t{ZO@xnF6YXUSM!)a+(ehNeIoOBpZRNB7!28BBP-F1{lE z%nKxNy3LSu6}U>)t4NQ|SIq0n{YJ6XKT%Sgmh;rWZm0+{RH$KT=2i`s;&55+As8&M zRV@YgS5sC3p-b3#6H zYI5`P+I%R=j|Gg`vyN1D5jGrE(kf@q9o)Z^%9CCjWi0v~hc6$97NN+Vbn9&yLxgbDAqhsda^EAsCWk@XZYT@qw#2!_fQf0-}%R+TSsIH*9DWWtH4mM~mmKF_JJuf`BBSuxRC^=Nb z=py~yd%#gfs1aO$qFr6fk|JKrG>^-mz!7s1>D~$R3w~)>&2SOzs(mBAz)eR_19*J~ zpS`Yp8!HpwpK*vAj-Dn*;0pSjGJ*-R_xY0L7w;5vItx=Y4d?6nPqs#WO=-L;ef_X+ z{|{)b_zPO`|AJP@2NF%$zo8Z2`U9*3+v0g4T_{q4kfm%Hlcv<*dLzp|x|4 zh(6`t&;`Pt^ zlSc71Kfzd&y1(OE-3{&bqCXwXQ=^=N7ly#us(F>}nnWot$g&jJlI%Hp1AorZq) z&j2#`y@#=%SfD*Qi=&o3gS2TmEponk_8ao@i)^f}KZIFxsNZpZh-$hQBAUIAo@n?m zR(`2jp4i%D>v(%UTJ~i5{{yZ6c2?l}KcO}A|0A^8{7-1j{4Z#Aa{L#x_Wn<3{kO9M ze?co$wXWa)fYysY&`SIdXjNnS_i=|fjp6Gma_E|B1U%YCI->z3&t)xN?X zw2&b;SgIabIOHd?M)@_G=XKJ>s#s22#YkTRXT%0? zh@&Hjj`S{SX<(##jr!IDBAO~rA*oMTo-2FL5zq9Ky{Hgt+NE?yr}UzdlaT`|5BcKB zwsly?h#nWd{I){VU9?XllI-L0Qjd@>`x{}O@l_}$k#|xWP{d>+Lfgw1ViJQjyJu;B zz+gELiGeME8czlE57fa1Dmu7(-EU!EbOvua?<-qi0mDuggiJ6n!Ip8uM7YJRQ;=Dm ztw>7{xHOU8asA0#SIY%~U!__;fe{+6Q;}LlUWjK@Jc&X(Vn2#@>X}B6=e-IzerI>8 z&|S=P)Vm75^7{n*aB@+jcR;p>A%d{ATAXG_SZN~zj}_>NNGX~zQg}tW^{MyF$nO~T z<0IoX0$Av^DxL+p1-TJm27&9fdQhUWSw1l>a;>mAl;GKL1fy$gCS|c7l2>_ao_B3s zN1VoPsQGPaIWY4z{`}-C|B=>zK`ZbNX+`-Xt^U7q{w1yVoh`ixaV6$?weBiqIBuir zByfmud7A0e>R#L%!hP7g@t1d{b%Un2 zCTS##&0SKfe>2Jl(ff4q*R%Vm%Oq69iwjY;$a=Zc)1IIg0`r64sd5VynkiV+tWqlQ z3H>3uU(Gin?{^2)Z_R19&u4f}&rYt>zRAUhA6K8ScIgo84y}#h|JyDGJbPOL>ya3de;01>J+L zR(pfEa}~x40JdOuXg~M>NBs*dV{Q9~dVFmP45_A*C!5MaiqZ{{fwVSDJ+eC#3!_KE z8QJ?vSA1x7U`D@?$3j)d?lCJXW-SI+nbkZ?!#6Go*)ZqFPjiA(uYX>B;T*K`t@XE|~MLPp-?4-hXO=*mOn($sjqxtos2_(Bj+yEEpn^olRI&sEZH1C z;q2#C)bKc^zBKB`GpCN|uOyI<*i({Ig}0!*;oQ<C(^4SJ6Pl^OPum^a);NAyHEckt>!H}|C_WX{gGCg|B}|lzogY?J>(F> zF35wK5Up)$aJ zQau7Qv!00~r8*n=SO~QZfA8^6Y5n{^rS)^b1r>H@S73ATle+SydFv3iD z%P5S@1OUS_BL*G)6frJ+b9~X3v!1x~ecP93!Sns~X=YK2DsXj|Mpf|pywh=2GtU-E zO|wudoHN}=OV0v>KH=kxk9O-k_fWX*t)Amq6n49)!~~Mra`ijQR(fxX3cOiGnnQc% zzR`s~I7>ye71wD{QK_jKgX2QBA@)H5Qd)KG9H34zwx^zn)V4P)eKjW5IUcRwnft*Z z)0`il>#Kv=LF!8y>QeI$X}$fIv^M{^j`}05-dH9RQP_V;tIprjD*I1qeGX{*TUzh` zNGs!iN~_5~r1f9W3bgz;Y32A^T3bCrAD;e)v?_fvRr&AI`tIBQm$Y{MkyeI(N~^T? z|4UkB{*u;?|BzN<4~2g~D{vPKNqV}2v9ybE@rG|#XunI-elK{Q9JIIh3h`gkTKIoU ztLlGBE3w1e|BzPxzoj+$f0fpC&3{2F@V`o{7Tr*6rB~Ho((3t_v~F1szI-_zy!^i7 zy`!cJ#bx=ow9fx8((2arPibZNZ_;}HFKG?>Bdx=Kr1f9W3jFWVs`r<)mj98~)xV_m z;eSf2-SK})>(L)+?aRLWPif6Bz{&o%w0`usx|S64AQO9IaA|uPei_opZ;l)E1kTV` z+%6E~uC8f`>V@ny1ZWON{Xm<4cXN)yE4VvlL7jO7G4d>_(5aT+(R|=g-i9QD4??*6}R@#t?F^ z<&6FuR?3E~s((XNOmPWdaQL1ueH|65wMd_nQ*3yMZz{@&MQTV3*3 zTaRC}1pjyEzWosC@6P5}FBH;WJHNyWrSvi&KtNuh<4M-Ak_G)iDe8}Biy+Ycx+p*D zQ7{P!0>Xmu=Kx8L^#^4GB2_1&=fyCxaia;TR_IrBlB#v6XA})e1ERs+P{0bTV_VL0 z2}&mNM$+y4T&$>Yc`;%bR901g1%7}=y>jxutP+i-;$uH=a@lp5V8@zmrn?b!yXkdv z5;bWef^}b+mw(v$#sK06!}{xisBe%WC_-L_N`tADpr&7t9kq%95i`FJAxq)oJ`i2> z&jw(eqB`t8OQK5jAT}XM>JXb4-$d{q{nXLMd$bHi2_UwNzYW;gfw^jBU=;f> zg?_F@^3<4t9^iSyvRZ2}YvRk|GB|s(=1h{sxesPN{z385OQq<4jsASa6q7>k7+s{9 z2irD&Kn+SP8>e;$e3o$Q;~d!=F3pM|Od3b`N`|cz*tnMT*AbgQH;OK6_8*fCnaM_Q zDQ;AkGY^fe;kODeG|`kpnAwoPuZ?BjejR$}G$`t=s2#sR>rD(YnUkI<0G)fr(;&;< z1i~Y1qSs*E1Z~xy54GUBRu93caGoySBBfuxqnT97z)ZgZ%+CEI;n`Y^6`yVQsvN&{ zEjbu+vQg%8X0@aqB73BwB6~4~8SDhDuv#PRSPehuw&;9A%99Q^{J~%>eOK>JZrsgl zze|;h+HxJAJHk~Gd8S!_5W-59?c<6j=FG3>w4=gcD~7w}Ov;6owZcH6V`H+1Pa5q@ zc3EvD$ERlrSi#qn+@@pe+Jo8MJ}2?wSXz1PEUwqR-j%SqXEh!w&XgK~W`G(h#xwSC zf!|?RDQRaHt5LsXDoyRfF3;r`!u6T&*2D2s)0zc;eJ#~z(vUPtlAnNFgxXJ}K-it; z+?wZ6F2-uj$3E>eKxr{}rFQm^`x?>!G{HO>g_~Le7!8rQ zhg6#sJS#|sV{rs+F?WnnwRe<9e}f99BYl6jB7xo0e`So93DXzk@);#DBGH1x%^VGY ziksiwwv{jW_c@)Ruy8KljDVBh{r- zX#>j}fY1C0ecv!FS+yvc>i$575?vrovPh$E!*H<87))?YK22g6iv4KfW5qv1=J~vy2FV9!iiPx-X*7K;mY0`jlb@4RVDZ&SIC+4 z2Y{f?)6Q z-p-c!>NU*a$>c}~Yints7n`4ZIk3sxaHr@v<;i20i1OW08*zr4L?G|sAVDXi_)PmV zV9dqj!wnWUL&J>E&HP{nI-W?7qy*b%Pe@)7BS?J!8Dka*aRIXe3Oh~F=W~zP62DL; zE5d-h`CP$TgDfx};VZE2?AZ<#ZoM5yOK&64GtF8j8gTcDHnPfHAQgwzrMenpK_`(^ zj+W> zpDDUh$?dxdI#gA@c}0u~exy9K@h!X0{dLeRUS%XYOW?zaB<}`0LYA?fQTatK*D0kB zDaJat=C&1Xr!5TvnpR~00WL7P9PT0UrY($mh)soa6jr-;7=4Pl@5ulG4(Zo8pm@tS zvqfxsg&8Nz^VKK)j#?$oD|jzH?_OIu__8G5Wt(G)CGIQo7AQHntJ-|BW*R_IE5n?> zi2oc<62ZCwANYch29~G(?gQ}_=L))Vk4WWwAwgxETTE*auEVj`qW1ZP7KQk{*Tk?y zQ@NRKtTIs=nF`c1r^Yq`my8MnkWI+0!#jVTDeaqr`GT@vb0ijvjbzVu)h@De@oQ6w z)5pBOthR(yt*~V5TcLKLM4%L*7OPzp<*h0+2KrWKBxt03ro_S$DqFJjMn$^Jy&i|# zyWY$P^uNOdf6e?B>2B7Yf0hL^7!VMue})P4^laUX00Sel?|ORwN){N|x;dKJSTnG( zF#kyz7&)nI+2D(!Y;L^lgzpttmzvRtYHsJ+QB;tABFdxAGe^fL#VHLHWG$6FTH`+& zdzxBVQ(7zc&w({}-N*gJjmGqre34lE4i5v$W}1Co@Vzp9Z7nl1^{p$_uC7jzikSnj zu7D6&4tSMdL;%lX+Lo~KM-iC`aip?YEj&nfT1ggRHpoCRAgHhUbjz`Ujmk=Mqdz>lAdH`kgNlN4~uF6sa3&-?GNXudfU(--i&K}-H zhJrS_akAMrp9>q?)~!gRXTPQi*)H$PoyCn#hWE*-#pVusdiU20#;pza8Yi3Rt;03s z?sxO1CO1*y(n=C~7Xv=|qlopgr`1%wk<$tV4Xkw~=ENc3{ZC&F(-BOuNi@82@z zhBt*$Zb-tt2q{^j(c}sUe3k7}s3*Y_5ymcIRkdALPu*xhGN~`FyCmyA$$@+aB8vX5fVDJ1`fdu_ZsUB(V%K@g2x5O+QUD07gMLn?^# z1WP!5>XKIKSdtAE5a20Zu3JFMQNcMeC`ht1IYB_el!jnXs_Y4CLjZ#_azE(H8c@68 zLruO~zyA`zvSQD|-po^@d2PNHv50tUGtBX0ica_Aa@oR?AkaVcY-nYhU~Td42}ec% zpgs6qdxWMJkDca|MzwD*OZk!|~n+ky`QiO~qy2&{u~#DtHR zFg14vrSu3q9bo6%ouFnQjpHXFM@x>jq_i1tuzn<*NHU{nE0;V9_9#`=@2ic@dQ~TW zD|gv&Sd<;eigvn=88c2xV0hSCuam87#*(sex$R}+ciitq{4@F(pdswNOwcA1sDBu# zK8rbz<7Saht?(+&*`X_{Qj=zsSKaFIyo>L(trtPtSSNqd6qP9{v@ia-xEX@B6+Sol z`!%KyhnJzg#gE*?4yIK71UudGDdS}Kl7sPR8xlEqcGG#-Y>_^-~tmP3rh z)sJhOmI%p7!)-ThH~J^Si+IJ#63q<-DT9>Iv~Ub6eg-F=2HGC7)cy#Utt}y}8&l0Z zEF7|LMSAK_P;RL-PrO;Hz?d2$uaGI=aFSF)D(J=hM`-aM!IZz|!u%UpK#>RpB$p2a zggQzo@nKw6(T-N-DY@)bLPn)5PEti% zk|GLyxD{TG>hPs2OXCbIXR0=IHOt$3YK-kB8eAsXRZ`2~D83Ra$PtY$bZ(yzYXXJj|uN@(>m&NQ20pYZA<_o$U8 zzY2N8JRjdz2pA(%yam72@qc}i?mUZL%}?O`_43N%F|lQHS#00*<$|~d`4Rnh05J3A zm>8%-ehNPCe@A@@0T#ae(rP*oFcR1!u8)42UY&j(KHqzXeF^x;c+2?cczMROd z#=pb=hJO$L9sW-9k^roJIhR{edYAew^^W@z0W5ww&90w*Vp?N*IXb_6r~6I!4*BBu zQTA5$k@lAM(e~CRU?ulrvuyK-eI8!JI1do+azDWHU3QJaYgeXrj6f6p`i<=tiu^_S ztoae80m-P`qtxVzmDqQ&`KiRB3P~Yz>fxHDBzsw$AkNXw-@=##B%-G{mcW3 z0D_a+BkBX;@jc|j2A=S(^pnbsy zjOh6iVXx!YWT`z_$n(m5<~P!>T6$s^>^Lb)ht~=#uVTF6d=!0$th(VOK2d%r1;X`y z(R5FmJ|1M<(0BK3?OcADD{)Nh9tOH0tm+&i^^P1i<`BP3+c-P$z77})md(IubjV-w`0mH zU=-h{6vSKRaRX8l%@)s<^wTS8iG|Px|KmpREBIqoZ!_Xnbze1jx6|YQ0h~Z%zafQx zehp6usPrWNI{(H)eM9}D{n<+)@QgxvO(avvZ4Amp+WkR~K(A-I$-PT)Xu4?6u0$hx4;>Y3|0t^~!di z;rZG5rK^``D>IkoZp^{_*WyyQ>fK8V@65(SX?bxLRxmd+8yDvnZY<4SovX}$UAjIS z=N4wK%q~!aw(vfzVDZxR`D?S4tCwSQaq0cbA7U=IbbaaUg=XD-AhTz;vu7x4LRPlt zuP!ZIx_Q1Q{~G1@t+-L8>NJ9eS9R)@rrV4>?#3fn9X8!+BwBu@DcqX(AgPR6e68YC zt8N&2K_f0S9dWk>4ZhQi_d_9nbStff*Azk2(`%qvhw^gppn{pWOhxX6k?@vV5%dV- z=Y;DA_uNV~Xhe?hHJnBiR@_D;oTl$$p1ajxEsXai{cr}?YgCrkDjpBxgCQ=ImLJrB z5-MIJavPx+d2Se=z?JtlZqrI{w}Z|-R{+JOXy|J!H3bwax7~WP;)g4LaTdumbfY-y z!zN)=oQ2WLLFmSYaDhG>Rjo%eVm<@E8Ptx)An7nM7%3e!QX|$;RyE|2{mO=ldTNn= zAFFeddYqwoV-0W>Yjw8~XS@b?A53cE6hINDF`0toV@13IpT`%LTV9>tT1MQ{7V5VN;tLd|5}mtbAmxYaOMd9Z)uPsM9<8W`@36N9XEArK7JJ9WBaw zDra1K#8d@Fn;q8CSJ;-i&h8dEY&8RIjO*CiVIVXz?N{>_{9C$z?S)pY)u`e|EXX^a z?{X^&ECB=qM`GcYtnjwebf;*9Z7w=KQ2*7-OCPg1TMg=!Gmvrj1oiC11P!bM3b4A@mib>Y0fa# zf>sn8vSjCz+6u3K(do(L0uIk7%ZH{fAbJl$lGjbuREsL6I{dLGj2WK|X(h;uD>y0_ za#5=^o#=L4s0Kc8E-_%0^_-g7dusm{l1N!qJ0OZXfd5AtoEe+~oE3|-Ew#fU{h|_O zpiVykc5VEUgC&P^>x1#cY>?3V6zoo1exWRQeRr8U$ zZonB9J5c61p2t}lM__aTM%#a!(XgH(tZ`sV*(?DqWszDs*-oaW18odAzYS#iBglhf zm9=TRddaO+_L zqUHhxfY5_~Vgtkzmjrii%BnldZhhLih94_hYr0lYv!dIs?RIcK&X6X2TnORNs}gyWg96#jHihf( zN;SNH7iXMiv%V%Qu_ z)gmi0Q!O(c#l~msNjcM78p;eo5AhKRotiX%tzfcUf?i8S#sw1eTm|-1wi0m zpwk!$35~?O+RRnDa)knKTYjVd#wAS*P#xr4NpXtIwMmz}73oAleXb(J8yvd+S~Qxgk!)Nh zmaIDhEi}0cU}@CKhWh{vHQs~7#V#gIQN=t*rY@zMuikDo?t~JvGsL7+^K>ll--+b} z(zg&nWmL`C67bU){1w%bM7oFOQZH6hF2V(cXCV0|;B^yt2aOz1a9}KP^${9>OgS)^ zG%}#TSo$*fN_UHB5l%92l4vVn7_#PFyD%@8fzsD5BA-;~MBUKNMFA|b!JQC@X&VLI z^G6XHXQ;PrN;24Gga{XsU&7TEAz|Oht3%TE_>5Wze|>TScc-Ux^C=`;1yc}cX9G+N zQ1vBFs=K4sz=Q*3o{_u12j&fbDEn?ne2|RE-qFJg-0UdO;{wkm-+b~dB;O)_#p@AU zR~zHqku03y1JrUQ(x9b~w3P1X;zp!H4U01CD%X2fG}yBumk;%<=w32<7)CcDgGOgz z^x*dB$mC_T-_!EsNc+3Hq}?w027)0F9EBUm8>sj{){RDe%?ex1W+0+ufi>Z}((u7ml1}cii=F`+ z{AE(#tAW=L2N3=QCI>NpL9?djBM}bW)ckFK@xo%>D4HL`D{)c z`rSNZPYlLBGc@)?Lua2F8TJFiV2=%RR5gIF4OK_2nLG6TZeBB=m@2`O4&hG4sV2gd z7~-5FJtTDbA}!LROk{BN*NoKb=_pYaM5P(!!0v`Tct*(d`s3h#5}F`rF!^A_g6E66 z#DXkN*-t3RBDVD)*-Kty+i=}MGV>A|*u)YTHclEKS)_kG2P5pkdFeGSx*$5hzJ&_# zPFf-i#LR7!$Fa81YQ(vjpus(~Gws~f#C2X<60f1W%9Al9Mll(|1cjy;!vwFCMM+7_ zE3tryg=16^?IDVP$NVsioG7eFbW{>_{7b~}IGrbBOilmv$ILXoS1#yGJm``G62{ZL z^QG=Fp)_8{Ji4CRLy}mJz*~6S`TUo;ROOo$YRAFHu{`d}w!%!_r$U*^u6{|N!1 z<|!%dlU<$HhpwrM^`Pq1!#AhWL830<>dpf%6h0J|5?5HP+tqo&A(4~UNpx*zYM9mUj7V@?CUu2$9A3;;>t)*>${4yM&T?*$dN@Y z3ijs;SRGt{gD*(<2lP^c;nrM&j0fZqJUkszX}_oYX9w zKC&>>@f`Ha-+|uP1J187N9YtcF}W^J@&@M7N+-cf0#kB|xQ}?s%=*n}ZPFlTM0~l+ z9OPTjIqXL`y2LbMh!Ah*z@Y-;b+7vbazn^D_^(QTVt<&L4lI&KHc`z_O#@3s_!&D& z#;0Wi&V-3?A|%828WqRsGi~rliU}iy|AmeO2v|9Q2crWbsLbg|2r_c>6p8Do4Dp4VUSwZ6=DqN&KkpVKe7%9;}REh?pLNpW&uM`kMq;8g>#{e=QelMmq z!%59>!qUY#p{9o>zL6e-cxfk#>%A;~h46#eL1$@`p##N%$Q~+qTLz{uX$Nqc*akzx zIO&^7_HNMLH8L@@xxQ&w?=;AtE70{LgKRK=u(5xR-GxC!6d+uW5+%lL>P)P9yU67t z8X@SSzc~mq`~gB9RoZWpetDWG0SVA^nH;)zd!u8b0TH9-iuVLk| zByA$-0YBhg&2u?|3++-L_%I&z8VxXdw`<7*-?5kBUF#ZvErqCehQL@asb$qJ!}p{j|vusYyv}ASH;O?#_sJAQJV2SfIc+e z8+kJ0mY1#XTgn99WarUG6yz5L0u+D6tW?mP0qP%`3v8yG+nS)Wt5PHF8M% z#UiW{h8M-k{Zu#YHsGW6IR`AY$*fs;0Q`~d; zv{Rjkqjkji#MzI1RbrSf?NX+H&zAh7l?p+%{mU z$PL9WU~Mi+cOcHvy5!=e^f;gLcU)+C_kySrIm>ld@-K1iI4yvta~gvDF^Uc7w_S%% znlf(mXEFJ6n0y;CUaSUx^{^5Jl{>E6jMWu;P_DJI4A=rTa^vE5%xt%Du|JKJt@Q_% zUQv_45X<)Svi^OukY^KMTa>}tgPJEgI9_Glcw!Drd=G2KnDJx`%i5@}?=2I)#|lhE zd13s1zHDY>WJLTrO!vFL@)z;nzxiUEPXk)K7MH=Ltb#|0A-*Jk@X}|*7)p<`_8%3g zq$3qANib5B74io*wFsF@-y-=Asyef3AR|TQ3iJV^A{XUX3rQ%;Sl7O$L`4i!&}~?0 z3k3pt5h~v7baY^56&O-m9YWR|1ziDi4X=Z$2XiB{g~}gqf=0H-aa~%Lz^ED-2O1d% z`^eYOR>#1MGJX<&7`DbPBHaBHQeiwxVYP99bs%9wv`o$RZ;`{dKrvM-2@Zxrke9>Tsf@dUudO z{CNe<=3hiQ?5e;yP_sXe1=KN<6Ou=XJ%5E{OE+9+XMx^-$zC9B`JFC*k9K+amofP+ zvG7XO#!zA1twy%tg^gCd9&2ICk24fnlx&IKkezJ=5jGRok*r5+y{x!8jY)C6IT@`-=2KMR9OfPd_z*$BO zrPf^05`6&7;E^p0q_0O=yyithM@!Wyr!l#?RFV)qBn!5Bg6&o-M9~d~9|hDROX~1H z5t)%yFfBi*6uRQ9FHW^>K=gENg|6jMM}W+xh4cx;-83x87sT>j zM40a*LL#WijHH`>0Zyc|lR%E{e8;duI{${6cnNw@QbtN2W)3LeH=v=-PW$cL>C=gt zZlAm!E@FlPaF*xOCEkkLpePE*SPj`^eh7Yrj!z}xY!XfZNKJ&4`2Xz}lbh=w32Dj; zU~JfbHcp>L-?e>$BDOzjRNI8^k#HTqeRPq&>9rXWRv#Admg8@d%Jksq>&{BMaQd_i z5CCt4h2 zTYe8<>=#&0)#DU-f(=H{kaF`USv-xR1Rx`SiS6^dfykB~Ilc`@Ugw3Mumn&dJYc%^ zjYRz8gCFp3iDK zel3}ln&kC#Pjh#@yXU_$sS$Ybw=nrloaktGqP=Bd`>j8zg2S^5T2ZqVwcnZFA+Xke z5^8Kz>n*y-4=B8iQF+(}AcaI4ii1A7YmWkn-tn6KJlWpA2zC8EfsR~ao`#e#?b=*l zb0T+Iq1!Ln`a~+uzx_}kv`z+1dU!Q$rEb(ID`E3f)?I`#*&YIZ?O)w{e{9*^JE4T0 z2R2Kd0JP67AoCzduK`HjKBxq>-Nyod)=C@DgS99T>cPsn_Dt8_rw|HM?YXu!tNYdg z`tFd+c)s_%0I0V;^F9b~nYZwr8UO(N!|H${mV*aWg~;rEz6QLX5E4NP@vxHDVD0ui zKCH4FF0aM`oV4aK?<@mRNp;ox7%nBg9<-k7;qoqWB+*+=4LQ7TKc_4=s!pMQB)eO! zz~VqowvW$WP6Q{QDU1*YK1gN{Lq<1eK$_xX%l3f;Iv(`tn=?ScIAp2`M*q)|lJXczZ!23bz3o}mNj!N|wc+v0;$2&}xcl8?( zFLa8W-BZO2P^VGduz0C6SX9Nkn#npFA@gJa;08f(~X1DGb;+zkzkKMg_ z_dR-c@a!D9S76j-NOp@ysCD1ltqA+o*8M=-!7ycgO_~Wb7A>Y!Q8 zoAR9pmGniXjwY_pgK?CZlMy2~A$3EOw3T zwcVyO-NWMDfcPog>GONPn%#8|yXUG#_Fh%*y>Rc6dM}6n{_aJ2-9;Uodz#rdZS^*S zfDX_kO$;h%oVXUOI08<@2V2jprFNgor>0~Ks(mQ6m-?)KFj53lQ*G;WO*iauVm?(- zQaAOPb|mYaIxJ*-ZZA`1YRa;+NCz;WeR6KM+S^0l0kD@AU3AV#aW1=IYD3V2Z?LOH z5aKrEO(8Av%u_+9BXQFqL6uV+TLf#H?7D5xA(_4MMKMA>{#2;Fkftulr{;S@OR)FS zHq>KJg_^v7S)aDVU&fQ0+JX7RL;%HVbrowbp}epN0Q&q?m+xiP;UbvxB9JOxDZ~ce zCAuQk{eYvrPp>YY7cXMlzCF1`;HxzGwla@yyR-)7eF)wyB+r-9XVQ8#Xs!voq^HMg zG7|i=T`u_xDXYYWba0YLk_X18A@ukwGN>HYR@ulfR3}-&4}42yyQ}z~mod@{ch2 z$C&&RO#Ue*zlF(fWAe{1`RAD6-HP}ZI8h0fR-t1gRNCd9XSB(sL#$pA{|X1tOF?|j zB4#mvp(_H44d229L-F>Y6cb&u${v)WA^oou$&1q8>B^gb4ax7|9l>8m+vqmF{2FKU z0-f$+^(VS+;Nw!dH896@R#x-sD3nTS5u?fsZ)EXiSN|c)45qtW|JFc;MT5tL1U~Xbya}f$K9>?TF zmjj>ga@J&*yfXpf0(TVi%cz2w%3*BW02Yu?Dhokb14U`zEUR@Mj?1 z{%;vF#jpGuQh%9EoHX^Z;n5T0gQKHk*|8IBjQ;CJ^3z%VGghWt`h%K@BkZhy{wYf? zXqm=-$aMB8%jAE+-qVw@`6ui&>w$zd#yGv}exhsJX^EbB5X%Dd8C1VrrtfFE-VM!s zsvA4=*lzAI?vI>f+fY#$?A{mq0W;WRmSz8!ne6*4$G*q%?EkO=`$JY_|COmV;Okxx=+l4gdgBm#`iIA%EL8^1EmJA8_giQ+G1QvYph?jqXO4CN>{lUU+ncd;)lb^(T~gr_;-+3rqQ{!H2K zj(+f?(Em|Ct!SP^JdeV;2F1lHc_K6;aiU=fjRSzdza-GWjL|tyV}xcg%zw5>`O08O z6zvlN1Z|W{EXhw|oY9<~r*tL69QP(~-wmc-|LWr1<>;Dp$<6K#flnhzdCtPvqrT6w zEaY+4XsIEV@yAU?`XQL9>a|1JYil$=PdH_r^LdHV-{7W{!q4HzAar^B;>cRc`Wjrt{qumZbQidR_u-soyH?DsAVv_6e_j2f7D zZ+es0ayI4{X~xrjX|>&*^GWaZEObfz8?#$Vd;enbix#Uu)6~+Mx%icwJ#m9yK_1GZaR4vV3et#JPr9odvbSY@=j1p zinvt0KdsdEC5z!`I)4~l52x>iBHFLJJEN<~rDu(fATmvi$y9L9@#o-tFi|d?umww{ z^R9cByM5 zvORQjhnRu?O!DZE^$4#gln#->y`EWMNQLUNNv1>0$Td?=hd|J-OoAv#AgKx0yM;9yL~JsC+hKqm{~u*6w~TQlG*uLg7ULdR>E$-d~9|k zpGKqnF@HuXlHW0cl!PzDZ%JIDbYMfHD`GQLkA%;EV16z!QwH@&+#w?#v)V)tVScA& zfzkclQHqGm_+D=vgM%w4)9OFoR-hkr-MFFxRwE%xiCWgT;3S;88ynt8_c zscYg@gkF^85Q!DQ&0vSjoyLcToIhNUh?$BbNt1NDq-_|qoVmvM=3rK0uqb06MQ+b3 zqo-`e@0q7+n)#C#2ANZG2%yc$Sl+4notCn-*ldMkOc0BLu2~4}?g+PO^1Ab;cVR|W zEPub_0SH0^XxIocE<=oxC!RdVJ9 z%;!&^7Ulm3A#IuIzbURU(q`*mmnIPAfzyEgW7@kMj(Vf%ra;wr&AUe zJsQ%4F}A7Vx9#gSF*fovA+VGqIjsY(lBeoQ5m7G=D3=ehIMLXwe$gSj2NL2r-$>3$)KOiK85l>jF1N zB_+)=Zw;J>x!goC&eZW5bkK(#GUq%3=rk=9*M~63?>b}=af*e6sMIU_}eDMvG2AT7yxIe(=ogq?&; z6G^t5EK^=2w&b7+FR@u>agf%W;+Lv^=loI3Yax~fzTnHULe$yuo<>EDr?hz%{;XLV z>|~(Vtepfd7|!Df#){K`g~43^4%R*kJdiqE#<&SV{9$P=BQVCb7vU^jolS0pFXNv~J*Ez!mb7$NJV_p33B%@LydAJaFmoQbkei$OUZG4CRtj)Ayo`6B+~(Kwyu`xoB%QE^5oHWzG@%Qc^2| z!rBU#=G8~t8zi#)Ab%e0)Fh|KHY6D+IpYjB)=hFC7eFH!Z2Rmp(oTs}c~$*VsE_;f zG5qY@_&X%C;A7cL;wLx z8WeU2wPw2NM~i@1jTD-&yz#s32U)T5j7uTUNCwXOzVLjbd0niG>?8PD$pwo$8bx{> zdq(P~9I}_bn}5kvF32!nmRBTOn#nV>l!Gdd#x!~YXK(2DjO=R`SUU4+6;%6Lt2FG( z$n7PEB6V)|Y8T0St-acH@ZPM2NwM2*ZD(d?E9*LM!@Znwd1PO28JA);J!ZD#5%+I8 ztv&0Wb#GS5;ED}q{v=oD&A%nXti{2m={*aCz9p+>-G9Qa-lm*A2+_T5W)ar*fBnr6%^;t(5X<1AP~!r<91c#PyC<|M(U6Ja$J2C7RDONTFH zO_Y8xjupJON7vsSo>-vi`0~&@z4~@IDk~Wt-!n+5uq@UIt+ixoQAtr-Kx>lMbW}{l zIa1f)secab{#tdl85Gi9jX)i!i1eVX1C}uHjQs-kLyhM8yok#aYdn=UH(UHv?79X` zb;>QyofYy1>C{1trnRF5G%F8cHjvhC5i)$qz=$7t`TLRZ!K(z3QB#f=A)A7ye5t|E zj!ZFt)WHjTOn?yT9x9WmZQ?T6qKH30o1@(THGeg}J@Cz-Kiw}^^zh7z8$)_If6opM z9Xvwst)V0BsM#sk?1O6#YUfcIDw5)Mm9jz+OJuJ%T)9-43e8! z38|wkqp%U#TEnUu$R$0yh|1K(={$^8V^z*biBqdI$Y02%ag3TZ0Z>*Puq=T4mTF3E ztzqm(MSuntr5bFLu$9zX)O90erRHKb!Aj|IkWXFY7&{^QKZzSAUyo?`@?Zg`$csnL3XDO|8-;3ql&#n-R>_xaw+6 zl0CRYYEwW;j`2h`bCKF*;P^bNgnw+BuZ_?U z_KkC>Uq(DXhoV0iq$y9GMdKHZ|0@YNEFxqnSp7-$9Red*fMb$Cyinlz*?l!n3wE!h z+qlhZ_^!MuC|@s#AnYDk>NMqB1`$?&HO!e^)|P`+1`&=}Fq(^l!}9yH+#zYo>-*G0C6zjI15I+Bc%O!+-axOwQOp-LyM1?M+42B@Owaa%26~ALRpyVRKabl+6s> z^$a6)=0+y=i$@}YuAj1;tp<-_{frRbLQG|!W zWR+n-m3ep-kZR*fwcLhvT=*|gO9KQH000080FJA2SdQQT35yZ{0GD5d0SLEJEdinu z1ND}Jm-jyb#sl@1gSWsy0pkn~^_GKJ1h!YWtQ7zNO)ZzPO#wH5TW}lKdB!dliwi-J zqAs?q*j_un2#&?@BFPdh%b`F}rcHu02*viX6l8&O04@#eLT4A^rBKtUo#|Cc({!4& zR$4KAXfthQI-TiEANtg%zUQU0)3-kHtxcMn>^lAbvllEtIBlLfC5LCvIeYe8{{Q>0 zht~LbCI&xGT^MhFl;>ixe~S(M9EZOzz@PneJQkxdDz#$7drCwSPnpQ#87Fb^R7gTR zRgx6X6iLHVZpGUfGA7}-@s`@olB^VK#;EdLiHy?($x)R|&?K3pDRPXa$rR0y<8+Li zpjmQ~j+0X~M;@UQSJ)hVoy|2A`UYKvo_UsQ zNOX;5X-T}vHdMM!H$F&^^WhupRp^_v480fVW;jECh2DS}E~dYjek*;R{+by3HoScY z{s{cN%U+^4*(Deq-_A60^cMXR%y2nfO`lJH8NRC0-oQ7(c%3bX?`zQZzMZ0VO5xj= zBl}^r0jpo3&2T?k)PSB>=yuri9&JI-RoV`FOlm>TtLeAsT`^ZjtbwP#O*yRag~$pv zbztm&BJKK=QRp4I13lO1ZrHO&_o1girOjCW*AG!@N_ojuJA0j$+gh&ZFrk2!Cu{o4Y4b>>F8!wP<1icT)*b)dRV;j@!M{?5S}i3&9KUU z!Kp~R+%-vR**_IYuY{)>F98+9CXveyNtaoJadC#!ra1G;5=m}4UAhOAF`MbUzO`Am zU<1l#T{p?(n(oxMt}$naF%wp@iLz4DZ<7?3I6p~ZZPjXem5oj6T3M^i&Rsl|u#psv zmr93nq*XYK_gj}rhf1VXI85|gmrI9gq*XXfhOPOT(qSsnDjcT!t=ZCHCekV#;t7b? znkyYwM{O5?c8&eiDiB2o|TYID^IGSUicvQ&@a|7K>-G zcpi&9ewg`{KJ>^qKbu#@XT+oGOaAkc|Gexy=V$l~E}^)ol}dT>&Dq)WVeLX#yBOA9 z%BM(BpXal<<{TFDSe(b@g148#+U1}&Hw#j2wcEN$>r%u7BZrYA1vJBGi=Eg5sTVtt zzJBbk1a-L=dl2tQ2kBz0@7`fRFgSXTrn^63&@gq-P!$Kj9-TO!iK4St#Vvs9obK$oxegvfOjvE(&EH2dA-c2bl=Sp3_{+FL2Go}67!8AQ zSM|1fgvRlJ#?L_kGzD56NI+q!C%rFG`8}CW^duTT9ovk-Q^sdr+5fgU>~-d7zGBxx zb*eSE)fOb}udZol*>rP%TC+R7joG>R^A~y><@HLnw76ExM=9R_VnOqja-?sNR9p3Ugt*8IB_Qsy05A|xdSAX9@5LW~NCyC6Dg&t%8mGz! zv7WLNdmmc~fVBesi5{?{2L#_%9wd5+18E{Ah_C9ia5iz+%9dekZRTuQRO3viwx(;i z4{gVAwxHeuG$FphSUP&!Fm=;`ApqemqY3Y8T?#z`2gMsPEoW`(4nr2U`^@wYJRN11 z-v_+#Or#cCSuHew{?YMKS}hFj3cE%RWi61v0A->%hy#BU-kEWyFqH$ggoXzb<6Vin zQsxN!kwcrvp+});0>y8o?gI25#13Ly?n&Q}Z_Aq=I>4n#09p)?Uss-U95qD}EK1Qe6dH2GdoLtNc}* zS=qJ>(~ZLp+?1dk91OJiHC(zNa63RUay&}jD$WB+ARFxjPm8lA+Z{f`$&s7=3p@^* z!OU1r%1IgNl*HeJW?ha@REc8#nxN=ixhH!{MT&`<9MRLvx$zsr#1KW)2e>t~>xI!%;#~G1_lC!bxBCgQ{DIO_QQ8kAd|rw5)a^u1YJ#gl6Zh~-bzAL$ z2Bzvg`EIh8+)i$%dI_3@h(1NrG;(Jc(9iay1| zSuL%}t!`&<+i(uc8_n$K1cFNoS_L10Pg}Eg#j4S9$i@W?VmQx53l-QMz11C@QLuV{ zL9>nhaKip%K9oJ5>k)uBn}*ojFd41F){Lf!vR|rz(Nrk6I^UCB74VB5Q_?Vf2&3cQ z>b6a=MBeKB8H6HQ;$;ARvSBoFaguJqyka-5+5(GZQdim0xtY(pa)-JZn6+wkM7TW0 z8MX|QvRzj;JG_c=pyC!x-A#0iI~MqV<9G}be~*Gtby^Pp5{`*C>y3QMQ+PMAxP=A2 z5_9k`lJU41x+?M?Vl#LTmWT#N%rld(Qp{L}Gltquy z!2=&Yf#0S;dkaMaBrz;FcRdvmH8j7;K-B)>Q$_7=%8|F@p16THC?HtiR)XMvsMzks z`LmAvK<%Y`@k z=?CKnF%Z=(9T!6C?J*}rQ+bdRqB)`eQz)7!7J7(P0ks{=$D=1_!iM{pk6J?yryije zw8fTfX*#8ThX$J4u|#|~N)85p#M_fmp{+YZrrj763SUCQOv};j3`nwQ3`(-UfPGXb z!DN~?nwn9Yz)4`3W`ibY|7$Vzh6JEm2y4N}-E;b`@hj?)sOWPYP*`$R5O|9-%>Pm2 z4$8zi;W6_q^a1C^&ktTj4_a?^wsewccO6Fa30K;6r9Ds)zyp$kWx#uX>DVOMfipS| z%;(8wf`k9&fX%yuHc#Ht`8G~J!QVrnYhhvH8x8-a{0QY5QzzMfQ&~_AlR{M}vLeaiH-DXD%-{V1C{;VF^#(HD>UO;S`@ z{6-9e`|X&678VqSu&};OZpt)4(ay%P&xt#VlWa9KC$Weznq6MP1G0*@NUPr!R+KwN? z0`m;tlZk>N#2GdulCr?a@jHy7zt%4C6Jm^b%WH_9ac&{wbEWn2T6LvVDV8=@D({GQ zZWgm5#g&psjJU^tHi{dSm8D{}uvl6ztrQkl-Sp<-+Qw?Jy0YYEudS??iyN!ZUn#n2 z@bi~eDl6+HHz`z2ZO=`I`oxVxpy^Ja6|B;3KtbLXTlUaAgv&Fq`D%ne)d(@p0|qd& zN4D)Jw_X;15u0$41pFXG6WM`8-ReTfRmT1|y#7BzC72Bs!_XMB>~^>1R0lfK;Tppr zc%*}53`d9KE+YUivzIN9YFRsm*%V89!JP)O_j<8VIp>Mk z%2JeN;h7eK)fWdC6Y|Z*+m#`%@wZ|@K2j3gg2%T<$lP(XHe!N4saGUFiIhw7KM~Gl z|DwdkbMkKc2%SUOnhae;h?V7l()xPfdIiJ|=hX1t#iMk1d8Bzx`lnNQ=<8t`2lQo4 zPe0Fp&CFhyximLZeNFFj{oIBDhVL9+JDjVPH&zQX^XKNK+xu&)BB8)x7RK?~?3Ed< z-%;weYmC1#i{6m=%IwAY^E1;!fj+zdBVt?843FJyG>l!e90Bb_3dSBV97hSfpbav4 z@W5X1{>)9W>E6xhH6k8d-+{8c_!g8D{&tLiM#E1^^S6=bJ`(=)Q<31WUhTko#_nc- zgvk{D5C=_SQVZOhFoYY(#K}?X?~Jnk5`O|c0Ql=${8SS?fp}ow4`OKl&EvmhT&6z` zMvMuIuY?vqp}!qqZIiW7o`pNfY-hmu2T^#$mJ5{{_VvyFw6-|tH`u}LP0cU^*H$cl zsB343u5z`{X={LI1=s^fSBcBy62FF|s{{t+( zCh!ut6Pv|1Zxl;~qMHjHjMo;+*Ngmr_i*ezEbumos0Q6_>$~pwW^u7xc&%D2U0*2` zh1USf39sS)2=G3O;ElQt;gO?GL>1xuBEmVNNQ!VGWJ&o%j?4e0#MO_LKKH{keloKwnF_N4*Quv5A+cj|4+`We+LlYad@Gj>q)lL_i&@w zIUYC?{DS~WL~V;BdA=+Et(dr_?jx!eAd3Gk7JblsJ;d8Lut1#fKO6y&GdlQ5dXokD z$S5p&l7AI5sY2ufPQ{RCDf0t=&&i;?QSIo?*2oJTfrhKX_-#vgQ+y!V<4(L6`w(1? z@5t{Fi%*lz^{^Kn;UlsR0>P9M`bY;tS6+D)!ihbJd zv1PEECCga`x04n*x44MK-4tHSRO|K~-tg4KcM(xZK^NK>MnXDV_-m4XyOdSYokZUA zzri8d2Gj9>Vn#DN^eA6{A7C~Gfkp-JRZNCSJuiYdDRRy+j1!6b7+y3;=YqhF7}fDpbCW;{3;DF^sv-`T&8W(`rHqYs5nmJQ0K z({*0dKudroK=XrX7Oicg<+7`o?&9HE{$$hnB>zh!_FoCsh?{VKw(AH^k*wg9Ka&47 zj>v58*$!(LcMZ(7d>^}1&t)2BTvjm7U+zQ6cQqeXQy{VUC(9|2T>f{6fw*_Kz|UeM zu`Hgv8}siRmh)02zvOLGxJ4JmPQYQrgh$fwc={E^3Fz)B;j4DR9q#=wUxIk3*<}dX&iJ3Q3oX%f&JpW1&Y$YO`{GW9c0T@A;o&Q`v;jpTyU$ zm0e}!+TxpJ0?R6<4i;BeNO6gzM91p-b-%W;agB`mOs%%{4oRbDeGyQz>s4p_)frJu zLDhSo5&w{LK2pDgb^JdT@sBU!e@eXDLHr*Gar-MS@Wc(0tJEy3c4M(K3-8d+A zk(Uzb01+O4iXcY#ufmoKZ7u{|U?l>xKGsrK+7|7$*i{sM8x=(=hX06>lDyxn$bTt0 z&yD!ie!;nrKJ8cip?nNR{6Ycyqo5O_$R8^S`6ES@A1X=t=SoWcnUa=&s$}G!C}Z-E zm8|@q%DDU^B`5z-nUMcOnNucV*#xg1N4@ImjrwPJPq}hgIMJ^`1RWW z006+3O?&}0AM}=kSd#|VG6Mhr009610384T00000000000Jedqm!W(CECcnHgO|{J z0c~APXcJKoezPJ*2(i*aqgj^KO2mpJElOibYq6lHNL%qxRFJe3ENPL}O0k-3UTU@; zyp+Wbqya%FN(IG(i-kZD#abw#1ux>kizm%NDgImp=e=~3Y;a*&n4RydQ8?}|D&fj?3ZjQvkI%5k*$@?@=yRCUS9 zqCHibJr(TKR4HSj;DcEXK8$fndvO*yhl8?VPdNmXnXF|zZ^p0*UTK6h8C&O3OG8u| zE(?SAu`uklw0ymE)y1I1y4v?l3biY0TF8k^N1OkAi5qF?koeuxgkQ>AFE-jLP&cz- z1#9ysuLaNHiQ^(bEen?8XY;o!rOHT7EH@<<>L#c%KNoR~gO&NcTf^_u&@H`k@#xG~ zx`G}mW2?(6{td5r;Hbz6=^sk-IW(x`NC7%vPWUOVs5o8XHtmyQ@CV8*cW6$mZ18axhn4df36BV A$N&HU delta 42136 zcmc%PV{j(ivncv_l8Gm_HL-2mwr%r?lZkEHdSctQCbn&yckilO_rL3&eQF=nxvQ$H zYprkHAJ&KN-{unJb2JpZf;2b;DhLP&G)S+jPd$7JHvB(XqpFhvbm^bh`oubHTtJMV z+yDcT=yRL&0&|{`0%T(u@_9Ut`~bA~#rl4PgI5-z`0vZ%d4PTQqkaoc)47VX0%ELv zg=>igb6?XulLtqe+{4M7mcx)DIPED1J1yiU zX=vk!sWwQ_;|K86vvrq$$C&51P}3;^Cw=l$mT9&_?<@Pf=0h3Kl))JETZ&+*Fh`=*?> zd(P$sx!i)?Sz=OPiUgkI>xI`x4?j(J>kF}EF1_T&%@1wTfFYxLjSD*T2@L(t_3zuz zBX<7oZF0}98b6-g)U}ctzs=1hby2>OmWTV9hQ|_0>xQiJ+YHBq`cjm6a~xcFjK*hV2vcOj7wZ6v}kJueAc2<)RGp%nnCY3Rh8+a;CA`g(5kZ zrh<9?wT?!O^NmY3Wq%~0)Ul6@BgvnSUMr`8(K2A?$eQuwXmp;Zi zfGzd9Zx1IAdoN3hJ{}HtSHVQ@!+u|>K|p>2O~jd4%o2*Qx7)Rj$in@^mxG;ZFBi9m zw-Y!t&Ue9lZ@-(Thbcv$52?TUA2(;$Qq|wK3krG zzQqtz8e@(n^$ykmJeor|?d8Cv0{5lR>2s%!79%!abRcTuJL6MJl-1TUHys;=p}cGham3G`C!bas^cb)9m{n$u?PXbYWm ziBlQ<*Gbcn+?5QCQb3;W&Pbq2S7n}XXYEiRqjT0F>r^G6YT6>iD_oi-v5z)$b2ed$ zQz19T!TbDUhYi zW4h8&w`$d2Tv)Mab@}3v9M2l8jNOKT6eQvBC+b;=Pyg)EP3fK=o7{v^BE`=L<%Tw_ zRMg>s!~uPuw?ocv-;cA2^qMp3QGrAM_KW?4ZmLB9>L&Z4g7Sf3(qX15Q)yzPlQc4U z28Rv)WFl8a!;iv}r8R|HKo_kuWpyaaSmF|1W~8bX9q|i_eZrfR6DrDvO~ifqpVo>k zu!yc7xihX#UuPGr)-U(q9V$(#jM zGD8{wWg$r-_63`{GiUP0gOiNf7@6*6BKuA3qC1XM1Y~(FWqu2emKh=H@WIsiwKyRL z%zhT!U~bj-ltzpIT#yPVes5r~bZQi>!I)LCMs#Z2V>cc%M5K)z=Mff#kkVGXXTfj1 zg^uh@lO|J2{rNZW;T1GH+(u@uPPznww> z0{&u7tC&0%N&Jx_D8jVtqElnRaF`=-aXQGZkk!P__{NN^P_C3vHQ^%B$i^K}`P^6z zfBIpliuzQ}_<{K_bfo?V+{y(KW$%OJ6W|qpiqr5NUadPz4im)Mz2UZfjKFc?EB6LW z>Ix#g2m|Jl1!-&4Kcs3Iy6VdMlnadj4UTjWaG5AF(njmbEFu&6M3bgX@p~3kC3^V{B^UTQO$I36M)d-b4gbnCQ=4hm51HFz&?#@_H|662-t%JS z6{6r%2wRp7KS|8O>2!E2BidILh`fw?so-`eoa%g0f;Yq?NZ~#xt(bZP_WTR)IYy4ktZqg5bwnV@Yu9noAwyHOj9JW zZ=XL`vBrb~;vfqNq8_hv^o+EPcsa>T%k4AW*3>bpl66D+SencdoU;N zPy#FsfvR((LYIt_328kAs*8=>f;7RydOU~LnBQYiWaxH8omyL_F3D#Nvd6Z2m-ooK z3%q-uP)7vKGG68x%x=`?j-u5EMSLXd^wHf&sgs#|tYTnHT+b}CA z(1o@+kaSs^R|uBOF5gHh%~w!nDkur~y>{ZLc>@;A@4``R5F6+pkL^$0BK}x#(&GIZ zKH^VLlPw_VJp9`}-u;4+2?0#oxKES|;L{WM4tj69JKfkxD_7M=T?#bSf~(dUs{IV}g2PXH1^kLE4jkJ~%>Ew@;7f68m%y zE~0rgT*e^zEt+`nUnxnPnal47y4~$0OKt<7Jq2F{MJgJ*3{;$yCPe^724D102z!&c z_B}tBQ$wg3d>Wn2phIIi~gIStZrT?fy|@NyyVcMumrzPZ+IR8*a@$4RuPuUV;2@1@8#T-zzCiuihc zXQ5V@c6Pd7Hkm-C=xePW4w^d_aDHfS+xJ`7eB)EJF5?iJ7H%EH6ZR*cu@T`Q|B}K! z-_?N*LQmOc(Ju`^u}Hd(8H@ehx&&uIFo`TzSg_j5#L+;gTXS*L#|!!)BO>vhbVO~! zqiIIm#-~`|-wf3>u*^-fC6k$B*MMX+7<5RS&X)LfdZxAk2s0S+5TFwtp`K#wa8=lR z*0l+|-8<|vl@jY=t2Llx3|lV-=3W$sRZ5FdTlD=rn4%w$W!k`MeMMy-bZUpmx}gM2 zS1r8N*PxH^2`s_#I(>%K)N7Pq35s8r^D$D-c+2*6^3nay=$hB)kYkFP@&kp!eD zAhb9gUR&P7RwFRqK^AV$c@3Zkq_eKMgqXEE7CdTRhzDZ@Iwm+wP_qE_;+VThB;5Y2 z=NW5BHBSOiOm%yw16I?-g+oO`-)hxsv6Q>r%yox4pnJ)r8w5A%(0sQGmPAtg1)(r? z&WWecv1CD|LT2lm0`ksb=)f`jnI7#>#n<9C%CTrf#Rix=(~c+gY|osi9r|x$zS3=p z3%5qCu(=^Ga!zF5G_BbMd*2;q_wXI_GfMS}R6POY@VKZFdW%-PF`2%y?zDG7W_ZvI zy%fd%X;`&DRat3b@upuJ4%D;a2Dx1ZY~CqqHh!dSS(c;;IY8gLFhdKl*7MeC5`R4B z^HX@B6?p@hbpX4$wgeVq?t~e50Cb91As7P#3oR&o*L)Pfq7e(Na6Xf)qynn3?YQ$v z2^Bzhj$VTf9?)ag0_}jz4K}PjY);;LxNyB>I?}&GZNhWo74sJ~m7)e(U#&ClaXbc( zsU0Z-r#}^~+PJd;%zH5eqHt)U{hm2r9G9F`7oBW!d6agczd4}2JzQ7S*@TUd4fC8hv1Ly0{=2^-U8~glf_v!B70&A4Nd{(-y^Ksgk-i7gD*fYVK=JeIP;fBbWfA z0YO#{R8T~4G7zG{Q8a%C+G3xeUlR?hc68STRL`tY*bKnkr2D{My{~3j6t5X2nhA)s z9>%lU0o0gLEuDkCs>7PrWx95ybgz0frIx^E3zlvZH1)dASpiULR=Ji;v`jA6>H;h8 zG5iiB@xZ^DxjOZI+Zw-&X)+qwTTB5!Kufog&5);3c76KDKg_hbKCM%j~XIIKW8K%scC^T&97v# zQZc6))t^Xd8V5^mT>MSHqD(CjmLRIeJ# zP6IqcGu;XGeKV`kyF!4uWcyH;J}1F3h}>oB;sEtRR*^tWVp1F*C&71Cgh*N(qC6Ys zsk=)nvUvac!m*E^B{@YX-dHo9w2^(lpIf2>jG5B$y9prAk~w#3FQ`Yj{TPCqDo!vq z5FuyO!<_0NO80pPb)1ug7AnU6YDB@y?lKsw#*tkUSi*POD5d~>U;e*|=N~dOX+MwE zXS0Si9SjuXTobkpJG(CSCqJOwg|x%h_t#nCFK_D;CrCkFUWzB<>djznLMK5EJ zUbLU#tpCXVoc~KDFJr0g7&ok#GR!Wi=_-$kD6sNl28Ly|Wf^P(Ql7^0kuv>Dh+sL5 z8qsxI(1DE;yN?lIn*~`Y_9tyB@Cf;icE>Z@yEqJZJvVMkUJ6QuxP|+@9q+qY5WNwF zBTawIlPV%yjp|P8nq!J!%cDIKrlm{yG#<)BR&zag%yvuTO7W96|1Y!)G}n|!Gnxgb zW%G_XlK|}Q4{(0F=)~|lKps*4en?r;9F=ZLxweB@d=CpCVuLq@4qg-TiRPtZhl>G> zg6wUboxe{g(mdIG>hXLR6t6S)>u>w;QBq0OM#0MWI|SpyDJfHdjr>}arS-!-XYeC4 z4373u%TKf`UU<51}l@{PvIU&~_OZ-mm{FREU98Nb< z30`msyEG}lnxvRDdj%a&1PwoqmstAX1tLY!j(AFMfPzb&Bd@hKZI|_fFQ!y7-A;0W z*!E32D{e8K`Bx$xy^SBLar3=yq1L)c5>XNzZE~!a#8+5n<7Xz{xVJ!KmiQTLe}W$N zC51GjNQavbpImc!r7ROgCVUowL#7#0>P61a`85N8ek;ECZR)uWDK^8c$55I9gG(fY zXsWhKIM%9)*Mw9zT`wYPb>aqQ;1xyBJqQM?l3!q^Y9h#RWYm<{<(I4Xc>rq0dbCEr zEi1)-zYUKQ&S^KqE3ZJvz!KzZV5jMaReQVFQ_c5-Crz4ox`{{sgVjPhxxQ2ATf;Yt zVqGr4vBISc9B_aoH;kl^%upu$S+q*{Zd3;h6UiImC} za`r3v9fGD*qK$Q;tcv1TN4z>K#zHDA>nePwBV#VSUv;vrMz$Up&ixmOzx3<&FKC^s zWTGp0+IB^n9uj+Ke&4vM&0ifdX;vq{EcZbGCmmbp_$t~CeGP64y}J3UvXEeIV!~3xaGIIr4OXEcX!PIyjT{xwig$>*Q{l5L zzzsX^0@S-HHrt50*YT^PiDd+-6(-?r8%-{B!UP^|6KBY50e;tmJPy%xDpw6$7mS|( zx-vq|@g>ex6}$(*z+)yxrECw!y`e!#3Kf;-ZBco_K3ogrQ`dgAbv}+a-y1!i+zWmH zx=*C-6uKNrz4AnlBZa6SpT6zuZDU}ml0OYnREEt(>ko)UZ)>I&RvlgQ%+;I^C5bw9 za!NMt)8StTB*0W#+r$1VCT~w87Q!w7VV$I@hZYNn4WjJqqs{uvRXpqVEB=lA^Zgu1^g8DI?vszeEly zOtXjI1RKW0(fKHO5S@Hv`Z)|BLw5__m+3-8f&E1C-Z|p5wHL=#PJ^lFCL_&s)wWDH z4_5xdQ=D;==KYE!x?4zLlsqh->5AU+OXnpjR67;Fun6f*j33w1MrAbKTL+m!M{CM` zpGZb=KOl|eh$O2+ns0%p0Q>pi;n!z?kMs$r`1~jxy5|T%m|dUcR2LB-!c6_G0AgzC zk3>U+AM~FVO#QO=T_CYRO%M-lf=;Cg(xXdn2&cuZmu>o*xPmN2uz3i3=Bq-|O0vT* zN7m>TkWUrbGO|NhRJYJMH(*u$ZEV}YPWJJW&Q!Y4HBzxLgMIR8FDV)j;d*$-Fs_;O zoVc`h%(SWzVS*G~!w3rCAE4*b%NdB!Fo9IbF50h(}{RErs=(Lig_cSn_T1+gO?0u(I0`>+t z_m%KKkN1Pg6`iBI08{K;5cC?+vT=SC@;KIt<)U_IrwfHDQ|Fc+re2O1Kz(`X326~1TKrDolfGM_h(yTUP5>V?Yd~fST5@sJ3i1ixi!?COq?zaV;zM8?Q7B~=D_sD5DG^dwTH5N>!$einY z()wLZCinp~FK3zR+*z4gFAlt;f{@#X*T{9EkBPqGie(&RAsGqQ4>oEAB6qkB!IscTwHu8vA)a)uq z?BjO6@i;Om^2OQ5I}9n0LA2GK(R*dD^wQBqBOnH#J%&Q_)g<7wkEX;IRj8_8zgC_% zw5kTHkCg(7+d6~CxRAfsM~bi#Zo(Pf*125|16S$)h;jL1eH0fom~E zzx`cUOa+F047SQ3U>u7_B3k7nhqL8$rNkIQqZ69V;IusfF;UP0g`!Nef!R&SK zKTFz!lV);sRDFJBNu=hzTHKRi{&Z89;^5NJu5b~>KCP;2hPzMtjNN9o@@c(8Z`S77 zXPSFT`AP}nu}=HaIjwVeiPIX_L!2Faqn=50o@CxaBFl1Y$(`2oP3i8sO~VJmboUKH8YHJz zbA(f+rlNx`=sL=$zZ01c?cLLvTdv#a5ElXX_5>#5+MO(o3#o4x2R!;?hu=1WEz2)) zi6K*#ECgfmD>cwI zF*LSQ%7vI+u?Z{g&F4z63x$sa#QTl-w<)PSe7J&4lP5j3mUl_EAQb<8Xxr9|+VYN82^IqByMM0)o62`0Ih|)|bFB+-VHL9}fw{3MeW7s|wAmx{sLKHAM@st4nKAUN!Ku zg0-|9hwA_d9?%LD;8~aqz=1xp7;o*v#YCT8xQIOJb#P$`JG?%C6CK-q)u-klCecAT zcuCfre2$iTjTg-|qSSX<#@Kv)>?(U;Ff&@&e}4zj7(6FF9NYbc4>Ljz@%@3)W%^SG z&9Y7y8JqoOCjh(+>!k>AJfVh2iFAG*cx&1o|rd-GMc}~#}lD4~04WYaNn1CwGehWk(M0 zj>(F^->qm^Q9%_hBRgdhBvq>~%z_Xq8O}dBiDa_H@c9qS`wXv`FZ~i%DV%7cj}&+H zKqy0fev?52%DbCa9n;HRmaWWNTcMAR-q!XtOCCcmY|6|5X8htkFi)6=gL;?P$p=qk z`A5OeCASC>fm720u?j$4MV#uNR7NN}@wLIaC{@2Ohf5)m$G^P6V5Um60$~P1Hr=q@ zMt=1zIGqQlZYw?4<}V1SR?W_AR?yXS6)35!J<)8kHN75Nz6fHy98F93mXH@|_BOHn z=pcJ-CJ>7REYJP$VW0)Jzi4XLAVx;fRk=?zKfy6S$`yw=+)=jW58-<`MT+F`I$dm| zlj?|Qyiy5dp7Nk!4>6=6%zJ5L*|XdHy}dkJUGa5CZUs5_mA9t}d;)(`{hqEo&+iIX zM{FjK;iE<;EFpT)qCDQ+cRS$eyM9_CDRwE7(vCO<7-TDqTfSRK3fk&_B`rQJ#TTp# zw(^=b`8E~RHP<%JMDA3E6o-Ab0kUXP6%)5AM7J7?qvqk?378Mx)>XP}9Dk4zx5=}` z#p5+&m(rQFB0ZRrmvLDI@WT1WpEfNX?DGZuMRyFv3txFBZO4k|HAHGSviBfwvO&n@ z)nnEMP!ghXksiSk7m=h}WC|Cu@2xhF3w`eT1_I5{vM27rD;k()N9_me!9q)YenAy8 z)?3Bl(Vy70KPGUJ$o)GU)(jfjJ@b&hj}!B`e!Am?%NGxU+M!GwZ$-L3AIh`#W98nnDyY^UM1-V z;0i6rXjZt2^EKjZyImCj=#bzgzd*JNM!!GYr%ZB-R~0*70b#>R6VEY06DdTKf zfx>zfPu`|gyr=W3`_S^Smr9d=5AxXAc?yMAk=+R?AY2=EVm zM+UyC4f(Y56W562CnIm4TeDXIGC%^fUv=A`?zr)vja;a81}9OCd$1{7eo!87gy6~H z--e{62+7}jeM=*uwQX^iuN!}ks3VS0=+K2~%8NT48!aNjY1P98x~yDLGJQb!JX`cM zyTFDfI`NFtYRre+FS+gF*o?i+!JmJXNK$ciu}yiaRt=m#PXepcE!#fy4(Xne^z8puU5gij2L>H9%*Fy5dXm+!+ztE8g-Uae^EX z+HM=`xtaj}^4~!}f`84UUf^}@%5pxvoWP&5i~bEhVadHu(r)Ha&B$+X#e-pT^v#M1 z)jDg$1h;1L_-NQfSW9vNi1tl4xOeMl5zkU#L%BtJXM?i{PyS{0zm#|_Uq39rp;5ZR zSmhyJ?>Daaz#IH?7CdoV58;sqQtZz0phlw<^j`nwHkQuIoRQQ zeufI|)&LNJke=-+3O}b}P-->m_n(JcUvhkN{PHetsh}2Z`GmmtJZa+rfrD zjP-R}j!!$$249eXZDr5Rn&9%C{KZiIAg4E;iX~XIo#~y6)^u30g8@~jQ;@r)O3{my zJ9@wlJuT|}Yjaf{1=ZKL$>S2es~&yqYamfVA+#xvg`^U?%Y$U z{nJ4`ZjirzZ;eP|J*YG&x&FP3*M|#fZZ19^qS6;JR+AtL4w>4z{gtgmK0@pp&esv- zLmo?iz~su;CD`NQq$tbblXan4@MpKT94#x)Y!9L8S)miVj}h&|cCNnzZ%2wHS^LWe zGh2^wO~xP!<8#W$E!Wr`huW6YH-?kO;r9>VwtNgJf`^~C#)$G&0Sy=khzE3{w+lX? zT2Dk~ z>Cz3M)U^2LMfN)iMbXN-*(Q|_-ndi%LSzaPlh~y@T6sLfOxheZr;2O8f92h**`hs&Lhg_Q;URk@{ZU8>* zqDoc93-rqixmlmR8`30RMdB)~Z#H)6)!aKdXCIh|xXVG{6Z5e|_}ZL?Ol~(c&iYW4 z4*Y#VL;*%qS?4tE`_5?PjEDMUuwe+&IX{!Q@myF&%7V&8%V{HJO9xcyNBG5A3xv$9vEuRS>~8V4hy^Qj2#`aF>6zo#zLafDkNBT<*1&6 zWegi;Niw0V@Nfg1#tjfBr*q*rTlN>;5+AFeYTXM~ES1wW)Nx(S!<|mQf6oVbJwwka zHN?QeJAQ*!$;B9L+LoYYDtcw_Yg|9{yS_CN6Jw;CFX z&_D2twiyHj_a8Q9WNDXbU<8TD)bjwDM+`y_l)0SlcjhdxYQ@Uj!h|RIP?R*myib zz(2zIv`IXm@&+SYpzqO08p=?)w{d2#IV-9Or%@$;bf z&swH;Qh7$N;cS%ijf|1OkkGkT5L;{n) zos=tMzu$jEuOuPUE*StGBI$eLM1d0r1DIbnS9_lIdKhhVxW2EDhARZI+qrnz-MoA} ztsK~Y#KQo59Gsk9l~-H2oqXLKegNWYdBkbIm)jF!KA-QW5qFB?yVFcykg!Vl->=iZ zb7P{LJY0Nj99*3I-7lH#`~bl1*8`=W_svLQ%k9v%oa#Nr^$cKZ$nA8Ya3OMkMf`TR zZ|of<^g{dnH5YOE%f*D3hu@nDJml)NuVbM_ z{Pxz`6>z(M-ox60aZUH+*P}V7?b2iZ_d4Ub_2qVGtq*Z~4e<+wS7V+lhr%=7{N4o3 z{-k7DDPl_%hYL_Uamlrneg1RxhB<3dD`BZA2o2trD^V&u%aq+-!8*u{j*1zi~g#rfA;s( z-OC-_w9;cuAi`}oI_&VIW6BCkZ39;!+8GhuoD5WpjT1mvn(Y|%k%x?p%o|;QZi!q^ zUHZuZTKRZ_Ua39qT7XbRKy_J!)cz?GhMU-0jBR<8&(H)5d3n!NvA$!XKm&ek`uU(} z^t9W8R{P|6FF=eaL1B~~gj{P*r-^m?E@=xPOgKbs%wjBW@RV2zLm0+_0aJpnWBhl+ z>X9=hem}sEVlAVjh)#tU%t6vM+2&-mLYvHb*dHxD&V?}tqwvi@xI3K&li$INhvpWWTvBp`9lr?aC^FP{PBEy zUTfZ7QA-G(41U%SoA_R$pzN@nBrF>rrkG-{HU`L&qZ_9cO|#gm^JI`evlz#Z%1|>G z9{PT4p3HB=o+p)8bGM;Wvuq3Bk?)i|qaIU~u>n#|my+*Qc)?OW;rsBdjJ=)%*{kYR z^6My@$*AbQr%C@{S9CG%LKj8)I`w**f)>i~nLVPV>yvrPsV4{5=S9WHSk-N0O&zM- zEdVMRHOW$4n&h|o`IWY9DXGwlx(kB0>@3m)6fhH8?ENqUbV#F|nUDYtUoFO*V2p4| zkRBkgzvy(Cn*GTtlZ{yQgqGj!CXr|wI1kgzv!EqC#gG0}c*tJ6*(EHcS8VKc@5P* z{9J&I#O7iBO^$Oc)LX~d*GO8zD+&;9#KrE{QNy8%8CF08uxErMJi}(xt^3<9er&Q2 z_fcHTh-6NgU}NpY!P0XU8-R=*;*G7s$(+&X|K@e5;vy_gRzfVcKv^v-)frEpzho*u zR@b3grx?NIK%0T)!9iZ0GU+gCM$`6ra@%D(#M8oAuSYzigdbWpfN`?MTL$nOg`V^; z0V~QP;n4~xptAZw_!P6Sbc`9K5&7Z=kkf;Pnjm=aWpYiTVJp7J`N_X2d5+aLKCoZ> z>1!&Bbl%1ysjTZ}q3a8FVv+lNz}^__Ar=jRs_NAE3)*JsMaY2d*z9;3RA^0xx7)@_ z@ye^4VqG4mjs6f)>u!XqrX*0h&Lr z)LNpl9VmlB78gUx-J!R+HVQ_&=>3(4x?<|(bQ>;PzKwKPOpU!4nS)X=ku{frN+LEruTr8@;~ zBPAB){iR_n6Yoyv8R&5`0m4wi4IwchNJ#8W9elyuP~t`my_)(;JQd&U=Ay5Eh*!fe z1fAbPWDyiIM+AiE7xcnj8%9665O%W?Yb#9VVe~{lx_BuVAwU+5XRYFeXqT!3VolnQ zU~WgHm0rG=Z}qt?B{QM_-9v6oD1OG%vIl_OS8{KwiFUY)R%KTfx_h*IIMd8Yz3@`= z7&$;J&wLY9x6(|YkHM9qH^;3u>eeyv=l%3a+a_N>9OInu`t&jAC0wF#J?vb?k;-yt*KuE)ArY%bhTPDgsN`L zR!#QFM-M|oGu}Y4uFo5%{F6A5TgD*8K#7BT1#}PX(PGkza_UuXd8fI@A(vQHIi};u zJfmv&FpWXKQL(TH4XE1KBj-UpsY$L4Jj%IaOS=kH1ORDSGc{SUze64iMky>Z@>BhK z;toaMeRwg;F9#{yip#gxC`MKuXV$2ro6boGRxjAKlMs&R&l*J+o&FB>IeR zc-8DS$Z491vg#HkI22mBLw+J@M3@vXEuBKl=a8KH+<_*kW$;`j#U>Lxip!7}xZ%;B zEG7~rfU<9|#|*|pYxiZ<&PWiQ22z|xcZ5-(K zoE;_yP)g-%F~=nMw(N{PB@-5&fk@Bki=`5}7BC9j>l1=Ca;CHBs*-x;-m5=bMY&CG zC=i{9KWMLzF)BRIM75I5=X(VM4n(V$_gmLi04^)U{1(drgs?tpKf}{+$+ednNu5-v z>)~6#d?B9JV9kK$mP1G42@~mJ#HXAa6adUna{rE)4KA=%#+IBdhDtMH?FMuY=jMPW@o9VUc$sO6&ZhbX&O~n31oUtThoi~HrVZ2F(SR3yG4Rv z0P0B$I!%4dz;yUN^>p26;JSr|!E29K@f^IM60H0QX6sj2J4 zbtM?II;*((QM?8Xl@?4D`)#5(nP_cr$|@sQRaxCz7MD!5l1wy4ALc7a%zkOS#=V+A zHiu)tI`ID#)#5rypr>%Mpw%Z>4DF2VS zuYX}*fPZsevHunKrT#y3UzcBD|8@8EFYF8Of5&}2{|EQ=FYF6&yz+nFefyM|C0Oq{KtL0{-1MS|H8fi|10jx_dmL?$j*lU%zerJciq>& zurI*>*L{shbyNLsxi91Y#C$6JL$}j z3meOH#FwpV0d08e0{pG>S?(4dh;}NACcFP@b;K#8qcJ9SmCo?;lxl+t$?=G%c-Fp= zaaoiu=+_=#+z|)(eLXFjf>i=2yLWJKlC&~sLXHwZ9V`kidm~4nzKqe5Xe9x2!VVq1 zv+2?$QHe6h2a|{_j6fp$o?VK9mSyg^Y^IPR+O}CJfzGVGu^b?t%`ON+AY54-1y55?VUs za3=u3$lSz0`y(Q}#HVP;M&gQUF2V;AmCrjh>7PY7j1!hXw-@UsX6OTdK!ad~+`-fp zGBx_g;SLDttIZBa+a^XaN;m0oFD+**eQ~=d5l-iDn!m0rS%BM6AEg4;LIm&VTgzIPvpI z8;L@d>8bO&)A~(dV`5mFzD90yX5x_c7$#g2FM!T&t|w_@=ydVnFnaKdAE{Fk!>IsZ z$loyIO><0(P@Xs*h-^C2NV=b64NpuP7crw;0TbJnil~Un=S8e$`O&Cp?nvDiGGo!| zddevEGvEGUAbwaZ1h;Jj&9T3D$%h@kAC(n9T+*1Yzpdkdd5WaMQC^OIj$pJUILDbZ z`obvQ{loDDJ!^Rrr@+Fkm|kjDCp#QKT6q!{=Mi}u(oVof4R*vex$SoE9$8WLxHTm-~l@;zZ9sY)GcIaj%mw2fsON>CXKM=X*aW! zMfHH7$OUz;mX08!#CU#V2Zs-3DcwNl^p>GBEOlNd!{kq927-Aa`Xh_vgs=kuSMQ}w zs*}7c4l)8B#Z5?l+_Z}U^ys?_=p)b6fkFfLLomUza>`c?D%FTfpCVu3)9=xlPOpi_3vwGai&jL$f2&ZYRiY{VI06BdR^u=EyLP@@(}W zS>KUNWx~;Pm#*cv=2sQ)i<33FS&_pb1?&0))BF-1oslqPdXI|1=#(Asq0IyD&nCiY zy$02xGLRBwl4pAgF4myqJbJPew-H4aLQ25VFPF6V(_aQ1gZR%@sN+X~+aANNcwNkI zP>XjKpo9YB1Flg6VSCtxgIZQQ&7Emg`O;Q2)!S(^_jH;es%9|O`iX^9-WE?N^i?{o zuZ1+rWu)Cmbh7Qr;q7DYxxNM{gwOb|BYye6F$1l-e*KoC4r4o+H* z{jdVrPmG4A^I%BxOv`a8>T#Q~#;@d+z2j+`)6^vdT5xZ5lAsBIT=xhEX1;ZnZU2%7 z{$8x+IK+xb9bX{$oLk5KcD()QR*h;jVu;irp5@XO`l1p=w#OY_SmCGfF%qXvpMjKX zh+JVGSoxXsJZ@5;YfAC;N|Bb2+i~7RD__y|lBJ+2zNJWB{;YWJ&dw zQAyE0d}sf6TYe`V>*K~q9LC5Z%1(ZU5wmf^(LE8+umPspvIM0#JA0C|LX_ZlF@b_`!F6}WLn(}DB zDYTZolcO+^)hy=Q?7au)I>ST9tGCfwJpJEAGTTbMuE9TmG0z{#o>QSg9ph2?2_pzZ zuu%zP(_DXaIweM9M+P_h5osNm3fU2%W$!uyB+cx$+OKlZjhe!A-zq)L5D~3m#9Jz1 zho(LDb(Tzr8ReX6qBzr;;__*YdYUa^{*?IkNpA4QS?pu>P+T_;~g6Hu|wKY z3x+|(eYMUW<%?5L7OT9={V4CU#haj2kAYA%s+2%tnMkl>)-7T#GO`0Kj$Z_9431$a z=%2!MK$yCVAAFGFfS?7#Go?q#t~1S?rKS#fYF&#sQ7NV*tP+~_4Uswv6;bt5|7dD~ zFxP79-`QkP_?D`Qw))W$&fD-HjOMnT76cop&Bk=Lm_I{r?h)@Rrrb~5(dawb`caw> z*PTM?`T7We?{=Y2pQMm;V3oqt#%lzl&w*+I0Hy6u#gzZzu`}XX_6-*3>H>x$6^W;n zK(JB8l(**GI`btQ(+d{+wJDf->y`0()-{orEJy=E74-?&^!4uupEd!x2v8RJRd0 zptHnU0;C>Or;dR8*}%Q;S%2i_xWj_O$KwypNdb0k?Lt*lUWz&F-OB-dQ3Dn87b~8F z> zkUj%^U~UdKr6-XYl`QNSP`(o$y^T50=4z`yg?W-i`~r}_zx%M8_Ai}YJ3I42*57hFUU51p0 z9JdmY^jAl*^a5QILifh2tWpA!z%8%4T8NY4drcD3;gyG)h>MxkUdUy_-&GLlRlSE; zfy`4LP!5$0-e&<(CwcjH5wc01fCG5}9}fW#C=^&SWo}d@_M>f~cV&!?qpbnT%J%zg9>Mg8g{tn#nr3ED z%&ZK^r9znb0ejG(PN@eqv&8h}kOc7j(pmBIW?O0IM+#5&$Nsv~{`f+O0GYBQZu`B= zC*xZ}rlnz4pWV^rR@r;#J3Ey=21Ta0G0MK;R^(SxF5@jor3sgnb`sO`DPsjY+7hagXFy=+t3~4 z8e|Mn^0357pU)7#GGv<|P(oqbE?F*O$O20bmtp98im&vAO9Lu2@^jl%b#=T&i_01= zHCt`1?q?Qmf|zcmvg6-mWF>(9)|Lov(s#D}QRvl$h%W|OX0EqDK(h`3I+m`+b&BmJ zmf2ah6zuMSqLXkW@52H5A0F?6)gD@j*6_MJWq-B>&)+N&h75Q`kIhV5cB}3C8%y=I z?^nbQpv&($TN)6TKQ1!Avej3Fy^(A2ZDetM)o27{gm0P@rUynYM?C|#FDu2RP8E`y zQ5X8S@{<;xmJ$Nj0o@-|<(HKtqD?^#-s4vPSTI9decN1&K6O}W#CPX!4o$j3@-EfL zPD^R5Vw@*_v+>8Ka<8>L1St`xTpL0XZW~rP-AM+-P?U$73^ zJveTp+Gj}@W@7JtLX)ZO8yTxLTt4qkvt|W>-!2l97$Oql0H!&%FtLA#m#J*<+rT@7 z`P6T&)Z-s2rlxuE+u_ELaPqIDa8kxbON88dvwvOADPFxLKBJKJL9@-6Yq-Gv8*%p( z+qP}nw%ujh>eKr@5$7K};+>g2ac0L{WJF%%Z9e%t zYkk%?uhwsn`sttdhIL^q&lh%3MS7p9Xo-4#x{uAAIM4_t3ZmEd8lXno;om)Yt<+;X zU0%JkI#n<-utvn3FP)vvEdjq3?x1v>^aOEbQt%h5AN{rJT(?jr1fz91@q|NkCP%V6 z>R9VnZDqicwGZUoHW#d+0&(;QQva*)>+fFs-@vbb+!x^A!7ujzGw@6Je+a++abJM{ zEAZ$`d`7Xf7}-Ubb|VS3x3W1Tlj_XZ{gSMe}Z4E|G=+*+!x^Az^{Y< zcKG#=`vUwo!mq@?@XM6;QSTr4br19Z!moea7u5d;zY=%lup5Wsr+@q_xhW~5>|e=E z{F$pT|GL2{J%&g6j|e9N0d6ce2p}MM7$6|}#NSgWjl?Z^5dV5+r0WvwUolV8+dAf~ zU_d}JP>Fs08~_z5yG>EVW4hb~QY~Ln-zOvji-{KWxdL(j0&@_SAJ};e8HdUPegYO2 z!SMJKijvC0e9gQNr@OcOr?X?}tGXGDcHK?8G^82mha@DfrY8s2U9PTaY?&@_vm;ht z=9lIhyBk}BjMoOnfPfz|b>a_c`&*0s9>zl@W=!uTMgUunlnk7-&7A0V%Bo$~S)>;G zKC1$<^F5b(66`&x0?L$r97B<&zDz@vDu4P&mwMnHBDP`H+mtrCmOi!NmNj5Kl*{nr z>mEXU1nm@d;|)BMLD`juLZJHz0w1ig8Toe!*Q5!|xs8YTT-ZU1HUTehI%T{ANaz>( zI4EYsQvfsFg*1*O$`yMW*noc(r~zpK=sM_t)DVy(OqebaSsWW;;V&A3T_8rOAqYo= zFcl*BxKM#h5neRhf}cwi3E0N8Pwv5ORDIo zD^z3WOSa@aekDJZm-|vHGS1pn(ffKM+#M=NkMFZ0iJ^U8S;hCuA7Vqy$^SX#BSbRFu1E zOAe{8#_@@8_jG`GcQTpfex4mDjxWg4=VV_k*ER1)rsYCm*0a zL=0S&irrK8l(aXXHi?4Jen=!e4^WJ%KmkzDA|@k)cGdU#pvO{+vZq+q(3QrL6->OG zPPAR}47Kr`YkF^w2I|T+uj9C&?PDq55@gPGkEf31lKVciPX`c>sqvunCzvuG@jt`t ztR#;y?o(VRl526fX^yDL@pV4(^wQo=uAT4SYs^Fpko)vhjcc*U1_j_3#_&TdhyeTq zt(5L+sjO&PNzckhz9TJbxsR5j zR07^8H*Og>u7l*#=HfxB23eUStV*`H58Ah-3Kg=9+p)-qFgr^=qk1YjioH*~3GO$1 z0H^$MzPMRv-)iX7B6THb{4-APy#P(~A8;!LA$89!6o=$V*av$<*ChvEx?Cg89ip$L zL25b$I`>XQsBK$%d;MHvu`Cv|MN+QOmi1kvUhf5$4Ax9i= zCxCw#;eRYXy4)YocQimi;kt?QUFfMNX27I?Zf_W8+*Oygsm%5D^FCZEyX=Z zR488&MZ{tJACgHDh$IB#*it(*w-9?E9|xJh5>;j9iSrAlPnOk-Kj2*+>h%&-EO-8 z<&Dim=+7?$Gwb8NS&E52%Bo-40lONnoEka`Z3D{%hU1l%<<2Ig(+eu;7>%$UBeW=(B{IX#SIP53C{`^ zeYhl7jZVqV>g{6Qm7E1#D?5uhY?`aa=j5E09J;I4=TvxgNJ-%p`LjHiI_EJiWiD$x zI=uRrq!>*xsnIH^SBhQJbi3@tq6fw6@?nL;Vo2Z|n#^7%gtlF5xTb z)l51G|5PkcQYwnro7o#9#a)yS_#WqsoWM2a5zDfFkL%%9D+FN=k7jN#-$?IFm36igWEE})hA^k1JTX85hZ}$-Y2>l>{`^`Bx_O5b% zk)wB{R!k1XJE8Y8ZvKn7IihrkTCJC^Kho z%g#G}yIxX?(>3?VGJ2Ak6VWv^xkgqCG*WTf3B?c0a|EOk$3*vuPH=;MxE->K*QOU_ z1y_?BPzl+~^|=#-?|2R$JDHUsz`BHTv9ovZ9ZbJr)Jj==#1*tOeQWcKO-{ z=@#!@2U8asC31R8YGY^O;$^n6yk+FJvpBfipTC-4-?u*$raC3oM#sLN)mN_89%7rY zMg0vY)a_u%z~N+fb%A~Fy(}%P!kE1zUpxwRelr0C7#yxAUXSLtieY*<8Q<=MS8wSUJxuOz?=0tgi0u&AZ4u_Hp<3^K z?iUW9oK|C4w{wd=WExvCGiOq%j|QB3(keGpTKjAO#HQRTUK?mI#o8)s%)#7(C7pd3 zPH$%xn-0*)4ZEt-k(n^J1xdB?d8MT{@mZD zkiew3)XaV0?LzC#ld}D^Ycahrgm%7IcM?+d({EZWKt0g&bN5U-Xt!yd1{js?3CtIx z@mP49PX_=}Op>Mh19`8C{k0xKYd689(q^*G&=FkqQ`y3cT|QH+PVPVmih9B zwQ_{6p|%(kkoTm%pKBM-mzNhx!S@%x=Y(Jnkv;&z%OZJD(e#)qGr)&gztrKxotyQR z)rOeuxvSaWs8)E&RvqLgvKor$s9i9Z5+Kx$%Q2GJ8(gN9c&g}G4c#SJSnq^`=-f6T{FD^|pM+LA>n^DGTK4o?&IM&mH?^yzJfG-)ds2Gxcj(0-snp1L{flv-0 zelKvXieOc5RTJRu=3w4o30LNA8*@ z0xodw$0Z@!Ir)T162=*;a$pHU)J`!uzv2KSE=sJ+NfG7nf2@NU4+3Rn)P_B(67Ad4 z##J7!;d*U46hVHIU}c)h1TkhpJ=hUCBy%oB0rt3!G*cz%H-Gy!_I%=d_9YC!5+aC)z+qwg+ z>5lrD01mva?Sm+aT8uge-V~S83S8Ip@YLwEOV6Og%aTt;7 zXWr^|JGwmf)Oy-L?xof*EjFoxck-aGVpsEf6EMY;T3|K^p!fI1)yZHPQ#?Bd z{jjB!0GTR|ET5+kIARP1)36c)36Suom*pV>&|y=xVQ~FHu#-V z3TQO^Y}Hx1|GHrj3CqG^Md_@$epA!Njc%%=Y7~h90Okg{b{q+1(X`;ydhJ;g&~YBTrfS=u-Ml&g#w9)F-0 z{n|R=H@Prfw3L}Z!x&(@o(a5kU$vs;Fb9_n?D^fKi1?yPS$@)OB)&<4j1LOohSo?Z;Ers77zLP3vwPC7;amqC zAia>{4~{P&TsI`(Pb}wWA|}-XI}=f=KxZ`#UJ5Z$D5bP|mIK(z>~LkqOGgJq?eK@j zYZvYE#|;0#LLN8N%|js*115@QTd%wEqaVqPbEh3BqVxs2c`$ES{JmFdN0W4d7rxF+ zQk3-IPV-VV;yAaU6_y8!ee8~XSdR0pWT{Ey1HoHhS{0(m5}gb$c#qY0fjkEP&ZxiF zhrfR={1}eBhXSxTB=+ZjgqHTSEtxyRF*eGZw5Qnns6Usfw9yXm-&mE*&pHe%XfYz) zf(Ft!_JB+)_YVmBE))8~Om2}Lhp^H{WCS1A6EIhbt7}cLEPdqR%VQ7fMd$S|Tomr{ z0E1HN-vr%hhp7QiduJhEyeenjYxHM~9iQJ-%%au5ECi6>S;~|1)pC!vpHa&1bI5_M z8r$2z)KGRsnCYmhO?E5D97o-{dOFQ7@5yN0lJV9?R@T6CPDL-P)UXRK&4a~Hb7vBC zpe^kHHBZ(EHXq1$7AkDAdO+wr-g?*@U6HStce1|2p$d!GE8FJT6@e2x>{J{aMhd+7 z34@N5tOF)OIxHN4mlX~%d3v-o>@IneVX--W?t`~^>22Ft*pBD z691M-b0V4IL_Ws->48uRwhz4$i>B}E`oMb_y5UrNEP@5WnboaHM`xNHY}rh-1=NRS zu(@z2NHI>v{wBiR2g)yf>{X*k5&6@)7bB5RdJaH|uEnq}ZF0#CjJNCMjrBO({%sB~ggb2F;xU#`XcJv5@U#Ze}?DC4&a_rmZdn7_JP zn*|WY0K+Rm=BQ_A7tu-x*1Bv+RsbQuc8&Wi7Ha?9R%4j%u#WvY)WGP?Y3^qluXM^- zX5bPv8U8y6t^)K^Ys{~GJiwmH@DSqMNUyU;{uzZhnTb9x*MgJ|yzyXI!vA|fKOu|I zze3TvuLvcPP8yLP1P!)b%@7ymu?0efa0Q?+yiGdl@W61AdO8Mv`YJMHPd}f?SQ7H* z{bP!yC);8%y$!7%lxgIEd@m6xT^?ukE{8DuI)23S2a2N_=t9{RP;?944hW$K?oK)a zYqB^5siA;)S`#9n5ilXjK>v=Ob~Wp=fR(KNjm1KS)Eb2PqDgX74+RllT3HB(w;JGS znY)tFD2M7;N{I*X-fJXm?y(?gV`u!JImhZ;p`Jd2k-aEvK*~tIx%}V;9g`LsDshTb z`nIpM=B2I3fHO}a3=-O?|F<1mom3le8jj9Vo4@nE_c>~RO4*15D_vS~k&fVJYG*jC zrkV&7ZxVF)4kgl2C=eCRk|@Fq4G%b_X~WYFq%M06_0*w85TeYgzP6z|vTV$tfpnvx zSO+JK*4OlBh9JkVu(+a=!+g&Am$c}DU?L3B5Sg`DYMLl4(*76NR9T;wjvc4}F(H73`<0f| zUO-v~5=){WtL&I~v9A>MGfG8nq%aDY#bBR#EkS`Q$?45I(k_tX><7vCnQ}ZyU%wiTZL~wGkn+m;lz{$&mp<%Ht}*4y6e@VobF_O%r1%IcKj~rKXk$r! z5!#vbmt?HuOdf~XLy4W|xN$(w78vCMNwSQ-Mr*XAV$!0oq{E^h{llqTn*~*(J)Ha` zUd_I12hLVh=e$g(&NW%65J76p7H*Za@yk0!zM9K8$nqug0FJM{E$Pm zTzDul_^K5M&qlU3@u_I{A&+I`!FF^=*RImN4J`L*V{v`o;cKSH{6IiMJ&+WHfdsML zFC4405LUN9Pj~c)9)q|}i0jwbx=i`-HOR+XRb@B^ho4-BmBFl0F3?|SS<|_;p+9?q zY27HBAB-b@mbwV2vCz_TKTn$mqq0GAg<6Xot_F4hpA4^B23M3BnTDc*4E+s1skj}G z?mu43GAh=}aQmzgYKH}UI}oRmT~v-V7{F=UFpxmD(sK9L%2i@58#m|kZ_lkzc%ddl zlTbutY35h-R`SUo8~HsDmJ`5-9-y?MX%4>jqstEyk#t+;opL8{)R2QL7aWPeb)LqH zrlP_+2Dl?Wg-wzoKLs6$Z%5+gA0urto{fOT6HjwmP%xbsuZ;uHeyJ%o5&LY2>R0e; zA%&&*nL0}I2aTT;&VSkeijKa39q%JN7?2>pQ2x+ZkOr2)yij~fj!FU6F)i4(y9ZkN z`~#dcWt@k8+8DYP?cqeI=W4+RcBypeyXnk$!@>C$=L)22uj53z*{$$<(G!Kyp^8I* z=~qp}khSae)@v0Y`~u%N%Qx;ou?Na2Tq>s_X%E}1E#T^R7Ux;Zw7+S^XxsI$BZF1C z)L>V-WZtMCpEx9+*Ll@QbZCPrxj884$&pW)mf++wRnB~2P)#8u=V_$8-W|df_0B2G zre?r8dTb%Q)<&Pq5?R)xxs)ArM#nO52Ud>u#%Taj_NFsHO-I#1_Rw|qNq`b!ZGKC) z6``)M?W=|{Q{dse9B9#$p}>ac3Gw**8Xe*BQwPFLtDEDrxeY__yw60J9n@_z#ey1a zLLLx!W#r3a5B>ETrrnQ9*PsouwLr~Ae>j;p#3yQah%+;QTCY=fuuTnELXuE7BT*D3;X5lUeyu8_Q7%Q7fbdbRc@KB&p#gq&*{~8Qox2g>rx`W) zhTK2ehrT!Jt&E2ML4NZZ2M+)Os0ark2)p_ z_8OeLW+U~Se_HbReiK{y)`OzX0Z_2SgudZ-W^8T*b zFuWjJa!B%2)}h&FA zmCXuZRKy;&D`h&Bwxy5655DblFxndg_;VhW3(Ma78kOtce#Rm#t2Cq5gm3`!BoxwSBWhXI<3m@;Kz0uQbN2s6d)7N1rU99q*xb~EbeV|%1@vS~YP zNxFA#HwJsTiuUxk8b&_d%H#Hnl*BuwY0n0n!p+*)yWEBtA*iHg(YD9#GiFp7r=V3` z*aoq6f&dG*_E&x3PD9l$&RDXo5jp=oBp?Rx&^7IUtE~@nnj)1J-xg%?I-0As4I&Yk zn?KR0>^r(Y#Xzg}SEZosu%7<3ofwP>xNQFTk+?q^*G^PPZqFI%ZGVT1-7mB7P5ZZl zoE|jCw zGXry7BKqmgS{71$(`fSjqgHiq^N;YU+qm0(sIyhu9Er(-M>al6@ z@)vVG!up+ohal@u%DfcXijZ+Ts*b(2^)6z)mP_yESFl0gBm=m)1~BnRXfIrVT>z0z zEci6~HE|2{dWp&rGHA7F;#FlKs_NNdZ=iR;yN;X$Jk;Bew4W^N@UE(=OuNHQ`j7U# zhavE$MNnq=b_H)ij{}W%O4v`}9Pl@O%_00dPUd}tK^gZE?Rz>Z8q9Pvn&xW2F^43QISeRqwCYF69ckwuDAW@X%L^wVlsDeahhH zpC7-=nrZHSVkZhNM_{;LtLk$2MvV}vEhtl~q=6f5FdgdGHI4`VxeIerbx;b7hYzg* zw^7ja9H;AW6cikxIjmIaoH96Wj#!SH^mftwM5{_BtR;CNk=mh0YS3*1O!bN#Z$?`V zPcbm8(!!?68X5uYLuAaar;D)Hs~VoUh6bj zQ6f4AW>PIT$xpHA_K2haEk8(WTEAG{w88f>4SicyT=)F8DnM{o>7)xJ^KIN&Jj_XN zJxu}_0t7*S=aAbbf%DL}`ZXYfj8SCq?&cFnJHnN^o=WCxQ_v=5V2?N^Tl;qOSs)Hg?uvCCWw6%w}BU53>{!hxg1byVsA;JRR==mN_>oz*!a z>*h)zBMP=6se=q@2Z4)GYetZ!|#Q%iCN_m(A6UJvhZ>x$>K(#B^Qte-cQj?ysF?{ zH!g$~w?&}rw?Sb|%B=kdvWtf|~|99W` zESMwefiomnOI*Cf-m#G|<5ELY_%{-1_Ff%pquQ;V(Nb>L4(g}hN)J>oP(gJ>=SO%|K3TB|>@?@wx_HrB*;4~vZGHQmuZzFp$ZU__Ik{(BxZ1mPWUHx~ z+Trl~PYy^kdp}PLf$;e_{V=-xIVBHcwpmlwF-xgf08AW^5w^sPG9Cj$#*r`MR9FZn*y1#7DPs>;OA|RPz8(wY9^`0OtD%1PD)BKK(rKUK4n4Q0R!29Y~E$dDY;P!Q`&-vZ*j2B_o9-g$y9Ilen) z?ceECQ_3((V4FXh0^}~brKOlcpjWscfT(LC-(OqTDQ!<1`BUsb3%tt5x7b@uc^b@B zUi?gdFR&FGDKQ4f_n)wB_Y~%_K0GtIBhwbJL4^ zQ9Mn9AKAe9I=H(9AAx(SzU^y9(K@vqP68 zZ}-AelZp5AeCxB5*P5E^URDVNn4hNi=#>AMswe`;hP;nn4Z5_v^SvZ}bhRQhBNXK+ znLAxu+?*9JPC^HF-Ml<|W!{~o+{ch9-&ol3Cv8Wab%}Cjrp~y3AQ{bV+Oh8B7MQ&* zZ#Bvv;~Ku%9P|KlU1{)n&pNa70aZt91=>-+pv}2N@sG-FGviY<>9idI9!5^k%2&(2 zEvo+f`f~NP7`kww2mAcsrtJyOVF5(JIad%438GUPj_-I!d)@{-=aBIVeaf~EdqcpFqV$gH z2a_Y`cL!#}Jy(zj*#%z)xN}*U0QFb>Nl@Lj$t~dg%IWu~fc+3|Za#o;F1q7^GH&zX z0d~iC_To3O^0ivGe=!{oxabjl@ncS+$=wpgeC1N?;1|rx@`uA-Yb+tY=d8XI$=%nE z!?{#wPY?~{mS?&o01rV57!G3VlCroeFw`-)sSjvOhZ^w96Sl0nri4xZ5<=n93TdTh zWYpaFZe+}l*eg!$J&7!5H{goZu`9pb!8c0}d<)J$=ci`^9>mqw{r6qR+ce?ct3uu; z86RIIl331zXZo7C?-T4SP&E9+H2IzKhLa+`ivK2C(-X`6;f)V^ z*@xWq0aIu%8$<)kw@=n!M#FxZFD2(YU|R)ktx4fv0hn{%mSZtYY5D z7NIF5)AdB#bz0^g9hIXb*(UFDq9{IvGsJ4caUusD=lq?mGw9^7*RdLQQj`)Fu9xpB z>uDyjAxHEpca<+Tt9hEcJEE~6uCV3sGAOH2rpuKHu-}~6XGKmdwfl#VufHNeZm%=Z z&eq?zx^p3wacCV#$=%KH`d|l&Z>RKDXJ>}aL^oAVy#Rw|USkUdy8V%dUy1+gw{sES zaRJ|PAEd`LH`0daH`F&*G8f*vJ-MIOoD!s`v4Qm6I_BHX0v&lj9nStWl@+a1FQUdp z-=u{)zSANIODI? zbRr|dd%6nv?O2$QjQL%Yhd{f_hlY>M=E?8R8`-eXPH#+snIB=nychTi+9<~jbjqlX zjp!A}b(yAd1#6jtDxaU4c`3wsY_Yty>rNb-p9>Bm;Hk!e;3DqpfOe)#41sDz11_yJ z0Msh44Sd=t!@LHkpC13i&+Ca*Sv|Dibsu#;^@bSoCQnCo*%+jcv^Oh_U05Y z^3As9%QG>6*V7dp3uw0ki1b$#>#JP@fR)$i3Y}WD5{>{5hXKZwMLRT`3AZgw%EMy8 zK7pfg*blg`IipN?D%5iw6?;un1??7kyE{c(OJ0f>trZo&_(XKT5AQVizTU5x+)j?)|6Q_Cq9MQY|X zt^@^O=lc#JzJr7WWVkGYZnqWf4Qyd#4hnu@f>U)(4)JP+g4YZMV&4|<;>yAsacCL- zV%$gw;Nq_Yha80?Eykv9?nh8MQ1@^A3}7X05>&K-1+uXt zO?A71RbbVK4D_iBwq_^lR&DAvszizcvmm3XBi{=AKEt@Lw@iV4ZA>KfdHtC(N*t-4ao z_+k&YfsA)y1IE{u4B-4j)n&y$`>XXS4m;jSDs0J^S@CX4o>@o(Md;6s(e3JXR4>V1 zEgO}~>|u0(da)sOCJ+Hu6mjC3|x0P<3^YR!lC5?U3`<;h=6*fiRtyaL)bxKdE#iD*mym*7+qK}0`rY2tmXgYxxNd-?uAZ`V+*EPjbLa%!kTyj83o?eL$dTh${Yy40i^_xk1HZz z=7hydg|XbLGu-3r$W0KFzy~Sj#1v@{*T^pvzo#z1Hvrf%T2`!`huH7FXEfoQ#1Ud7 z3+E*&FTEh%;a2}xn`^!3ODASLxyVQa#9lyZH z`0*V|*yig6v(s|!FMWnRvXm~Z=1<#vE#`^4wE_6`f}dF~3mdQI!UeYW5pE-|qH7#YD?bE})bcl9n+{c%z992&C6h zFh5omd63OSIyS1lCju&1aKE4Uy(ygMOKKEj{jw=I*JGf)DY+n45-gqsgjoB`2Ch3smTv369O~PaSf`n5%-zubOcJo+i6+!VmiaZw^2$?QhtBG3HOE;3IH;SUA<>#g@lsjwW7@OGNlssMjbdlyDbDoJEI*MiJ zr+Dv2vOxdCN9LX+^eHTD9wQq%YJUG5%kTy+?TeHM*dh|z&o#K8CDxqN1ouUg`2-+? zq|&LHIyI>Z!kU_7@TsB>xlP8_cOi$D1w&=qiq7Wx6(9 znp&vv84b5<>(w`vPvBAG;aQZ~_DL{K38KEx^hiU9 z$p_BOpGeWen^MAO2kJ{jgoi-$J5~RC4zQeUaw}GM15cjsu)Dt`s0NxhknW zt)C5ialC8?>uM1<_!SP@VNYSXPJh^ABmI4OMmpgc7Vg-F0mHu+SZ>^s<^UDn#zRhj zIF$z$WW6zvotd+0{nIkpZ<#5(qxe=tPof{ubl7jDq%k*O-#qQ1Da)~A)5zU&u+}7Ukih;X0zgj#mxxZDQG&|B{Z%*~ zw;w3X@yeZP5+%bxu$U#~EsuDAPOtLr&V{nd!l-p%>PM9yI*|dC)5@JJpTDAI|NP)z zDXhIKD_N1yI}xNI&9Cw&7J>Y6@8Uiq)Gk}XFNqaK4eDq=Ew=hT&jYZ$Z)7KnCM`GA zO_0cPEzfp-rh(#Jz+|PFhhmW?gi|Rh3s2KApgxBF_=|igdUv3j&{7Nm<}IK{zP(Pu z5P4^cUd_6=)e$qTJqc`{4m{S88fY3EY&9lGTPsu5m$k1xTdLoZBtYIMOt}kT@Y7bEe1Hp zNS2c{_M}fH-v>L=LE4JJlfJ#6Y3}wfU-qXC7xwj_mz-3?6$82=EgRL-uq){HvrWb- z)$MA1_YFUgAyC3B@vd)BD9Ks%31;i8?=AwvV!XvmGy_6=E-p)Kar?PL!LXm0@ zx#!%-xho<)2&q~_g-U~^58%9;^Bo9?)ny8lGFL_hl)sM=$(D!rIGTy4op|n(kmK)c zBeT+;#}d!NE&<#cp7-u-qA9*g=4Y#4TnN5tV!j2C{KN$OMPj9+jcIe?j*ez?%n3bn zHDfpHC|vx9&`|1ZU%Mk%7Bs@i7GLZl`dP z_&23x+3AL)H*8&0Ty6pQV6@48Nj*sJX@rY;)6+*8rU2e)d_!R3F8L<81LOWTkfP{X z*9p|x`#+=Ve{_{@t}i#1H|Lj^@osj8JI*?D`Vn9UAZdi%ej_-(x7HiI6Qkpv8exC@ zTwEtcm-0>RE`oGPq`suEvV?M)!W#j8zr_#tOe<-Y*ErSJ7SC%MVbtl!kt5cgJlYkreY<}s;$8QR(f7fOP4+Qi( zbru+y5OC0P*J<{NFaHFyyn#FUtmA!NhNqEovs7=KO;Xyv_7nb>5Hd=H{%hUQ;ll2Y z4k8e+7iH{jGSi{G4nNG8v(S6J#3iBA|F9Of;dOWQum3N7%Zi! zlsYhsqub*jncq_vI(pl%VoD2ytsh=se966S|qfw{m_XxRipo9g^r&WItE3sdF}v^d>Omb?_D0 zYU8*5VRXkJA7%|B9W9%XA?8)ASzu%j`&FR7SU0jTW{ga7HC2_an=>!=V}}}qX6y)$ z=$CYt+xEB?C(c78%Yj8L%j5UXQPGF%8a`mUtN-Z-BVF-0Btc``^L`0?Bgbvtt&*G})fDgFqEmHhiDv<{0f!qp9M0eRPX}uYBMM7sfv2+m3l#X&<9)KHfd$=Ms{iO6f9)!1FA07ZZM`8d4 znTU49PNs_GN5Zf^%SCKS;5uY}*z^1M&Z#HUW7RwoSYtS2I@C4i_od!5`9v@Z8YKh6 z75u20&)+l$P$eX?K$xIeTRJ`pdtER&(x%<)#~qKa`OSXux&XB7_sa6wV7wAUkbu5Y ziuYS@B54UHfXmFi*odJ#t8np5=gr>_p3{4v_oIC7_43~4cQCe~&(i7glNNZ-z7#lK z>@1pd))WrrJd_L}larkc3g-LSoAex+>Fvz~jGuxQkXCwu(EPlf)K7?zu#N2TBAKZ^ z0AX{}xNFerqD0t&`xeS4MYePa@uUSL&JxW+rz$2ho89YiQ ziZ8Qy<5wv4-e=Tx4xCx_s$Vl>R&?FLs^2yj1EF3tkh685ClFV?fLWM}Wgy~3GPq}; zdaVtjp^r&`Yz*SX6a?W44$P8J)k-{^a8nBDpYi%W?iK@(*@eBsUwhdb4E|b3x>4Jq()CJSe2`;8r)sIj*J><-&nAZVi^I7@!rEf6p?!n zdr2L_7ee5oYr?1Hd6D`4#Jsf+NnfT*bu{6Ph$Y5KFxFRzQlS*B}6~r7&ZjGXIdwG zEPG%iJcCasJ|sOHUnIeWx4;CeIiqdl?d7azSVv)$`^$D)O;m0@gP%cendO0;9+s)} z5K+J^slH36SsZd=#jp`AhEb3Pn#+{1sh51KWc*?#vw9$^N}~s`5Bx(2^z`lr6P<8W z0tHy6Ac63F{GtoyA_pYuk7bp%pgRvFpkLGUqLMwuikLtbRH#L4HAmLf1(%FrHLO!T za|Crytzg1)%x(HXF2hV&xo?FjJ0YRv^_n)6U0JKR4isx`guvBJq@djYXA5 z&`Yh&(81IBa>}3q(%tTui&|f$^J>sQ^_g)9X?Tx?oP)v_lk(3>ttk^2e|3G}AsCI4 zSW?Y~`z0T@eU-i&q~HJK>vf-wqQF*Rs30)G(BM#l>=z@F@1BSTS(OKn7s#WzILgRcNyKhTZR^n5 zI7+$+>+UMjMNSN6baIFOX;QleqZ@Xs+syV=_eW{(_73aK^$kwkz#c#+=Gc~Or7=>5 z;wgR69z4|)0M0=JoPtBES%@9!ehp4Y*688?>_O;|1nQ`>cU)Ffq;YKH|G1$0w&F;(W^QAfa=NEeEwiHoX+D0x>P^gb-jL9GNZ*msnk@yKf4VfNvmsk-=3Y}|<7vL&kYR?%LN&DbE92aT(_2P{C zyAsrKg~v~Z&2+yWG}{pDjSZeF6;;xRQv>;^2Jizz?9;|K{{TCd1+6$fnJ?~aL62iv z^V|>f@piN}nYHSZpvjX^@W-1w-MuG{5gvqWjP4Xa0wLsw&Gc=0xu|x_sS7#!G;u0C zy$b4bjCP;uv^*Urs}K7GN4_8dFxYchR%DCoNkPk$<}q$PJ!vx$MOC%Eg|5$1w=mHp z0${&iR>m>HS}egofqBN>Xqyz$5tt>Rzh$cfJM2_%|13Qfl0x3R-< zO5lj6YHoD${$+jX3>`&AiIRjO4)}`z79ia?o8ZwYRCGMTHD)W(6pq$^ZQQADy<<#c zjzKh&TMkTG$39{=l!%G2;1ZMCWaBz*Tr1WiljpG)V-;848s@YCA{bAoIz8syK|r8N z6k0}o=XLryuu9!)q&Uuu1k}t+&%J#<3*CzNTXYD$z5Q0-r$d4aJsRmjL2e?hX}4=qD>YCwK!(i8f9#}_E0g&nu}#s$bf(glcC35a8>3) z?8toJ%eV3S^j$i5Ms(=t&fC*!pn}HL?W@1wMsRVXjCnP?QlwUa5dsw6G<7-+v}dDLf^`uq_2Yqwkm~H z+Onw86h}R7KFe+{E^fa2oq3|-&;vy1JmIbgo-DZ$weG^iK_j=X1Hi2$?$6S`kmA=A zxsxzuKZW|7#-_!A3Et}_-Q297?ajm-n10*r2NMlebDA+z-PNBOIwMl#fyAm(>x9om@Vt*R~n@l|qCC~m5zuiC@)V55E zA4RKlWXaHM=IDSZ69Np=cvdvortr_QjqdAb6=T1RbK!g}IeaFGv`LHf4idDdFDGz) zGezLXSF=`CBx)}-!&FP_)XWU3v+5m1C2=&bZj9xQ&v95zX8=*?F$J0cZ{?P^DZ8MK z_$q^$>nqIkKSOCrbVqNqDXLoXLH-10n_Wgq6y@rh>hAmQ5dxs7PTb;E`JEQZGr#&4 zW5b*7BzRQLFh+ESeY9@sj9xd$`PjDVwpaA>o2)P`Z%V6bI05RW*QgMz4mMp3F|AjW zSAAbS-Y{!d>c47i47v|rj!Nq2)hn(-g*Zi0rSVq&6o^`m1;mX!se(7M)U;|wl73Ld z_c@5DrQ(`=GXR(d$Z;q=NQ+cmeM&TE@ax5x8#SrKk9+Iz-uEHQX=zSaaxwPPzek0( z=f9K>dYfL}R64Uy-f@K2XK!q&h?)m=&#PHYY5YF90RuW1`%rq^{dKV%a9uRw81UbB z$W*dWCgD%ketN)Yf*?$pzR`B4bN^3UR~;2q_pWD9=?=+3x;sUry9DX(mXL0S4(S}} z0qGFw5TqNVOM2)I0SRI5e17--e#^D)iFMxnp67jcob$(7`|R~REQv={{LS&qji=el zU`_^z{O*qp-7KV9pV*T2jF=gYCGqwgElGhf9d%pO`!V_%5!zf6$vWK2>C}?%1?MUK z@QkMv=~T6Zm5~uZ)X>F!%rdndcv$C&v1|00p-&$jc#*eu=g4*~ssMLwkw?g4 z&11=4$FGj&Ffqa(t+nPsKj|X4_lr<@`=~YYiEl98D>p`h`yvKBiRltH8^kJ`$~+Tu zs%SKDKM$5glsRwEieyBSj!d99l~xa|#~d9ed+1eU17#pOO3n&QIdrvOs+mDGH8cHSqHj>>$ z_*-0q%+Ey2j7BRBDyB)L5I*3otx^mKO1?hPiMh;+izBsYd90$YP;ZbmrdOdksL7%g zIH+NKO|9xPW>KzQR@PQ{(c5I1a^<@{_6ok{W(ttZx?v_NC`Jg7jKkrN6g-YMN{=S~YTyo5g6Sj8%m^Huz$UIT_&?*RqFHlk!)=QO%LVVSA3 zbc1uxh**UM!BzDoz2r3WF%qpuNt#VQ30F;e0zL%>yARn=J02O{EjkcwiEVmz@#Yrh zSQ5(KQ=h~)$#s`ceWKnZMeZwHpo+*)7L^QHTw;Q6$pV6GFC{KsiYVnmagrGKd$ySG zP<=8*6?5|wl5FJu^iHzt=`C^c>+`^B2F}c8cGMBbnXX6ieMB zxKxnOD^QiX!wh&nwQEZ`RXx2+9T1!w>Yn)xbssY)+&%ugly9za&?QX(+LY3zl|1nt zYvMPsqDkl_PrySY{_U%PY*B^0;_<210lB%KIH&Ypsd7#Tp2R2Z$^{yC#%m| zH5&-j!(VuMaFYCzA@iW&4*4Cz@kl!1~(LrH554HaX8NDR%mO=INM8zHpEgVNIJA zJHGVWqO-y3cY)dW*w&Pz9nz;XuAYcf8?uouoz0naj}wraZ&XsS`Ak(a?G;1TmPI+3x@X>M~ zEgBV!L1v?9Dv5%bC?57Og|HBFxx7D6`}MZ<@fmA{)_P%+?` zU7&i7`WMAhlJ464EZ>uTUH)yWgY)(j(*El=CK$g&ZFxq~$_;#8wYd1x>_qb8SCSp$ z;S%}8yj|6eY*mX3Oonlod)K^@*~vDQpx`0Hq4w9Rta=WaSSD8zh`zYghq;+(kbL_> z{^bKf6aC;w`+`IpBdA;TL;Dk1jcDeYgiTX6iK5Z|qdH-=gpOb4#zw}g)mD5rOY2mf znRR{j>~jaU+EU=H2mYw%7$6Xzs#*hdR^pW?>W56NC2Nm?cY~yZDJ{@6wv^~hd^L9J zA72%4c_NQrNdq_Pd@iYlzr>7x(2$T~S{X~A}q!qjzK8at)Uz+D0Bc_{kAltklrEyXK7#!qsQ zcxgG84cJoP*v2j+Wg=B!(?h?^q==@M{tg~x^u>=ud~jQwrJv{MT*IXuu36mpO!(jB zH%^l++hZhO+sxPRF1Wkt*9o`BEw-bE*26px(A-A@S40y3i?ci@b** zX8cznrQ9TVyLr6rh`~Y4F`nF+n_%yeCWL$bMa_>|P#1txC{mhSPBc99kvSRJ<>T8r zbw$OY-)(Bo2uD-KC+Y)2{%jkQ(_`dLB-&u3pwmsRbP zeU@6{&SoEM0;ohebG?sW6;NJrhoXslkJxtq(uYW@bV{7dV2)y4>|=q{gB(ZitQ63<}5hIN{RHQ5+4V{QlO6>RiN`{}?w zrwSp1!CPviLSg!~XoMXZtymLcQK0_j<&;1wFcUtm*jJ>C^b)($nAy)H*?C^2!=Gd(#TnSL<#e_o<)^o*@_pJh->G?JEe zDc!8qMK?GLKQ%NiSfC?F!-TLIL`5n@O79FN8UWHK#%Z_O=hQJQfmVC=lVToNm$CJ; zMs?o|E0FHT_Clb#5L!a5f{z_EOuhyuL%Hfn<}{3vEiZRUk?cX+R2*jRbUGceGXM_ZRu>-oI4VoVYG5TTN(tKi(a48Brd_njZ*4?>+;VSwN;CoLG& zcwh_byeo8`hG9^}7&U1sa)ejZ5Lo~NJ^c`Lse8;~mw$Pdm!UA z<+iQIe+ElXu%odroWO&ce&{5P!q7{T2px{(X(RHt0Ku%`)%Q?4ca@DvdrBbQ%f~`} zt}fK57JQTP;bOOm z_P0%$da*!s`b@`=QFJws_$0+}Q;71K4lfEU)yTLU*ETkwyTf z158Xlep3g{A9CpT@{`vx?qE_At`O`3TMudj3;lW7K3wdj&d-?^J_NE5E-1?#A&y)_ zJwsD#PM1}WC|fmu-KQt09PGA4{?9}m$9@pi$u3co6Z4_gye4MXG;O-n_IfYfb?|IV z;JOERA9|~H?f)4C0COON+4We~qbu=dG7JZnU6e@^4zt&1u&fh{6oU)R?r>!XI%XZS zOrzTtZ=ayb#C}c(UJgY}_6{kuKHK~-+iZ`)mabrACAVuWdN>R^ur{43pOrXwA9&f5 zY^L6LIBXs1T##DY{8Yn>0JNF@?D3^jXs;NQN;4f{+`x#dug@noeaUDGW;QR5IprlTP|j1p%*89+tT)$L6elzn((b zjO50hWdjjQ9#nBNN^1xWqD^K`z&Qw22r)!@%Y%NzZ*NYf@S61mjLKEzM`|RfVf-*3 zY2?dM?~r^lcC1VDjhr6k3G8^NbYd=tdfEvSJ)ARtLt^lJ8&kfP2~)9FEI7cBsmn4` zAT#rAOs51i!x|jQU;}^0eUF|y)z_xYUYN?!BQ=b4(Ws}`L*-)I9oR5zi9+%K7Z5t)VXwP{wzP$HEMmj&-h%Av6y7<=H-5rvTIw?UY{XZ>v zySl(`ojd$*Gq>4u);c#BZ&V0doy{-Usdqco`h}{akxIb{95#}^UEY}jL*AX8Ii0Dp zE7S_v_%SYf3Jhu3ax?W(bQx_rz2E9L{`rM>^j9$s2dO2}6qkLcenIB_T+OG! zT)A;(M`(&*B*mp#0xv?wjZ&3ovx6W#e^tuLBQp?|1Wvf?f8)6 zraCTJ$x`5D$IL?q5iYaW65evm&mTa;PiS)nRcFGl)_rpfs026gDnV3}vO-tib>#eZ zl-|Z*^y>;<%~4Lko9`)-ZcFJgw?Ha7!$>iG&fH(Xg6wneMblPp(i=91({Iaq*Cg0t zzqU->?dW?HX)>n|v!?x84zm+7wNI8@f9H1+9S4)OvMSSMXG@<`Vh!)p5;QV4T+(IN z)=6JJN=v&MgY~n_=zMS@Yr85+8y@yWYi}2*rg`Ipj%LfdYU%VmLFEH0y^AaC4OFjU z%1N9KjEgJHFck5nVyd4DMi~)YfoH5qOAa>fR${f~FJlQ}*;dj^2bXoGe4s^U+!&HS z;H*7MgBJt5=lQtSg8(i7zb7v14Z4@63esUy>PI_vrxZ^&CCK37J2v@r1ybdJSDU!h z!@17C%rX^m@w$mOX5n=<{O(RMi7G!KTn7L!%Z=qkCpE!d9ho_5J13XPA-%TOIl)u;=XQlfZrSt;iaH4%2QZM;4;AA z(klm&YFfQ_BgPNTXx1nxW@jOoPbdRdK5IU{{8+pn@)N zkQhegZU=2P6+F-)YWE`!4uKn9?tNs$`2!do2P<53{dw-EcvzNK#sr)POM>EX;JriB z6A<;>Y=9OnDeab9h1D^MwTAM3xHB2*cvhd}b~a-mtJ2%_a6f%iYZD4n+XzuLoxy*HAX`zALlqn*pm&Z*c}<<#|I zIRaRGOkvEHUE%0d`P_PEd`zY3*SvOL5-{>D$$($Qymft`Oxe%t%L>@%KkgpUjR58c=N+vGtCC7B1PH7(b{|Py~;{cU4z#V@_5+8q>o1 zu0ZmF#B0H+-E|gxWREqxaO8_+x$c~Mny$R|AHlb=M)*`_JbfLJaSZ1RAI{aJPO-8Z z%tSGfJ>?d?qQ#;GC@I&pg%ucQ!OGHX0b~YFHc$raB%w>rfk2srnn{t1506ekO+4@N zl`%?^7QY8g;+P*tVG-N&R~}~F+c~;Sn0yHp&dfAJ#;*LRHlIu0sydrHKSxs{=t;@a zR-v9~MB-n+hD%W@(K~yf{bJ2BVsCN^^eb){~% zF%|@K^(!S0#nNnWVmO6fD1n7I;1GNWIG=8YGu35#b%Fc>RE6kQab$u=hL zx?TB^n<3Bi!bmj^&5LKV+9Q>>EL5scQ15*`Nne*2E(#eLE5F-l+GsSO-wUBF@{@(h zSyQ;Rj{hQF_6xlBi{}!)@gt?;bu+G{=A_VN^42I5-8q@(2T96b(yJN#?A#t zqSIMlb*-xme<`(hv*4++2o3wRi>1tXlx}Jv6cQqicR>F)u|#ZDZS^g&E5!*IhmVD_ z>;R0XKvVIqTi@TgsS(Y@u#xW%r@3dA^2;J+T`SWQ?F0*Y{lPCVB+t4GakxV|kcK2o z#oqK?zi{k&EN{Mk3sJtM>*Z>!SZ_4cs+{&zWed$`)BDbIlL4+DZ0T#CpbZyfpKT3@ z={O9ACEyY&NI*8D{oJEgy;&4GD2L&)++l(v=kw1HWP7Au3`eZoY+Fb2u~efz0yfI; z+c*L_suwS)YfjPUqDCAW4^iJFRj#M5CBuap2UCAJL^i~HMW)Q7Fi$(UmG6liLGQnv zekj_yPsb@|KD$3p+9s6e9(dR3zx5`jfrJ$cDnP%gjEN)viEP6qocKJVHxz3m1YTgCc!hllE`g zEPj~>DEYqk0TQ9xV|^9gJzC*RnOmLPCauP$H=c>gRjKo=hva<+2e34leZ$47Y!cmu zL7Ao@{Sdez&5Jc_K88+*`T1uF&G_xZtiGs?SZ8C-Btqr4om4r6Jzv(QOE0X$3k73F z`c&$>CD)QQTU-P7MWU{5acW9aF=FWPO}^Idv#{zsU!x)TsC|xW=Hd}f2$5IRto&?p z2Opt%RO$VC zCKbcZ;og^%-Aw43mTA?nEd2z!5v5@-h>dPc!5@8N(i=)+kB42swJMo~AwjGg-C3E? zFjL_e<7sfjgWmBVf`(Im24+=H{x$M}mVG;HSIocR#SOvKL*Rj`C5gr({&8YRU?k$e z3* zixR*5IYX;aG9C*T{A+DdQ>iSlisYZUTZpa+ z@HG~~(5-A^1kNI)p=%)dCP4l__ZCfn}Gb1+pN0r${-Ypn$U6Bqiyx(Y(n`ZYq-n)yGC z0RS3<6Co<*Ky4&C!<2hfCdimMA}O{Nevcft?^;7C=!X zZXZaK1yB;HHUM&O0c4^2TZA0|Ao;6`(x4QuI~l|T(HQ#wPhtE2z>UG^q5sj_=Eo)d zKR@Ey7#vXy!LtG~{uvTsE5wjkR6}&FfC7IUN&h&ozd$})0bl)b9RB0j`A4CPuPDcf&l>Tzrw=iAk;QMu|KH) z@_hmT{C}ZPmmsq?K&d~;$!&q;Pn1!C5FuNj(US;5ASA;UDEwp$Va(eCK~#UOVdDV+ zp8u68gAzh$2b2`yV+R84%*|XlJ>2aa+&JF*s3`%F2myo$Fe>1mp8BTc356g^*@*M+ z$9*6Mk+266K38&()NkAqbZN06N5egbn~)`2i87{{d{RW4QnT diff --git a/ibaq/ibaqpy_commons.py b/ibaq/ibaqpy_commons.py index 1d72e1f..daf5cd1 100644 --- a/ibaq/ibaqpy_commons.py +++ b/ibaq/ibaqpy_commons.py @@ -14,7 +14,6 @@ "peptidoform", "sequence", "charge", - "fragment_ion", "isotope_label_type", "channel", "condition", @@ -22,10 +21,8 @@ "run", "fraction", "intensity", - "reference_file_name", "sample_accession", ] - PROTEIN_NAME = "ProteinName" PEPTIDE_SEQUENCE = "PeptideSequence" PEPTIDE_CANONICAL = "PeptideCanonical" @@ -302,19 +299,15 @@ def sum_peptidoform_intensities(dataset: DataFrame) -> DataFrame: :param dataset: Dataframe to be analyzed :return: dataframe with the intensities """ - dataset = dataset[dataset[NORM_INTENSITY].notna()] - normalize_df = dataset.groupby( - [PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION], observed=True - )[NORM_INTENSITY].sum() - normalize_df = normalize_df.reset_index() - normalize_df = pd.merge( - normalize_df, - dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION]], - how="left", - on=[PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION], - ) - normalize_df.drop_duplicates(inplace=True) - return normalize_df + dataset.dropna(subset=[NORM_INTENSITY], inplace=True) + dataset.drop(['PeptideSequence','PrecursorCharge'],axis=1,inplace=True) + dataset = dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION,NORM_INTENSITY]] + dataset.loc[:,'NormIntensity'] = dataset.groupby( + [PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION],observed=True + )[NORM_INTENSITY].transform('sum') + dataset = dataset.drop_duplicates() + dataset.reset_index(inplace=True,drop=True) + return dataset def parse_uniprot_accession(uniprot_id: str) -> str: @@ -365,21 +358,22 @@ def get_peptidoform_normalize_intensities( :param higher_intensity: select based on normalize intensity, if false based on best scored peptide :return: """ - dataset = dataset[dataset[NORM_INTENSITY].notna()] + dataset.dropna(subset=[NORM_INTENSITY], inplace=True) if higher_intensity: dataset = dataset.loc[ dataset.groupby( [PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, SAMPLE_ID, CONDITION, BIOREPLICATE], observed=True, )[NORM_INTENSITY].idxmax() - ].reset_index(drop=True) + ] else: dataset = dataset.loc[ dataset.groupby( [PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, SAMPLE_ID, CONDITION, BIOREPLICATE], observed=True, )[SEARCH_ENGINE].idxmax() - ].reset_index(drop=True) + ] + dataset.reset_index(drop=True,inplace=True) return dataset @@ -389,18 +383,15 @@ def average_peptide_intensities(dataset: DataFrame) -> DataFrame: :param dataset: Dataframe containing all the peptidoforms :return: New dataframe """ - dataset_df = dataset.groupby( - [PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION], observed=True - )[NORM_INTENSITY].median() - dataset_df = dataset_df.reset_index() - dataset_df = pd.merge( - dataset_df, - dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION]], - how="left", - on=[PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION], - ) - dataset_df.drop_duplicates(inplace=True) - return dataset_df + dataset.dropna(subset=[NORM_INTENSITY], inplace=True) + dataset.drop(['BioReplicate'],axis=1,inplace=True) + dataset.loc[:,'NormIntensity'] = dataset.groupby( + [PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION],observed=True + )[NORM_INTENSITY].transform('median') + dataset = dataset.drop_duplicates() + dataset.reset_index(inplace=True,drop=True) + + return dataset # Functions needed by Combiner diff --git a/ibaqpy.egg-info/SOURCES.txt b/ibaqpy.egg-info/SOURCES.txt index b2c3106..1338c34 100644 --- a/ibaqpy.egg-info/SOURCES.txt +++ b/ibaqpy.egg-info/SOURCES.txt @@ -7,6 +7,7 @@ bin/compute_tpa.py bin/datasets_merger.py bin/merge_condition_files.py bin/normalize_methods.py +bin/parquet.py bin/peptide_normalization.py bin/tsne_visualization.py ibaq/__init__.py