diff --git "a/pages/1_\360\237\223\201_Data_Preparation.py" "b/pages/1_\360\237\223\201_Data_Preparation.py" index afbe859..a3e7a1f 100644 --- "a/pages/1_\360\237\223\201_Data_Preparation.py" +++ "b/pages/1_\360\237\223\201_Data_Preparation.py" @@ -32,22 +32,25 @@ if file_origin == "Small example dataset for testing": ft, md = load_example() - elif file_origin == "GNPS(2) task ID" or file_origin == "Example dataset from publication" or file_origin == "GNPS2 classical molecular networking (CMN)": + elif file_origin in ["GNPS(2) task ID", "Example dataset from publication", "GNPS2 classical molecular networking (CMN)"]: st.warning("💡 This tool only supports task ID from GNPS1 and 2 not from Quickstart GNPS1.") if file_origin == "Example dataset from publication": task_id_default = "b661d12ba88745639664988329c1363e" # 63e8b3da08df41fe95031e4710e0476b disabled = True + cmn_flag = False elif file_origin == "GNPS2 classical molecular networking (CMN)": task_id_default = "" # 2a65f90094654235a4c8d337fdca11e1 disabled = False + cmn_flag = True else: task_id_default = "" disabled = False + cmn_flag = False task_id = st.text_input("GNPS task ID", task_id_default, disabled=disabled) _, c2, _ = st.columns(3) if c2.button("Load files from GNPS", type="primary", disabled=len(task_id) == 0, use_container_width=True): - st.session_state["ft_gnps"], st.session_state["md_gnps"] = load_from_gnps(task_id, cmn=True) + st.session_state["ft_gnps"], st.session_state["md_gnps"] = load_from_gnps(task_id, cmn= cmn_flag) if not st.session_state["ft_gnps"].empty and st.session_state["md_gnps"].empty: st.warning("Meta data is empty. Please upload one.") @@ -202,6 +205,8 @@ show_table(ft, "blank-features-removed") st.session_state['blank_removal_done'] = True + else: + st.session_state['blank_removal_done'] = False if not ft.empty: cutoff_LOD = get_cutoff_LOD(ft) @@ -224,6 +229,8 @@ st.warning(f"Can't impute with random values between 1 and lowest value, which is {cutoff_LOD} (rounded).") st.session_state['imputation_done'] = True + else: + st.session_state['imputation_done'] = False with tabs[2]: normalization_method = st.radio("data normalization method", ["None", diff --git a/pages/5_Random_Forest.py b/pages/5_Random_Forest.py index 6e1c9aa..5ec7cd1 100644 --- a/pages/5_Random_Forest.py +++ b/pages/5_Random_Forest.py @@ -28,16 +28,19 @@ random_seed = 123 if use_random_seed else None if c2.button("Run supervised learning", type="primary"): - df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy = run_random_forest(st.session_state.rf_attribute, st.session_state.rf_n_trees, random_seed) - st.session_state['df_oob'] = df_oob - st.session_state['df_important_features'] = df_important_features - st.session_state['log'] = log - st.session_state['class_report'] = class_report - st.session_state['label_mapping'] = label_mapping - st.session_state['test_confusion_df'] = test_confusion_df - st.session_state['train_confusion_df'] = train_confusion_df - st.session_state['test_accuracy'] = test_accuracy - st.session_state['train_accuracy'] = train_accuracy + try: + df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy = run_random_forest(st.session_state.rf_attribute, st.session_state.rf_n_trees, random_seed) + st.session_state['df_oob'] = df_oob + st.session_state['df_important_features'] = df_important_features + st.session_state['log'] = log + st.session_state['class_report'] = class_report + st.session_state['label_mapping'] = label_mapping + st.session_state['test_confusion_df'] = test_confusion_df + st.session_state['train_confusion_df'] = train_confusion_df + st.session_state['test_accuracy'] = test_accuracy + st.session_state['train_accuracy'] = train_accuracy + except Exception as e: + st.error(f"Failed to run model due to: {str(e)}") if 'df_important_features' in st.session_state and not st.session_state.df_important_features.empty: tabs = st.tabs(["📈 Analyze optimum number of trees", diff --git a/src/anova.py b/src/anova.py index 196430f..5cbcc29 100644 --- a/src/anova.py +++ b/src/anova.py @@ -80,6 +80,8 @@ def get_anova_plot(anova): yaxis_title="-log(p)", showlegend=False ) + fig.update_yaxes(title_standoff=10) + # fig.update_yaxes(title_font_size=20) # fig.update_xaxes(title_font_size=20) diff --git a/src/fileselection.py b/src/fileselection.py index 9fb456e..b7644ba 100644 --- a/src/fileselection.py +++ b/src/fileselection.py @@ -55,34 +55,48 @@ def load_example(): @st.cache_data def load_from_gnps(task_id, cmn=False): + try: # GNPS2 will run here ft = workflow_fbmn.get_quantification_dataframe(task_id, gnps2=True) md = workflow_fbmn.get_metadata_dataframe(task_id, gnps2=True).set_index("filename") an = taskresult.get_gnps2_task_resultfile_dataframe(task_id, "nf_output/library/merged_results_with_gnps.tsv")[["#Scan#", "Compound_Name"]].set_index("#Scan#") - except urllib.error.HTTPError: # GNPS1 task IDs can not be retrieved and throw HTTP Error 500 + except urllib.error.HTTPError as e: + print(f"HTTP Error encountered: {e}") # GNPS1 task IDs can not be retrieved and throw HTTP Error 500 if cmn: ft_url = f"https://gnps2.org/resultfile?task={task_id}&file=nf_output/clustering/featuretable_reformatted_precursorintensity.csv" md_url = f"https://gnps2.org/resultfile?task={task_id}&file=nf_output/metadata/merged_metadata.tsv" + + ft = pd.read_csv(ft_url) + try: + md = pd.read_csv(md_url, sep = "\t", index_col="filename") + except pd.errors.EmptyDataError: + md = pd.DataFrame() + else: ft_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=quantification_table_reformatted/&block=main" md_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=metadata_merged/&block=main" - ft = pd.read_csv(ft_url) - try: - md = pd.read_csv(md_url, sep = "\t", index_col="filename") - except pd.errors.EmptyDataError: - md = pd.DataFrame() - if not cmn: an_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=DB_result/&block=main" + + ft = pd.read_csv(ft_url) + md = pd.read_csv(md_url, sep="\t", index_col="filename") an = pd.read_csv(an_url, sep = "\t")[["#Scan#", "Compound_Name"]].set_index("#Scan#") + + if md.empty: # Handle empty metadata + md = pd.DataFrame() + if cmn: ft.index = ft["row ID"].astype(str) ft = ft.drop(columns=["row m/z", "row retention time", "row ID"]) + else: - index_with_mz_RT = pd.Index(ft.apply(lambda x: f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1)) + index_with_mz_RT = ft.apply(lambda x: f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1) ft.index = index_with_mz_RT - st.session_state["df_gnps_annotations"].index = index_with_mz_RT - st.session_state["df_gnps_annotations"]["GNPS annotation"] = ft["row ID"].apply(lambda x: an.loc[x, "Compound_Name"] if x in an.index else pd.NA) - st.session_state["df_gnps_annotations"].dropna(inplace=True) + if 'df_gnps_annotations' in st.session_state: + st.session_state["df_gnps_annotations"].index = index_with_mz_RT + st.session_state["df_gnps_annotations"]["GNPS annotation"] = ft["row ID"].apply(lambda x: an.loc[x, "Compound_Name"] if x in an.index else pd.NA) + st.session_state["df_gnps_annotations"].dropna(inplace=True) + + ft.index.name = 'metabolite' return ft, md diff --git a/src/pca.py b/src/pca.py index d80fae3..0cca8a4 100644 --- a/src/pca.py +++ b/src/pca.py @@ -1,18 +1,25 @@ import streamlit as st from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler import pandas as pd import plotly.express as px import numpy as np + @st.cache_data def get_pca_df(scaled, n=5): +<<<<<<< HEAD + # calculating Principal components + pca = PCA(n_components=n) + pca_df = pd.DataFrame( + data=pca.fit_transform(scaled), columns=[f"PC{x}" for x in range(1, n + 1)] +======= # calculating Principal components pca = PCA(n_components=n) pca_df = pd.DataFrame( data=pca.fit_transform(scaled), columns=[f"PC{x}" for x in range(1, n + 1)] +>>>>>>> efdd76467755ddb96598832b0740cb7149a9cefb ) pca_df.index = scaled.index return pca.explained_variance_ratio_, pca_df diff --git a/src/randomforest.py b/src/randomforest.py index 15eeb02..c284128 100644 --- a/src/randomforest.py +++ b/src/randomforest.py @@ -3,12 +3,11 @@ import numpy as np import plotly.express as px from sklearn.preprocessing import OrdinalEncoder -from sklearn.model_selection import train_test_split +from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score from sklearn.ensemble import RandomForestClassifier from sklearn.utils import class_weight from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix, accuracy_score -from io import StringIO @st.cache_data def run_random_forest(attribute, n_trees, random_seed=None): @@ -36,13 +35,16 @@ def run_random_forest(attribute, n_trees, random_seed=None): # Extract the feature intensities as np 2D array features = np.array(st.session_state.data) + # Determine the smallest class size and adjust test_size accordingly + unique, counts = np.unique(labels, return_counts=True) + min_class_count = min(counts) + min_test_size = float(len(unique)) / len(labels) - # Split the data into training and test sets - train_features, test_features, train_labels, test_labels = train_test_split(features, - labels, - test_size=0.25, - random_state= random_seed, - stratify=labels) + # Adjust test size to be larger of the calculated min_test_size or the initial_test_size + adjusted_test_size = max(min_test_size, 0.25) + + train_features, test_features, train_labels, test_labels = train_test_split( + features, labels, test_size= adjusted_test_size, random_state=random_seed, stratify=labels) # Collecting info about feature and label shapes for logging log += f"Training Features Shape: {train_features.shape}\n" @@ -61,7 +63,7 @@ def run_random_forest(attribute, n_trees, random_seed=None): weights[w] = sklearn_weights[i] # Set up the random forest classifier with 100 tress, balanded weights, and a random state to make it reproducible - rf = RandomForestClassifier(n_estimators=n_trees, class_weight='balanced', random_state=random_seed) + rf = RandomForestClassifier(n_estimators=n_trees, class_weight= weights, random_state=random_seed) # Fit the classifier to the training set rf.fit(train_features, train_labels) @@ -106,9 +108,10 @@ def run_random_forest(attribute, n_trees, random_seed=None): df_important_features = pd.DataFrame(rf.feature_importances_, index=st.session_state.data.columns).sort_values(by=0, ascending=False) df_important_features.columns = ["importance"] - + return df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy + def get_oob_fig(df): return px.line(df, x="n trees", y="error rate", title="out-of-bag (OOB) error")