Skip to content

Commit

Permalink
Merge pull request #20 from abzer005/Abzer-branch
Browse files Browse the repository at this point in the history
Fixed issues
  • Loading branch information
axelwalter authored Apr 16, 2024
2 parents 4f0edae + 877a43c commit 9a264ab
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 34 deletions.
11 changes: 9 additions & 2 deletions pages/1_📁_Data_Preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,22 +32,25 @@
if file_origin == "Small example dataset for testing":
ft, md = load_example()

elif file_origin == "GNPS(2) task ID" or file_origin == "Example dataset from publication" or file_origin == "GNPS2 classical molecular networking (CMN)":
elif file_origin in ["GNPS(2) task ID", "Example dataset from publication", "GNPS2 classical molecular networking (CMN)"]:
st.warning("💡 This tool only supports task ID from GNPS1 and 2 not from Quickstart GNPS1.")
if file_origin == "Example dataset from publication":
task_id_default = "b661d12ba88745639664988329c1363e" # 63e8b3da08df41fe95031e4710e0476b
disabled = True
cmn_flag = False
elif file_origin == "GNPS2 classical molecular networking (CMN)":
task_id_default = "" # 2a65f90094654235a4c8d337fdca11e1
disabled = False
cmn_flag = True
else:
task_id_default = ""
disabled = False
cmn_flag = False
task_id = st.text_input("GNPS task ID", task_id_default, disabled=disabled)
_, c2, _ = st.columns(3)

if c2.button("Load files from GNPS", type="primary", disabled=len(task_id) == 0, use_container_width=True):
st.session_state["ft_gnps"], st.session_state["md_gnps"] = load_from_gnps(task_id, cmn=True)
st.session_state["ft_gnps"], st.session_state["md_gnps"] = load_from_gnps(task_id, cmn= cmn_flag)

if not st.session_state["ft_gnps"].empty and st.session_state["md_gnps"].empty:
st.warning("Meta data is empty. Please upload one.")
Expand Down Expand Up @@ -202,6 +205,8 @@
show_table(ft, "blank-features-removed")

st.session_state['blank_removal_done'] = True
else:
st.session_state['blank_removal_done'] = False

if not ft.empty:
cutoff_LOD = get_cutoff_LOD(ft)
Expand All @@ -224,6 +229,8 @@
st.warning(f"Can't impute with random values between 1 and lowest value, which is {cutoff_LOD} (rounded).")

st.session_state['imputation_done'] = True
else:
st.session_state['imputation_done'] = False

with tabs[2]:
normalization_method = st.radio("data normalization method", ["None",
Expand Down
23 changes: 13 additions & 10 deletions pages/5_Random_Forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,19 @@
random_seed = 123 if use_random_seed else None

if c2.button("Run supervised learning", type="primary"):
df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy = run_random_forest(st.session_state.rf_attribute, st.session_state.rf_n_trees, random_seed)
st.session_state['df_oob'] = df_oob
st.session_state['df_important_features'] = df_important_features
st.session_state['log'] = log
st.session_state['class_report'] = class_report
st.session_state['label_mapping'] = label_mapping
st.session_state['test_confusion_df'] = test_confusion_df
st.session_state['train_confusion_df'] = train_confusion_df
st.session_state['test_accuracy'] = test_accuracy
st.session_state['train_accuracy'] = train_accuracy
try:
df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy = run_random_forest(st.session_state.rf_attribute, st.session_state.rf_n_trees, random_seed)
st.session_state['df_oob'] = df_oob
st.session_state['df_important_features'] = df_important_features
st.session_state['log'] = log
st.session_state['class_report'] = class_report
st.session_state['label_mapping'] = label_mapping
st.session_state['test_confusion_df'] = test_confusion_df
st.session_state['train_confusion_df'] = train_confusion_df
st.session_state['test_accuracy'] = test_accuracy
st.session_state['train_accuracy'] = train_accuracy
except Exception as e:
st.error(f"Failed to run model due to: {str(e)}")

if 'df_important_features' in st.session_state and not st.session_state.df_important_features.empty:
tabs = st.tabs(["📈 Analyze optimum number of trees",
Expand Down
2 changes: 2 additions & 0 deletions src/anova.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ def get_anova_plot(anova):
yaxis_title="-log(p)",
showlegend=False
)
fig.update_yaxes(title_standoff=10)

# fig.update_yaxes(title_font_size=20)
# fig.update_xaxes(title_font_size=20)

Expand Down
36 changes: 25 additions & 11 deletions src/fileselection.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,34 +55,48 @@ def load_example():

@st.cache_data
def load_from_gnps(task_id, cmn=False):

try: # GNPS2 will run here
ft = workflow_fbmn.get_quantification_dataframe(task_id, gnps2=True)
md = workflow_fbmn.get_metadata_dataframe(task_id, gnps2=True).set_index("filename")
an = taskresult.get_gnps2_task_resultfile_dataframe(task_id, "nf_output/library/merged_results_with_gnps.tsv")[["#Scan#", "Compound_Name"]].set_index("#Scan#")
except urllib.error.HTTPError: # GNPS1 task IDs can not be retrieved and throw HTTP Error 500
except urllib.error.HTTPError as e:
print(f"HTTP Error encountered: {e}") # GNPS1 task IDs can not be retrieved and throw HTTP Error 500
if cmn:
ft_url = f"https://gnps2.org/resultfile?task={task_id}&file=nf_output/clustering/featuretable_reformatted_precursorintensity.csv"
md_url = f"https://gnps2.org/resultfile?task={task_id}&file=nf_output/metadata/merged_metadata.tsv"

ft = pd.read_csv(ft_url)
try:
md = pd.read_csv(md_url, sep = "\t", index_col="filename")
except pd.errors.EmptyDataError:
md = pd.DataFrame()

else:
ft_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=quantification_table_reformatted/&block=main"
md_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=metadata_merged/&block=main"
ft = pd.read_csv(ft_url)
try:
md = pd.read_csv(md_url, sep = "\t", index_col="filename")
except pd.errors.EmptyDataError:
md = pd.DataFrame()
if not cmn:
an_url = f"https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResultFile?task={task_id}&file=DB_result/&block=main"

ft = pd.read_csv(ft_url)
md = pd.read_csv(md_url, sep="\t", index_col="filename")
an = pd.read_csv(an_url, sep = "\t")[["#Scan#", "Compound_Name"]].set_index("#Scan#")

if md.empty: # Handle empty metadata
md = pd.DataFrame()

if cmn:
ft.index = ft["row ID"].astype(str)
ft = ft.drop(columns=["row m/z", "row retention time", "row ID"])

else:
index_with_mz_RT = pd.Index(ft.apply(lambda x: f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1))
index_with_mz_RT = ft.apply(lambda x: f'{x["row ID"]}_{round(x["row m/z"], 4)}_{round(x["row retention time"], 2)}', axis=1)
ft.index = index_with_mz_RT
st.session_state["df_gnps_annotations"].index = index_with_mz_RT
st.session_state["df_gnps_annotations"]["GNPS annotation"] = ft["row ID"].apply(lambda x: an.loc[x, "Compound_Name"] if x in an.index else pd.NA)
st.session_state["df_gnps_annotations"].dropna(inplace=True)
if 'df_gnps_annotations' in st.session_state:
st.session_state["df_gnps_annotations"].index = index_with_mz_RT
st.session_state["df_gnps_annotations"]["GNPS annotation"] = ft["row ID"].apply(lambda x: an.loc[x, "Compound_Name"] if x in an.index else pd.NA)
st.session_state["df_gnps_annotations"].dropna(inplace=True)

ft.index.name = 'metabolite'
return ft, md


Expand Down
9 changes: 8 additions & 1 deletion src/pca.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
import streamlit as st
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import plotly.express as px
import numpy as np


@st.cache_data
def get_pca_df(scaled, n=5):
<<<<<<< HEAD
# calculating Principal components
pca = PCA(n_components=n)
pca_df = pd.DataFrame(
data=pca.fit_transform(scaled), columns=[f"PC{x}" for x in range(1, n + 1)]
=======

# calculating Principal components
pca = PCA(n_components=n)
pca_df = pd.DataFrame(
data=pca.fit_transform(scaled),
columns=[f"PC{x}" for x in range(1, n + 1)]
>>>>>>> efdd76467755ddb96598832b0740cb7149a9cefb
)
pca_df.index = scaled.index
return pca.explained_variance_ratio_, pca_df
Expand Down
23 changes: 13 additions & 10 deletions src/randomforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
import numpy as np
import plotly.express as px
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score
from io import StringIO

@st.cache_data
def run_random_forest(attribute, n_trees, random_seed=None):
Expand Down Expand Up @@ -36,13 +35,16 @@ def run_random_forest(attribute, n_trees, random_seed=None):
# Extract the feature intensities as np 2D array
features = np.array(st.session_state.data)

# Determine the smallest class size and adjust test_size accordingly
unique, counts = np.unique(labels, return_counts=True)
min_class_count = min(counts)
min_test_size = float(len(unique)) / len(labels)

# Split the data into training and test sets
train_features, test_features, train_labels, test_labels = train_test_split(features,
labels,
test_size=0.25,
random_state= random_seed,
stratify=labels)
# Adjust test size to be larger of the calculated min_test_size or the initial_test_size
adjusted_test_size = max(min_test_size, 0.25)

train_features, test_features, train_labels, test_labels = train_test_split(
features, labels, test_size= adjusted_test_size, random_state=random_seed, stratify=labels)

# Collecting info about feature and label shapes for logging
log += f"Training Features Shape: {train_features.shape}\n"
Expand All @@ -61,7 +63,7 @@ def run_random_forest(attribute, n_trees, random_seed=None):
weights[w] = sklearn_weights[i]

# Set up the random forest classifier with 100 tress, balanded weights, and a random state to make it reproducible
rf = RandomForestClassifier(n_estimators=n_trees, class_weight='balanced', random_state=random_seed)
rf = RandomForestClassifier(n_estimators=n_trees, class_weight= weights, random_state=random_seed)

# Fit the classifier to the training set
rf.fit(train_features, train_labels)
Expand Down Expand Up @@ -106,9 +108,10 @@ def run_random_forest(attribute, n_trees, random_seed=None):
df_important_features = pd.DataFrame(rf.feature_importances_,
index=st.session_state.data.columns).sort_values(by=0, ascending=False)
df_important_features.columns = ["importance"]

return df_oob, df_important_features, log, class_report, label_mapping, test_confusion_df, train_confusion_df, test_accuracy, train_accuracy


def get_oob_fig(df):
return px.line(df, x="n trees", y="error rate", title="out-of-bag (OOB) error")

Expand Down

0 comments on commit 9a264ab

Please sign in to comment.