diff --git a/src/sdstools/join_model_scores.py b/src/sdstools/join_model_scores.py index 0a72c86..3de7827 100644 --- a/src/sdstools/join_model_scores.py +++ b/src/sdstools/join_model_scores.py @@ -7,156 +7,185 @@ import pandas as pd import os - -def join_model_scores_to_shorelines(good_bad_csv, - good_bad_seg_csv, - shorelines_path, - img_type): +def drop_columns_if_exist(df, columns): """ - Joins model scores to shoreline points - inputs: - good_bad_csv (str): path to the image suitability output csv - good_bad_seg_csv (str): path to the seg filter output csv - shorelines_path (str): path to the extracted_shorelines_points.geojson or extracted_shorelines_lines.geojson - img_type (str): 'RGB' 'MNDWI' or 'NDWI'. - outputs: - shorelines_path (str): path to the shoreline points with model scores joined + Drop columns from a DataFrame if they exist. + Parameters: + df (pd.DataFrame): DataFrame to drop columns from. + columns (list): List of column names to drop. + + Returns: + pd.DataFrame: DataFrame with columns dropped. """ - ##load files - shorelines_gdf = gpd.read_file(shorelines_path) - shorelines_gdf['date'] = pd.to_datetime(shorelines_gdf['date'], utc=True) + for col in columns: + if col in df.columns: + df.drop(columns=col, inplace=True) + return df - good_bad_df = pd.read_csv(good_bad_csv) - good_bad_seg_df = pd.read_csv(good_bad_seg_csv) - - ##get image dates - dts = [None]*len(good_bad_df) - for i in range(len(good_bad_df)): - dt = os.path.basename(good_bad_df['im_paths'].iloc[i]) - idx = dt.find('_RGB') - dt = dt[0:idx] - dts[i] = dt - good_bad_df['dates'] = dts - good_bad_df['dates'] = pd.to_datetime(good_bad_df['dates'], utc=True, - format='%Y-%m-%d-%H-%M-%S') - - ##get seg dates - dts_seg = [None]*len(good_bad_seg_df) - for i in range(len(good_bad_seg_df)): - dt = os.path.basename(good_bad_seg_df['im_paths'].iloc[i]) - idx = dt.find('_'+img_type) - dt = dt[0:idx] - dts_seg[i] = dt - good_bad_seg_df['dates'] = dts_seg - good_bad_seg_df['dates'] = pd.to_datetime(good_bad_seg_df['dates'], utc=True, - format='%Y-%m-%d-%H-%M-%S') - - ##merge image scores - shorelines_gdf = shorelines_gdf.merge(good_bad_df, - left_on='date', - right_on='dates', - suffixes=['', '_image'] - ) - - ##merge seg scores - shorelines_gdf = shorelines_gdf.merge(good_bad_seg_df, - left_on='date', - right_on='dates', - suffixes=['', '_seg'] - ) - - ##clean up columns - cols = list(shorelines_gdf.columns) - keep_cols = ['date', 'satname', 'geoaccuracy', 'cloud_cover', - 'geometry','im_paths','model_scores','model_scores_seg'] - for col in cols: - if col not in keep_cols: - shorelines_gdf = shorelines_gdf.drop(columns=[col]) - shorelines_gdf.to_file(shorelines_path) - return shorelines_path - -def join_model_scores_to_time_series(good_bad_csv, - good_bad_seg_csv, - transect_time_series_merged_path, - img_type): +def join_model_scores_to_time_series(transect_time_series_merged_path, + good_bad_csv=None, + good_bad_seg_csv=None): """ - Joins model scores to shoreline points - inputs: - good_bad (str): path to the image suitability output csv - good_bad_seg (str): path to the seg filter output csv - transect_time_series_merged_path (str): path to raw_transect_time_series_merged.csv - or tidally_corrected_transect_time_series_merged.csv - img_type (str): 'RGB' 'MNDWI' or 'NDWI'. - outputs: - shorelines_path (str): path to the transect_time_series_merged.csv with model scores joined + Joins model scores to time series transect data based on the provided CSVs. + Parameters: + transect_time_series_merged_path (str): Path to transect time series CSV. + good_bad_csv (str, optional): Path to the image suitability output CSV. + good_bad_seg_csv (str, optional): Path to the seg filter output CSV. + + Returns: + str: Path to the updated transect time series CSV with model scores joined. """ - ##load csv + + # Load transect time series data transect_time_series_merged = pd.read_csv(transect_time_series_merged_path) transect_time_series_merged['dates'] = pd.to_datetime(transect_time_series_merged['dates'], utc=True) - ##getting image dates - good_bad = pd.read_csv(good_bad_csv) - good_bad_seg = pd.read_csv(good_bad_seg_csv) - dts = [None]*len(good_bad) - for i in range(len(good_bad)): - dt = os.path.basename(good_bad['im_paths'].iloc[i]) - idx = dt.find('_RGB') - dt = dt[0:idx] - dts[i] = dt - good_bad['dates'] = dts - good_bad['dates'] = pd.to_datetime(good_bad['dates'], - utc=True, - format='%Y-%m-%d-%H-%M-%S') - try: - good_bad = good_bad.drop(columns = ['Unnamed: 0.1', 'Unnamed: 0']) - good_bad_seg = good_bad_seg.drop(columns = ['Unnamed: 0.1', 'Unnamed: 0']) - except: - pass - - ##gettting seg dates - dts_seg = [None]*len(good_bad_seg) - for i in range(len(good_bad_seg)): - dt = os.path.basename(good_bad_seg['im_paths'].iloc[i]) - idx = dt.find('_'+img_type) - dt = dt[0:idx] - dts_seg[i] = dt - good_bad_seg['dates'] = dts_seg - good_bad_seg['dates'] = pd.to_datetime(good_bad_seg['dates'], - utc=True, - format='%Y-%m-%d-%H-%M-%S') - - - ##join good_bad and good_bad_seg scores - transect_time_series_merged = transect_time_series_merged.merge(good_bad, - left_on='dates', - right_on='dates', - suffixes = ['_ts', '_image'] - ) - transect_time_series_merged = transect_time_series_merged.merge(good_bad_seg, - left_on='dates', - right_on='dates', - suffixes = ['', '_seg'] - ) - ##get satellite names - satnames = [None]*len(transect_time_series_merged) - for i in range(len(transect_time_series_merged)): - im_path = transect_time_series_merged['im_paths'].iloc[i] - satname = os.path.splitext(os.path.basename(im_path))[0][-2:] - satnames[i] = satname - transect_time_series_merged['satname'] = satnames - - keep_cols = ['dates','x','y','transect_id','cross_distance', - 'shore_x','shore_y','im_paths','model_scores', - 'model_scores_seg','satname','tide'] - - ##clean up columns - cols = list(transect_time_series_merged.columns) - for col in cols: - if col not in keep_cols: - transect_time_series_merged = transect_time_series_merged.drop(columns=[col]) - + if good_bad_csv: + + drop_columns_if_exist(transect_time_series_merged, ['classifier_model_score', 'classifier_threshold']) + + # Load and process good_bad CSV + good_bad = pd.read_csv(good_bad_csv) + good_bad['dates'] = good_bad['im_paths'].apply(lambda x: pd.to_datetime(os.path.basename(x).split('_')[0], utc=True, format='%Y-%m-%d-%H-%M-%S')) + + # Prepare merge and rename directly + merge_columns = ['dates', 'model_scores'] + if 'threshold' in good_bad.columns: + merge_columns.append('threshold') + merged_df = good_bad[merge_columns] + + transect_time_series_merged = transect_time_series_merged.merge(merged_df, + on='dates', + how='left', + suffixes=('_ts', '_image')) + # Optional: drop additional unnamed columns if present + transect_time_series_merged.drop(columns=[col for col in transect_time_series_merged if 'Unnamed:' in col], errors='ignore', inplace=True) + # rename model_scores column to classifier_model_score + transect_time_series_merged.rename(columns={'model_scores': 'classifier_model_score'}, inplace=True) + if "threshold" in transect_time_series_merged: + transect_time_series_merged.rename(columns={'threshold': 'classifier_threshold'}, inplace=True) + + if good_bad_seg_csv: + + drop_columns_if_exist(transect_time_series_merged, ['segmentation_model_score', 'segmentation_threshold']) + + # Load and process good_bad_seg CSV + good_bad_seg = pd.read_csv(good_bad_seg_csv) + good_bad_seg['dates'] = good_bad_seg['im_paths'].apply(lambda x: pd.to_datetime(os.path.basename(x).split('_')[0], utc=True, format='%Y-%m-%d-%H-%M-%S')) + + # Prepare merge and rename directly + merge_columns = ['dates', 'model_scores'] + if 'threshold' in good_bad_seg.columns: + merge_columns.append('threshold') + merged_df = good_bad_seg[merge_columns] + + transect_time_series_merged = transect_time_series_merged.merge(merged_df, + on='dates', + how='left', + suffixes=('', '_seg')) + # rename model_scores column to segmentation_model_score + transect_time_series_merged.rename(columns={'model_scores': 'segmentation_model_score'}, inplace=True) + if "threshold" in transect_time_series_merged: + transect_time_series_merged.rename(columns={'threshold': 'segmentation_threshold'}, inplace=True) + + # Save updated DataFrame transect_time_series_merged.to_csv(transect_time_series_merged_path) - + return transect_time_series_merged_path +def join_model_scores_to_shorelines(shorelines_path, + good_bad_csv=None, + good_bad_seg_csv=None): + """ + Joins model scores to shoreline points based on the provided CSVs. + Parameters: + shorelines_path (str): path to the extracted shorelines geojson. + good_bad_csv (str, optional): path to the image suitability output CSV. + good_bad_seg_csv (str, optional): path to the seg filter output CSV. + + Returns: + str: path to the shoreline points with model scores joined. + """ + # Load shorelines data + shorelines_gdf = gpd.read_file(shorelines_path) + shorelines_gdf['date'] = pd.to_datetime(shorelines_gdf['date'], utc=True) + + if good_bad_csv: + good_bad_df = pd.read_csv(good_bad_csv) + good_bad_df['dates'] = good_bad_df['im_paths'].apply(lambda x: pd.to_datetime(os.path.basename(x).split('_')[0], utc=True, format='%Y-%m-%d-%H-%M-%S')) + + # Drop old scores if they exist + drop_columns_if_exist(shorelines_gdf, ['classifier_model_score', 'classifier_threshold']) + + # Prepare merge and rename directly + merge_columns = ['dates', 'model_scores'] + if 'threshold' in good_bad_df.columns: + merge_columns.append('threshold') + merged_df = good_bad_df[merge_columns] + + shorelines_gdf = shorelines_gdf.merge(merged_df, + left_on='date', right_on='dates', + suffixes=('', '_image')) + + #rename model_scores to classifer_model_score + shorelines_gdf.rename(columns={'model_scores': 'classifier_model_score'}, inplace=True) + if "threshold" in shorelines_gdf: + shorelines_gdf.rename(columns={'threshold': 'classifier_threshold'}, inplace=True) + + if good_bad_seg_csv: + good_bad_seg_df = pd.read_csv(good_bad_seg_csv) + good_bad_seg_df['dates'] = good_bad_seg_df['im_paths'].apply(lambda x: pd.to_datetime(os.path.basename(x).split('_')[0], utc=True, format='%Y-%m-%d-%H-%M-%S')) + + # Drop old scores if they exist + drop_columns_if_exist(shorelines_gdf, ['segmentation_model_score', 'segmentation_threshold']) + + # Prepare merge and rename directly + merge_columns = ['dates', 'model_scores'] + if 'threshold' in good_bad_seg_df.columns: + merge_columns.append('threshold') + merged_df = good_bad_seg_df[merge_columns] + + shorelines_gdf = shorelines_gdf.merge(merged_df, + left_on='date', right_on='dates', + suffixes=('', '_seg')) + # rename model_scores column to segmentation_model_score + shorelines_gdf.rename(columns={'model_scores': 'segmentation_model_score'}, inplace=True) + if "threshold" in shorelines_gdf: + shorelines_gdf.rename(columns={'threshold': 'segmentation_threshold'}, inplace=True) + + # Save modified GeoDataFrame + # drop any duplicate columns + shorelines_gdf = shorelines_gdf.loc[:,~shorelines_gdf.columns.duplicated()] + + shorelines_gdf.to_file(shorelines_path) + + return shorelines_path + + +# # Example #1 : join_model_scores_to_shorelines +# good_bad_csv = r"C:\development\doodleverse\coastseg\CoastSeg\data\ID_wra5_datetime03-04-24__03_43_01\jpg_files\preprocessed\RGB\image_classification_results.csv" +# good_bad_seg_csv = r"C:\development\doodleverse\coastseg\CoastSeg\sessions\sample_session_demo1\segmentation_classification_results.csv" +# shorelines_path = r"C:\development\doodleverse\coastseg\CoastSeg\sessions\sample_session_demo1\extracted_shorelines_points.geojson" +# join_model_scores_to_shorelines(shorelines_path, +# good_bad_csv, +# good_bad_seg_csv) + + +# # Example #2 : join_model_scores_to_time_series +# good_bad_csv = r"C:\development\doodleverse\coastseg\CoastSeg\data\ID_wra5_datetime03-04-24__03_43_01\jpg_files\preprocessed\RGB\image_classification_results.csv" +# good_bad_seg_csv = r"C:\development\doodleverse\coastseg\CoastSeg\sessions\sample_session_demo1\segmentation_classification_results.csv" +# transect_time_series_merged_path = r"C:\development\doodleverse\coastseg\CoastSeg\sessions\sample_session_demo1\raw_transect_time_series_merged.csv" + +# csv = pd.read_csv(transect_time_series_merged_path) +# # drop the columns if they exist +# columns = ['model_scores_seg','classifier_model_score','segmentation_model_score'] +# for col in columns: +# if col in csv.columns: +# csv.drop(columns=col, inplace=True) +# # overwrite the old save +# csv.to_csv(transect_time_series_merged_path, index=False) + +# join_model_scores_to_time_series(transect_time_series_merged_path, +# good_bad_csv, +# good_bad_seg_csv) \ No newline at end of file