issue #234 added support for PCT linetype

dtcenter · Nov 22, 2023 · 408152d · 408152d
1 parent 2ea203e
commit 408152d
Showing 1 changed file with 178 additions and 100 deletions.
diff --git a/METreformat/write_stat_ascii.py b/METreformat/write_stat_ascii.py
@@ -17,7 +17,7 @@
 
 # pylint:disable=no-member
 # constants exist in constants.py
-
+import gc
 import sys
 import os
 import logging
@@ -123,13 +123,7 @@ def write_stat_ascii(self, stat_data: pd.DataFrame, parms: dict) -> pd.DataFrame
             # stat_bcl/bcu, stat_ncl/ncu columns.  Other plots, like the histogram plots (rank, relative, probability)
             # and ROC diagrams require specific formatting.
 
-            # ToDo Determine if we need to separate the linetypes based on formatting
-            wide_to_long_format = [cn.FHO, cn.CNT, cn.VCNT, cn.CTC, cn.CTS, cn.MCTS, cn.SL1L2, cn.ECNT]
-            roc_format = [cn.PCT]
-            hist_format = [cn.RHIST, cn.PHIST, cn.RELP]
-
             working_df = stat_data.copy(deep=True)
-            stat_data.to_csv('/Users/minnawin/RRFS/feature_234_RRFS_linetypes/METdataio/METreformat/output/pct_prc_pjc_pstd_raw.csv')
             linetype_requested = parms['line_type']
             if linetype_requested in supported_linetypes:
                 working_df = working_df.loc[working_df['line_type'] == linetype_requested]
@@ -158,7 +152,6 @@ def write_stat_ascii(self, stat_data: pd.DataFrame, parms: dict) -> pd.DataFrame
             # METdbLoad read_data_files module.
             # stat_data = stat_data.fillna('NA')
             working_df = working_df.fillna('NA')
-            working_df.to_csv('/Users/minnawin/RRFS/feature_234_rrfs_linetypes/METdataio/METreformat/output/raw_cnt_only.csv')
             begin_reformat = time.perf_counter()
             reformatted_df = self.process_by_stat_linetype(linetype_requested, working_df)
             end_reformat = time.perf_counter()
@@ -250,8 +243,8 @@ def process_by_stat_linetype(self, linetype: str, stat_data: pd.DataFrame):
     def process_pct(self, stat_data: pd.DataFrame) -> pd.DataFrame:
         """
             Retrieve the PCT linetype data (Contingency count for probabilistic data) and reshape it
-            (wide to long format) to enable METplotpy to ingest the data and generate plots.
-            Take into account that this linetype consists of a variable number
+            (wide to long format) to enable METplotpy to ingest the data and generate ROC diagram plots.
+            Take into account that this line type consists of a variable number
             of fields/columns that appear after the N_THRESH column (i.e. THRESH_i, OY_i, ON_i).
 
             Arguments:
@@ -261,12 +254,9 @@ def process_pct(self, stat_data: pd.DataFrame) -> pd.DataFrame:
             linetype_data: the input dataframe reformatted into long format
 
         """
-        linetype: str = cn.PCT
 
-
-        # Determine how many columns are between the TOTAL column and the fields/columns of variable
-        # number (i.e. THRESH_i, OY_i, ON_i).
-        name_of_col_after_total:str = cn.LINE_VAR_COUNTER[linetype]
+        # Determine the columns for this line type
+        linetype: str = cn.PCT
 
         # Number of columns after the N_THRESH column (i.e. THRESH_i, RANK_i, BIN_i, etc.)
         num_repeating_col_labels: int = int(cn.LINE_VAR_REPEATS[linetype])
@@ -276,22 +266,38 @@ def process_pct(self, stat_data: pd.DataFrame) -> pd.DataFrame:
 
         # Add 1 for THRESH_N, the last threshold value column
         total_number_variable_columns = num_thresh * num_repeating_col_labels + 1
-        print(f"{total_number_variable_columns} total number of PCT-specific columns ")
 
-        # Add 1 for the TOTAL column to get the total number of columns for this data
+        # Add 1 for the TOTAL column to get the total number of columns for this line type
         total_number_relevant_columns = cn.NUM_STATIC_PCT_COLS + total_number_variable_columns + 1
 
+        #
+        # Subset the input dataframe to include only the PCT columns and label the remaining
+        # "unlabelled" (i.e. labelled with numbers after data is read in by METdbLoad)
+        # columns/headers.
+        #
+
+        # Subset the dataframe to only the PCT line type rows
+        # stat_data_copy: pd.DataFrame = stat_data.copy(deep=True)
+        # Do not assume that the input data contains only the PCT lines.
+        stat_data_copy = stat_data.loc[stat_data['line_type'] == cn.PCT]
+
         # Get a list of names of the columns that correspond to the PCT linetype for this data
-        all_columns = stat_data.columns
-        only_relevant_columns = stat_data.columns.tolist()[0:total_number_relevant_columns]
+        only_relevant_columns = stat_data_copy.columns.tolist()[0:total_number_relevant_columns]
 
-        # Subset the input dataframe to include only the PCT columns and label the remaining
-        # unlabelled (numbered) columns/headers.
-        filtered_df = stat_data[only_relevant_columns]
+        filtered_df = stat_data_copy[only_relevant_columns]
         headers = filtered_df.columns
+
+        # Identify the common headers to be used in indexing the dataframe.
+        common_list = headers[0:cn.NUM_STATIC_PCT_COLS].to_list()
+        common_list[cn.NUM_STATIC_PCT_COLS - 1] = 'total'
+
         working_df = filtered_df.copy(deep=True)
 
-        # Replace the numbered labels with the TOTAL and N_THRESH labels
+        # Remove the stat_data_copy dataframe, it is no longer needed.
+        del stat_data_copy
+        gc.collect()
+
+        # Replace the first two numbered labels (following the LINETYPE column) with the TOTAL and N_THRESH labels
         working_df.rename(columns={'0':'total', cn.LINE_VAR_COUNTER[cn.PCT]:'n_thresh'}, inplace=True)
 
         # Relabel the remaining numbered column headers
@@ -302,22 +308,122 @@ def process_pct(self, stat_data: pd.DataFrame) -> pd.DataFrame:
         working_df.rename(columns={str(last_column_name):thresh_n_col_name}, inplace=True)
 
         # Relabel the repeating columns (THRESH_i, OY_i, ON_i)
-        idx = 1
+        # column names are numbered '1','2','3',...,etc. Give them descriptive labels: thresh_1, oy_1, on_1, etc.
+        ith_value_label = []
         column_name_value = int(cn.LINE_VAR_COUNTER[cn.PCT]) + 1
-        for i in range(int(cn.LINE_VAR_COUNTER[cn.PCT]), int(num_thresh) +1):
+
+        for i in range(int(cn.LINE_VAR_COUNTER[cn.PCT]), int(num_thresh) + 1):
             for column in cn.LC_PCT_VARIABLE_HEADERS:
                 column_name = str(column_name_value)
                 column_label = "{label}_{idx}".format(label=column, idx=i)
                 working_df.rename(columns={column_name:column_label}, inplace=True)
                 column_name_value += 1
 
-        # Now reformat the columns so all thresh_1, thresh_2, ..., etc values go under the thresh_i column,
-        # the oy_1, oy_2, ..., etc. values go under the oy_i column, and the on_1, on_2, ...,etc. values
-        # go under the on_i column.  The nth threshold goes under the i_value column (ie threshold 1,..., n).
+            # Add a list used to facilitate creating the value_i column when reformatting.
+            ith_value_label.append("{label}_{idx}".format(label="value", idx=i))
+
+        # Create a dataframe consisting only of the value_1, ..., value_n values and their corresponding index values
+        # and concat to the working_df.
+        num_rows = working_df.shape[0]
+
+        # Create a dictionary of values corresponding to each value_1, value_2, etc 'key'
+        value_i_dict = {}
+
+        # Create a dataframe with the same number of rows
+        # as the working_df dataframe to enable concatenation.
+
+        for label in ith_value_label:
+            values_list = []
+            match = re.match(r'(value_)(\d+)', label)
+            ith_value = int(match.group(2))
 
+            for i in range(1, num_rows + 1):
+                values_list.append(ith_value)
 
+            value_i_dict[label] = values_list
 
-        return working_df
+        # Create the dataframe of the value_i values.
+        value_df = pd.DataFrame.from_dict(value_i_dict)
+
+        # Reindex working_df to match the value_df index. This ensures correct concatenation of
+        # the working_df with the value_df
+        working_df_reindexed = working_df.reset_index(drop=False)
+        working_df_reindexed = pd.concat([working_df_reindexed, value_df], axis=1)
+
+        # Clean up working_df dataframe, it is no longer needed
+        del working_df
+        del value_df
+        gc.collect()
+
+        # Now reformat the columns so all thresh_1, thresh_2, ..., etc values go under the thresh_i column,
+        # the oy_1, oy_2, ..., etc. values go under the oy_i column, and the on_1, on_2, ...,etc. values
+        # go under the on_i column.  The corresponding threshold level/number/index for the thresh_i, oy_i and on_i
+        # columns goes under the i_value column (ie threshold 1,..., n).
+
+        # Work on a copy of the working_df to avoid working on a fragmented dataframe (i.e. avoid the
+        # PerformanceWarning).
+        working_copy_df = working_df_reindexed.copy(deep=True)
+
+        # Clean up
+        del working_df_reindexed
+        gc.collect()
+
+        thresh_cols = []
+        oy_cols = []
+        on_cols = []
+        i_value = []
+        working_headers = working_copy_df.columns.to_list()
+        remaining_columns = working_headers[cn.NUM_STATIC_PCT_COLS:]
+        for cur in remaining_columns:
+            match_thresh = re.match(r'(thresh_)(\d+)', cur)
+            match_oy = re.match(r'(oy_)(\d+)', cur)
+            match_on = re.match(r'(on_)(\d+)', cur)
+            match_val = re.match(r'(value_)(\d+)', cur)
+            if match_thresh:
+                thresh_cols.append(cur)
+            elif match_oy:
+                oy_cols.append(cur)
+            elif match_on:
+                on_cols.append(cur)
+            elif match_val:
+                i_value.append(cur)
+
+        # The last threshold value, thresh_n isn't used, remove it from the list
+        thresh_cols = thresh_cols[:-1]
+
+        # Now apply melt to get the thresh_i, oy_i, on_i columns, and i_value column
+        # include the n_thresh column for indexing.
+        common_list.append('n_thresh')
+        df_thresh  = working_copy_df.melt(id_vars=common_list, value_vars=thresh_cols, var_name='thresh', value_name='thresh_i')
+        df_oy = working_copy_df.melt(id_vars=common_list, value_vars=oy_cols, var_name='oy', value_name='oy_i')
+        df_on = working_copy_df.melt(id_vars=common_list, value_vars=on_cols, var_name='on', value_name='on_i')
+        df_values = working_copy_df.melt(id_vars=common_list, value_vars=i_value, var_name='values', value_name='i_value')
+
+        # Drop the unused var_names in the melted dataframes
+        df_thresh.drop('thresh', axis=1, inplace=True)
+        df_oy.drop('oy', axis=1, inplace=True)
+        df_on.drop('on', axis=1, inplace=True)
+        df_values.drop('values', axis=1, inplace=True)
+
+        # Reindex to use the common columns before concatenating the melted dataframes to avoid duplication of
+        # common columns.
+        df_thresh_reindex = df_thresh.set_index(common_list, drop=True, append=False, inplace=False)
+        df_oy_reindex = df_oy.set_index(common_list, drop=True, append=False, inplace=False)
+        df_on_reindex = df_on.set_index(common_list, drop=True, append=False, inplace=False)
+        df_values_reindex = df_values.set_index(common_list, drop=True, append=False, inplace=False)
+        reformatted_df = pd.concat([df_thresh_reindex, df_oy_reindex, df_on_reindex, df_values_reindex], axis=1)
+
+        # clean up
+        del working_copy_df
+        gc.collect()
+
+        # reset the index so all columns are same level
+        reformatted_df = reformatted_df.reset_index(drop=False)
+
+        # Convert the n_thresh values to integers
+        reformatted_df = reformatted_df.astype({"n_thresh":np.int16})
+
+        return reformatted_df
 
 
     def process_fho(self, stat_data: pd.DataFrame) -> pd.DataFrame:
@@ -327,7 +433,7 @@ def process_fho(self, stat_data: pd.DataFrame) -> pd.DataFrame:
              stat_name, stat_value, stat_bcl, stat_bcu, stat_ncu, and stat_ncl
 
              Arguments:
-             @param stat_data: The dataframe containing all the original data from
+             @param stat_data: The dataframe containing the data from
              the MET .stat file.
 
              Returns:
@@ -404,7 +510,7 @@ def process_cnt(self, stat_data: pd.DataFrame) -> pd.DataFrame:
            specifically for the CNT line type data.
 
            Arguments:
-           @param stat_data: the dataframe containing all the data from the MET .stat
+           @param stat_data: the dataframe containing data from the MET .stat
            file.
 
            Returns:
@@ -496,7 +602,7 @@ def process_vcnt(self, stat_data: pd.DataFrame) -> pd.DataFrame:
            specifically for the VCNT line type data.
 
            Arguments:
-           @param stat_data: the dataframe containing all the data from the MET .stat
+           @param stat_data: the dataframe containing all the VCNT data from the MET .stat
            file.
 
            Returns:
@@ -642,74 +748,6 @@ def process_ctc(self, stat_data: pd.DataFrame) -> pd.DataFrame:
 
         return linetype_data
 
-    # def process_mctc(self, stat_data: pd.DataFrame) -> pd.DataFrame:
-    #     """
-    #          Reshape the data from the original MET output file (stat_data)
-    #          into new statistics columns:
-    #          stat_name, stat_value specifically for the MCTC (multi-category
-    #          contingency table counts) line type
-    #          data.
-    #
-    #          Arguments:
-    #          @param stat_data: the dataframe containing all the data from the MET .stat
-    #                            file.
-    #
-    #          Returns:
-    #              linetype_data: the reshaped pandas dataframe with statistics data
-    #                             reorganized into the stat_name and
-    #                             stat_value, stat_ncl, stat_ncu, stat_bcl,
-    #                             and stat_bcu columns.
-    #
-    #     """
-    #     # !!!!!!!!
-    #     # TODO
-    #     #  Need to correctly implement this to support the variable number or
-    #     #  Fi_Oj columns
-    #     # !!!!!!!!
-    #
-    #     # Relevant columns for the MCTC line type
-    #     linetype: str = cn.MCTC
-    #     end = cn.NUM_STAT_MCTC_COLS + 1
-    #     mctc_columns_to_use: List[str] =\
-    #         np.arange(0, end).tolist()
-    #
-    #     # Subset original dataframe to one containing only the MCTC data
-    #     mctc_df: pd.DataFrame = stat_data[stat_data['line_type'] == linetype].iloc[:,
-    #                            mctc_columns_to_use]
-    #
-    #     # Add the stat columns header names for the MCTC line type
-    #     mctc_columns: List[str] = cn.MCTC_HEADERS
-    #     mctc_df.columns: List[str] = mctc_columns
-    #
-    #     # Create another index column to preserve the index values from the stat_data
-    #     # dataframe (ie the dataframe
-    #     # containing the original data from the MET output file).
-    #     idx = list(mctc_df.index)
-    #
-    #     # Work on a copy of the mctc_df dataframe to avoid a possible PerformanceWarning
-    #     # message due to a fragmented dataframe.
-    #     mctc_df_copy = mctc_df.copy()
-    #     mctc_df_copy.insert(loc=0, column='Idx', value=idx)
-    #
-    #     # Now apply melt to get the stat_name and stat_values from the statistics
-    #
-    #     # Columns we don't want to stack (i.e. treat these columns as a multi index)
-    #     id_vars_list = ['Idx'] + cn.LC_COMMON_STAT_HEADER + ['total']
-    #     linetype_data = mctc_df_copy.melt(id_vars=id_vars_list,
-    #                                      value_vars=cn.MCTC_STATISTICS_HEADERS,
-    #                                      var_name='stat_name',
-    #                                      value_name='stat_value').sort_values('Idx')
-    #
-    #     # MCTC line type doesn't have the ncl, ncu, bcl and bcu stat values set these
-    #     # to NA
-    #     na_column: List[str] = ['NA' for _ in range(0, linetype_data.shape[0])]
-    #
-    #     linetype_data['stat_ncl']: pd.Series = na_column
-    #     linetype_data['stat_ncu']: pd.Series = na_column
-    #     linetype_data['stat_bcl']: pd.Series = na_column
-    #     linetype_data['stat_bcu']: pd.Series = na_column
-    #
-    #     return linetype_data
 
     def process_cts(self, stat_data: pd.DataFrame) -> pd.DataFrame:
         """
@@ -1168,6 +1206,12 @@ def main():
         except yaml.YAMLError as exc:
             print(exc)
 
+    # Check that the config file has all the necessary settings and values
+    config_file_ok = config_file_complete(parms)
+    if not config_file_ok:
+        raise ValueError("Configuration file is missing one or more required settings and/or values.")
+        sys.exit(1)
+
     # Replacing the need for an XML specification file, pass in the XMLLoadFile and
     # ReadDataFile parameters
     rdf_obj: ReadDataFiles = ReadDataFiles()
@@ -1200,6 +1244,40 @@ def main():
     # stat_lines_obj.write_stat_ascii(file_df, parms, logger)
     stat_lines_obj.write_stat_ascii(file_df, parms)
 
+def config_file_complete(parms):
+    '''
+        Determines if the config file contains all the necessary fields.
+
+        Input:
+            parms:  The config file
+        Returns:
+            True if all expected settings are found, False otherwise.
+    '''
+
+    # Check for log direcotry, log filename, log level, line type, output_dir, output_filename, input_data_dir
+    expected_settings = ['output_dir', 'output_filename', 'input_data_dir', 'log_directory', 'log_filename', 'log_level',
+                         'line_type']
+    actual_keys = []
+    for k,v in parms.items():
+        actual_keys.append(k)
+        if v is None:
+            msg = "ERROR: Missing the value for the " + k + " setting."
+            logger.error(msg)
+            return False
+
+    for expected in expected_settings:
+        if expected not in actual_keys:
+            msg = "ERROR: The " + expected + " setting is missing in the YAML config file"
+            logger.error(msg)
+            return False
+
+
+
+
+
+    return True
+
+
 
 if __name__ == "__main__":