Skip to content

Commit

Permalink
issue #234 added support for PCT linetype
Browse files Browse the repository at this point in the history
  • Loading branch information
bikegeek committed Nov 22, 2023
1 parent 2ea203e commit 408152d
Showing 1 changed file with 178 additions and 100 deletions.
278 changes: 178 additions & 100 deletions METreformat/write_stat_ascii.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

# pylint:disable=no-member
# constants exist in constants.py

import gc
import sys
import os
import logging
Expand Down Expand Up @@ -123,13 +123,7 @@ def write_stat_ascii(self, stat_data: pd.DataFrame, parms: dict) -> pd.DataFrame
# stat_bcl/bcu, stat_ncl/ncu columns. Other plots, like the histogram plots (rank, relative, probability)
# and ROC diagrams require specific formatting.

# ToDo Determine if we need to separate the linetypes based on formatting
wide_to_long_format = [cn.FHO, cn.CNT, cn.VCNT, cn.CTC, cn.CTS, cn.MCTS, cn.SL1L2, cn.ECNT]
roc_format = [cn.PCT]
hist_format = [cn.RHIST, cn.PHIST, cn.RELP]

working_df = stat_data.copy(deep=True)
stat_data.to_csv('/Users/minnawin/RRFS/feature_234_RRFS_linetypes/METdataio/METreformat/output/pct_prc_pjc_pstd_raw.csv')
linetype_requested = parms['line_type']
if linetype_requested in supported_linetypes:
working_df = working_df.loc[working_df['line_type'] == linetype_requested]
Expand Down Expand Up @@ -158,7 +152,6 @@ def write_stat_ascii(self, stat_data: pd.DataFrame, parms: dict) -> pd.DataFrame
# METdbLoad read_data_files module.
# stat_data = stat_data.fillna('NA')
working_df = working_df.fillna('NA')
working_df.to_csv('/Users/minnawin/RRFS/feature_234_rrfs_linetypes/METdataio/METreformat/output/raw_cnt_only.csv')
begin_reformat = time.perf_counter()
reformatted_df = self.process_by_stat_linetype(linetype_requested, working_df)
end_reformat = time.perf_counter()
Expand Down Expand Up @@ -250,8 +243,8 @@ def process_by_stat_linetype(self, linetype: str, stat_data: pd.DataFrame):
def process_pct(self, stat_data: pd.DataFrame) -> pd.DataFrame:
"""
Retrieve the PCT linetype data (Contingency count for probabilistic data) and reshape it
(wide to long format) to enable METplotpy to ingest the data and generate plots.
Take into account that this linetype consists of a variable number
(wide to long format) to enable METplotpy to ingest the data and generate ROC diagram plots.
Take into account that this line type consists of a variable number
of fields/columns that appear after the N_THRESH column (i.e. THRESH_i, OY_i, ON_i).
Arguments:
Expand All @@ -261,12 +254,9 @@ def process_pct(self, stat_data: pd.DataFrame) -> pd.DataFrame:
linetype_data: the input dataframe reformatted into long format
"""
linetype: str = cn.PCT


# Determine how many columns are between the TOTAL column and the fields/columns of variable
# number (i.e. THRESH_i, OY_i, ON_i).
name_of_col_after_total:str = cn.LINE_VAR_COUNTER[linetype]
# Determine the columns for this line type
linetype: str = cn.PCT

# Number of columns after the N_THRESH column (i.e. THRESH_i, RANK_i, BIN_i, etc.)
num_repeating_col_labels: int = int(cn.LINE_VAR_REPEATS[linetype])
Expand All @@ -276,22 +266,38 @@ def process_pct(self, stat_data: pd.DataFrame) -> pd.DataFrame:

# Add 1 for THRESH_N, the last threshold value column
total_number_variable_columns = num_thresh * num_repeating_col_labels + 1
print(f"{total_number_variable_columns} total number of PCT-specific columns ")

# Add 1 for the TOTAL column to get the total number of columns for this data
# Add 1 for the TOTAL column to get the total number of columns for this line type
total_number_relevant_columns = cn.NUM_STATIC_PCT_COLS + total_number_variable_columns + 1

#
# Subset the input dataframe to include only the PCT columns and label the remaining
# "unlabelled" (i.e. labelled with numbers after data is read in by METdbLoad)
# columns/headers.
#

# Subset the dataframe to only the PCT line type rows
# stat_data_copy: pd.DataFrame = stat_data.copy(deep=True)
# Do not assume that the input data contains only the PCT lines.
stat_data_copy = stat_data.loc[stat_data['line_type'] == cn.PCT]

# Get a list of names of the columns that correspond to the PCT linetype for this data
all_columns = stat_data.columns
only_relevant_columns = stat_data.columns.tolist()[0:total_number_relevant_columns]
only_relevant_columns = stat_data_copy.columns.tolist()[0:total_number_relevant_columns]

# Subset the input dataframe to include only the PCT columns and label the remaining
# unlabelled (numbered) columns/headers.
filtered_df = stat_data[only_relevant_columns]
filtered_df = stat_data_copy[only_relevant_columns]
headers = filtered_df.columns

# Identify the common headers to be used in indexing the dataframe.
common_list = headers[0:cn.NUM_STATIC_PCT_COLS].to_list()
common_list[cn.NUM_STATIC_PCT_COLS - 1] = 'total'

working_df = filtered_df.copy(deep=True)

# Replace the numbered labels with the TOTAL and N_THRESH labels
# Remove the stat_data_copy dataframe, it is no longer needed.
del stat_data_copy
gc.collect()

# Replace the first two numbered labels (following the LINETYPE column) with the TOTAL and N_THRESH labels
working_df.rename(columns={'0':'total', cn.LINE_VAR_COUNTER[cn.PCT]:'n_thresh'}, inplace=True)

# Relabel the remaining numbered column headers
Expand All @@ -302,22 +308,122 @@ def process_pct(self, stat_data: pd.DataFrame) -> pd.DataFrame:
working_df.rename(columns={str(last_column_name):thresh_n_col_name}, inplace=True)

# Relabel the repeating columns (THRESH_i, OY_i, ON_i)
idx = 1
# column names are numbered '1','2','3',...,etc. Give them descriptive labels: thresh_1, oy_1, on_1, etc.
ith_value_label = []
column_name_value = int(cn.LINE_VAR_COUNTER[cn.PCT]) + 1
for i in range(int(cn.LINE_VAR_COUNTER[cn.PCT]), int(num_thresh) +1):

for i in range(int(cn.LINE_VAR_COUNTER[cn.PCT]), int(num_thresh) + 1):
for column in cn.LC_PCT_VARIABLE_HEADERS:
column_name = str(column_name_value)
column_label = "{label}_{idx}".format(label=column, idx=i)
working_df.rename(columns={column_name:column_label}, inplace=True)
column_name_value += 1

# Now reformat the columns so all thresh_1, thresh_2, ..., etc values go under the thresh_i column,
# the oy_1, oy_2, ..., etc. values go under the oy_i column, and the on_1, on_2, ...,etc. values
# go under the on_i column. The nth threshold goes under the i_value column (ie threshold 1,..., n).
# Add a list used to facilitate creating the value_i column when reformatting.
ith_value_label.append("{label}_{idx}".format(label="value", idx=i))

# Create a dataframe consisting only of the value_1, ..., value_n values and their corresponding index values
# and concat to the working_df.
num_rows = working_df.shape[0]

# Create a dictionary of values corresponding to each value_1, value_2, etc 'key'
value_i_dict = {}

# Create a dataframe with the same number of rows
# as the working_df dataframe to enable concatenation.

for label in ith_value_label:
values_list = []
match = re.match(r'(value_)(\d+)', label)
ith_value = int(match.group(2))

for i in range(1, num_rows + 1):
values_list.append(ith_value)

value_i_dict[label] = values_list

return working_df
# Create the dataframe of the value_i values.
value_df = pd.DataFrame.from_dict(value_i_dict)

# Reindex working_df to match the value_df index. This ensures correct concatenation of
# the working_df with the value_df
working_df_reindexed = working_df.reset_index(drop=False)
working_df_reindexed = pd.concat([working_df_reindexed, value_df], axis=1)

# Clean up working_df dataframe, it is no longer needed
del working_df
del value_df
gc.collect()

# Now reformat the columns so all thresh_1, thresh_2, ..., etc values go under the thresh_i column,
# the oy_1, oy_2, ..., etc. values go under the oy_i column, and the on_1, on_2, ...,etc. values
# go under the on_i column. The corresponding threshold level/number/index for the thresh_i, oy_i and on_i
# columns goes under the i_value column (ie threshold 1,..., n).

# Work on a copy of the working_df to avoid working on a fragmented dataframe (i.e. avoid the
# PerformanceWarning).
working_copy_df = working_df_reindexed.copy(deep=True)

# Clean up
del working_df_reindexed
gc.collect()

thresh_cols = []
oy_cols = []
on_cols = []
i_value = []
working_headers = working_copy_df.columns.to_list()
remaining_columns = working_headers[cn.NUM_STATIC_PCT_COLS:]
for cur in remaining_columns:
match_thresh = re.match(r'(thresh_)(\d+)', cur)
match_oy = re.match(r'(oy_)(\d+)', cur)
match_on = re.match(r'(on_)(\d+)', cur)
match_val = re.match(r'(value_)(\d+)', cur)
if match_thresh:
thresh_cols.append(cur)
elif match_oy:
oy_cols.append(cur)
elif match_on:
on_cols.append(cur)
elif match_val:
i_value.append(cur)

# The last threshold value, thresh_n isn't used, remove it from the list
thresh_cols = thresh_cols[:-1]

# Now apply melt to get the thresh_i, oy_i, on_i columns, and i_value column
# include the n_thresh column for indexing.
common_list.append('n_thresh')
df_thresh = working_copy_df.melt(id_vars=common_list, value_vars=thresh_cols, var_name='thresh', value_name='thresh_i')
df_oy = working_copy_df.melt(id_vars=common_list, value_vars=oy_cols, var_name='oy', value_name='oy_i')
df_on = working_copy_df.melt(id_vars=common_list, value_vars=on_cols, var_name='on', value_name='on_i')
df_values = working_copy_df.melt(id_vars=common_list, value_vars=i_value, var_name='values', value_name='i_value')

# Drop the unused var_names in the melted dataframes
df_thresh.drop('thresh', axis=1, inplace=True)
df_oy.drop('oy', axis=1, inplace=True)
df_on.drop('on', axis=1, inplace=True)
df_values.drop('values', axis=1, inplace=True)

# Reindex to use the common columns before concatenating the melted dataframes to avoid duplication of
# common columns.
df_thresh_reindex = df_thresh.set_index(common_list, drop=True, append=False, inplace=False)
df_oy_reindex = df_oy.set_index(common_list, drop=True, append=False, inplace=False)
df_on_reindex = df_on.set_index(common_list, drop=True, append=False, inplace=False)
df_values_reindex = df_values.set_index(common_list, drop=True, append=False, inplace=False)
reformatted_df = pd.concat([df_thresh_reindex, df_oy_reindex, df_on_reindex, df_values_reindex], axis=1)

# clean up
del working_copy_df
gc.collect()

# reset the index so all columns are same level
reformatted_df = reformatted_df.reset_index(drop=False)

# Convert the n_thresh values to integers
reformatted_df = reformatted_df.astype({"n_thresh":np.int16})

return reformatted_df


def process_fho(self, stat_data: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -327,7 +433,7 @@ def process_fho(self, stat_data: pd.DataFrame) -> pd.DataFrame:
stat_name, stat_value, stat_bcl, stat_bcu, stat_ncu, and stat_ncl
Arguments:
@param stat_data: The dataframe containing all the original data from
@param stat_data: The dataframe containing the data from
the MET .stat file.
Returns:
Expand Down Expand Up @@ -404,7 +510,7 @@ def process_cnt(self, stat_data: pd.DataFrame) -> pd.DataFrame:
specifically for the CNT line type data.
Arguments:
@param stat_data: the dataframe containing all the data from the MET .stat
@param stat_data: the dataframe containing data from the MET .stat
file.
Returns:
Expand Down Expand Up @@ -496,7 +602,7 @@ def process_vcnt(self, stat_data: pd.DataFrame) -> pd.DataFrame:
specifically for the VCNT line type data.
Arguments:
@param stat_data: the dataframe containing all the data from the MET .stat
@param stat_data: the dataframe containing all the VCNT data from the MET .stat
file.
Returns:
Expand Down Expand Up @@ -642,74 +748,6 @@ def process_ctc(self, stat_data: pd.DataFrame) -> pd.DataFrame:

return linetype_data

# def process_mctc(self, stat_data: pd.DataFrame) -> pd.DataFrame:
# """
# Reshape the data from the original MET output file (stat_data)
# into new statistics columns:
# stat_name, stat_value specifically for the MCTC (multi-category
# contingency table counts) line type
# data.
#
# Arguments:
# @param stat_data: the dataframe containing all the data from the MET .stat
# file.
#
# Returns:
# linetype_data: the reshaped pandas dataframe with statistics data
# reorganized into the stat_name and
# stat_value, stat_ncl, stat_ncu, stat_bcl,
# and stat_bcu columns.
#
# """
# # !!!!!!!!
# # TODO
# # Need to correctly implement this to support the variable number or
# # Fi_Oj columns
# # !!!!!!!!
#
# # Relevant columns for the MCTC line type
# linetype: str = cn.MCTC
# end = cn.NUM_STAT_MCTC_COLS + 1
# mctc_columns_to_use: List[str] =\
# np.arange(0, end).tolist()
#
# # Subset original dataframe to one containing only the MCTC data
# mctc_df: pd.DataFrame = stat_data[stat_data['line_type'] == linetype].iloc[:,
# mctc_columns_to_use]
#
# # Add the stat columns header names for the MCTC line type
# mctc_columns: List[str] = cn.MCTC_HEADERS
# mctc_df.columns: List[str] = mctc_columns
#
# # Create another index column to preserve the index values from the stat_data
# # dataframe (ie the dataframe
# # containing the original data from the MET output file).
# idx = list(mctc_df.index)
#
# # Work on a copy of the mctc_df dataframe to avoid a possible PerformanceWarning
# # message due to a fragmented dataframe.
# mctc_df_copy = mctc_df.copy()
# mctc_df_copy.insert(loc=0, column='Idx', value=idx)
#
# # Now apply melt to get the stat_name and stat_values from the statistics
#
# # Columns we don't want to stack (i.e. treat these columns as a multi index)
# id_vars_list = ['Idx'] + cn.LC_COMMON_STAT_HEADER + ['total']
# linetype_data = mctc_df_copy.melt(id_vars=id_vars_list,
# value_vars=cn.MCTC_STATISTICS_HEADERS,
# var_name='stat_name',
# value_name='stat_value').sort_values('Idx')
#
# # MCTC line type doesn't have the ncl, ncu, bcl and bcu stat values set these
# # to NA
# na_column: List[str] = ['NA' for _ in range(0, linetype_data.shape[0])]
#
# linetype_data['stat_ncl']: pd.Series = na_column
# linetype_data['stat_ncu']: pd.Series = na_column
# linetype_data['stat_bcl']: pd.Series = na_column
# linetype_data['stat_bcu']: pd.Series = na_column
#
# return linetype_data

def process_cts(self, stat_data: pd.DataFrame) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -1168,6 +1206,12 @@ def main():
except yaml.YAMLError as exc:
print(exc)

# Check that the config file has all the necessary settings and values
config_file_ok = config_file_complete(parms)
if not config_file_ok:
raise ValueError("Configuration file is missing one or more required settings and/or values.")
sys.exit(1)

# Replacing the need for an XML specification file, pass in the XMLLoadFile and
# ReadDataFile parameters
rdf_obj: ReadDataFiles = ReadDataFiles()
Expand Down Expand Up @@ -1200,6 +1244,40 @@ def main():
# stat_lines_obj.write_stat_ascii(file_df, parms, logger)
stat_lines_obj.write_stat_ascii(file_df, parms)

def config_file_complete(parms):
'''
Determines if the config file contains all the necessary fields.
Input:
parms: The config file
Returns:
True if all expected settings are found, False otherwise.
'''

# Check for log direcotry, log filename, log level, line type, output_dir, output_filename, input_data_dir
expected_settings = ['output_dir', 'output_filename', 'input_data_dir', 'log_directory', 'log_filename', 'log_level',
'line_type']
actual_keys = []
for k,v in parms.items():
actual_keys.append(k)
if v is None:
msg = "ERROR: Missing the value for the " + k + " setting."
logger.error(msg)
return False

for expected in expected_settings:
if expected not in actual_keys:
msg = "ERROR: The " + expected + " setting is missing in the YAML config file"
logger.error(msg)
return False





return True



if __name__ == "__main__":

Expand Down

0 comments on commit 408152d

Please sign in to comment.