Skip to content

Commit

Permalink
added post-transformer capability, internal col list extension
Browse files Browse the repository at this point in the history
  • Loading branch information
AmandaBirmingham committed Jun 26, 2024
1 parent b8fb46e commit 07e5d4b
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 80 deletions.
7 changes: 5 additions & 2 deletions qiimp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from qiimp.src.util import HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY, \
SAMPLE_TYPE_KEY, QC_NOTE_KEY, LEAVE_BLANK_VAL, DO_NOT_USE_VAL, \
NOT_PROVIDED_VAL, HOST_SUBJECT_ID_KEY, SAMPLE_NAME_KEY, \
COLLECTION_TIMESTAMP_KEY, \
COLLECTION_TIMESTAMP_KEY, METADATA_TRANSFORMERS_KEY, SOURCES_KEY, \
FUNCTION_KEY, PRE_TRANSFORMERS_KEY, POST_TRANSFORMERS_KEY, \
extract_config_dict, deepcopy_dict, load_df_with_best_fit_encoding
from qiimp.src.metadata_extender import \
write_extended_metadata, write_extended_metadata_from_df
Expand All @@ -12,7 +13,9 @@
"SAMPLE_TYPE_KEY", "QC_NOTE_KEY", "LEAVE_BLANK_VAL",
"DO_NOT_USE_VAL", "NOT_PROVIDED_VAL",
"HOST_SUBJECT_ID_KEY", "SAMPLE_NAME_KEY",
"COLLECTION_TIMESTAMP_KEY",
"COLLECTION_TIMESTAMP_KEY", "METADATA_TRANSFORMERS_KEY",
"SOURCES_KEY", "FUNCTION_KEY", "PRE_TRANSFORMERS_KEY",
"POST_TRANSFORMERS_KEY",
"extract_config_dict",
"deepcopy_dict", "load_df_with_best_fit_encoding",
"merge_sample_and_subject_metadata",
Expand Down
96 changes: 21 additions & 75 deletions qiimp/src/metadata_extender.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
from datetime import datetime
from qiimp.src.util import extract_config_dict, extract_stds_config, \
deepcopy_dict, validate_required_columns_exist, get_extension, \
load_df_with_best_fit_encoding, \
load_df_with_best_fit_encoding, update_metadata_df_field, \
HOSTTYPE_SHORTHAND_KEY, SAMPLETYPE_SHORTHAND_KEY, \
QC_NOTE_KEY, METADATA_FIELDS_KEY, HOST_TYPE_SPECIFIC_METADATA_KEY, \
SAMPLE_TYPE_SPECIFIC_METADATA_KEY, SAMPLE_TYPE_KEY, QIITA_SAMPLE_TYPE, \
DEFAULT_KEY, REQUIRED_KEY, ALIAS_KEY, BASE_TYPE_KEY, \
LEAVE_BLANK_VAL, SAMPLE_NAME_KEY, \
ALLOWED_KEY, TYPE_KEY, LEAVE_REQUIREDS_BLANK_KEY
ALLOWED_KEY, TYPE_KEY, LEAVE_REQUIREDS_BLANK_KEY, \
PRE_TRANSFORMERS_KEY, POST_TRANSFORMERS_KEY
from qiimp.src.metadata_configurator import combine_stds_and_study_config, \
flatten_nested_stds_dict, update_wip_metadata_dict
from qiimp.src.metadata_validator import validate_metadata_df, \
Expand Down Expand Up @@ -76,7 +77,10 @@ def write_extended_metadata(
def write_extended_metadata_from_df(
raw_metadata_df, study_specific_config_dict, out_dir, out_name_base,
study_specific_transformers_dict=None, sep="\t",
suppress_empty_fails=False):
suppress_empty_fails=False, internal_col_names=None):

if internal_col_names is None:
internal_col_names = INTERNAL_COL_KEYS

validate_required_columns_exist(
raw_metadata_df, REQUIRED_RAW_METADATA_FIELDS,
Expand All @@ -102,7 +106,7 @@ def write_extended_metadata_from_df(
study_specific_config_dict)

_output_to_df(metadata_df, out_dir, out_name_base,
INTERNAL_COL_KEYS, remove_internals=True, sep=sep,
internal_col_names, remove_internals=True, sep=sep,
suppress_empty_fails=suppress_empty_fails)
output_validation_msgs(validation_msgs, out_dir, out_name_base, sep=",",
suppress_empty_fails=suppress_empty_fails)
Expand All @@ -112,79 +116,21 @@ def write_extended_metadata_from_df(
def _populate_metadata_df(
raw_metadata_df, transformer_funcs_dict, main_config_dict):
metadata_df = raw_metadata_df.copy()
_update_metadata_df_field(metadata_df, QC_NOTE_KEY, LEAVE_BLANK_VAL)
update_metadata_df_field(metadata_df, QC_NOTE_KEY, LEAVE_BLANK_VAL)

metadata_df = transform_pre_metadata(
metadata_df, transformer_funcs_dict, main_config_dict)
metadata_df = transformers.transform_metadata(
metadata_df, transformer_funcs_dict, main_config_dict,
PRE_TRANSFORMERS_KEY)

# first, add the metadata for the host types
metadata_df, validation_msgs = _generate_metadata_for_host_types(
metadata_df, main_config_dict)

return metadata_df, validation_msgs

metadata_df = transformers.transform_metadata(
metadata_df, transformer_funcs_dict, main_config_dict,
POST_TRANSFORMERS_KEY)

def transform_pre_metadata(
pre_metadata_df, transformer_funcs_dict, config_dict):
if transformer_funcs_dict is None:
transformer_funcs_dict = {}

metadata_transformers = config_dict.get("metadata_transformers", None)
if metadata_transformers:
pre_transformers = metadata_transformers.get("pre_population", None)
for curr_target_field, curr_transformer_dict in pre_transformers.items():
curr_source_field = curr_transformer_dict["sources"]
curr_func_name = curr_transformer_dict["function"]

try:
curr_func = transformer_funcs_dict[curr_func_name]
except KeyError:
try:
curr_func = getattr(transformers, curr_func_name)
except AttributeError:
raise ValueError(
f"Unable to find transformer '{curr_func_name}'")
# end try to find in qiimp transformers
# end try to find in input (study-specific) transformers

# apply the function named curr_func_name to the column of the
# metadata_df named curr_source_field to fill curr_target_field
_update_metadata_df_field(pre_metadata_df, curr_target_field,
curr_func, curr_source_field,
overwrite_non_nans=False)

return pre_metadata_df


def _update_metadata_df_field(
metadata_df, field_name, field_val_or_func,
source_fields=None, overwrite_non_nans=True):

# Note: function doesn't return anything. Work is done in-place on the
# metadata_df passed in.

if source_fields:
if overwrite_non_nans or (field_name not in metadata_df.columns):
metadata_df[field_name] = \
metadata_df.apply(
lambda row: field_val_or_func(row, source_fields),
axis=1)
else:
# TODO: not yet tested; from StackOverflow
metadata_df.loc[metadata_df[field_name].isnull(), field_name] = \
metadata_df.apply(
lambda row: field_val_or_func(row, source_fields),
axis=1)
# endif overwrite_non_nans for function call
else:
if overwrite_non_nans or (field_name not in metadata_df.columns):
metadata_df[field_name] = field_val_or_func
else:
metadata_df[field_name] = \
metadata_df[field_name].fillna(field_val_or_func)
# metadata_df[field_name].fillna(field_val_or_func, inplace=True)
# endif overwrite_non_nans for constant value
# endif using a function/a constant value
return metadata_df, validation_msgs


def _generate_metadata_for_host_types(
Expand Down Expand Up @@ -232,7 +178,7 @@ def _generate_metadata_for_host_type(
validation_msgs = []
known_host_shorthands = config[HOST_TYPE_SPECIFIC_METADATA_KEY].keys()
if curr_host_type not in known_host_shorthands:
_update_metadata_df_field(
update_metadata_df_field(
host_type_df, QC_NOTE_KEY, "invalid host_type")
# host_type_df[QC_NOTE_KEY] = "invalid host_type"
concatted_df = host_type_df
Expand Down Expand Up @@ -283,14 +229,14 @@ def _update_metadata_from_dict(metadata_df, metadata_fields_dict):
for curr_field_name, curr_field_vals_dict in metadata_fields_dict.items():
if DEFAULT_KEY in curr_field_vals_dict:
curr_default_val = curr_field_vals_dict[DEFAULT_KEY]
_update_metadata_df_field(
update_metadata_df_field(
output_df, curr_field_name, curr_default_val,
overwrite_non_nans=False)
# output_df[curr_field_name] = curr_default_val
elif REQUIRED_KEY in curr_field_vals_dict:
curr_required_val = curr_field_vals_dict[REQUIRED_KEY]
if curr_required_val and curr_field_name not in output_df:
_update_metadata_df_field(
update_metadata_df_field(
output_df, curr_field_name, REQ_PLACEHOLDER)
# note that if the field is (a) required, (b) does not have a
# default value, and (c) IS already in the metadata, it will
Expand All @@ -315,7 +261,7 @@ def _generate_metadata_for_sample_type_in_host(
validation_msgs = []
known_sample_types = host_sample_types_dict.keys()
if sample_type not in known_sample_types:
_update_metadata_df_field(
update_metadata_df_field(
sample_type_df, QC_NOTE_KEY, "invalid sample_type")
# sample_type_df[QC_NOTE_KEY] = "invalid sample_type"
else:
Expand Down Expand Up @@ -370,7 +316,7 @@ def _construct_sample_type_metadata_dict(
# get the base's sample type dict and add this sample type's
# info on top of it
base_sample_dict = host_sample_types_dict[sample_type_base]
if base_sample_dict.keys().to_list() != [METADATA_FIELDS_KEY]:
if list(base_sample_dict.keys()) != [METADATA_FIELDS_KEY]:
raise ValueError(f"Base sample type '{sample_type_base}' "
f"must only have metadata fields")
sample_type_specific_dict_metadata = update_wip_metadata_dict(
Expand Down
53 changes: 51 additions & 2 deletions qiimp/src/metadata_transformers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,44 @@
import pandas
from dateutil import parser


from qiimp.src.util import METADATA_TRANSFORMERS_KEY, \
SOURCES_KEY, FUNCTION_KEY, \
update_metadata_df_field


# transformer runner function
def transform_metadata(
metadata_df, transformer_funcs_dict, config_dict, stage_key):
if transformer_funcs_dict is None:
transformer_funcs_dict = {}

metadata_transformers = config_dict.get(METADATA_TRANSFORMERS_KEY, None)
if metadata_transformers:
transformers = metadata_transformers.get(stage_key, None)
for curr_target_field, curr_transformer_dict in transformers.items():
curr_source_field = curr_transformer_dict[SOURCES_KEY]
curr_func_name = curr_transformer_dict[FUNCTION_KEY]

try:
curr_func = transformer_funcs_dict[curr_func_name]
except KeyError:
try:
curr_func = getattr(transformers, curr_func_name)
except AttributeError:
raise ValueError(
f"Unable to find transformer '{curr_func_name}'")
# end try to find in qiimp transformers
# end try to find in input (study-specific) transformers

# apply the function named curr_func_name to the column of the
# metadata_df named curr_source_field to fill curr_target_field
update_metadata_df_field(metadata_df, curr_target_field,
curr_func, curr_source_field,
overwrite_non_nans=False)

return metadata_df


# individual transformer functions
def pass_through(row, source_fields):
return _get_one_source_field(row, source_fields, "pass_through")

Expand Down Expand Up @@ -65,7 +102,19 @@ def format_a_datetime(row, source_fields):
return formatted_x


# helper functions
def _get_one_source_field(row, source_fields, func_name):
if len(source_fields) != 1:
raise ValueError(f"{func_name} requires exactly one source field")
return row[source_fields[0]]


def _format_field_val(row, source_fields, field_type, format_string):
x = _get_one_source_field(row, source_fields, "format_field_val")

result = x
# format string should be something like '{0:g}' or '{0:.2f}'
# field type should be something like float or int
if isinstance(x, field_type):
result = format_string.format(x)
return result
38 changes: 37 additions & 1 deletion qiimp/src/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,18 @@
SAMPLE_TYPE_KEY = "sample_type"
QIITA_SAMPLE_TYPE = "qiita_sample_type"
SAMPLE_TYPE_SPECIFIC_METADATA_KEY = "sample_type_specific_metadata"
METADATA_TRANSFORMERS_KEY = "metadata_transformers"
PRE_TRANSFORMERS_KEY = "pre_transformers"
POST_TRANSFORMERS_KEY = "post_transformers"
ALIAS_KEY = "alias"
BASE_TYPE_KEY = "base_type"
DEFAULT_KEY = "default"
REQUIRED_KEY = "required"
ALLOWED_KEY = "allowed"
ANYOF_KEY = "anyof"
TYPE_KEY = "type"
SOURCES_KEY = "sources"
FUNCTION_KEY = "function"
LEAVE_REQUIREDS_BLANK_KEY = "leave_requireds_blank"

# internal code keys
Expand Down Expand Up @@ -121,4 +126,35 @@ def validate_required_columns_exist(


def get_extension(sep):
return "csv" if sep == "," else "txt"
return "csv" if sep == "," else "txt"


def update_metadata_df_field(
metadata_df, field_name, field_val_or_func,
source_fields=None, overwrite_non_nans=True):

# Note: function doesn't return anything. Work is done in-place on the
# metadata_df passed in.

if source_fields:
if overwrite_non_nans or (field_name not in metadata_df.columns):
metadata_df[field_name] = \
metadata_df.apply(
lambda row: field_val_or_func(row, source_fields),
axis=1)
else:
# TODO: not yet tested; from StackOverflow
metadata_df.loc[metadata_df[field_name].isnull(), field_name] = \
metadata_df.apply(
lambda row: field_val_or_func(row, source_fields),
axis=1)
# endif overwrite_non_nans for function call
else:
if overwrite_non_nans or (field_name not in metadata_df.columns):
metadata_df[field_name] = field_val_or_func
else:
metadata_df[field_name] = \
metadata_df[field_name].fillna(field_val_or_func)
# metadata_df[field_name].fillna(field_val_or_func, inplace=True)
# endif overwrite_non_nans for constant value
# endif using a function/a constant value

0 comments on commit 07e5d4b

Please sign in to comment.