Skip to content

Commit

Permalink
update train test split tools
Browse files Browse the repository at this point in the history
  • Loading branch information
qiagu committed Dec 13, 2019
1 parent eacd4c3 commit d6c6607
Show file tree
Hide file tree
Showing 4 changed files with 358 additions and 107 deletions.
5 changes: 1 addition & 4 deletions galaxy_ml/tools/fitted_model_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@
from galaxy_ml.utils import get_scoring, load_model, read_columns


N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))


def _get_X_y(params, infile1, infile2):
""" read from inputs and output X and y
Expand Down Expand Up @@ -123,7 +120,7 @@ def main(inputs, infile_estimator, outfile_eval,
if hasattr(main_est, 'config') and hasattr(main_est, 'load_weights'):
if not infile_weights or infile_weights == 'None':
raise ValueError("The selected model skeleton asks for weights, "
"but dataset for weights wan not selected!")
"but no dataset for weights was provided!")
main_est.load_weights(infile_weights)

# handle scorer, convert to scorer dict
Expand Down
18 changes: 15 additions & 3 deletions galaxy_ml/tools/fitted_model_eval.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="fitted_model_eval" name="Evaluate a Fitted Model" version="@VERSION@">
<tool id="sklearn_fitted_model_eval" name="Evaluate a Fitted Model" version="@VERSION@">
<description>using a new batch of labeled data</description>
<macros>
<import>main_macros.xml</import>
Expand Down Expand Up @@ -58,14 +58,26 @@
<![CDATA[
**What it does**
Given a fitted estimator and a labeled dataset, this tool outpus the performances of the fitted estimator on the labeled dataset with the selected scorers.
Given a fitted estimator and a labeled dataset, this tool outputs the performances of the fitted estimator on the labeled dataset with selected scorers.
For estimator, this tool supports fitted sklearn estimators (pickled) and trained deep learning models (model skeleton + weights). For input datasets, it supports as follows.
For the estimator, this tool supports fitted sklearn estimators (pickled) and trained deep learning models (model skeleton + weights). For input datasets, it supports the following:
- tabular
- sparse
**Output**
A tabular file containing performance scores,
e.g.:
======== ======== =========
accuracy f1_macro precision
======== ======== =========
0.8613 0.6759 0.7928
======== ======== =========
]]>
</help>
<expand macro="sklearn_citation">
Expand Down
138 changes: 102 additions & 36 deletions galaxy_ml/tools/train_test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,83 @@
import warnings

from galaxy_ml.model_validations import train_test_split
from galaxy_ml.model_validations import OrderedKFold
from galaxy_ml.utils import get_cv, read_columns


def _get_single_cv_split(params, array, infile_labels=None,
infile_groups=None):
""" output (train, test) subset from a cv splitter
Parameters
----------
params : dict
Galaxy tool inputs
array : pandas DataFrame object
The target dataset to split
infile_labels : str
File path to dataset containing target values
infile_groups : str
File path to dataset containing group values
"""
y = None
groups = None

nth_split = params['mode_selection']['nth_split']

# read groups
if infile_groups:
header = 'infer' if (params['mode_selection']['cv_selector']
['groups_selector']['header_g']) else None
column_option = (params['mode_selection']['cv_selector']
['groups_selector']['column_selector_options_g']
['selected_column_selector_option_g'])
if column_option in ['by_index_number', 'all_but_by_index_number',
'by_header_name', 'all_but_by_header_name']:
c = (params['mode_selection']['cv_selector']['groups_selector']
['column_selector_options_g']['col_g'])
else:
c = None

groups = read_columns(infile_groups, c=c, c_option=column_option,
sep='\t', header=header, parse_dates=True)
groups = groups.ravel()

params['mode_selection']['cv_selector']['groups_selector'] = groups

# read labels
if infile_labels:
target_input = (params['mode_selection']
['cv_selector'].pop('target_input'))
header = 'infer' if target_input['header1'] else None
col_index = target_input['col'][0] - 1
df = pd.read_csv(infile_labels, sep='\t', header=header,
parse_dates=True)
y = df.iloc[:, col_index].values

# construct the cv splitter object
splitter, groups = get_cv(params['mode_selection']['cv_selector'])

total_n_splits = splitter.get_n_splits(array.values, y=y, groups=groups)
if nth_split > total_n_splits:
raise ValueError("Total number of splits is {}, but got `nth_split` "
"= {}".format(total_n_splits, nth_split))

i = 1
for train_index, test_index in splitter.split(array.values, y=y, groups=groups):
# suppose nth_split >= 1
if i == nth_split:
break
else:
i += 1

train = array.iloc[train_index, :]
test = array.iloc[test_index, :]

return train, test


def main(inputs, infile_array, outfile_train, outfile_test,
infile_labels=None):
infile_labels=None, infile_groups=None):
"""
Parameter
---------
Expand All @@ -21,6 +93,9 @@ def main(inputs, infile_array, outfile_train, outfile_test,
infile_labels : str
File path to dataset containing labels
infile_groups : str
File path to dataset containing groups
outfile_train : str
File path to dataset containing train split
Expand All @@ -32,45 +107,35 @@ def main(inputs, infile_array, outfile_train, outfile_test,
with open(inputs, 'r') as param_handler:
params = json.load(param_handler)

input_header = 'header' in params['infile_info']

input_header = params['header0']
header = 'infer' if input_header else None
array = pd.read_csv(infile_array, sep='\t', header=header,
parse_dates=True)

options = params['options']
shuffle_selection = options.pop('shuffle_selection')
options['shuffle'] = shuffle_selection['shuffle']
if infile_labels:
header = 'infer' if 'header' in shuffle_selection['infile_info']\
else None
col_index = shuffle_selection['col'][0] - 1
df = pd.read_csv(infile_labels, sep='\t', header=header,
parse_dates=True)
labels = df.iloc[:, col_index].values
options['labels'] = labels

if shuffle_selection['shuffle'] == 'ordered_target':
test_size = options['test_size']
if test_size < 1.0:
if test_size > 0.5:
raise ValueError("Ordered Target Split only supports "
"test proportion 0 - 0.5!")
n_splits = round(1 / test_size)
else:
n_samples = array.shape[0]
n_splits = round(n_samples / test_size)
# train test split
if params['mode_selection']['selected_mode'] == 'train_test_split':
options = params['mode_selection']['options']
shuffle_selection = options.pop('shuffle_selection')
options['shuffle'] = shuffle_selection['shuffle']
if infile_labels:
header = 'infer' if shuffle_selection['header1'] else None
col_index = shuffle_selection['col'][0] - 1
df = pd.read_csv(infile_labels, sep='\t', header=header,
parse_dates=True)
labels = df.iloc[:, col_index].values
options['labels'] = labels

splitter = OrderedKFold(n_splits=n_splits, shuffle=True,
random_state=options['random_state'])
train_index, test_index = next(splitter.split(array.values, labels))
train, test = array.iloc[train_index, :], array.iloc[test_index, :]
else:
train, test = train_test_split(array, **options)

print(("Input shape:", array.shape))
print(("Train shape:", train.shape))
print(("Test shape:", test.shape))
# cv splitter
else:
train, test = _get_single_cv_split(params, array,
infile_labels=infile_labels,
infile_groups=infile_groups)

print("Input shape: %s" % repr(array.shape))
print("Train shape: %s" % repr(train.shape))
print("Test shape: %s" % repr(test.shape))
train.to_csv(outfile_train, sep='\t', header=input_header, index=False)
test.to_csv(outfile_test, sep='\t', header=input_header, index=False)

Expand All @@ -79,10 +144,11 @@ def main(inputs, infile_array, outfile_train, outfile_test,
aparser = argparse.ArgumentParser()
aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
aparser.add_argument("-X", "--infile_array", dest="infile_array")
aparser.add_argument("-g", "--infile_labels", dest="infile_labels")
aparser.add_argument("-y", "--infile_labels", dest="infile_labels")
aparser.add_argument("-g", "--infile_groups", dest="infile_groups")
aparser.add_argument("-o", "--outfile_train", dest="outfile_train")
aparser.add_argument("-t", "--outfile_test", dest="outfile_test")
args = aparser.parse_args()

main(args.inputs, args.infile_array, args.outfile_train,
args.outfile_test, args.infile_labels)
args.outfile_test, args.infile_labels, args.infile_groups)
Loading

0 comments on commit d6c6607

Please sign in to comment.