update train test split tools

goeckslab · Dec 13, 2019 · d6c6607 · d6c6607
1 parent eacd4c3
commit d6c6607
Show file tree

Hide file tree

Showing 4 changed files with 358 additions and 107 deletions.
diff --git a/galaxy_ml/tools/fitted_model_eval.py b/galaxy_ml/tools/fitted_model_eval.py
@@ -10,9 +10,6 @@
 from galaxy_ml.utils import get_scoring, load_model, read_columns
 
 
-N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))
-
-
 def _get_X_y(params, infile1, infile2):
     """ read from inputs and output X and y
 
@@ -123,7 +120,7 @@ def main(inputs, infile_estimator, outfile_eval,
     if hasattr(main_est, 'config') and hasattr(main_est, 'load_weights'):
         if not infile_weights or infile_weights == 'None':
             raise ValueError("The selected model skeleton asks for weights, "
-                             "but dataset for weights wan not selected!")
+                             "but no dataset for weights was provided!")
         main_est.load_weights(infile_weights)
 
     # handle scorer, convert to scorer dict

diff --git a/galaxy_ml/tools/fitted_model_eval.xml b/galaxy_ml/tools/fitted_model_eval.xml
@@ -1,4 +1,4 @@
-<tool id="fitted_model_eval" name="Evaluate a Fitted Model" version="@VERSION@">
+<tool id="sklearn_fitted_model_eval" name="Evaluate a Fitted Model" version="@VERSION@">
     <description>using a new batch of labeled data</description>
     <macros>
         <import>main_macros.xml</import>
@@ -58,14 +58,26 @@
         <![CDATA[
 **What it does**
 
-Given a fitted estimator and a labeled dataset, this tool outpus the performances of the fitted estimator on the labeled dataset with the selected scorers.
+Given a fitted estimator and a labeled dataset, this tool outputs the performances of the fitted estimator on the labeled dataset with selected scorers.
 
-For estimator, this tool supports fitted sklearn estimators (pickled) and trained deep learning models (model skeleton + weights). For input datasets, it supports  as follows.
+For the estimator, this tool supports fitted sklearn estimators (pickled) and trained deep learning models (model skeleton + weights). For input datasets, it supports the following:
 
 - tabular
 
 - sparse
 
+
+**Output**
+
+A tabular file containing performance scores,
+e.g.:
+
+======== ======== =========
+accuracy f1_macro precision
+======== ======== =========
+ 0.8613   0.6759   0.7928
+======== ======== =========
+
         ]]>
     </help>
     <expand macro="sklearn_citation">

diff --git a/galaxy_ml/tools/train_test_split.py b/galaxy_ml/tools/train_test_split.py
@@ -4,11 +4,83 @@
 import warnings
 
 from galaxy_ml.model_validations import train_test_split
-from galaxy_ml.model_validations import OrderedKFold
+from galaxy_ml.utils import get_cv, read_columns
+
+
+def _get_single_cv_split(params, array, infile_labels=None,
+                         infile_groups=None):
+    """ output (train, test) subset from a cv splitter
+
+    Parameters
+    ----------
+    params : dict
+        Galaxy tool inputs
+    array : pandas DataFrame object
+        The target dataset to split
+    infile_labels : str
+        File path to dataset containing target values
+    infile_groups : str
+        File path to dataset containing group values
+    """
+    y = None
+    groups = None
+
+    nth_split = params['mode_selection']['nth_split']
+
+    # read groups
+    if infile_groups:
+        header = 'infer' if (params['mode_selection']['cv_selector']
+                             ['groups_selector']['header_g']) else None
+        column_option = (params['mode_selection']['cv_selector']
+                         ['groups_selector']['column_selector_options_g']
+                         ['selected_column_selector_option_g'])
+        if column_option in ['by_index_number', 'all_but_by_index_number',
+                             'by_header_name', 'all_but_by_header_name']:
+            c = (params['mode_selection']['cv_selector']['groups_selector']
+                 ['column_selector_options_g']['col_g'])
+        else:
+            c = None
+
+        groups = read_columns(infile_groups, c=c, c_option=column_option,
+                              sep='\t', header=header, parse_dates=True)
+        groups = groups.ravel()
+
+        params['mode_selection']['cv_selector']['groups_selector'] = groups
+
+    # read labels
+    if infile_labels:
+        target_input = (params['mode_selection']
+                        ['cv_selector'].pop('target_input'))
+        header = 'infer' if target_input['header1'] else None
+        col_index = target_input['col'][0] - 1
+        df = pd.read_csv(infile_labels, sep='\t', header=header,
+                         parse_dates=True)
+        y = df.iloc[:, col_index].values
+
+    # construct the cv splitter object
+    splitter, groups = get_cv(params['mode_selection']['cv_selector'])
+
+    total_n_splits = splitter.get_n_splits(array.values, y=y, groups=groups)
+    if nth_split > total_n_splits:
+        raise ValueError("Total number of splits is {}, but got `nth_split` "
+                         "= {}".format(total_n_splits, nth_split))
+
+    i = 1
+    for train_index, test_index in splitter.split(array.values, y=y, groups=groups):
+        # suppose nth_split >= 1
+        if i == nth_split:
+            break
+        else:
+            i += 1
+
+    train = array.iloc[train_index, :]
+    test = array.iloc[test_index, :]
+
+    return train, test
 
 
 def main(inputs, infile_array, outfile_train, outfile_test,
-         infile_labels=None):
+         infile_labels=None, infile_groups=None):
     """
     Parameter
     ---------
@@ -21,6 +93,9 @@ def main(inputs, infile_array, outfile_train, outfile_test,
     infile_labels : str
         File path to dataset containing labels
 
+    infile_groups : str
+        File path to dataset containing groups
+
     outfile_train : str
         File path to dataset containing train split
 
@@ -32,45 +107,35 @@ def main(inputs, infile_array, outfile_train, outfile_test,
     with open(inputs, 'r') as param_handler:
         params = json.load(param_handler)
 
-    input_header = 'header' in params['infile_info']
-
+    input_header = params['header0']
     header = 'infer' if input_header else None
     array = pd.read_csv(infile_array, sep='\t', header=header,
                         parse_dates=True)
 
-    options = params['options']
-    shuffle_selection = options.pop('shuffle_selection')
-    options['shuffle'] = shuffle_selection['shuffle']
-    if infile_labels:
-        header = 'infer' if 'header' in shuffle_selection['infile_info']\
-            else None
-        col_index = shuffle_selection['col'][0] - 1
-        df = pd.read_csv(infile_labels, sep='\t', header=header,
-                         parse_dates=True)
-        labels = df.iloc[:, col_index].values
-        options['labels'] = labels
-
-    if shuffle_selection['shuffle'] == 'ordered_target':
-        test_size = options['test_size']
-        if test_size < 1.0:
-            if test_size > 0.5:
-                raise ValueError("Ordered Target Split only supports "
-                                 "test proportion 0 - 0.5!")
-            n_splits = round(1 / test_size)
-        else:
-            n_samples = array.shape[0]
-            n_splits = round(n_samples / test_size)
+    # train test split
+    if params['mode_selection']['selected_mode'] == 'train_test_split':
+        options = params['mode_selection']['options']
+        shuffle_selection = options.pop('shuffle_selection')
+        options['shuffle'] = shuffle_selection['shuffle']
+        if infile_labels:
+            header = 'infer' if shuffle_selection['header1'] else None
+            col_index = shuffle_selection['col'][0] - 1
+            df = pd.read_csv(infile_labels, sep='\t', header=header,
+                             parse_dates=True)
+            labels = df.iloc[:, col_index].values
+            options['labels'] = labels
 
-        splitter = OrderedKFold(n_splits=n_splits, shuffle=True,
-                                random_state=options['random_state'])
-        train_index, test_index = next(splitter.split(array.values, labels))
-        train, test = array.iloc[train_index, :], array.iloc[test_index, :]
-    else:
         train, test = train_test_split(array, **options)
 
-    print(("Input shape:", array.shape))
-    print(("Train shape:", train.shape))
-    print(("Test shape:", test.shape))
+    # cv splitter
+    else:
+        train, test = _get_single_cv_split(params, array,
+                                           infile_labels=infile_labels,
+                                           infile_groups=infile_groups)
+
+    print("Input shape: %s" % repr(array.shape))
+    print("Train shape: %s" % repr(train.shape))
+    print("Test shape: %s" % repr(test.shape))
     train.to_csv(outfile_train, sep='\t', header=input_header, index=False)
     test.to_csv(outfile_test, sep='\t', header=input_header, index=False)
 
@@ -79,10 +144,11 @@ def main(inputs, infile_array, outfile_train, outfile_test,
     aparser = argparse.ArgumentParser()
     aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
     aparser.add_argument("-X", "--infile_array", dest="infile_array")
-    aparser.add_argument("-g", "--infile_labels", dest="infile_labels")
+    aparser.add_argument("-y", "--infile_labels", dest="infile_labels")
+    aparser.add_argument("-g", "--infile_groups", dest="infile_groups")
     aparser.add_argument("-o", "--outfile_train", dest="outfile_train")
     aparser.add_argument("-t", "--outfile_test", dest="outfile_test")
     args = aparser.parse_args()
 
     main(args.inputs, args.infile_array, args.outfile_train,
-         args.outfile_test, args.infile_labels)
+         args.outfile_test, args.infile_labels, args.infile_groups)