From 52d59950bdce522d70647b9180eb3072ac3a02d1 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Thu, 9 Apr 2020 23:15:47 -0700 Subject: [PATCH] Components - Add model URL to AutoML - Create model/dataset for tables (#3486) * Re-generated the components * Components - Add model URL to AutoML - Create model for tables Fixes https://github.com/kubeflow/pipelines/issues/3246 * Added dataset URL to the AutoML - Create dataset for tables component --- .../create_dataset_for_tables/component.py | 20 ++- .../create_dataset_for_tables/component.yaml | 155 +++++++++--------- .../create_model_for_tables/component.py | 22 ++- .../create_model_for_tables/component.yaml | 145 ++++++++-------- 4 files changed, 174 insertions(+), 168 deletions(-) diff --git a/components/gcp/automl/create_dataset_for_tables/component.py b/components/gcp/automl/create_dataset_for_tables/component.py index 644fd647509..9239e780b15 100644 --- a/components/gcp/automl/create_dataset_for_tables/component.py +++ b/components/gcp/automl/create_dataset_for_tables/component.py @@ -24,13 +24,9 @@ def automl_create_dataset_for_tables( retry=None, #=google.api_core.gapic_v1.method.DEFAULT, timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT, metadata: dict = None, -) -> NamedTuple('Outputs', [('dataset_path', str), ('create_time', str), ('dataset_id', str)]): +) -> NamedTuple('Outputs', [('dataset_path', str), ('create_time', str), ('dataset_id', str), ('dataset_url', 'URI')]): '''automl_create_dataset_for_tables creates an empty Dataset for AutoML tables ''' - import sys - import subprocess - subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) - import google from google.cloud import automl client = automl.AutoMlClient() @@ -50,9 +46,19 @@ def automl_create_dataset_for_tables( ) print(dataset) dataset_id = dataset.name.rsplit('/', 1)[-1] - return (dataset.name, dataset.create_time, dataset_id) + dataset_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id}/schemav2?project={project_id}'.format( + project_id=gcp_project_id, + region=gcp_region, + dataset_id=dataset_id, + ) + return (dataset.name, dataset.create_time, dataset_id, dataset_url) if __name__ == '__main__': import kfp - kfp.components.func_to_container_op(automl_create_dataset_for_tables, output_component_file='component.yaml', base_image='python:3.7') + kfp.components.func_to_container_op( + automl_create_dataset_for_tables, + output_component_file='component.yaml', + base_image='python:3.7', + packages_to_install=['google-cloud-automl==0.4.0'] + ) diff --git a/components/gcp/automl/create_dataset_for_tables/component.yaml b/components/gcp/automl/create_dataset_for_tables/component.yaml index 4dfdeddfdeb..74257db9fdd 100644 --- a/components/gcp/automl/create_dataset_for_tables/component.yaml +++ b/components/gcp/automl/create_dataset_for_tables/component.yaml @@ -1,61 +1,46 @@ name: Automl create dataset for tables -description: | - automl_create_dataset_for_tables creates an empty Dataset for AutoML tables +description: automl_create_dataset_for_tables creates an empty Dataset for AutoML + tables inputs: -- name: gcp_project_id - type: String -- name: gcp_region - type: String -- name: display_name - type: String -- name: description - type: String - optional: true -- name: tables_dataset_metadata - type: JsonObject - default: '{}' - optional: true -- name: retry - optional: true -- name: timeout - type: Float - optional: true -- name: metadata - type: JsonObject - optional: true +- {name: gcp_project_id, type: String} +- {name: gcp_region, type: String} +- {name: display_name, type: String} +- {name: description, type: String, optional: true} +- {name: tables_dataset_metadata, type: JsonObject, default: '{}', optional: true} +- {name: retry, optional: true} +- {name: timeout, type: Float, optional: true} +- {name: metadata, type: JsonObject, optional: true} outputs: -- name: dataset_path - type: String -- name: create_time - type: String -- name: dataset_id - type: String +- {name: dataset_path, type: String} +- {name: create_time, type: String} +- {name: dataset_id, type: String} +- {name: dataset_url, type: URI} implementation: container: image: python:3.7 command: + - sh + - -c + - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location + 'google-cloud-automl==0.4.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip + install --quiet --no-warn-script-location 'google-cloud-automl==0.4.0' --user) + && "$0" "$@" - python3 - -u - -c - | - from typing import NamedTuple - def automl_create_dataset_for_tables( - gcp_project_id: str, - gcp_region: str, - display_name: str, - description: str = None, - tables_dataset_metadata: dict = {}, + gcp_project_id , + gcp_region , + display_name , + description = None, + tables_dataset_metadata = {}, retry=None, #=google.api_core.gapic_v1.method.DEFAULT, - timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT, - metadata: dict = None, - ) -> NamedTuple('Outputs', [('dataset_path', str), ('create_time', str), ('dataset_id', str)]): + timeout = None, #=google.api_core.gapic_v1.method.DEFAULT, + metadata = None, + ) : '''automl_create_dataset_for_tables creates an empty Dataset for AutoML tables ''' - import sys - import subprocess - subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) - import google from google.cloud import automl client = automl.AutoMlClient() @@ -75,28 +60,42 @@ implementation: ) print(dataset) dataset_id = dataset.name.rsplit('/', 1)[-1] - return (dataset.name, dataset.create_time, dataset_id) + dataset_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id}/schemav2?project={project_id}'.format( + project_id=gcp_project_id, + region=gcp_region, + dataset_id=dataset_id, + ) + return (dataset.name, dataset.create_time, dataset_id, dataset_url) import json + def _serialize_str(str_value: str) -> str: + if not isinstance(str_value, str): + raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value)))) + return str_value + import argparse - _missing_arg = object() - _parser = argparse.ArgumentParser(prog='Automl create dataset for tables', description='automl_create_dataset_for_tables creates an empty Dataset for AutoML tables\n') - _parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=True, default=_missing_arg) - _parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=True, default=_missing_arg) - _parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=_missing_arg) - _parser.add_argument("--description", dest="description", type=str, required=False, default=_missing_arg) - _parser.add_argument("--tables-dataset-metadata", dest="tables_dataset_metadata", type=json.loads, required=False, default=_missing_arg) - _parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg) - _parser.add_argument("--timeout", dest="timeout", type=float, required=False, default=_missing_arg) - _parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg) - _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=3) - _parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg} + _parser = argparse.ArgumentParser(prog='Automl create dataset for tables', description='automl_create_dataset_for_tables creates an empty Dataset for AutoML tables') + _parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--description", dest="description", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--tables-dataset-metadata", dest="tables_dataset_metadata", type=json.loads, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--retry", dest="retry", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--timeout", dest="timeout", type=float, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=argparse.SUPPRESS) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=4) + _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = automl_create_dataset_for_tables(**_parsed_args) - if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): - _outputs = [_outputs] + _output_serializers = [ + _serialize_str, + _serialize_str, + _serialize_str, + str, + + ] import os for idx, output_file in enumerate(_output_files): @@ -105,45 +104,41 @@ implementation: except OSError: pass with open(output_file, 'w') as f: - f.write(str(_outputs[idx])) + f.write(_output_serializers[idx](_outputs[idx])) args: - --gcp-project-id - - inputValue: gcp_project_id + - {inputValue: gcp_project_id} - --gcp-region - - inputValue: gcp_region + - {inputValue: gcp_region} - --display-name - - inputValue: display_name + - {inputValue: display_name} - if: - cond: - isPresent: description + cond: {isPresent: description} then: - --description - - inputValue: description + - {inputValue: description} - if: - cond: - isPresent: tables_dataset_metadata + cond: {isPresent: tables_dataset_metadata} then: - --tables-dataset-metadata - - inputValue: tables_dataset_metadata + - {inputValue: tables_dataset_metadata} - if: - cond: - isPresent: retry + cond: {isPresent: retry} then: - --retry - - inputValue: retry + - {inputValue: retry} - if: - cond: - isPresent: timeout + cond: {isPresent: timeout} then: - --timeout - - inputValue: timeout + - {inputValue: timeout} - if: - cond: - isPresent: metadata + cond: {isPresent: metadata} then: - --metadata - - inputValue: metadata + - {inputValue: metadata} - '----output-paths' - - outputPath: dataset_path - - outputPath: create_time - - outputPath: dataset_id + - {outputPath: dataset_path} + - {outputPath: create_time} + - {outputPath: dataset_id} + - {outputPath: dataset_url} diff --git a/components/gcp/automl/create_model_for_tables/component.py b/components/gcp/automl/create_model_for_tables/component.py index 21b126cff38..205a4a064c0 100644 --- a/components/gcp/automl/create_model_for_tables/component.py +++ b/components/gcp/automl/create_model_for_tables/component.py @@ -24,11 +24,7 @@ def automl_create_model_for_tables( input_feature_column_paths: list = None, optimization_objective: str = 'MAXIMIZE_AU_PRC', train_budget_milli_node_hours: int = 1000, -) -> NamedTuple('Outputs', [('model_path', str), ('model_id', str)]): - import sys - import subprocess - subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) - +) -> NamedTuple('Outputs', [('model_path', str), ('model_id', str), ('model_page_url', 'URI'),]): from google.cloud import automl client = automl.AutoMlClient() @@ -50,9 +46,21 @@ def automl_create_model_for_tables( print(result) model_name = result.name model_id = model_name.rsplit('/', 1)[-1] - return (model_name, model_id) + model_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id};modelId={model_id};task=basic/train?project={project_id}'.format( + project_id=gcp_project_id, + region=gcp_region, + dataset_id=dataset_id, + model_id=model_id, + ) + + return (model_name, model_id, model_url) if __name__ == '__main__': import kfp - kfp.components.func_to_container_op(automl_create_model_for_tables, output_component_file='component.yaml', base_image='python:3.7') + kfp.components.func_to_container_op( + automl_create_model_for_tables, + output_component_file='component.yaml', + base_image='python:3.7', + packages_to_install=['google-cloud-automl==0.4.0'] + ) diff --git a/components/gcp/automl/create_model_for_tables/component.yaml b/components/gcp/automl/create_model_for_tables/component.yaml index 86ad38730bd..8b909eb1ee8 100644 --- a/components/gcp/automl/create_model_for_tables/component.yaml +++ b/components/gcp/automl/create_model_for_tables/component.yaml @@ -1,56 +1,41 @@ name: Automl create model for tables inputs: -- name: gcp_project_id - type: String -- name: gcp_region - type: String -- name: display_name - type: String -- name: dataset_id - type: String -- name: target_column_path - type: String - optional: true -- name: input_feature_column_paths - type: JsonArray - optional: true -- name: optimization_objective - type: String - default: MAXIMIZE_AU_PRC - optional: true -- name: train_budget_milli_node_hours - type: Integer - default: '1000' - optional: true +- {name: gcp_project_id, type: String} +- {name: gcp_region, type: String} +- {name: display_name, type: String} +- {name: dataset_id, type: String} +- {name: target_column_path, type: String, optional: true} +- {name: input_feature_column_paths, type: JsonArray, optional: true} +- {name: optimization_objective, type: String, default: MAXIMIZE_AU_PRC, optional: true} +- {name: train_budget_milli_node_hours, type: Integer, default: '1000', optional: true} outputs: -- name: model_path - type: String -- name: model_id - type: String +- {name: model_path, type: String} +- {name: model_id, type: String} +- {name: model_page_url, type: URI} implementation: container: image: python:3.7 command: + - sh + - -c + - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location + 'google-cloud-automl==0.4.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip + install --quiet --no-warn-script-location 'google-cloud-automl==0.4.0' --user) + && "$0" "$@" - python3 - -u - -c - | - from typing import NamedTuple - def automl_create_model_for_tables( - gcp_project_id: str, - gcp_region: str, - display_name: str, - dataset_id: str, - target_column_path: str = None, - input_feature_column_paths: list = None, - optimization_objective: str = 'MAXIMIZE_AU_PRC', - train_budget_milli_node_hours: int = 1000, - ) -> NamedTuple('Outputs', [('model_path', str), ('model_id', str)]): - import sys - import subprocess - subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) - + gcp_project_id , + gcp_region , + display_name , + dataset_id , + target_column_path = None, + input_feature_column_paths = None, + optimization_objective = 'MAXIMIZE_AU_PRC', + train_budget_milli_node_hours = 1000, + ) : from google.cloud import automl client = automl.AutoMlClient() @@ -72,28 +57,43 @@ implementation: print(result) model_name = result.name model_id = model_name.rsplit('/', 1)[-1] - return (model_name, model_id) + model_url = 'https://console.cloud.google.com/automl-tables/locations/{region}/datasets/{dataset_id};modelId={model_id};task=basic/train?project={project_id}'.format( + project_id=gcp_project_id, + region=gcp_region, + dataset_id=dataset_id, + model_id=model_id, + ) + + return (model_name, model_id, model_url) + + def _serialize_str(str_value: str) -> str: + if not isinstance(str_value, str): + raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value)))) + return str_value import json import argparse - _missing_arg = object() _parser = argparse.ArgumentParser(prog='Automl create model for tables', description='') - _parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=True, default=_missing_arg) - _parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=True, default=_missing_arg) - _parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=_missing_arg) - _parser.add_argument("--dataset-id", dest="dataset_id", type=str, required=True, default=_missing_arg) - _parser.add_argument("--target-column-path", dest="target_column_path", type=str, required=False, default=_missing_arg) - _parser.add_argument("--input-feature-column-paths", dest="input_feature_column_paths", type=json.loads, required=False, default=_missing_arg) - _parser.add_argument("--optimization-objective", dest="optimization_objective", type=str, required=False, default=_missing_arg) - _parser.add_argument("--train-budget-milli-node-hours", dest="train_budget_milli_node_hours", type=int, required=False, default=_missing_arg) - _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2) - _parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg} + _parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--dataset-id", dest="dataset_id", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--target-column-path", dest="target_column_path", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--input-feature-column-paths", dest="input_feature_column_paths", type=json.loads, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--optimization-objective", dest="optimization_objective", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--train-budget-milli-node-hours", dest="train_budget_milli_node_hours", type=int, required=False, default=argparse.SUPPRESS) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=3) + _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = automl_create_model_for_tables(**_parsed_args) - if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): - _outputs = [_outputs] + _output_serializers = [ + _serialize_str, + _serialize_str, + str, + + ] import os for idx, output_file in enumerate(_output_files): @@ -102,40 +102,37 @@ implementation: except OSError: pass with open(output_file, 'w') as f: - f.write(str(_outputs[idx])) + f.write(_output_serializers[idx](_outputs[idx])) args: - --gcp-project-id - - inputValue: gcp_project_id + - {inputValue: gcp_project_id} - --gcp-region - - inputValue: gcp_region + - {inputValue: gcp_region} - --display-name - - inputValue: display_name + - {inputValue: display_name} - --dataset-id - - inputValue: dataset_id + - {inputValue: dataset_id} - if: - cond: - isPresent: target_column_path + cond: {isPresent: target_column_path} then: - --target-column-path - - inputValue: target_column_path + - {inputValue: target_column_path} - if: - cond: - isPresent: input_feature_column_paths + cond: {isPresent: input_feature_column_paths} then: - --input-feature-column-paths - - inputValue: input_feature_column_paths + - {inputValue: input_feature_column_paths} - if: - cond: - isPresent: optimization_objective + cond: {isPresent: optimization_objective} then: - --optimization-objective - - inputValue: optimization_objective + - {inputValue: optimization_objective} - if: - cond: - isPresent: train_budget_milli_node_hours + cond: {isPresent: train_budget_milli_node_hours} then: - --train-budget-milli-node-hours - - inputValue: train_budget_milli_node_hours + - {inputValue: train_budget_milli_node_hours} - '----output-paths' - - outputPath: model_path - - outputPath: model_id + - {outputPath: model_path} + - {outputPath: model_id} + - {outputPath: model_page_url}