From 91d50f654b87743dc232fc8d4322451c70fbe7b1 Mon Sep 17 00:00:00 2001 From: hongye-sun <43763191+hongye-sun@users.noreply.github.com> Date: Fri, 1 Mar 2019 10:35:47 -0800 Subject: [PATCH] GCPcomponents yaml spec (#887) * add component yaml for GCP components * Add bigquery component yaml * Fix typo and set default instead of optional setting. --- components/gcp/bigquery/query/component.yaml | 42 ++++++++++++++ .../gcp/dataflow/launch_python/component.yaml | 44 +++++++++++++++ .../dataflow/launch_template/component.yaml | 44 +++++++++++++++ .../ml_engine/batch_predict/component.yaml | 50 +++++++++++++++++ .../gcp/ml_engine/deploy/component.yaml | 53 ++++++++++++++++++ components/gcp/ml_engine/train/component.yaml | 56 +++++++++++++++++++ 6 files changed, 289 insertions(+) create mode 100644 components/gcp/bigquery/query/component.yaml create mode 100644 components/gcp/dataflow/launch_python/component.yaml create mode 100644 components/gcp/dataflow/launch_template/component.yaml create mode 100644 components/gcp/ml_engine/batch_predict/component.yaml create mode 100644 components/gcp/ml_engine/deploy/component.yaml create mode 100644 components/gcp/ml_engine/train/component.yaml diff --git a/components/gcp/bigquery/query/component.yaml b/components/gcp/bigquery/query/component.yaml new file mode 100644 index 00000000000..6e8e7b5b042 --- /dev/null +++ b/components/gcp/bigquery/query/component.yaml @@ -0,0 +1,42 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Bigquery - Query +description: | + Submit a query to Bigquery service and write outputs to a GCS blob. +inputs: + - {name: query, description: 'The query used by Bigquery service to fetch the results.'} + - {name: project_id, description: 'The project to execute the query job.' } + - {name: dataset_id, description: 'The ID of the persistent dataset to keep the results of the query.'} + - {name: table_id, description: 'The ID of the table to keep the results of the query. If absent, the operation will generate a random id for the table.', default: '' } + - {name: output_gcs_path, description: 'The GCS blob path to dump the query results to.', default: '' } + - {name: job_config, description: 'The full config spec for the query job.', default: '' } +outputs: + - {name: output_gcs_path, description: 'The GCS blob path to dump the query results to.'} +implementation: + container: + image: gcr.io/ml-pipeline-dogfood/ml-pipeline-gcp:latest + args: [ + kfp_component.google.bigquery, query, + --query, {inputValue: query}, + --project_id, {inputValue: project_id}, + --dataset_id, {inputValue: dataset_id}, + --table_id, {inputValue: table_id}, + --output_gcs_path, {inputValue: output_gcs_path}, + --job_config, {inputValue: job_config} + ] + env: + KFP_POD_NAME: "{{pod.name}}" + fileOutputs: + output_gcs_path: /tmp/kfp/output/bigquery/query-output-path.txt \ No newline at end of file diff --git a/components/gcp/dataflow/launch_python/component.yaml b/components/gcp/dataflow/launch_python/component.yaml new file mode 100644 index 00000000000..9b8b5b54cc6 --- /dev/null +++ b/components/gcp/dataflow/launch_python/component.yaml @@ -0,0 +1,44 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Launch Python +description: | + Launch a self-executing beam python file. +inputs: + - {name: python_file_path, description: 'The gcs or local path to the python file to run.'} + - {name: project_id, description: 'The ID of the parent project.' } + - {name: requirements_file_path, description: 'Optional, the gcs or local path to the pip requirements file', default: '' } + - {name: location, description: 'The regional endpoint to which to direct the request.', default: '' } + - {name: job_name_prefix, description: 'Optional. The prefix of the genrated job name. If not provided, the method will generated a random name.', default: '' } + - {name: args, description: 'The list of args to pass to the python file.', default: '[]' } + - {name: wait_interval, default: '30', description: 'Optional wait interval between calls to get job status. Defaults to 30.' } +outputs: + - {name: job_id, description: 'The id of the created dataflow job.'} +implementation: + container: + image: gcr.io/ml-pipeline-dogfood/ml-pipeline-gcp:latest + args: [ + kfp_component.google.dataflow, launch_python, + --python_file_path, {inputValue: python_file_path}, + --project_id, {inputValue: project_id}, + --requirements_file_path, {inputValue: requirements_file_path}, + --location, {inputValue: location}, + --job_name_prefix, {inputValue: job_name_prefix}, + --args, {inputValue: args}, + --wait_interval, {inputValue: wait_interval} + ] + env: + KFP_POD_NAME: "{{pod.name}}" + fileOutputs: + job_id: /tmp/kfp/output/dataflow/job_id.txt \ No newline at end of file diff --git a/components/gcp/dataflow/launch_template/component.yaml b/components/gcp/dataflow/launch_template/component.yaml new file mode 100644 index 00000000000..b5dfa42a1e7 --- /dev/null +++ b/components/gcp/dataflow/launch_template/component.yaml @@ -0,0 +1,44 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Launch Dataflow Template +description: | + Launchs a dataflow job from template. +inputs: + - {name: project_id, description: 'Required. The ID of the Cloud Platform project that the job belongs to.'} + - {name: gcs_path, description: 'Required. A Cloud Storage path to the template from which to create the job. Must be valid Cloud Storage URL, beginning with `gs://`.' } + - {name: launch_parameters, description: 'Parameters to provide to the template being launched. Schema defined in https://cloud.google.com/dataflow/docs/reference/rest/v1b3/LaunchTemplateParameters. `jobName` will be replaced by generated name.' } + - {name: location, description: 'The regional endpoint to which to direct the request.', default: '' } + - {name: job_name_prefix, description: 'Optional. The prefix of the genrated job name. If not provided, the method will generated a random name.', default: '' } + - {name: validate_only, description: 'If true, the request is validated but not actually executed. Defaults to false.', default: 'False' } + - {name: wait_interval, description: 'Optional wait interval between calls to get job status. Defaults to 30.', default: '30'} +outputs: + - {name: job_id, description: 'The ID of the created dataflow job.'} +implementation: + container: + image: gcr.io/ml-pipeline-dogfood/ml-pipeline-gcp:latest + args: [ + kfp_component.google.dataflow, launch_template, + --project_id, {inputValue: project_id}, + --gcs_path, {inputValue: gcs_path}, + --launch_parameters, {inputValue: launch_parameters}, + --location, {inputValue: location}, + --job_name_prefix, {inputValue: job_name_prefix}, + --validate_only, {inputValue: validate_only}, + --wait_interval, {inputValue: wait_interval}, + ] + env: + KFP_POD_NAME: "{{pod.name}}" + fileOutputs: + job_id: /tmp/kfp/output/dataflow/job_id.txt \ No newline at end of file diff --git a/components/gcp/ml_engine/batch_predict/component.yaml b/components/gcp/ml_engine/batch_predict/component.yaml new file mode 100644 index 00000000000..facc19706ea --- /dev/null +++ b/components/gcp/ml_engine/batch_predict/component.yaml @@ -0,0 +1,50 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Batch predict against a model with Cloud ML Engine +description: | + Creates a MLEngine batch prediction job. +inputs: + - {name: project_id, description: 'Required. The ID of the parent project of the job.'} + - {name: model_path, description: 'The path to the model. It can be either: `projects/[PROJECT_ID]/models/[MODEL_ID]` or `projects/[PROJECT_ID]/models/[MODEL_ID]/versions/[VERSION_ID]` or a GCS path of a model file.' } + - {name: input_paths, description: 'Required. The Google Cloud Storage location of the input data files. May contain wildcards.' } + - {name: input_data_format, description: 'Required. The format of the input data files. See https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat.' } + - {name: output_path, description: 'Required. The output Google Cloud Storage location.' } + - {name: region, description: 'Required. The Google Compute Engine region to run the prediction job in.' } + - {name: output_data_format, description: 'Optional. Format of the output data files, defaults to JSON.', default: ''} + - {name: prediction_input, description: 'Input parameters to create a prediction job.', default: ''} + - {name: job_id_prefix, description: 'The prefix of the generated job id.', default: ''} + - {name: wait_interval, description: 'Optional wait interval between calls to get job status. Defaults to 30.', default: '30'} +outputs: + - {name: job_id, description: 'The ID of the created job.'} +implementation: + container: + image: gcr.io/ml-pipeline-dogfood/ml-pipeline-gcp:latest + args: [ + kfp_component.google.ml_engine, batch_predict, + --project_id, {inputValue: project_id}, + --model_path, {inputValue: model_path}, + --input_paths, {inputValue: input_paths}, + --input_data_format, {inputValue: input_data_format}, + --output_path, {inputValue: output_path}, + --region, {inputValue: region}, + --output_data_format, {inputValue: output_data_format}, + --prediction_input, {inputValue: prediction_input}, + --job_id_prefix, {inputValue: job_id_prefix}, + --wait_interval, {inputValue: wait_interval} + ] + env: + KFP_POD_NAME: "{{pod.name}}" + fileOutputs: + job_id: /tmp/kfp/output/ml_engine/job_id.txt \ No newline at end of file diff --git a/components/gcp/ml_engine/deploy/component.yaml b/components/gcp/ml_engine/deploy/component.yaml new file mode 100644 index 00000000000..e57181ec1cb --- /dev/null +++ b/components/gcp/ml_engine/deploy/component.yaml @@ -0,0 +1,53 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Deploy a model to Cloud ML Engine +description: | + Creates a Cloud Machine Learning version and optionally a model if it's not exist. +inputs: + - {name: model_uri, description: 'Required, the GCS URI which contains a model file. Common used TF model search path (export/exporter) will be used if exist.'} + - {name: project_id, description: 'Required, the ID of the parent project.'} + - {name: model_id, description: 'Optional, the user provided name of the model.', default: '' } + - {name: version_id, description: 'Optional, the user provided name of the version. If it is not provided, the operation uses a random name.', default: '' } + - {name: runtime_version, description: 'Optional, the Cloud ML Engine runtime version to use for this deployment. If not set, Cloud ML Engine uses the default stable version, 1.0.', default: '' } + - {name: python_version, description: 'Optional, the version of Python used in prediction. If not set, the default version is `2.7`. Python `3.5` is available when runtimeVersion is set to `1.4` and above. Python `2.7` works with all supported runtime versions.', default: '' } + - {name: version, description: 'Optional, the payload of the new version.', default: '' } + - {name: replace_existing_version, description: 'Boolean flag indicates whether to replace existing version in case of conflict.', default: 'Fasle' } + - {name: set_default, description: 'Boolean flag indicates whether to set the new version as default version in the model.', default: 'False'} + - {name: wait_interval, description: 'The interval to wait for a long running operation.', default: '30'} +outputs: + - {name: model_uri, description: 'The URI of the model.'} + - {name: model_name, description: 'The name of the deployed model.'} + - {name: version_name, description: 'The name of the deployed version.'} +implementation: + container: + image: gcr.io/ml-pipeline-dogfood/ml-pipeline-gcp:latest + args: [ + kfp_component.google.ml_engine, deploy, + --model_uri, {inputValue: model_uri}, + --project_id, {inputValue: project_id}, + --model_short_name, {inputValue: model_short_name}, + --version_short_name, {inputValue: version_short_name}, + --runtime_version, {inputValue: runtime_version}, + --version, {inputValue: version}, + --replace_existing_version, {inputValue: replace_existing_version}, + --set_default, {inputValue: set_default}, + --wait_interval, {inputValue: wait_interval}, + ] + env: + KFP_POD_NAME: "{{pod.name}}" + fileOutputs: + model_uri: /tmp/kfp/output/ml_engine/model_uri.txt + model_name: /tmp/kfp/output/ml_engine/model_name.txt + version_name: /tmp/kfp/output/ml_engine/version_name.txt \ No newline at end of file diff --git a/components/gcp/ml_engine/train/component.yaml b/components/gcp/ml_engine/train/component.yaml new file mode 100644 index 00000000000..db465d7bea9 --- /dev/null +++ b/components/gcp/ml_engine/train/component.yaml @@ -0,0 +1,56 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Train a model with Cloud ML Engine +description: | + Submits a Cloud Machine Learning training job. +inputs: + - {name: project_id, description: 'Required. The ID of the parent project of the job.'} + - {name: python_module, description: 'The Python module name to run after installing the packages.', default: ''} + - {name: package_uris, description: 'The Google Cloud Storage location of the packages with the training program and any additional dependencies. The maximum number of package URIs is 100.', default: ''} + - {name: region, description: 'The Google Compute Engine region to run the training job in.', default: ''} + - {name: args, description: 'Command line arguments to pass to the program.', default: ''} + - {name: job_dir, description: 'A Google Cloud Storage path in which to store training outputs and other data needed for training. This path is passed to your TensorFlow program as the `--job-dir` command-line argument. The benefit of specifying this field is that Cloud ML validates the path for use in training.', default: ''} + - {name: python_version, description: 'The version of Python used in training. If not set, the default version is `2.7`. Python `3.5` is available when runtimeVersion is set to `1.4` and above.', default: ''} + - {name: runtime_version, description: 'The Cloud ML Engine runtime version to use for training. If not set, Cloud ML Engine uses the default stable version, 1.0. ', default: ''} + - {name: master_image_uri, description: 'The Docker image to run on the master replica. This image must be in Container Registry.', default: ''} + - {name: worker_image_uri, description: 'The Docker image to run on the worker replica. This image must be in Container Registry.', default: ''} + - {name: training_input, description: 'Input parameters to create a training job.', default: ''} + - {name: job_id_prefix, description: 'The prefix of the generated job id.', default: ''} + - {name: wait_interval, description: 'Optional wait interval between calls to get job status. Defaults to 30.', default: '30'} +outputs: + - {name: job_id, description: 'The ID of the created job.'} +implementation: + container: + image: gcr.io/ml-pipeline-dogfood/ml-pipeline-gcp:latest + args: [ + kfp_component.google.ml_engine, train, + --project_id, {inputValue: project_id}, + --python_module, {inputValue: python_module}, + --package_uris, {inputValue: package_uris}, + --region, {inputValue: region}, + --args, {inputValue: args}, + --job_dir, {inputValue: job_dir}, + --python_version, {inputValue: python_version}, + --runtime_version, {inputValue: runtime_version}, + --master_image_uri, {inputValue: master_image_uri}, + --worker_image_uri, {inputValue: worker_image_uri}, + --training_input, {inputValue: training_input}, + --job_id_prefix, {inputValue: job_id_prefix}, + --wait_interval, {inputValue: wait_interval} + ] + env: + KFP_POD_NAME: "{{pod.name}}" + fileOutputs: + job_id: /tmp/kfp/output/ml_engine/job_id.txt \ No newline at end of file