From 91d50f654b87743dc232fc8d4322451c70fbe7b1 Mon Sep 17 00:00:00 2001
From: hongye-sun <43763191+hongye-sun@users.noreply.github.com>
Date: Fri, 1 Mar 2019 10:35:47 -0800
Subject: [PATCH] GCPcomponents yaml spec (#887)

* add component yaml for GCP components

* Add bigquery component yaml

* Fix typo and set default instead  of optional setting.
---
 components/gcp/bigquery/query/component.yaml  | 42 ++++++++++++++
 .../gcp/dataflow/launch_python/component.yaml | 44 +++++++++++++++
 .../dataflow/launch_template/component.yaml   | 44 +++++++++++++++
 .../ml_engine/batch_predict/component.yaml    | 50 +++++++++++++++++
 .../gcp/ml_engine/deploy/component.yaml       | 53 ++++++++++++++++++
 components/gcp/ml_engine/train/component.yaml | 56 +++++++++++++++++++
 6 files changed, 289 insertions(+)
 create mode 100644 components/gcp/bigquery/query/component.yaml
 create mode 100644 components/gcp/dataflow/launch_python/component.yaml
 create mode 100644 components/gcp/dataflow/launch_template/component.yaml
 create mode 100644 components/gcp/ml_engine/batch_predict/component.yaml
 create mode 100644 components/gcp/ml_engine/deploy/component.yaml
 create mode 100644 components/gcp/ml_engine/train/component.yaml

diff --git a/components/gcp/bigquery/query/component.yaml b/components/gcp/bigquery/query/component.yaml
new file mode 100644
index 00000000000..6e8e7b5b042
--- /dev/null
+++ b/components/gcp/bigquery/query/component.yaml
@@ -0,0 +1,42 @@
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Bigquery - Query
+description: |
+  Submit a query to Bigquery service and write outputs to a GCS blob.
+inputs:
+  - {name: query, description: 'The query used by Bigquery service to fetch the results.'}
+  - {name: project_id, description: 'The project to execute the query job.' }
+  - {name: dataset_id, description: 'The ID of the persistent dataset to keep the results of the query.'}
+  - {name: table_id, description: 'The ID of the table to keep the results of the query. If absent, the operation will generate a random id for the table.', default: '' }
+  - {name: output_gcs_path, description: 'The GCS blob path to dump the query results to.', default: '' }
+  - {name: job_config, description: 'The full config spec for the query job.', default: '' }
+outputs:
+  - {name: output_gcs_path, description: 'The GCS blob path to dump the query results to.'}
+implementation:
+  container:
+    image: gcr.io/ml-pipeline-dogfood/ml-pipeline-gcp:latest
+    args: [
+      kfp_component.google.bigquery, query,
+      --query, {inputValue: query},
+      --project_id, {inputValue: project_id},
+      --dataset_id, {inputValue: dataset_id},
+      --table_id, {inputValue: table_id},
+      --output_gcs_path, {inputValue: output_gcs_path},
+      --job_config, {inputValue: job_config}
+    ]
+    env:
+      KFP_POD_NAME: "{{pod.name}}"
+    fileOutputs:
+      output_gcs_path: /tmp/kfp/output/bigquery/query-output-path.txt
\ No newline at end of file
diff --git a/components/gcp/dataflow/launch_python/component.yaml b/components/gcp/dataflow/launch_python/component.yaml
new file mode 100644
index 00000000000..9b8b5b54cc6
--- /dev/null
+++ b/components/gcp/dataflow/launch_python/component.yaml
@@ -0,0 +1,44 @@
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Launch Python
+description: |
+  Launch a self-executing beam python file.
+inputs:
+  - {name: python_file_path, description: 'The gcs or local path to the python file to run.'}
+  - {name: project_id, description: 'The ID of the parent project.' }
+  - {name: requirements_file_path, description: 'Optional, the gcs or local path to the pip requirements file', default: '' }
+  - {name: location, description: 'The regional endpoint to which to direct the request.', default: '' }
+  - {name: job_name_prefix, description: 'Optional. The prefix of the genrated job name. If not provided, the method will generated a random name.', default: '' }
+  - {name: args, description: 'The list of args to pass to the python file.', default: '[]' }
+  - {name: wait_interval, default: '30', description: 'Optional wait interval between calls to get job status. Defaults to 30.' }
+outputs:
+  - {name: job_id, description: 'The id of the created dataflow job.'}
+implementation:
+  container:
+    image: gcr.io/ml-pipeline-dogfood/ml-pipeline-gcp:latest
+    args: [
+      kfp_component.google.dataflow, launch_python,
+      --python_file_path, {inputValue: python_file_path},
+      --project_id, {inputValue: project_id},
+      --requirements_file_path, {inputValue: requirements_file_path},
+      --location, {inputValue: location},
+      --job_name_prefix, {inputValue: job_name_prefix},
+      --args, {inputValue: args},
+      --wait_interval, {inputValue: wait_interval}
+    ]
+    env:
+      KFP_POD_NAME: "{{pod.name}}"
+    fileOutputs:
+      job_id: /tmp/kfp/output/dataflow/job_id.txt
\ No newline at end of file
diff --git a/components/gcp/dataflow/launch_template/component.yaml b/components/gcp/dataflow/launch_template/component.yaml
new file mode 100644
index 00000000000..b5dfa42a1e7
--- /dev/null
+++ b/components/gcp/dataflow/launch_template/component.yaml
@@ -0,0 +1,44 @@
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Launch Dataflow Template
+description: |
+  Launchs a dataflow job from template.
+inputs:
+  - {name: project_id, description: 'Required. The ID of the Cloud Platform project that the job belongs to.'}
+  - {name: gcs_path, description: 'Required. A Cloud Storage path to the template from which to create the job. Must be valid Cloud Storage URL, beginning with `gs://`.' }
+  - {name: launch_parameters, description: 'Parameters to provide to the template being launched. Schema defined in https://cloud.google.com/dataflow/docs/reference/rest/v1b3/LaunchTemplateParameters. `jobName` will be replaced by generated name.' }
+  - {name: location, description: 'The regional endpoint to which to direct the request.', default: '' }
+  - {name: job_name_prefix, description: 'Optional. The prefix of the genrated job name. If not provided, the method will generated a random name.', default: '' }
+  - {name: validate_only, description: 'If true, the request is validated but not actually executed. Defaults to false.', default: 'False' }
+  - {name: wait_interval, description: 'Optional wait interval between calls to get job status. Defaults to 30.', default: '30'}
+outputs:
+  - {name: job_id, description: 'The ID of the created dataflow job.'}
+implementation:
+  container:
+    image: gcr.io/ml-pipeline-dogfood/ml-pipeline-gcp:latest
+    args: [
+      kfp_component.google.dataflow, launch_template,
+      --project_id, {inputValue: project_id},
+      --gcs_path, {inputValue: gcs_path},
+      --launch_parameters, {inputValue: launch_parameters},
+      --location, {inputValue: location},
+      --job_name_prefix, {inputValue: job_name_prefix},
+      --validate_only, {inputValue: validate_only},
+      --wait_interval, {inputValue: wait_interval},
+    ]
+    env:
+      KFP_POD_NAME: "{{pod.name}}"
+    fileOutputs:
+      job_id: /tmp/kfp/output/dataflow/job_id.txt
\ No newline at end of file
diff --git a/components/gcp/ml_engine/batch_predict/component.yaml b/components/gcp/ml_engine/batch_predict/component.yaml
new file mode 100644
index 00000000000..facc19706ea
--- /dev/null
+++ b/components/gcp/ml_engine/batch_predict/component.yaml
@@ -0,0 +1,50 @@
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Batch predict against a model with Cloud ML Engine
+description: |
+  Creates a MLEngine batch prediction job.
+inputs:
+  - {name: project_id, description: 'Required. The ID of the parent project of the job.'}
+  - {name: model_path, description: 'The path to the model. It can be either: `projects/[PROJECT_ID]/models/[MODEL_ID]` or `projects/[PROJECT_ID]/models/[MODEL_ID]/versions/[VERSION_ID]` or a GCS path of a model file.' }
+  - {name: input_paths, description: 'Required. The Google Cloud Storage location of the input data files. May contain wildcards.' }
+  - {name: input_data_format, description: 'Required. The format of the input data files. See https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#DataFormat.' }
+  - {name: output_path, description: 'Required. The output Google Cloud Storage location.' }
+  - {name: region, description: 'Required. The Google Compute Engine region to run the prediction job in.' }
+  - {name: output_data_format, description: 'Optional. Format of the output data files, defaults to JSON.', default: ''}
+  - {name: prediction_input, description: 'Input parameters to create a prediction job.', default: ''}
+  - {name: job_id_prefix, description: 'The prefix of the generated job id.', default: ''}
+  - {name: wait_interval, description: 'Optional wait interval between calls to get job status. Defaults to 30.', default: '30'}
+outputs:
+  - {name: job_id, description: 'The ID of the created job.'}
+implementation:
+  container:
+    image: gcr.io/ml-pipeline-dogfood/ml-pipeline-gcp:latest
+    args: [
+      kfp_component.google.ml_engine, batch_predict,
+      --project_id, {inputValue: project_id},
+      --model_path, {inputValue: model_path},
+      --input_paths, {inputValue: input_paths},
+      --input_data_format, {inputValue: input_data_format},
+      --output_path, {inputValue: output_path},
+      --region, {inputValue: region},
+      --output_data_format, {inputValue: output_data_format},
+      --prediction_input, {inputValue: prediction_input},
+      --job_id_prefix, {inputValue: job_id_prefix},
+      --wait_interval, {inputValue: wait_interval}
+    ]
+    env:
+      KFP_POD_NAME: "{{pod.name}}"
+    fileOutputs:
+      job_id: /tmp/kfp/output/ml_engine/job_id.txt
\ No newline at end of file
diff --git a/components/gcp/ml_engine/deploy/component.yaml b/components/gcp/ml_engine/deploy/component.yaml
new file mode 100644
index 00000000000..e57181ec1cb
--- /dev/null
+++ b/components/gcp/ml_engine/deploy/component.yaml
@@ -0,0 +1,53 @@
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Deploy a model to Cloud ML Engine
+description: |
+  Creates a Cloud Machine Learning version and optionally a model if it's not exist.
+inputs:
+  - {name: model_uri,                 description: 'Required, the GCS URI which contains a model file. Common used TF model search path (export/exporter) will be used if exist.'}
+  - {name: project_id,                description: 'Required, the ID of the parent project.'}
+  - {name: model_id,                  description: 'Optional, the user provided name of the model.', default: '' }
+  - {name: version_id,                description: 'Optional, the user provided name of the version. If it is not provided, the operation uses a random name.', default: '' }
+  - {name: runtime_version,           description: 'Optional, the Cloud ML Engine runtime version to use for this deployment. If not set, Cloud ML Engine uses the default stable version, 1.0.', default: '' }
+  - {name: python_version,            description: 'Optional, the version of Python used in prediction. If not set, the default version is `2.7`. Python `3.5` is available when runtimeVersion is set to `1.4` and above. Python `2.7` works with all supported runtime versions.', default: '' }
+  - {name: version,                   description: 'Optional, the payload of the new version.', default: '' }
+  - {name: replace_existing_version,  description: 'Boolean flag indicates whether to replace existing version in case of conflict.', default: 'Fasle' }
+  - {name: set_default,               description: 'Boolean flag indicates whether to set the new version as default version in the model.', default: 'False'}
+  - {name: wait_interval,             description: 'The interval to wait for a long running operation.', default: '30'}
+outputs:
+  - {name: model_uri,     description: 'The URI of the model.'}
+  - {name: model_name,    description: 'The name of the deployed model.'}
+  - {name: version_name,  description: 'The name of the deployed version.'}
+implementation:
+  container:
+    image: gcr.io/ml-pipeline-dogfood/ml-pipeline-gcp:latest
+    args: [
+      kfp_component.google.ml_engine, deploy,
+      --model_uri, {inputValue: model_uri},
+      --project_id, {inputValue: project_id},
+      --model_short_name, {inputValue: model_short_name},
+      --version_short_name, {inputValue: version_short_name},
+      --runtime_version, {inputValue: runtime_version},
+      --version, {inputValue: version},
+      --replace_existing_version, {inputValue: replace_existing_version},
+      --set_default, {inputValue: set_default},
+      --wait_interval, {inputValue: wait_interval},
+    ]
+    env:
+      KFP_POD_NAME: "{{pod.name}}"
+    fileOutputs:
+      model_uri: /tmp/kfp/output/ml_engine/model_uri.txt
+      model_name: /tmp/kfp/output/ml_engine/model_name.txt
+      version_name: /tmp/kfp/output/ml_engine/version_name.txt
\ No newline at end of file
diff --git a/components/gcp/ml_engine/train/component.yaml b/components/gcp/ml_engine/train/component.yaml
new file mode 100644
index 00000000000..db465d7bea9
--- /dev/null
+++ b/components/gcp/ml_engine/train/component.yaml
@@ -0,0 +1,56 @@
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Train a model with Cloud ML Engine
+description: |
+  Submits a Cloud Machine Learning training job.
+inputs:
+  - {name: project_id,        description: 'Required. The ID of the parent project of the job.'}
+  - {name: python_module,     description: 'The Python module name to run after installing the packages.', default: ''}
+  - {name: package_uris,      description: 'The Google Cloud Storage location of the packages with the training program and any additional dependencies. The maximum number of package URIs is 100.', default: ''}
+  - {name: region,            description: 'The Google Compute Engine region to run the training job in.', default: ''}
+  - {name: args,              description: 'Command line arguments to pass to the program.', default: ''}
+  - {name: job_dir,           description: 'A Google Cloud Storage path in which to store training outputs and other data needed for training. This path is passed to your TensorFlow program as the `--job-dir` command-line argument. The benefit of specifying this field is that Cloud ML validates the path for use in training.', default: ''}
+  - {name: python_version,    description: 'The version of Python used in training. If not set, the default version is `2.7`. Python `3.5` is available when runtimeVersion is set to `1.4` and above.', default: ''}
+  - {name: runtime_version,   description: 'The Cloud ML Engine runtime version to use for training. If not set, Cloud ML Engine uses the default stable version, 1.0. ', default: ''}
+  - {name: master_image_uri,  description: 'The Docker image to run on the master replica. This image must be in Container Registry.', default: ''}
+  - {name: worker_image_uri,  description: 'The Docker image to run on the worker replica. This image must be in Container Registry.', default: ''}
+  - {name: training_input,    description: 'Input parameters to create a training job.', default: ''}
+  - {name: job_id_prefix,     description: 'The prefix of the generated job id.', default: ''}
+  - {name: wait_interval,     description: 'Optional wait interval between calls to get job status. Defaults to 30.', default: '30'}
+outputs:
+  - {name: job_id,            description: 'The ID of the created job.'}
+implementation:
+  container:
+    image: gcr.io/ml-pipeline-dogfood/ml-pipeline-gcp:latest
+    args: [
+      kfp_component.google.ml_engine, train,
+      --project_id, {inputValue: project_id},
+      --python_module, {inputValue: python_module},
+      --package_uris, {inputValue: package_uris},
+      --region, {inputValue: region},
+      --args, {inputValue: args},
+      --job_dir, {inputValue: job_dir},
+      --python_version, {inputValue: python_version},
+      --runtime_version, {inputValue: runtime_version},
+      --master_image_uri, {inputValue: master_image_uri},
+      --worker_image_uri, {inputValue: worker_image_uri},
+      --training_input, {inputValue: training_input},
+      --job_id_prefix, {inputValue: job_id_prefix},
+      --wait_interval, {inputValue: wait_interval}
+    ]
+    env:
+      KFP_POD_NAME: "{{pod.name}}"
+    fileOutputs:
+      job_id: /tmp/kfp/output/ml_engine/job_id.txt
\ No newline at end of file