From dac47011a35056d6fc3aba274f972b5dc975d620 Mon Sep 17 00:00:00 2001 From: Hongye Sun Date: Thu, 6 Dec 2018 13:45:48 -0800 Subject: [PATCH 1/2] support tpu settings in dsl --- sdk/python/kfp/compiler/compiler.py | 4 ++ sdk/python/kfp/dsl/_container_op.py | 48 ++++++++++++++++--- sdk/python/tests/compiler/testdata/basic.py | 1 + sdk/python/tests/compiler/testdata/basic.yaml | 4 ++ 4 files changed, 50 insertions(+), 7 deletions(-) diff --git a/sdk/python/kfp/compiler/compiler.py b/sdk/python/kfp/compiler/compiler.py index 20f3d0a0056..40884ddce41 100644 --- a/sdk/python/kfp/compiler/compiler.py +++ b/sdk/python/kfp/compiler/compiler.py @@ -143,6 +143,10 @@ def _op_to_template(self, op): template['container']['env'] = list(map(self._convert_k8s_obj_to_dic, op.env_variables)) if op.volume_mounts: template['container']['volumeMounts'] = list(map(self._convert_k8s_obj_to_dic, op.volume_mounts)) + + if op.pod_annotations: + template['metadata'] = {'annotations': op.pod_annotations} + return template def _get_groups_for_ops(self, root_group): diff --git a/sdk/python/kfp/dsl/_container_op.py b/sdk/python/kfp/dsl/_container_op.py index de29767fe49..fc18320bae0 100644 --- a/sdk/python/kfp/dsl/_container_op.py +++ b/sdk/python/kfp/dsl/_container_op.py @@ -59,6 +59,7 @@ def __init__(self, name: str, image: str, command: str=None, arguments: str=None self.volumes = [] self.volume_mounts = [] self.env_variables = [] + self.pod_annotations = {} matches = [] if arguments: @@ -127,16 +128,16 @@ def _validate_cpu_string(self, cpu_string): raise ValueError('Invalid cpu string. Should be float or integer, or integer followed ' 'by "m".') - def _validate_gpu_string(self, gpu_string): - "Validate a given string is valid for gpu limit." + def _validate_positive_number(self, str_value, param_name): + "Validate a given string is in positive integer format." try: - gpu_value = int(gpu_string) + int_value = int(str_value) except ValueError: - raise ValueError('Invalid gpu string. Should be integer.') + raise ValueError('Invalid {}. Should be integer.'.format(param_name)) - if gpu_value <= 0: - raise ValueError('gpu must be positive integer.') + if int_value <= 0: + raise ValueError('{} must be positive integer.'.format(param_name)) def add_resource_limit(self, resource_name, value): """Add the resource limit of the container. @@ -212,7 +213,7 @@ def set_gpu_limit(self, gpu, vendor = "nvidia"): are: 'nvidia' (default), and 'amd'. """ - self._validate_gpu_string(gpu) + self._validate_positive_number(gpu, 'gpu') if vendor != 'nvidia' and vendor != 'amd': raise ValueError('vendor can only be nvidia or amd.') @@ -268,5 +269,38 @@ def add_node_selector_constraint(self, label_name, value): self.node_selector[label_name] = value return self + def add_pod_annotation(self, name: str, value: str): + """Adds a pod's metadata annotation. + + Args: + name: The name of the annotation. + value: The value of the annotation. + """ + + self.pod_annotations[name] = value + return self + + + def set_tpu(self, tpu_cores: str, tpu_resource: str, tf_version: str): + """Sets TPU spec in the container op. + + Args: + tpu_cores: Required. The number of cores of TPU resource. + For example, the value can be '8', '32', '128', etc. + Check more details at: https://cloud.google.com/tpu/docs/kubernetes-engine-setup#pod-spec. + tpu_resource: Required. The resource name of the TPU resource. + For example, the value can be 'v2', 'preemptible-v1', 'v3' or 'preemptible-v3'. + Check more details at: https://cloud.google.com/tpu/docs/kubernetes-engine-setup#pod-spec. + tf_version: Required. The TensorFlow version that the TPU nodes use. + For example, the value can be '1.12', '1.11', '1.9' or '1.8'. + Check more details at: https://cloud.google.com/tpu/docs/supported-versions. + """ + + self._validate_positive_number(tpu_cores, 'tpu_cores') + self.add_pod_annotation('tf-version.cloud-tpus.google.com', tf_version) + self.add_resource_limit('cloud-tpus.google.com/{}'.format(tpu_resource), tpu_cores) + return self + + def __repr__(self): return str({self.__class__.__name__: self.__dict__}) diff --git a/sdk/python/tests/compiler/testdata/basic.py b/sdk/python/tests/compiler/testdata/basic.py index 85f12d206a4..3329c74325a 100644 --- a/sdk/python/tests/compiler/testdata/basic.py +++ b/sdk/python/tests/compiler/testdata/basic.py @@ -87,3 +87,4 @@ def save_most_frequent_word(message: str, outputpath: str): saver.set_cpu_limit('0.5') saver.set_gpu_limit('2') saver.add_node_selector_constraint('cloud.google.com/gke-accelerator', 'nvidia-tesla-k80') + saver.set_tpu(tpu_cores = '8', tpu_resource = 'v2', tf_version = '1.12') diff --git a/sdk/python/tests/compiler/testdata/basic.yaml b/sdk/python/tests/compiler/testdata/basic.yaml index f9da8393bba..4298bf1e751 100644 --- a/sdk/python/tests/compiler/testdata/basic.yaml +++ b/sdk/python/tests/compiler/testdata/basic.yaml @@ -140,12 +140,16 @@ spec: image: google/cloud-sdk resources: limits: + cloud-tpus.google.com/v2: "8" cpu: "0.5" nvidia.com/gpu: "2" inputs: parameters: - name: get-frequent-word - name: outputpath + metadata: + annotations: + tf-version.cloud-tpus.google.com: "1.12" name: save nodeSelector: cloud.google.com/gke-accelerator: nvidia-tesla-k80 From d70722a4b37f86f919bebd1fbfe6ec626d873ee9 Mon Sep 17 00:00:00 2001 From: Hongye Sun Date: Thu, 6 Dec 2018 15:30:52 -0800 Subject: [PATCH 2/2] fix issues from review comment --- sdk/python/kfp/compiler/compiler.py | 8 ++++++-- sdk/python/kfp/dsl/_container_op.py | 22 ++++++--------------- sdk/python/kfp/gcp.py | 22 +++++++++++++++++++++ sdk/python/tests/compiler/testdata/basic.py | 3 ++- 4 files changed, 36 insertions(+), 19 deletions(-) diff --git a/sdk/python/kfp/compiler/compiler.py b/sdk/python/kfp/compiler/compiler.py index 40884ddce41..03f70b9e2b5 100644 --- a/sdk/python/kfp/compiler/compiler.py +++ b/sdk/python/kfp/compiler/compiler.py @@ -144,8 +144,12 @@ def _op_to_template(self, op): if op.volume_mounts: template['container']['volumeMounts'] = list(map(self._convert_k8s_obj_to_dic, op.volume_mounts)) - if op.pod_annotations: - template['metadata'] = {'annotations': op.pod_annotations} + if op.pod_annotations or op.pod_labels: + template['metadata'] = {} + if op.pod_annotations: + template['metadata']['annotations'] = op.pod_annotations + if op.pod_labels: + template['metadata']['labels'] = op.pod_labels return template diff --git a/sdk/python/kfp/dsl/_container_op.py b/sdk/python/kfp/dsl/_container_op.py index fc18320bae0..e1f5d7042c6 100644 --- a/sdk/python/kfp/dsl/_container_op.py +++ b/sdk/python/kfp/dsl/_container_op.py @@ -60,6 +60,7 @@ def __init__(self, name: str, image: str, command: str=None, arguments: str=None self.volume_mounts = [] self.env_variables = [] self.pod_annotations = {} + self.pod_labels = {} matches = [] if arguments: @@ -280,27 +281,16 @@ def add_pod_annotation(self, name: str, value: str): self.pod_annotations[name] = value return self - - def set_tpu(self, tpu_cores: str, tpu_resource: str, tf_version: str): - """Sets TPU spec in the container op. + def add_pod_label(self, name: str, value: str): + """Adds a pod's metadata label. Args: - tpu_cores: Required. The number of cores of TPU resource. - For example, the value can be '8', '32', '128', etc. - Check more details at: https://cloud.google.com/tpu/docs/kubernetes-engine-setup#pod-spec. - tpu_resource: Required. The resource name of the TPU resource. - For example, the value can be 'v2', 'preemptible-v1', 'v3' or 'preemptible-v3'. - Check more details at: https://cloud.google.com/tpu/docs/kubernetes-engine-setup#pod-spec. - tf_version: Required. The TensorFlow version that the TPU nodes use. - For example, the value can be '1.12', '1.11', '1.9' or '1.8'. - Check more details at: https://cloud.google.com/tpu/docs/supported-versions. + name: The name of the label. + value: The value of the label. """ - self._validate_positive_number(tpu_cores, 'tpu_cores') - self.add_pod_annotation('tf-version.cloud-tpus.google.com', tf_version) - self.add_resource_limit('cloud-tpus.google.com/{}'.format(tpu_resource), tpu_cores) + self.pod_labels[name] = value return self - def __repr__(self): return str({self.__class__.__name__: self.__dict__}) diff --git a/sdk/python/kfp/gcp.py b/sdk/python/kfp/gcp.py index e271d26e8d7..9b346d4b56b 100644 --- a/sdk/python/kfp/gcp.py +++ b/sdk/python/kfp/gcp.py @@ -56,3 +56,25 @@ def _use_gcp_secret(task): ) return _use_gcp_secret + +def use_tpu(tpu_cores: int, tpu_resource: str, tf_version: str): + """An operator that configures GCP TPU spec in a container op. + + Args: + tpu_cores: Required. The number of cores of TPU resource. + For example, the value can be '8', '32', '128', etc. + Check more details at: https://cloud.google.com/tpu/docs/kubernetes-engine-setup#pod-spec. + tpu_resource: Required. The resource name of the TPU resource. + For example, the value can be 'v2', 'preemptible-v1', 'v3' or 'preemptible-v3'. + Check more details at: https://cloud.google.com/tpu/docs/kubernetes-engine-setup#pod-spec. + tf_version: Required. The TensorFlow version that the TPU nodes use. + For example, the value can be '1.12', '1.11', '1.9' or '1.8'. + Check more details at: https://cloud.google.com/tpu/docs/supported-versions. + """ + + def _set_tpu_spec(task): + task.add_pod_annotation('tf-version.cloud-tpus.google.com', tf_version) + task.add_resource_limit('cloud-tpus.google.com/{}'.format(tpu_resource), str(tpu_cores)) + return task + + return _set_tpu_spec diff --git a/sdk/python/tests/compiler/testdata/basic.py b/sdk/python/tests/compiler/testdata/basic.py index 3329c74325a..a81cf3a4d1f 100644 --- a/sdk/python/tests/compiler/testdata/basic.py +++ b/sdk/python/tests/compiler/testdata/basic.py @@ -14,6 +14,7 @@ import kfp.dsl as dsl +import kfp.gcp as gcp class GetFrequentWordOp(dsl.ContainerOp): @@ -87,4 +88,4 @@ def save_most_frequent_word(message: str, outputpath: str): saver.set_cpu_limit('0.5') saver.set_gpu_limit('2') saver.add_node_selector_constraint('cloud.google.com/gke-accelerator', 'nvidia-tesla-k80') - saver.set_tpu(tpu_cores = '8', tpu_resource = 'v2', tf_version = '1.12') + saver.apply(gcp.use_tpu(tpu_cores = 8, tpu_resource = 'v2', tf_version = '1.12'))