From 31deca7b01c45864822ddde0af0c0e04e3cbb2ae Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Tue, 12 May 2020 18:24:26 -0700 Subject: [PATCH] SDK - Components - Calculate component hash digest (#3726) * SDK - Components - Calculate component hash digest The digest is calculated when loading the component from URL, tfile or text. Slightly refactored component loading - streams are no longer used, only bytes. TODO: Calculate the digest if missing TODO: Report possible digest conflicts * Updated the test graph component * Using the actual digest in the test --- sdk/python/kfp/components/_components.py | 42 ++++++++++--------- .../tests/components/test_components.py | 12 ++++++ ...tockout_prediction_pipeline.component.yaml | 5 +++ 3 files changed, 40 insertions(+), 19 deletions(-) diff --git a/sdk/python/kfp/components/_components.py b/sdk/python/kfp/components/_components.py index fa91b9c93721..35df17281296 100644 --- a/sdk/python/kfp/components/_components.py +++ b/sdk/python/kfp/components/_components.py @@ -129,7 +129,7 @@ def _fix_component_uri(uri: str) -> str: def _load_component_spec_from_file(path) -> ComponentSpec: with open(path, 'rb') as component_stream: - return _load_component_spec_from_yaml_or_zip_stream(component_stream) + return _load_component_spec_from_yaml_or_zip_bytes(component_stream.read()) def _load_component_spec_from_url(url: str): @@ -148,33 +148,30 @@ def _load_component_spec_from_url(url: str): def _load_component_spec_from_yaml_or_zip_bytes(data: bytes): - import io - component_stream = io.BytesIO(data) - return _load_component_spec_from_yaml_or_zip_stream(component_stream) - - -def _load_component_spec_from_yaml_or_zip_stream(stream) -> ComponentSpec: - '''Loads component spec from a stream. + '''Loads component spec from binary data. - The stream can be YAML or a zip file with a component.yaml file inside. + The data can be a YAML file or a zip file with a component.yaml file inside. ''' import zipfile - stream.seek(0) + import io + stream = io.BytesIO(data) if zipfile.is_zipfile(stream): stream.seek(0) with zipfile.ZipFile(stream) as zip_obj: - with zip_obj.open(_COMPONENT_FILE_NAME_IN_ARCHIVE) as component_stream: - return _load_component_spec_from_component_text( - text_or_file=component_stream, - ) - else: - stream.seek(0) - return _load_component_spec_from_component_text(stream) + data = zip_obj.read(_COMPONENT_FILE_NAME_IN_ARCHIVE) + return _load_component_spec_from_component_text(data) -def _load_component_spec_from_component_text(text_or_file) -> ComponentSpec: - component_dict = load_yaml(text_or_file) +def _load_component_spec_from_component_text(text) -> ComponentSpec: + component_dict = load_yaml(text) component_spec = ComponentSpec.from_dict(component_dict) + + # Calculating hash digest for the component + import hashlib + data = text if isinstance(text, bytes) else text.encode('utf-8') + digest = hashlib.sha256(data).hexdigest() + component_spec._digest = digest + return component_spec @@ -287,6 +284,13 @@ def _create_task_factory_from_component_spec(component_spec:ComponentSpec, compo component_ref = ComponentReference(spec=component_spec, url=component_filename) else: component_ref.spec = component_spec + + digest = getattr(component_spec, '_digest', None) + # TODO: Calculate the digest if missing + if digest: + # TODO: Report possible digest conflicts + component_ref.digest = digest + def create_task_from_component_and_arguments(pythonic_arguments): arguments = { diff --git a/sdk/python/tests/components/test_components.py b/sdk/python/tests/components/test_components.py index 76f642d303b2..783ef3192358 100644 --- a/sdk/python/tests/components/test_components.py +++ b/sdk/python/tests/components/test_components.py @@ -87,6 +87,18 @@ def test_loading_minimal_component(self): self.assertEqual(task_factory1.component_spec.implementation.container.image, component_dict['implementation']['container']['image']) + def test_digest_of_loaded_component(self): + component_text = textwrap.dedent('''\ + implementation: + container: + image: busybox + ''' + ) + task_factory1 = comp.load_component_from_text(component_text) + task1 = task_factory1() + + self.assertEqual(task1.component_ref.digest, '1ede211233e869581d098673962c2c1e8c1e4cebb7cf5d7332c2f73cb4900823') + def test_accessing_component_spec_from_task_factory(self): component_text = '''\ implementation: diff --git a/sdk/python/tests/components/test_data/retail_product_stockout_prediction_pipeline.component.yaml b/sdk/python/tests/components/test_data/retail_product_stockout_prediction_pipeline.component.yaml index b9afcd09ed87..286d46125f15 100644 --- a/sdk/python/tests/components/test_data/retail_product_stockout_prediction_pipeline.component.yaml +++ b/sdk/python/tests/components/test_data/retail_product_stockout_prediction_pipeline.component.yaml @@ -38,6 +38,7 @@ implementation: tasks: Automl create dataset for tables: componentRef: + digest: 98381958ba8b0d2b83a23a78f482f08b48e665409820b3a6254bccdbcf206df3 url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/create_dataset_for_tables/component.yaml arguments: gcp_project_id: @@ -51,6 +52,7 @@ implementation: inputName: dataset_display_name Automl import data from bigquery: componentRef: + digest: a965621525a9081a8c7d4c12806bf4359a03b9842a7d3e891ab5b48422dbe527 url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/import_data_from_bigquery/component.yaml arguments: dataset_path: @@ -63,6 +65,7 @@ implementation: inputName: dataset_bq_input_uri Automl split dataset table column names: componentRef: + digest: a77ef9ecb87e543290a02b3fa933bcd5e67947a12f6d011fdd37bf38c1b26ade url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/split_dataset_table_column_names/component.yaml arguments: dataset_path: @@ -76,6 +79,7 @@ implementation: table_index: '0' Automl create model for tables: componentRef: + digest: e52ee882685380988ee2f4de6beacdcd0d2ab21d37bef45c4e16a20a224d374e url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/create_model_for_tables/component.yaml arguments: gcp_project_id: @@ -108,6 +112,7 @@ implementation: inputName: train_budget_milli_node_hours Automl prediction service batch predict: componentRef: + digest: 908ea1855f5aa3d35f60145f0f15007ea437b35b0a1be2fd1d0db5a76221cad1 url: https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/prediction_service_batch_predict/component.yaml arguments: model_path: