[CDF-24141] 👻 Credentials hash (#1468)

* refactor: added flag * build; enable flag * feat: implemented workflow trigger hash storing * tests: updated tests * feat; hash workflow trigger * tests: update approval client * feat: store hash transformations * fix: introduced bug * fix: first passing * tests: extend test to detect changes * tests: extend test * tests: added failing test * feat: hash function schedule * refactor: cleanup * docs * tests: update outdated * fix: typos
cognitedata · Feb 19, 2025 · e4c7e70 · e4c7e70
1 parent 8e45d1f
commit e4c7e70
Show file tree

Hide file tree

Showing 12 changed files with 306 additions and 94 deletions.
diff --git a/CHANGELOG.cdf-tk.md b/CHANGELOG.cdf-tk.md
@@ -23,6 +23,9 @@ Changes are grouped as follows:
 - [alpha feature] When the flag `strict-validation` is set to `true` in the `cdf.toml` file, the Toolkit will
   no longer use its own authentication as a fallback when deploying WorkflowTriggers and FunctionSchedules in projects
   where `validation-type` is set to anything other than `dev`. This will be the default behavior from version `0.5.0`.
+- [alpha feature] When the flag `credentials-hash` is set to `true` in the `cdf.toml` file, the Toolkit will hash
+  the credentials of Transformations (if present), FunctionSchedules, and WorkflowTriggers before deploying them. This
+  will be used to detect if the credentials have been changed when running `cdf deploy`.
 
 ### Improved
 

diff --git a/cdf.toml b/cdf.toml
@@ -9,6 +9,9 @@ module-repeat = true
 dump-extended = true
 populate = true
 strict-validation = true
+# Setting this to true wil change all snapshots for WorkflowTriggers/FunctionSchedules.
+# For simplicity, (avoid keeping track of two sets of snapshots), we keep this to false in development.
+credentials-hash = false
 
 [plugins]
 run = true

diff --git a/cognite_toolkit/_cdf_tk/feature_flags.py b/cognite_toolkit/_cdf_tk/feature_flags.py
@@ -32,6 +32,11 @@ class Flags(Enum):
         "visible": True,
         "description": "For Workflow/Transformations/Function do not fallback to Toolkit credentials when validation-type != 'dev'",
     }
+    CREDENTIALS_HASH: ClassVar[dict[str, Any]] = {  # type: ignore[misc]
+        "visible": True,
+        "description": "Stores a hash of the credentials of Workflow/Transformation/Function in the resources such that"
+        " the resource is updated when the credentials change",
+    }
 
     def is_enabled(self) -> bool:
         return FeatureFlag.is_enabled(self)

diff --git a/cognite_toolkit/_cdf_tk/loaders/_resource_loaders/function_loaders.py b/cognite_toolkit/_cdf_tk/loaders/_resource_loaders/function_loaders.py
@@ -5,7 +5,6 @@
 from pathlib import Path
 from typing import Any, cast, final
 
-from cognite.client.credentials import OAuthClientCredentials
 from cognite.client.data_classes import (
     ClientCredentials,
     Function,
@@ -34,7 +33,6 @@
 from cognite_toolkit._cdf_tk.exceptions import (
     ResourceCreationError,
     ToolkitRequiredValueError,
-    ToolkitTypeError,
 )
 from cognite_toolkit._cdf_tk.feature_flags import Flags
 from cognite_toolkit._cdf_tk.loaders._base_loaders import ResourceLoader
@@ -44,6 +42,7 @@
     calculate_secure_hash,
     calculate_str_or_file_hash,
 )
+from cognite_toolkit._cdf_tk.utils.cdf import read_auth
 
 from .auth_loaders import GroupAllScopedLoader
 from .data_organization_loaders import DataSetsLoader
@@ -351,6 +350,9 @@ class FunctionScheduleLoader(
     parent_resource = frozenset({FunctionLoader})
     support_update = False
 
+    _hash_key = "cdf-auth"
+    _description_character_limit = 500
+
     def __init__(self, client: ToolkitClient, build_path: Path | None, console: Console | None):
         super().__init__(client, build_path, console)
         self.authentication_by_id: dict[FunctionScheduleID, ClientCredentials] = {}
@@ -401,32 +403,34 @@ def get_dependent_items(cls, item: dict) -> Iterable[tuple[type[ResourceLoader],
         if "functionExternalId" in item:
             yield FunctionLoader, item["functionExternalId"]
 
-    def load_resource(self, resource: dict[str, Any], is_dry_run: bool = False) -> FunctionScheduleWrite:
-        identifier = self.get_id(resource)
-        auth = resource.pop("authentication", None)
-        if auth is None:
-            if (self.client.config.is_strict_validation and Flags.STRICT_VALIDATION.is_enabled()) or not isinstance(
-                self.client.config.credentials, OAuthClientCredentials
-            ):
-                raise ToolkitRequiredValueError(f"Authentication is missing for schedule {identifier!r}.")
-            else:
-                HighSeverityWarning(
-                    f"Authentication is missing for schedule {identifier!r}. Falling back to the Toolkit credentials"
-                ).print_warning(console=self.console)
-            credentials = ClientCredentials(
-                self.client.config.credentials.client_id, self.client.config.credentials.client_secret
-            )
-        elif not isinstance(auth, dict):
-            raise ToolkitTypeError(f"Authentication must be a dictionary for schedule {identifier!r}")
-        elif "clientId" not in auth or "clientSecret" not in auth:
-            raise ToolkitRequiredValueError(
-                f"Authentication must contain clientId and clientSecret for schedule {identifier!r}"
-            )
-        else:
-            credentials = ClientCredentials(auth["clientId"], auth["clientSecret"])
-        self.authentication_by_id[identifier] = credentials
+    def load_resource_file(
+        self, filepath: Path, environment_variables: dict[str, str | None] | None = None
+    ) -> list[dict[str, Any]]:
+        resources = super().load_resource_file(filepath, environment_variables)
+        # We need to the auth hash calculation here, as the output of the load_resource_file
+        # is used to compare with the CDF resource.
+        for resource in resources:
+            identifier = self.get_id(resource)
+            credentials = read_auth(identifier, resource, self.client, "function schedule", self.console)
+            self.authentication_by_id[identifier] = credentials
+            if Flags.CREDENTIALS_HASH.is_enabled():
+                auth_hash = calculate_secure_hash(credentials.dump(camel_case=True), shorten=True)
+                extra_str = f" {self._hash_key}: {auth_hash}"
+                if "description" not in resource:
+                    resource["description"] = extra_str[1:]
+                elif len(resource["description"]) + len(extra_str) < self._description_character_limit:
+                    resource["description"] += f"{extra_str}"
+                else:
+                    LowSeverityWarning(
+                        f"Description is too long for schedule {identifier!r}. Truncating..."
+                    ).print_warning(console=self.console)
+                    truncation = self._description_character_limit - len(extra_str) - 3
+                    resource["description"] = f"{resource['description'][:truncation]}...{extra_str}"
+        return resources
 
+    def load_resource(self, resource: dict[str, Any], is_dry_run: bool = False) -> FunctionScheduleWrite:
         if "functionId" in resource:
+            identifier = self.get_id(resource)
             LowSeverityWarning(f"FunctionId will be ignored in the schedule {identifier!r}").print_warning(
                 console=self.console
             )

diff --git a/cognite_toolkit/_cdf_tk/loaders/_resource_loaders/transformation_loaders.py b/cognite_toolkit/_cdf_tk/loaders/_resource_loaders/transformation_loaders.py
@@ -70,8 +70,10 @@
     ToolkitTypeError,
     ToolkitYAMLFormatError,
 )
+from cognite_toolkit._cdf_tk.feature_flags import Flags
 from cognite_toolkit._cdf_tk.loaders._base_loaders import ResourceLoader
 from cognite_toolkit._cdf_tk.utils import (
+    calculate_secure_hash,
     in_dict,
     load_yaml_inject_variables,
     quote_int_value_by_key_in_yaml,
@@ -114,6 +116,7 @@ class TransformationLoader(
         }
     )
     _doc_url = "Transformations/operation/createTransformations"
+    _hash_key = "-- cdf-auth"
 
     @property
     def display_name(self) -> str:
@@ -225,6 +228,22 @@ def load_resource_file(
                 )
             elif query_file:
                 item["query"] = safe_read(query_file)
+
+            if Flags.CREDENTIALS_HASH.is_enabled():
+                auth_dict: dict[str, Any] = {}
+                for key in [
+                    "authentication",
+                    "sourceOidcCredentials",
+                    "destinationOidcCredentials",
+                    "sourceNonce",
+                    "destinationNonce",
+                ]:
+                    if key in item:
+                        auth_dict[key] = item[key]
+                if auth_dict:
+                    auth_hash = calculate_secure_hash(auth_dict, shorten=True)
+                    if "query" in item:
+                        item["query"] = f"{self._hash_key}: {auth_hash}\n{item['query']}"
         return raw_list
 
     def load_resource(self, resource: dict[str, Any], is_dry_run: bool = False) -> TransformationWrite:
@@ -274,9 +293,11 @@ def dump_resource(self, resource: Transformation, local: dict[str, Any] | None =
         local = local or {}
         if data_set_id := dumped.pop("dataSetId", None):
             dumped["dataSetExternalId"] = self.client.lookup.data_sets.external_id(data_set_id)
+        if "isPublic" in dumped and "isPublic" not in local:
+            # Default set from server side.
+            dumped.pop("isPublic")
         if "authentication" in local:
-            # Todo: Need a way to detect changes in credentials instead of just assuming
-            #    that the credentials are always the same.
+            # The hash added to the beginning of the query detects the change in the authentication
             dumped["authentication"] = local["authentication"]
         return dumped
 

diff --git a/cognite_toolkit/_cdf_tk/loaders/_resource_loaders/workflow_loaders.py b/cognite_toolkit/_cdf_tk/loaders/_resource_loaders/workflow_loaders.py
@@ -19,7 +19,6 @@
 from pathlib import Path
 from typing import Any, final
 
-from cognite.client.credentials import OAuthClientCredentials
 from cognite.client.data_classes import (
     ClientCredentials,
     Workflow,
@@ -49,17 +48,16 @@
 from cognite_toolkit._cdf_tk.client import ToolkitClient
 from cognite_toolkit._cdf_tk.exceptions import (
     ToolkitRequiredValueError,
-    ToolkitTypeError,
 )
 from cognite_toolkit._cdf_tk.feature_flags import Flags
 from cognite_toolkit._cdf_tk.loaders._base_loaders import ResourceLoader
 from cognite_toolkit._cdf_tk.tk_warnings import (
-    HighSeverityWarning,
     LowSeverityWarning,
     MissingReferencedWarning,
     ToolkitWarning,
 )
-from cognite_toolkit._cdf_tk.utils import humanize_collection, to_directory_compatible
+from cognite_toolkit._cdf_tk.utils import calculate_secure_hash, humanize_collection, to_directory_compatible
+from cognite_toolkit._cdf_tk.utils.cdf import read_auth
 from cognite_toolkit._cdf_tk.utils.diff_list import diff_list_hashable, diff_list_identifiable
 
 from .auth_loaders import GroupAllScopedLoader
@@ -451,6 +449,9 @@ class WorkflowTriggerLoader(
 
     _doc_url = "Workflow-triggers/operation/CreateOrUpdateTriggers"
 
+    class _MetadataKey:
+        secret_hash = "cognite-toolkit-auth-hash"
+
     def __init__(self, client: ToolkitClient, build_dir: Path | None, console: Console | None = None):
         super().__init__(client, build_dir, console)
         self._authentication_by_id: dict[str, ClientCredentials] = {}
@@ -584,43 +585,38 @@ def get_dependent_items(cls, item: dict) -> Iterable[tuple[type[ResourceLoader],
             if "workflowVersion" in item:
                 yield WorkflowVersionLoader, WorkflowVersionId(item["workflowExternalId"], item["workflowVersion"])
 
+    def load_resource_file(
+        self, filepath: Path, environment_variables: dict[str, str | None] | None = None
+    ) -> list[dict[str, Any]]:
+        resources = super().load_resource_file(filepath, environment_variables)
+
+        # We need to the auth hash calculation here, as the output of the load_resource_file
+        # is used to compare with the CDF resource.
+        for resource in resources:
+            identifier = self.get_id(resource)
+            credentials = read_auth(identifier, resource, self.client, "workflow trigger", self.console)
+            self._authentication_by_id[identifier] = credentials
+            if Flags.CREDENTIALS_HASH.is_enabled():
+                if "metadata" not in resource:
+                    resource["metadata"] = {}
+                    resource["metadata"][self._MetadataKey.secret_hash] = calculate_secure_hash(
+                        credentials.dump(camel_case=True), shorten=True
+                    )
+        return resources
+
     def load_resource(self, resource: dict[str, Any], is_dry_run: bool = False) -> WorkflowTriggerUpsert:
         if isinstance(resource.get("data"), dict):
             resource["data"] = json.dumps(resource["data"])
-
-        identifier = self.get_id(resource)
-        auth = resource.pop("authentication", None)
-        if auth is None:
-            if (self.client.config.is_strict_validation and Flags.STRICT_VALIDATION.is_enabled()) or not isinstance(
-                self.client.config.credentials, OAuthClientCredentials
-            ):
-                raise ToolkitRequiredValueError(f"Authentication is missing for workflow trigger {identifier!r}.")
-            else:
-                HighSeverityWarning(
-                    f"Authentication is missing for workflow trigger {identifier!r}. Falling back to the Toolkit credentials"
-                ).print_warning(console=self.console)
-            credentials = ClientCredentials(
-                self.client.config.credentials.client_id, self.client.config.credentials.client_secret
-            )
-        elif not isinstance(auth, dict):
-            raise ToolkitTypeError(f"Authentication must be a dictionary for workflow trigger {identifier!r}")
-        elif "clientId" not in auth or "clientSecret" not in auth:
-            raise ToolkitRequiredValueError(
-                f"Authentication must contain clientId and clientSecret for workflow trigger {identifier!r}"
-            )
-        else:
-            credentials = ClientCredentials(auth["clientId"], auth["clientSecret"])
-
-        self._authentication_by_id[self.get_id(resource)] = credentials
         return WorkflowTriggerUpsert._load(resource)
 
     def dump_resource(self, resource: WorkflowTrigger, local: dict[str, Any] | None = None) -> dict[str, Any]:
         dumped = resource.as_write().dump()
         local = local or {}
         if isinstance(dumped.get("data"), str) and isinstance(local.get("data"), dict):
             dumped["data"] = json.loads(dumped["data"])
+
         if "authentication" in local:
-            # Note that change in the authentication will not be detected, and thus,
-            # will require a forced redeployment.
+            # Changes in auth will be detected by the hash. We need to do this to ensure
+            # that the pull command works.
             dumped["authentication"] = local["authentication"]
         return dumped
diff --git a/cognite_toolkit/_cdf_tk/utils/cdf.py b/cognite_toolkit/_cdf_tk/utils/cdf.py
@@ -1,13 +1,25 @@
-from collections.abc import Iterator
+from collections.abc import Hashable, Iterator
 from typing import Any, Literal, overload
 
+from cognite.client.credentials import OAuthClientCredentials
+from cognite.client.data_classes import (
+    ClientCredentials,
+)
 from cognite.client.data_classes.data_modeling import Edge, Node, ViewId
 from cognite.client.data_classes.filters import SpaceFilter
 from cognite.client.exceptions import CogniteAPIError
 from rich.console import Console
 
 from cognite_toolkit._cdf_tk.client import ToolkitClient
-from cognite_toolkit._cdf_tk.tk_warnings import MediumSeverityWarning
+from cognite_toolkit._cdf_tk.exceptions import (
+    ToolkitRequiredValueError,
+    ToolkitTypeError,
+)
+from cognite_toolkit._cdf_tk.feature_flags import Flags
+from cognite_toolkit._cdf_tk.tk_warnings import (
+    HighSeverityWarning,
+    MediumSeverityWarning,
+)
 
 
 @overload
@@ -76,3 +88,32 @@ def iterate_instances(
         if next_cursor is None:
             break
         body["cursor"] = next_cursor
+
+
+def read_auth(
+    identifier: Hashable,
+    resource: dict[str, Any],
+    client: ToolkitClient,
+    resource_name: str,
+    console: Console | None = None,
+) -> ClientCredentials:
+    auth = resource.get("authentication")
+    if auth is None:
+        if (client.config.is_strict_validation and Flags.STRICT_VALIDATION.is_enabled()) or not isinstance(
+            client.config.credentials, OAuthClientCredentials
+        ):
+            raise ToolkitRequiredValueError(f"Authentication is missing for {resource_name} {identifier!r}.")
+        else:
+            HighSeverityWarning(
+                f"Authentication is missing for {resource_name} {identifier!r}. Falling back to the Toolkit credentials"
+            ).print_warning(console=console)
+        credentials = ClientCredentials(client.config.credentials.client_id, client.config.credentials.client_secret)
+    elif not isinstance(auth, dict):
+        raise ToolkitTypeError(f"Authentication must be a dictionary for {resource_name} {identifier!r}")
+    elif "clientId" not in auth or "clientSecret" not in auth:
+        raise ToolkitRequiredValueError(
+            f"Authentication must contain clientId and clientSecret for {resource_name} {identifier!r}"
+        )
+    else:
+        credentials = ClientCredentials(auth["clientId"], auth["clientSecret"])
+    return credentials
diff --git a/cognite_toolkit/_cdf_tk/utils/hashing.py b/cognite_toolkit/_cdf_tk/utils/hashing.py
@@ -34,11 +34,14 @@ def calculate_directory_hash(
     return calculated
 
 
-def calculate_secure_hash(item: dict[str, Any]) -> str:
+def calculate_secure_hash(item: dict[str, Any], shorten: bool = False) -> str:
     """Calculate a secure hash of a dictionary"""
     sha256_hash = hashlib.sha512(usedforsecurity=True)
     sha256_hash.update(json.dumps(item, sort_keys=True).encode("utf-8"))
-    return sha256_hash.hexdigest()
+    calculated_hash = sha256_hash.hexdigest()
+    if shorten:
+        return calculated_hash[:8]
+    return calculated_hash
 
 
 def calculate_str_or_file_hash(content: str | Path, shorten: bool = False) -> str:

diff --git a/tests/test_unit/approval_client/client.py b/tests/test_unit/approval_client/client.py
@@ -138,10 +138,15 @@ class ApprovalToolkitClient:
     def __init__(self, mock_client: ToolkitClientMock):
         self._return_verify_resources = False
         self.mock_client = mock_client
+        credentials = MagicMock(spec=OAuthClientCredentials)
+        credentials.client_id = "toolkit-client-id"
+        credentials.client_secret = "toolkit-client-secret"
+        credentials.token_url = "https://toolkit.auth.com/oauth/token"
+        credentials.scopes = ["ttps://pytest-field.cognitedata.com/.default"]
         self.mock_client.config = ToolkitClientConfig(
             client_name=CLIENT_NAME,
             project="pytest-project",
-            credentials=MagicMock(spec=OAuthClientCredentials),
+            credentials=credentials,
             is_strict_validation=False,
         )
         # This is used to simulate the existing resources in CDF