fix(profiling): default to libdatadog if autoinject/SSI is enabled (#…

…8822) In injected configurations, the Profiler can cause deployed (bundled? stashed?) dependencies to over-ride the dependencies of the underlying application. This is especially problematic for the `protobuf` library. This PR changes several things * Introduces a new "required" option to the libdatadog (libdd) collector/exporter. Normally, if the user merely enables libdatadog, the profiler will make a best effort at using it, but fallback to the old collector/exporter if needed. When libdatadog is required, profiling will be disabled if libdatadog cannot be used. * Has lib-inection set libdatadog to "required" ## Checklist - [X] Change(s) are motivated and described in the PR description - [X] Testing strategy is described if automated tests are not included in the PR - [X] Risks are described (performance impact, potential for breakage, maintainability) - [X] Change is maintainable (easy to change, telemetry, documentation) - [X] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed or label `changelog/no-changelog` is set - [X] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)) - [X] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) - [x] If this PR changes the public interface, I've notified `@DataDog/apm-tees`. - [X] If change touches code that signs or publishes builds or packages, or handles credentials of any kind, I've requested a review from `@DataDog/security-design-and-guidance`. ## Reviewer Checklist - [x] Title is accurate - [x] All changes are related to the pull request's stated goal - [x] Description motivates each change - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - [x] Testing strategy adequately addresses listed risks - [x] Change is maintainable (easy to change, telemetry, documentation) - [x] Release note makes sense to a user of the library - [x] Author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: sanchda <sanchda@users.noreply.github.com> Co-authored-by: Gabriele N. Tornetta <P403n1x87@users.noreply.github.com> Co-authored-by: Emmett Butler <723615+emmettbutler@users.noreply.github.com>
DataDog · Apr 2, 2024 · 1c1f54a · 1c1f54a
1 parent 228590b
commit 1c1f54a
Show file tree

Hide file tree

Showing 8 changed files with 147 additions and 66 deletions.
diff --git a/ddtrace/profiling/collector/memalloc.py b/ddtrace/profiling/collector/memalloc.py
@@ -71,9 +71,6 @@ def _start_service(self):
         if _memalloc is None:
             raise collector.CollectorUnavailable
 
-        if self._export_libdd_enabled and not ddup.is_available:
-            self._export_libdd_enabled = False
-
         try:
             _memalloc.start(self.max_nframe, self._max_events, self.heap_sample_size)
         except RuntimeError:

diff --git a/ddtrace/profiling/collector/stack.pyx b/ddtrace/profiling/collector/stack.pyx
@@ -487,24 +487,7 @@ class StackCollector(collector.PeriodicCollector):
 
         # If libdd is enabled, propagate the configuration
         if config.export.libdd_enabled:
-            if not ddup.is_available:
-                # We don't report on this, since it's already been reported in profiler.py
-                set_use_libdd(False)
-
-                # If the user had also set stack.v2.enabled, then we need to disable that as well.
-                if self._stack_collector_v2_enabled:
-                    self._stack_collector_v2_enabled = False
-                    LOG.error("Stack v2 was requested, but the libdd collector could not be enabled.  Falling back to the v1 stack sampler.")
-            else:
-                set_use_libdd(True)
-
-        # If stack v2 is requested, verify it is loaded properly and that libdd has been enabled.
-        if self._stack_collector_v2_enabled:
-            if not stack_v2.is_available:
-                self._stack_collector_v2_enabled = False
-                LOG.error("Stack v2 was requested, but it could not be enabled.  Check debug logs for more information.")
-            if not use_libdd:
-                self._stack_collector_v2_enabled = False
+            set_use_libdd(True)
 
         # If at the end of things, stack v2 is still enabled, then start the native thread running the v2 sampler
         if self._stack_collector_v2_enabled:

diff --git a/ddtrace/profiling/profiler.py b/ddtrace/profiling/profiler.py
@@ -128,20 +128,23 @@ class _ProfilerInstance(service.Service):
         init=False, factory=lambda: os.environ.get("AWS_LAMBDA_FUNCTION_NAME"), type=Optional[str]
     )
     _export_libdd_enabled = attr.ib(type=bool, default=config.export.libdd_enabled)
+    _export_libdd_required = attr.ib(type=bool, default=config.export.libdd_required)
 
     ENDPOINT_TEMPLATE = "https://intake.profile.{}"
 
     def _build_default_exporters(self):
         # type: (...) -> List[exporter.Exporter]
-        _OUTPUT_PPROF = config.output_pprof
-        if _OUTPUT_PPROF:
-            # DEV: Import this only if needed to avoid importing protobuf
-            # unnecessarily
-            from ddtrace.profiling.exporter import file
-
-            return [
-                file.PprofFileExporter(prefix=_OUTPUT_PPROF),
-            ]
+        if not self._export_libdd_enabled:
+            # If libdatadog support is enabled, we can skip this part
+            _OUTPUT_PPROF = config.output_pprof
+            if _OUTPUT_PPROF:
+                # DEV: Import this only if needed to avoid importing protobuf
+                # unnecessarily
+                from ddtrace.profiling.exporter import file
+
+                return [
+                    file.PprofFileExporter(prefix=_OUTPUT_PPROF),
+                ]
 
         if self.url is not None:
             endpoint = self.url
@@ -168,13 +171,11 @@ def _build_default_exporters(self):
         if self._lambda_function_name is not None:
             self.tags.update({"functionname": self._lambda_function_name})
 
-        # It's possible to fail to load the libdatadog collector, so we check.  Any other consumers
-        # of libdd can do their own check, so we just log.
-        if self._export_libdd_enabled and not ddup.is_available:
-            LOG.error("Failed to load the libdd collector, falling back to the legacy collector")
-            self._export_libdd_enabled = False
-        elif self._export_libdd_enabled:
-            LOG.debug("Using the libdd collector")
+        # Did the user request the libdd collector?  Better log it.
+        if self._export_libdd_enabled:
+            LOG.debug("The libdd collector is enabled")
+        if self._export_libdd_required:
+            LOG.debug("The libdd collector is required")
 
         # Build the list of enabled Profiling features and send along as a tag
         configured_features = []
@@ -194,6 +195,8 @@ def _build_default_exporters(self):
             configured_features.append("exp_dd")
         else:
             configured_features.append("exp_py")
+        if self._export_libdd_required:
+            configured_features.append("req_dd")
         configured_features.append("CAP" + str(config.capture_pct))
         configured_features.append("MAXF" + str(config.max_frames))
         self.tags.update({"profiler_config": "_".join(configured_features)})
@@ -202,34 +205,53 @@ def _build_default_exporters(self):
         if self.endpoint_collection_enabled:
             endpoint_call_counter_span_processor.enable()
 
+        # If libdd is enabled, then
+        # * If initialization fails, disable the libdd collector and fall back to the legacy exporter
+        # * If initialization fails and libdd is required, disable everything and return (error)
         if self._export_libdd_enabled:
-            ddup.init(
-                env=self.env,
-                service=self.service,
-                version=self.version,
-                tags=self.tags,  # type: ignore
-                max_nframes=config.max_frames,
-                url=endpoint,
-            )
-        else:
-            # DEV: Import this only if needed to avoid importing protobuf
-            # unnecessarily
-            from ddtrace.profiling.exporter import http
-
-            return [
-                http.PprofHTTPExporter(
-                    service=self.service,
+            try:
+                ddup.init(
                     env=self.env,
-                    tags=self.tags,
+                    service=self.service,
                     version=self.version,
-                    api_key=self.api_key,
-                    endpoint=endpoint,
-                    endpoint_path=endpoint_path,
-                    enable_code_provenance=self.enable_code_provenance,
-                    endpoint_call_counter_span_processor=endpoint_call_counter_span_processor,
+                    tags=self.tags,  # type: ignore
+                    max_nframes=config.max_frames,
+                    url=endpoint,
                 )
-            ]
-        return []
+                return []
+            except Exception as e:
+                LOG.error("Failed to initialize libdd collector (%s), falling back to the legacy collector", e)
+                self._export_libdd_enabled = False
+                config.export.libdd_enabled = False
+
+                # If we're here and libdd was required, then there's nothing else to do.  We don't have a
+                # collector.
+                if self._export_libdd_required:
+                    LOG.error("libdd collector is required but could not be initialized. Disabling profiling.")
+                    config.enabled = False
+                    config.export.libdd_required = False
+                    config.lock.enabled = False
+                    config.memory.enabled = False
+                    config.stack.enabled = False
+                    return []
+
+        # DEV: Import this only if needed to avoid importing protobuf
+        # unnecessarily
+        from ddtrace.profiling.exporter import http
+
+        return [
+            http.PprofHTTPExporter(
+                service=self.service,
+                env=self.env,
+                tags=self.tags,
+                version=self.version,
+                api_key=self.api_key,
+                endpoint=endpoint,
+                endpoint_path=endpoint_path,
+                enable_code_provenance=self.enable_code_provenance,
+                endpoint_call_counter_span_processor=endpoint_call_counter_span_processor,
+            )
+        ]
 
     def __attrs_post_init__(self):
         # type: (...) -> None

diff --git a/ddtrace/profiling/scheduler.py b/ddtrace/profiling/scheduler.py
@@ -33,8 +33,6 @@ def __attrs_post_init__(self):
     def _start_service(self):
         # type: (...) -> None
         """Start the scheduler."""
-        if self._export_libdd_enabled and not ddup.is_available:
-            self._export_libdd_enabled = False
         LOG.debug("Starting scheduler")
         super(Scheduler, self)._start_service()
         self._last_export = compat.time_ns()

diff --git a/ddtrace/settings/profiling.py b/ddtrace/settings/profiling.py
@@ -37,6 +37,37 @@ def _derive_default_heap_sample_size(heap_config, default_heap_sample_size=1024
     return int(max(math.ceil(total_mem / max_samples), default_heap_sample_size))
 
 
+def _check_for_ddup_available():
+    ddup_is_available = False
+    try:
+        from ddtrace.internal.datadog.profiling import ddup
+
+        ddup_is_available = ddup.is_available
+    except Exception:
+        pass  # nosec
+    return ddup_is_available
+
+
+def _check_for_stack_v2_available():
+    stack_v2_is_available = False
+    if not _check_for_ddup_available():
+        return False
+
+    try:
+        from ddtrace.internal.datadog.profiling import stack_v2
+
+        stack_v2_is_available = stack_v2.is_available
+    except Exception:
+        pass  # nosec
+    return stack_v2_is_available
+
+
+# We don't check for the availability of the ddup module when determining whether libdd is _required_,
+# since it's up to the application code to determine what happens in that failure case.
+def _is_libdd_required(config):
+    return config.stack.v2.enabled or config._libdd_required
+
+
 class ProfilingConfig(En):
     __prefix__ = "dd.profiling"
 
@@ -162,14 +193,16 @@ class Stack(En):
         class V2(En):
             __item__ = __prefix__ = "v2"
 
-            enabled = En.v(
+            _enabled = En.v(
                 bool,
                 "enabled",
                 default=False,
                 help_type="Boolean",
                 help="Whether to enable the v2 stack profiler. Also enables the libdatadog collector.",
             )
 
+            enabled = En.d(bool, lambda c: _check_for_stack_v2_available() and c._enabled)
+
     class Lock(En):
         __item__ = __prefix__ = "lock"
 
@@ -223,13 +256,36 @@ class Heap(En):
     class Export(En):
         __item__ = __prefix__ = "export"
 
-        libdd_enabled = En.v(
+        _libdd_required = En.v(
+            bool,
+            "libdd_required",
+            default=False,
+            help_type="Boolean",
+            help="Requires the native exporter to be enabled",
+        )
+
+        libdd_required = En.d(
+            bool,
+            _is_libdd_required,
+        )
+
+        _libdd_enabled = En.v(
             bool,
             "libdd_enabled",
             default=False,
             help_type="Boolean",
-            help="Enables collection and export using the experimental exporter",
+            help="Enables collection and export using a native exporter.  Can fallback to the pure-Python exporter.",
+        )
+
+        libdd_enabled = En.d(
+            bool, lambda c: (_is_libdd_required(c) or c._libdd_enabled) and _check_for_ddup_available()
         )
 
+    Export.include(Stack, namespace="stack")
+
 
 config = ProfilingConfig()
+
+if config.export.libdd_required and not config.export.libdd_enabled:
+    logger.warning("The native exporter is required, but not enabled. Disabling profiling.")
+    config.enabled = False
diff --git a/lib-injection/dl_wheels.py b/lib-injection/dl_wheels.py
@@ -19,6 +19,8 @@
 import argparse
 import itertools
 import os
+from pathlib import Path
+import shutil
 import subprocess
 import sys
 
@@ -122,3 +124,16 @@
         )
         # Remove the wheel as it has been unpacked
         os.remove(wheel_file)
+
+    sitepackages_root = Path(dl_dir) / f"site-packages-ddtrace-py{python_version}-{platform}"
+    directories_to_remove = [
+        sitepackages_root / "google" / "protobuf",
+        sitepackages_root / "google" / "_upb",
+    ]
+    directories_to_remove.extend(sitepackages_root.glob("protobuf-*"))  # dist-info directories
+
+    for directory in directories_to_remove:
+        try:
+            shutil.rmtree(directory)
+        except Exception:
+            pass
diff --git a/lib-injection/sitecustomize.py b/lib-injection/sitecustomize.py
@@ -77,6 +77,9 @@ def _inject():
             _log("failed to load ddtrace module: %s" % e, level="error")
             return
         else:
+            # In injected environments, the profiler needs to know that it is only allowed to use the native exporter
+            os.environ["DD_PROFILING_EXPORT_LIBDD_REQUIRED"] = "true"
+
             # This import has the same effect as ddtrace-run for the current process (auto-instrument all libraries).
             import ddtrace.bootstrap.sitecustomize
 

diff --git a/releasenotes/notes/profiling-fix-protobuf-injection-d94f7c152681795c.yaml b/releasenotes/notes/profiling-fix-protobuf-injection-d94f7c152681795c.yaml
@@ -0,0 +1,7 @@
+---
+fixes:
+  - |
+    Profiling: This fix resolves an issue where the profiler was forcing protobuf to load in injected environments,
+               causing crashes in configurations which relied on older protobuf versions. The profiler will now detect
+               when injection is used and try loading with the native exporter. If that fails, it will self-disable
+               rather than loading protobuf.