lightstep · jaronoff97 · Oct 5, 2023 · Sep 22, 2023 · Sep 22, 2023 · Sep 22, 2023
diff --git a/README.md b/README.md
@@ -9,3 +9,17 @@ This is the repository for recommended [Helm](https://helm.sh/) charts for runni
 * [otel-cloud-stack](https://github.com/lightstep/prometheus-k8s-opentelemetry-collector/tree/main/charts/otel-cloud-stack) - **Recommended** chart for sending Kubernetes metrics to ServiceNow Cloud Observability using OpenTelemetry-native metric collection and the OpenTelemetry Operator.
 * [kube-otel-stack](https://github.com/lightstep/prometheus-k8s-opentelemetry-collector/tree/main/charts/kube-otel-stack) - Drop in replacement for [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack), which uses the same configuration for scraping Prometheus exporters and forwarding metrics to Lightstep using the OpenTelemetry Operator. Use this chart if you are looking to compare Kubernetes monitoring in Prometheus with Kubernetes monitoring using ServiceNow Cloud Observability. 
 
+## Arrow Usage
+
+> [!NOTE] 
+> Arrow usage is in beta, please use at your own risk. Reach out if you have any issues.
+
+In order to use an arrow trace collector, you will need to build your own custom image. We have supplied a collector builder config below. Once an image is a available, simply apply your desired helm chart with the values.yaml AND the arrow.yaml in the respective chart. Make sure to replace the image in arrow.yaml with your custom built image.
+
+## Build configurations
+
+Some of the features available in these charts are optional because
+they rely on components that have not been released in the
+OpenTelemetry Contrib Collector.  Specifically, to make use of the new
+OTel-Arrow protocol requires building a customer collector at this
+time.  See a [recommended custom collector build configuration](./gateway-build.yaml).
diff --git a/charts/kube-otel-stack/Chart.yaml b/charts/kube-otel-stack/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 name: kube-otel-stack
 description: Chart for sending Kubernetes metrics to Lightstep using the OpenTelemetry Operator.
 type: application
-version: 0.3.1
+version: 0.3.2
 appVersion: 0.83.0
 dependencies:
 # cert manager must be manually installed because it has CRDs

diff --git a/charts/kube-otel-stack/arrow.yaml b/charts/kube-otel-stack/arrow.yaml
@@ -0,0 +1,52 @@
+# This is a BETA feature, please use at your own risk.
+# OTel-Arrow notes: to use OTel-Arrow requires an image with the
+# OTel-Arrow components built in.  The collector-contrib image
+# does not include these components yet, so a custom image will be
+# needed.  See https://github.com/lightstep/otel-collector-charts/blob/main/gateway-build.yaml
+tracesCollector:
+  image: "your-image-with-arrow-here"
+  resources:
+    # OTel-Arrow notes: to use OTel-Arrow in a gateway configuration,
+    # we recommend the following adjustments:
+    #
+    #
+    # OTel-Arrow gateways with this resource configuration have been
+    # exercised at rates above 20,000 spans per second in our internal
+    # production setup, for reference.
+    limits:
+      cpu: 2000m
+      memory: 8Gi
+    requests:
+      cpu: 1500m
+      memory: 6Gi
+  config:
+    receivers:
+      otelarrow:
+        protocols:
+          grpc:
+            endpoint: "0.0.0.0:4317"
+          http:
+            endpoint: "0.0.0.0:4318"
+    exporters:
+      otelarrow:
+        # OTel-Arrow notes: To use OTel-Arrow during early-access
+        # specifically requires the following endpoint.  This endpoint
+        # supports both OTLP and OTel-Arrow.
+        endpoint: spaningest.lightstep.com:443
+
+        # OTel-Arrow notes: these settings are specific to OTel-Arrow.
+        # To use this configuration, replace "otlp" with "otelarrow" above
+        # and uncomment below.
+        arrow:
+          # This prevents the OTel-Arrow exporter from falling back to
+          # standard OTLP in case of misconfiguration.
+          disable_downgrade: true
+
+          # We recommend a small number of streams, since they consume
+          # substantial resources.  More than one stream is recommended
+          # to help balance load.
+          num_streams: 2
+
+          # A stream lifetime limit is required to avoid spurious
+          # disconnect error messages in the collector logs.
+          max_stream_lifetime: 4m30s
diff --git a/charts/kube-otel-stack/values.yaml b/charts/kube-otel-stack/values.yaml
@@ -47,6 +47,7 @@ tracesCollector:
   enabled: false
   name: traces
   clusterName: ""
+
   image: otel/opentelemetry-collector-contrib:0.83.0
   mode: deployment
   replicas: 1
@@ -69,15 +70,45 @@ tracesCollector:
         protocols:
           grpc:
             endpoint: "0.0.0.0:4317"
+          http:
+            endpoint: "0.0.0.0:4318"
     processors:
+      # We recommend use of the batch processor.  We recommend the settings
+      # below for traces.
+      #
+      # Note: We are aware of ongoing efforts within OpenTelemetry to
+      # configure batching in the exporter, where it is possible to
+      # configure batch size limits in terms of bytes, instead of items.
+      # We will update these recommendations when batching by size is
+      # available.
+      batch:
+        # In this example, the processor will wait to accumulate at least
+        # 1000 spans for up to 1 second, then flush the batch.  In cases
+        # where the arriving data is already batched, such that combining
+        # the pending batch with the arriving data would exceed 1500
+        # items, then 1500 items will be sent by splitting the data.
+        #
+        # Note: the batch processor has a side-effect of returning success
+        # to the producer, before waiting for the consumer to respond.
+        # This is appropriate default in most cases, it means that SDKs
+        # sending to the gateway will not see or report errors.
+        #
+        # The batch processor responds to "back-pressure" from the
+        # exporter, meaning it is never directly responsible for dropping
+        # spans.  Note that our current recommendation for exporter
+        # settings does not respond with back-pressure to the batch
+        # processor.  Due to exporter settings, this collector
+        # configuration will drop data when the ServiceNow service is
+        # (intentionally or accidentally) refusing data, instead of
+        # applying pressure backward, discussed in the `exporters`
+        # section.
+        send_batch_size: 1000
+        send_batch_max_size: 1500
+        timeout: 1s
       resourcedetection/env:
         detectors: [env]
         timeout: 2s
         override: false
-      batch:
-        send_batch_size: 1000
-        timeout: 1s
-        send_batch_max_size: 1500
       k8sattributes:
         passthrough: false
         pod_association:
@@ -116,6 +147,48 @@ tracesCollector:
         headers:
           "lightstep-access-token": "${LS_TOKEN}"
 
+        # Queue settings are required.  It does not make sense to use
+        # the exporter without a queue, it has to do with
+        # requiring the "num_consumers" limit configured in this
+        # section (i.e., we require a queue in order to limit the
+        # number of concurrent exports).
+        #
+        # Note that the queue settings are applied in unit-terms
+        # produced by the batch processor, so a number like 100 means
+        # the queue has support for 100 pre-batched items.  With up to
+        # 1500 spans each (from the batch processor), this
+        # configuration allows 150,000 spans to occupy memory.
+        queue_settings:
+          enabled: true
+          num_consumers: 4
+          queue_size: 100
+
+        # Retry settings are optional.
+        #
+        # Note that while retries are attempted, this component will
+        # begin to drop arriving data if the queue is not large
+        # enough.
+        retry_on_failure:
+          # We recommend disabling retries, since while the export is
+          # blocked it is likely that arriving spans will drop, and
+          # Otherwise, collectors will need substantial additional
+          # memory to survive transient failures.  Nevertheless, we
+          # recommend a limited retry policy to gracefully occasional
+          # failures, paired with a modest queue size.
+          #
+          # Note there is a persistent storage option inherited from a
+          # common collector component.  When persistent storage is
+          # configured, the default retry configuration is sensible.
+          #
+          # For more details on retry and queue settings, please refer to
+          # https://github.com/open-telemetry/opentelemetry-collector/blob/main/exporter/exporterhelper/README.md
+          enabled: true
+          max_elapsed_time: 60s
+
+        # While we expect latency under one second, typically, we
+        # recommend a longer timeout than the default.
+        timeout: 30s
+
     service:
       pipelines:
         traces:

diff --git a/charts/otel-cloud-stack/Chart.yaml b/charts/otel-cloud-stack/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: "0.2.3"
+version: "0.2.4"
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to

diff --git a/charts/otel-cloud-stack/arrow.yaml b/charts/otel-cloud-stack/arrow.yaml
@@ -0,0 +1,52 @@
+# This is a BETA feature, please use at your own risk.
+# OTel-Arrow notes: to use OTel-Arrow requires an image with the
+# OTel-Arrow components built in.  The collector-contrib image
+# does not include these components yet, so a custom image will be
+# needed.  See https://github.com/lightstep/otel-collector-charts/blob/main/gateway-build.yaml
+tracesCollector:
+  image: "your-image-with-arrow-here"
+  resources:
+    # OTel-Arrow notes: to use OTel-Arrow in a gateway configuration,
+    # we recommend the following adjustments:
+    #
+    #
+    # OTel-Arrow gateways with this resource configuration have been
+    # exercised at rates above 20,000 spans per second in our internal
+    # production setup, for reference.
+    limits:
+      cpu: 2000m
+      memory: 8Gi
+    requests:
+      cpu: 1500m
+      memory: 6Gi
+  config:
+    receivers:
+      otelarrow:
+        protocols:
+          grpc:
+            endpoint: "0.0.0.0:4317"
+          http:
+            endpoint: "0.0.0.0:4318"
+    exporters:
+      otelarrow:
+        # OTel-Arrow notes: To use OTel-Arrow during early-access
+        # specifically requires the following endpoint.  This endpoint
+        # supports both OTLP and OTel-Arrow.
+        endpoint: spaningest.lightstep.com:443
+
+        # OTel-Arrow notes: these settings are specific to OTel-Arrow.
+        # To use this configuration, replace "otlp" with "otelarrow" above
+        # and uncomment below.
+        arrow:
+          # This prevents the OTel-Arrow exporter from falling back to
+          # standard OTLP in case of misconfiguration.
+          disable_downgrade: true
+
+          # We recommend a small number of streams, since they consume
+          # substantial resources.  More than one stream is recommended
+          # to help balance load.
+          num_streams: 2
+
+          # A stream lifetime limit is required to avoid spurious
+          # disconnect error messages in the collector logs.
+          max_stream_lifetime: 4m30s
diff --git a/charts/otel-cloud-stack/values.yaml b/charts/otel-cloud-stack/values.yaml
@@ -60,10 +60,16 @@ daemonCollector:
   scrape_configs_file: "daemon_scrape_configs.yaml"
   config:
     receivers:
+      # OTel-Arrow notes: For this collector to receive OTel-Arrow in
+      # addition to standard forms of OTLP, use an image with the
+      # OTel-Arrow receiver component replace "otlp" below with
+      # "otelarrow".
       otlp:
         protocols:
           grpc:
             endpoint: 0.0.0.0:4317
+          http:
+            endpoint: 0.0.0.0:4318
       kubeletstats:
         collection_interval: "15s"
         auth_type: "serviceAccount"
@@ -208,10 +214,38 @@ daemonCollector:
             - container.image.tag
             - container.image.name
             - k8s.cluster.uid
+      # We recommend use of the batch processor.  We recommend the settings
+      # below for traces.
+      #
+      # Note: We are aware of ongoing efforts within OpenTelemetry to
+      # configure batching in the exporter, where it is possible to
+      # configure batch size limits in terms of bytes, instead of items.
+      # We will update these recommendations when batching by size is
+      # available.
       batch:
+        # In this example, the processor will wait to accumulate at least
+        # 1000 spans for up to 1 second, then flush the batch.  In cases
+        # where the arriving data is already batched, such that combining
+        # the pending batch with the arriving data would exceed 1500
+        # items, then 1500 items will be sent by splitting the data.
+        #
+        # Note: the batch processor has a side-effect of returning success
+        # to the producer, before waiting for the consumer to respond.
+        # This is appropriate default in most cases, it means that SDKs
+        # sending to the gateway will not see or report errors.
+        #
+        # The batch processor responds to "back-pressure" from the
+        # exporter, meaning it is never directly responsible for dropping
+        # spans.  Note that our current recommendation for exporter
+        # settings does not respond with back-pressure to the batch
+        # processor.  Due to exporter settings, this collector
+        # configuration will drop data when the ServiceNow service is
+        # (intentionally or accidentally) refusing data, instead of
+        # applying pressure backward, discussed in the `exporters`
+        # section.
         send_batch_size: 1000
-        timeout: 1s
         send_batch_max_size: 1500
+        timeout: 1s
     exporters:
       logging:
         verbosity: detailed
@@ -221,6 +255,36 @@ daemonCollector:
         endpoint: ingest.lightstep.com:443
         headers:
           "lightstep-access-token": "${LS_TOKEN}"
+        queue_settings:
+          enabled: true
+          num_consumers: 4
+          queue_size: 100
+
+        # Retry settings are optional.
+        #
+        # Note that while retries are attempted, this component will
+        # begin to drop arriving data if the queue is not large
+        # enough.
+        retry_on_failure:
+          # We recommend disabling retries, since while the export is
+          # blocked it is likely that arriving spans will drop, and
+          # Otherwise, collectors will need substantial additional
+          # memory to survive transient failures.  Nevertheless, we
+          # recommend a limited retry policy to gracefully occasional
+          # failures, paired with a modest queue size.
+          #
+          # Note there is a persistent storage option inherited from a
+          # common collector component.  When persistent storage is
+          # configured, the default retry configuration is sensible.
+          #
+          # For more details on retry and queue settings, please refer to
+          # https://github.com/open-telemetry/opentelemetry-collector/blob/main/exporter/exporterhelper/README.md
+          enabled: true
+          max_elapsed_time: 60s
+
+        # While we expect latency under one second, typically, we
+        # recommend a longer timeout than the default.
+        timeout: 30s
     service:
       pipelines:
         metrics: