Remove the histogram metrics by default (#253)

* remove the histogram metrics by default Signed-off-by: Daxin Wang <daxinwang@harmonycloud.cn> * fix log level Signed-off-by: Daxin Wang <daxinwang@harmonycloud.cn>
KindlingProject · Jun 16, 2022 · 82416b4 · 82416b4
1 parent 560bb94
commit 82416b4
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 12 deletions.
diff --git a/collector/consumer/exporter/otelexporter/consume.go b/collector/consumer/exporter/otelexporter/consume.go
@@ -11,6 +11,7 @@ import (
 	"go.opentelemetry.io/otel/metric"
 	apitrace "go.opentelemetry.io/otel/trace"
 	"go.uber.org/zap"
+	"go.uber.org/zap/zapcore"
 )
 
 func (e *OtelExporter) Consume(dataGroup *model.DataGroup) error {
@@ -88,9 +89,11 @@ func (e *OtelExporter) exportMetric(result *adapter.AdaptedResult) {
 		} else if ok && metric.DataType() == model.IntMetricType {
 			measurements = append(measurements, e.instrumentFactory.getInstrument(metric.Name, metricKind).Measurement(metric.GetInt().Value))
 		} else if metric.DataType() == model.HistogramMetricType {
-			e.telemetry.Logger.Error("Failed to exporter Metric: can not use otlp-exporter to export histogram Data", zap.String("MetricName", metric.Name))
+			e.telemetry.Logger.Warn("Failed to exporter Metric: can not use otlp-exporter to export histogram Data", zap.String("MetricName", metric.Name))
 		} else {
-			e.telemetry.Logger.Warn("Undefined metricKind for this Metric", zap.String("MetricName", metric.Name), zap.String("MetricType", reflect.TypeOf(metric).String()))
+			if ce := e.telemetry.Logger.Check(zapcore.DebugLevel, "Undefined metricKind for this Metric"); ce != nil {
+				ce.Write(zap.String("MetricName", metric.Name), zap.String("MetricType", reflect.TypeOf(metric).String()))
+			}
 		}
 	}
 	if len(measurements) > 0 {

diff --git a/collector/docker/kindling-collector-config.yml b/collector/docker/kindling-collector-config.yml
@@ -122,12 +122,10 @@ exporters:
     metric_aggregation_map:
       kindling_entity_request_total: counter
       kindling_entity_request_duration_nanoseconds_total: counter
-      kindling_entity_request_average_duration_nanoseconds: histogram
       kindling_entity_request_send_bytes_total: counter
       kindling_entity_request_receive_bytes_total: counter
       kindling_topology_request_total: counter
       kindling_topology_request_duration_nanoseconds_total: counter
-      kindling_topology_request_average_duration_nanoseconds: histogram
       kindling_topology_request_request_bytes_total: counter
       kindling_topology_request_response_bytes_total: counter
       kindling_trace_request_duration_nanoseconds: gauge

diff --git a/deploy/agent/kindling-collector-config.yml b/deploy/agent/kindling-collector-config.yml
@@ -122,12 +122,10 @@ exporters:
     metric_aggregation_map:
       kindling_entity_request_total: counter
       kindling_entity_request_duration_nanoseconds_total: counter
-      kindling_entity_request_average_duration_nanoseconds: histogram
       kindling_entity_request_send_bytes_total: counter
       kindling_entity_request_receive_bytes_total: counter
       kindling_topology_request_total: counter
       kindling_topology_request_duration_nanoseconds_total: counter
-      kindling_topology_request_average_duration_nanoseconds: histogram
       kindling_topology_request_request_bytes_total: counter
       kindling_topology_request_response_bytes_total: counter
       kindling_trace_request_duration_nanoseconds: gauge

diff --git a/docs/prometheus_metrics.md b/docs/prometheus_metrics.md
@@ -8,9 +8,9 @@ Service metrics are generated from the server-side events, which are used to sho
 | `kindling_entity_request_duration_nanoseconds_total` | Counter | Total duration of requests |
 | `kindling_entity_request_send_bytes_total` | Counter | Total size of payload sent |
 | `kindling_entity_request_receive_bytes_total` | Counter | Total size of payload received |
-| `kindling_entity_request_average_duration_nanoseconds_count`  | Histogram | Count of average duration of requests |
-| `kindling_entity_request_average_duration_nanoseconds_sum` | Histogram | Sum of average duration of requests |
-| `kindling_entity_request_average_duration_nanoseconds_bucket` | Histogram | Histogram buckets of average duration of requests |
+| `kindling_entity_request_average_duration_nanoseconds_count` | Histogram | Count of average duration of requests <br> **Disabled by default. See Note 3 for how to enable it.**|
+| `kindling_entity_request_average_duration_nanoseconds_sum` | Histogram | Sum of average duration of requests <br> **Disabled by default. See Note 3 for how to enable it.**|
+| `kindling_entity_request_average_duration_nanoseconds_bucket` | Histogram | Histogram buckets of average duration of requests <br> **Disabled by default. See Note 3 for how to enable it.**|
 ### Labels List
 | **Label Name** | **Example** | **Notes** |
 | --- | --- | --- |
@@ -70,6 +70,15 @@ Service metrics are generated from the server-side events, which are used to sho
 
 - For other cases, the `request_content` and `response_content` are both empty.
 
+**Note 3**: The histogram metric `kindling_entity_request_average_duration_nanoseconds_*` is disabled by default as it could be high-cardinality. If this metric is needed, please add a new line to the `exporters.otelexporter.metric_aggregation_map` section of the configuration file.
+```yaml
+exporters:
+  otelexporter:
+    metric_aggregation_map:
+      # add the following line
+      kindling_entity_request_average_duration_nanoseconds: histogram 
+```
+
 ## Topology Metrics
 
 Topology metrics are typically generated from the client-side events, which are used to show the service dependencies map, so the metrics are called "topology". Some timeseries may be generated from the server-side events, which contain a non-empty label `dst_container_id`. These timeseries are generated only when the source IP is not the pod's IP inside the Kubernetes cluster, which are useful when there is no agent installed on the client-side. 
@@ -81,9 +90,9 @@ Topology metrics are typically generated from the client-side events, which are
 | `kindling_topology_request_duration_nanoseconds_total` | Counter |  Total duration of requests |
 | `kindling_topology_request_request_bytes_total` | Counter | Total size of payload sent |
 | `kindling_topology_request_response_bytes_total` | Counter | Total size of payload received |
-| `kindling_topology_request_average_duration_nanoseconds_count` | Histogram | Count of average duration of requests |
-| `kindling_topology_request_average_duration_nanoseconds_sum` | Histogram | Sum of average duration of requests  |
-| `kindling_topology_request_average_duration_nanoseconds_bucket` | Histogram | Histogram buckets of average duration of requests |
+| `kindling_topology_request_average_duration_nanoseconds_count` | Histogram | Count of average duration of requests<br> **Disabled by default. See Note 3 for how to enable it.** |
+| `kindling_topology_request_average_duration_nanoseconds_sum` | Histogram | Sum of average duration of requests<br> **Disabled by default. See Note 3 for how to enable it.** |
+| `kindling_topology_request_average_duration_nanoseconds_bucket` | Histogram | Histogram buckets of average duration of requests<br> **Disabled by default. See Note 3 for how to enable it.** |
 
 ### Labels List
 | **Label Name** | **Example** | **Notes** |
@@ -125,6 +134,14 @@ These two terms are composed of two parts.
 - **DUBBO**: 'Error Code' of Dubbo request.
 - **others**: empty temporarily
 
+**Note 3**: The histogram metric `kindling_topology_request_average_duration_nanoseconds_*` is disabled by default as it could be high-cardinality. If this metric is needed, please add a new line to the `exporters.otelexporter.metric_aggregation_map` section of the configuration file.
+```yaml
+exporters:
+  otelexporter:
+    metric_aggregation_map:
+      # add the following line
+      kindling_topology_request_average_duration_nanoseconds: histogram 
+```
 ## Trace As Metric
 We made some rules for considering whether a request is abnormal. For the abnormal request, the detail request information is considered as useful for debugging or profiling. We name this kind of data "trace". It is not a good practice to store such data in Prometheus as some labels are high-cardinality, so we picked up some labels from the original ones to generate a new kind of metric, which is called "Trace As Metric". The following table shows what labels this metric contains.