Skip to content

Commit

Permalink
Add the description of self observability metrics (#383)
Browse files Browse the repository at this point in the history
Signed-off-by: Daxin Wang <daxinwang@harmonycloud.cn>
  • Loading branch information
dxsup authored Dec 6, 2022
1 parent 47be8af commit 615b2c2
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 60 deletions.
5 changes: 3 additions & 2 deletions collector/pkg/component/analyzer/network/metric.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ func newSelfMetrics(meterProvider metric.MeterProvider, na *NetworkAnalyzer) {
func(ctx context.Context, result metric.Int64ObserverResult) {
result.Observe(na.tcpMessagePairSize, attribute.String("type", "tcp"))
result.Observe(na.udpMessagePairSize, attribute.String("type", "udp"))
})
netanalyzerParsedRequestTotal = metric.Must(meterProvider.Meter("kindling")).NewInt64Counter(netanalyzerParsedRequestMetric)
}, metric.WithDescription("The size of the message pairs stored in the map"))
netanalyzerParsedRequestTotal = metric.Must(meterProvider.Meter("kindling")).NewInt64Counter(netanalyzerParsedRequestMetric,
metric.WithDescription("The count of traces that the agent has processed"))
// Suppress warnings of unused variables
_ = netanalyzerMessagePairSizeInstrument
})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ import (
"context"
"sync"

"github.com/Kindling-project/kindling/collector/pkg/component/analyzer/tcpconnectanalyzer/internal"
"go.opentelemetry.io/otel/metric"

"github.com/Kindling-project/kindling/collector/pkg/component/analyzer/tcpconnectanalyzer/internal"
)

var once sync.Once
Expand All @@ -18,6 +19,6 @@ func newSelfMetrics(meterProvider metric.MeterProvider, monitor *internal.Connec
meter.NewInt64GaugeObserver(mapSizeMetric,
func(ctx context.Context, result metric.Int64ObserverResult) {
result.Observe(int64(monitor.GetMapSize()))
})
}, metric.WithDescription("The current number of the connections stored in the map."))
})
}
Original file line number Diff line number Diff line change
@@ -1,32 +1,20 @@
package otelexporter

import (
"context"
"sync"

"go.opentelemetry.io/otel/metric"
)

var otelexporterMetricgroupsReceivedTotal = "kindling_telemetry_otelexporter_metricgroups_received_total"
var otelexporterCardinalitySize = "kindling_telemetry_otelexporter_cardinality_size"

var once sync.Once

var labelsSet map[labelKey]bool
var labelsSetMutex sync.RWMutex

var dataGroupReceiverCounter metric.Int64Counter
var metricExportedCardinalitySize metric.Int64UpDownCounterObserver

func newSelfMetrics(meterProvider metric.MeterProvider) {
once.Do(func() {
dataGroupReceiverCounter = metric.Must(meterProvider.Meter("kindling")).NewInt64Counter(otelexporterMetricgroupsReceivedTotal)
metricExportedCardinalitySize = metric.Must(meterProvider.Meter("kindling")).NewInt64UpDownCounterObserver(
otelexporterCardinalitySize, func(ctx context.Context, result metric.Int64ObserverResult) {
labelsSetMutex.Lock()
defer labelsSetMutex.Unlock()
result.Observe(int64(len(labelsSet)))
})
labelsSet = make(map[labelKey]bool, 0)
dataGroupReceiverCounter = metric.Must(meterProvider.Meter("kindling")).NewInt64Counter(
otelexporterMetricgroupsReceivedTotal, metric.WithDescription("The total count of the data received by otelexporter"))
})
}
7 changes: 4 additions & 3 deletions collector/pkg/component/receiver/cgoreceiver/self_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ import (
"sync"
"sync/atomic"

"github.com/Kindling-project/kindling/collector/pkg/model/constnames"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"

"github.com/Kindling-project/kindling/collector/pkg/model/constnames"
)

var once sync.Once
Expand All @@ -26,11 +27,11 @@ func newSelfMetrics(meterProvider metric.MeterProvider, receiver *CgoReceiver) {
for name, value := range receiver.stats.getStats() {
result.Observe(value, attribute.String("name", name))
}
})
}, metric.WithDescription("The total number of the events received by cgoreceiver"))
meter.NewInt64GaugeObserver(channelSizeMetric,
func(ctx context.Context, result metric.Int64ObserverResult) {
result.Observe(int64(len(receiver.eventChannel)))
})
}, metric.WithDescription("The current number of events contained in the channel. The maximum size is 300,000."))
})
}

Expand Down
65 changes: 33 additions & 32 deletions collector/pkg/component/receiver/cgoreceiver/self_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@ import (
"testing"
"time"

"github.com/Kindling-project/kindling/collector/pkg/model/constnames"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/stdout/stdoutmetric"
"go.opentelemetry.io/otel/metric"
controller "go.opentelemetry.io/otel/sdk/metric/controller/basic"
otelprocessor "go.opentelemetry.io/otel/sdk/metric/processor/basic"
selector "go.opentelemetry.io/otel/sdk/metric/selector/simple"

"github.com/Kindling-project/kindling/collector/pkg/model/constnames"
)

func runTest(counter eventCounter, workerNum int, loopNum int) {
Expand Down Expand Up @@ -70,22 +71,22 @@ func TestCounterIntCombination(t *testing.T) {

func TestCounterRwAtomicMap(t *testing.T) {
counter := newDynamicStats([]SubEvent{
{"net", "syscall_exit-writev"},
{"net", "syscall_exit-readv"},
{"net", "syscall_exit-write"},
{"net", "syscall_exit-read"},
{"net", "syscall_exit-sendto"},
{"net", "syscall_exit-recvfrom"},
{"net", "syscall_exit-sendmsg"},
{"net", "syscall_exit-recvmsg"},
{"net", "grpc_uprobe"},
{"", "kprobe-tcp_close"},
{"", "kprobe-tcp_rcv_established"},
{"", "kprobe-tcp_drop"},
{"", "kprobe-tcp_retransmit_skb"},
{"", "syscall_exit-connect"},
{"", "kretprobe-tcp_connect"},
{"", "kprobe-tcp_set_state"},
{Category: "net", Name: "syscall_exit-writev"},
{Category: "net", Name: "syscall_exit-readv"},
{Category: "net", Name: "syscall_exit-write"},
{Category: "net", Name: "syscall_exit-read"},
{Category: "net", Name: "syscall_exit-sendto"},
{Category: "net", Name: "syscall_exit-recvfrom"},
{Category: "net", Name: "syscall_exit-sendmsg"},
{Category: "net", Name: "syscall_exit-recvmsg"},
{Category: "net", Name: "grpc_uprobe"},
{Name: "kprobe-tcp_close"},
{Name: "kprobe-tcp_rcv_established"},
{Name: "kprobe-tcp_drop"},
{Name: "kprobe-tcp_retransmit_skb"},
{Name: "syscall_exit-connect"},
{Name: "kretprobe-tcp_connect"},
{Name: "kprobe-tcp_set_state"},
})
assertTest(t, counter, 5, 100000)
}
Expand Down Expand Up @@ -128,21 +129,21 @@ func BenchmarkCounterOtelCounter(b *testing.B) {

func BenchmarkCounterRwAtomicMap(b *testing.B) {
counter := newDynamicStats([]SubEvent{
{"net", "syscall_exit-writev"},
{"net", "syscall_exit-readv"},
{"net", "syscall_exit-write"},
{"net", "syscall_exit-read"},
{"net", "syscall_exit-sendto"},
{"net", "syscall_exit-recvfrom"},
{"net", "syscall_exit-sendmsg"},
{"net", "syscall_exit-recvmsg"},
{"", "kprobe-tcp_close"},
{"", "kprobe-tcp_rcv_established"},
{"", "kprobe-tcp_drop"},
{"", "kprobe-tcp_retransmit_skb"},
{"", "syscall_exit-connect"},
{"", "kretprobe-tcp_connect"},
{"", "kprobe-tcp_set_state"},
{Category: "net", Name: "syscall_exit-writev"},
{Category: "net", Name: "syscall_exit-readv"},
{Category: "net", Name: "syscall_exit-write"},
{Category: "net", Name: "syscall_exit-read"},
{Category: "net", Name: "syscall_exit-sendto"},
{Category: "net", Name: "syscall_exit-recvfrom"},
{Category: "net", Name: "syscall_exit-sendmsg"},
{Category: "net", Name: "syscall_exit-recvmsg"},
{Name: "kprobe-tcp_close"},
{Name: "kprobe-tcp_rcv_established"},
{Name: "kprobe-tcp_drop"},
{Name: "kprobe-tcp_retransmit_skb"},
{Name: "syscall_exit-connect"},
{Name: "kretprobe-tcp_connect"},
{Name: "kprobe-tcp_set_state"},
})
initOtelCounterObserver(counter)
b.ResetTimer()
Expand Down
14 changes: 7 additions & 7 deletions collector/pkg/metadata/conntracker/self_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ var (
)

// To avoid getting statistics multiple times, we cache the result in a global variable.
// This can be done because the observation functions are executed in the order they were
// This can be done because the observation functions are executed in the order as they were
// registered. See go.opentelemetry.io/otel/internal/metric/async/AsyncInstrumentState.runners.
var conntrackerStaticStates map[string]int64

Expand All @@ -40,33 +40,33 @@ func newSelfMetrics(meterProvider metric.MeterProvider, conntracker Conntracker)
conntrackerStaticStates = conntracker.GetStats()
result.Observe(conntrackerStaticStates["state_size"], attribute.String("type", "general"))
result.Observe(conntrackerStaticStates["orphan_size"], attribute.String("type", "orphan"))
})
}, metric.WithDescription("The current number of the conntrack records stored in the map"))
cacheMaxSizeInstrument = meter.NewInt64GaugeObserver(cacheMaxSizeMetric,
func(ctx context.Context, result metric.Int64ObserverResult) {
result.Observe(conntrackerStaticStates["cache_max_size"])
})
}, metric.WithDescription("The maximum size of the cache map"))
operationTimesInstrument = meter.NewInt64CounterObserver(operationTimesTotal,
func(ctx context.Context, result metric.Int64ObserverResult) {
result.Observe(conntrackerStaticStates["registers_total"], attribute.String("op", "add"))
result.Observe(conntrackerStaticStates["registers_dropped"], attribute.String("op", "drop"))
result.Observe(conntrackerStaticStates["unregisters_total"], attribute.String("op", "remove"))
result.Observe(conntrackerStaticStates["gets_total"], attribute.String("op", "get"))
result.Observe(conntrackerStaticStates["evicts_total"], attribute.String("op", "evict"))
})
}, metric.WithDescription("The total operation times the conntracker does to the cache map"))
errorsTotalInstrument = meter.NewInt64CounterObserver(errorsTotal,
func(ctx context.Context, result metric.Int64ObserverResult) {
result.Observe(conntrackerStaticStates["enobufs"], attribute.String("type", "enobuf"))
result.Observe(conntrackerStaticStates["read_errors"], attribute.String("type", "read_errors"))
result.Observe(conntrackerStaticStates["msg_errors"], attribute.String("type", "msg_errors"))
})
}, metric.WithDescription("The total count of errors the conntracker encounters"))
samplingRateInstrument = meter.NewInt64GaugeObserver(samplingRate,
func(ctx context.Context, result metric.Int64ObserverResult) {
result.Observe(conntrackerStaticStates["sampling_pct"])
})
}, metric.WithDescription("The sampling rate of the conntracker module"))
throttlesTotalInstrument = meter.NewInt64CounterObserver(throttlesTotal,
func(ctx context.Context, result metric.Int64ObserverResult) {
result.Observe(conntrackerStaticStates["throttles"])
})
}, metric.WithDescription("The total count of the records being throttled due to the high load"))
// Suppress warnings of unused variables
_ = cacheSizeInstrument
_ = cacheMaxSizeInstrument
Expand Down
129 changes: 129 additions & 0 deletions docs/self_metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Self Observability Metrics Description
## cgoreceiver
### kindling_telemetry_cgoreceiver_events_total
- Description: The total number of the events received by cgoreceiver.
- Metric Type: counter
- Unit: count
- Labels: Additional labels except [the common ones](#common-labels).


| **Label Name** | **Description** | **Example** |
|----------------|------------------------|-------------|
| name | The name of the event. | write |


### kindling_telemetry_cgoreceiver_channel_size
- Description: The current number of events contained in the channel. Cgoreceiver uses a channel to receive events from `cgo`. This channel is able to accommodate a maximum size of 300,000 events. No events can be received if the channel is full.
- Metric Type: Gauge
- Unit: count
- Labels: No other labels except [the common ones](#common-labels).

## networkanalyzer
### kindling_telemetry_netanalyer_messagepair_size
- Description: The size of the message pairs stored in the map. Message pairs are the middle data structure of "traces". This metric is used to identify how many "traces" have not finished yet.
- Metric Type: Gauge
- Unit: count
- Labels: Additional labels except [the common ones](#common-labels).


| **Label Name** | **Description** | **Example** |
|----------------|-----------------------------------------------|-------------|
| type | The type of the message pair. `tcp` or `udp`. | tcp |

### kindling_telemetry_netanalyer_parsedrequest_total
- Description: The count of traces that the agent has processed.
- Metric Type: counter
- Unit: count
- Labels: Additional labels except [the common ones](#common-labels).


| **Label Name** | **Description** | **Example** |
|----------------|-------------------------------|-------------|
| protocol | The protocol of the requests. | http |


## tcpconnectanalyzer
### kindling_telemetry_tcpconnectanalyzer_map_size
- Description: The current number of the connections stored in the map. This map accomodates the events related to the metric "TCP connect".
- Metric Type: gauge
- Unit: count
- Labels: No other labels except [the common ones](#common-labels).


## conntracker
### kindling_telemetry_conntracker_cache_size
- Description: The current number of the conntrack records stored in the map.
- Metric Type: gauge
- Unit: count
- Labels: Additional labels except [the common ones](#common-labels).


| **Label Name** | **Description** | **Example** |
|----------------|------------------------------------------------|-------------|
| type | The type of the records. `general` or `orphan` | general |


### kindling_telemetry_conntracker_cache_max_size
- Description: The maximum size of the cache map. The default value is 130,000. It can be configured in the configuration file.
- Metric Type: gauge
- Unit: count
- Labels: No other labels except [the common ones](#common-labels).

### kindling_telemetry_conntracker_operation_times_total
- Description: The total operation times the conntracker does to the cache map. This metric can reflect the load of the conntracker module.
- Metric Type: counter
- Unit: count
- Labels: Additional labels except [the common ones](#common-labels).


| **Label Name** | **Description** | **Example** |
|----------------|---------------------------------------------------------------------------|-------------|
| op | The opreation names. Could be `add`, `drop`, `remove`, `get`, or `evict`. | add |


### kindling_telemetry_conntracker_errors_total
- Description: The total count of errors the conntracker encounters. This metric can reflect the load of the conntracker module. In most cases, the error type is `enobuf` that means there are too many records the conntracker generates and there is no buffer to receive them.
- Metric Type: counter
- Unit: count
- Labels: Additional labels except [the common ones](#common-labels).

| **Label Name** | **Description** | **Example** |
|----------------|---------------------------------------------------------------------|-------------|
| type | The error types. Could be `enobuf`, `read_errors`, or `msg_errors`. | enobuf |


### kindling_telemetry_conntracker_sampling_rate
- Description: The sampling rate of the conntracker module. This rate may be automatically decreased if the load is too high.
- Metric Type: counter
- Unit: percent
- Labels: No other labels except [the common ones](#common-labels).


### kindling_telemetry_conntracker_throttles_total
- Description: The total count of the records being throttled due to the high load.
- Metric Type: counter
- Unit: count
- Labels: No other labels except [the common ones](#common-labels).


## otelexporter
### kindling_telemetry_otelexporter_metricgroups_received_total
- Description: The total count of the data received by `otelexporter`.
- Metric Type: counter
- Unit: count
- Labels: Additional labels except [the common ones](#common-labels).

| **Label Name** | **Description** | **Example** |
|----------------|------------------------------|---------------------------------|
| name | The name of the `DataGroup`. | single_net_request_metric_group |


### kindling_telemetry_otelexporter_cardinality_size
- Deprecated.

## Common labels
| **Label Name** | **Description** | **Example** |
|----------------------|--------------------------------------------------------------------|------------------|
| service.instance.id | The host name where the agent locates in. | worker-149 |
| service.name | The cluster name which is composed of "kindling" and "cluster ID". | kindling-abcd123 |
| instrumentation.name | A constant "kindling". | kindling |

0 comments on commit 615b2c2

Please sign in to comment.