-
Notifications
You must be signed in to change notification settings - Fork 380
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Initialize metrics with labels #2162
Changes from all commits
269a1cc
af88716
d2724f8
187592b
fea557c
623c564
75ab0f9
e53dc6c
93bcbfc
decdb63
fbacf4b
a7918f1
2e6e219
546bf72
6e55729
d90fc84
efe5cd7
07580f3
3f0f378
4262ed8
37739a5
d2f5d01
20cfea2
4f29532
5cdc9b4
415e68e
f5f9629
b24e3f1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,32 +5,62 @@ package errormetrics | |
|
||
import ( | ||
"fmt" | ||
"strings" | ||
|
||
"github.com/cilium/tetragon/pkg/api/ops" | ||
"github.com/cilium/tetragon/pkg/metrics/consts" | ||
"github.com/pkg/errors" | ||
"github.com/prometheus/client_golang/prometheus" | ||
) | ||
|
||
type ErrorType string | ||
type ErrorType int | ||
|
||
var ( | ||
const ( | ||
// Process not found on get() call. | ||
ProcessCacheMissOnGet ErrorType = "process_cache_miss_on_get" | ||
ProcessCacheMissOnGet ErrorType = iota | ||
// Process evicted from the cache. | ||
ProcessCacheEvicted ErrorType = "process_cache_evicted" | ||
ProcessCacheEvicted | ||
// Process not found on remove() call. | ||
ProcessCacheMissOnRemove ErrorType = "process_cache_miss_on_remove" | ||
ProcessCacheMissOnRemove | ||
// Tid and Pid mismatch that could affect BPF and user space caching logic | ||
ProcessPidTidMismatch ErrorType = "process_pid_tid_mismatch" | ||
ProcessPidTidMismatch | ||
// An event is missing process info. | ||
EventMissingProcessInfo ErrorType = "event_missing_process_info" | ||
EventMissingProcessInfo | ||
// An error occurred in an event handler. | ||
HandlerError ErrorType = "handler_error" | ||
HandlerError | ||
// An event finalizer on Process failed | ||
EventFinalizeProcessInfoFailed ErrorType = "event_finalize_process_info_failed" | ||
EventFinalizeProcessInfoFailed | ||
) | ||
|
||
var errorTypeLabelValues = map[ErrorType]string{ | ||
ProcessCacheMissOnGet: "process_cache_miss_on_get", | ||
ProcessCacheEvicted: "process_cache_evicted", | ||
ProcessCacheMissOnRemove: "process_cache_miss_on_remove", | ||
ProcessPidTidMismatch: "process_pid_tid_mismatch", | ||
EventMissingProcessInfo: "event_missing_process_info", | ||
HandlerError: "handler_error", | ||
EventFinalizeProcessInfoFailed: "event_finalize_process_info_failed", | ||
} | ||
|
||
func (e ErrorType) String() string { | ||
return errorTypeLabelValues[e] | ||
} | ||
Comment on lines
+43
to
+45
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we have a fallback here for missing types to avoid panics? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It won't panic, in case |
||
|
||
type EventHandlerError int | ||
|
||
// TODO: Recognize different errors returned by individual handlers | ||
const ( | ||
HandlePerfUnknownOp EventHandlerError = iota | ||
HandlePerfHandlerError | ||
) | ||
|
||
var eventHandlerErrorLabelValues = map[EventHandlerError]string{ | ||
HandlePerfUnknownOp: "unknown_opcode", | ||
HandlePerfHandlerError: "event_handler_failed", | ||
} | ||
|
||
func (e EventHandlerError) String() string { | ||
return eventHandlerErrorLabelValues[e] | ||
} | ||
|
||
var ( | ||
ErrorTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ | ||
Namespace: consts.MetricsNamespace, | ||
|
@@ -50,24 +80,44 @@ var ( | |
func InitMetrics(registry *prometheus.Registry) { | ||
registry.MustRegister(ErrorTotal) | ||
registry.MustRegister(HandlerErrors) | ||
|
||
// Initialize metrics with labels | ||
for er := range errorTypeLabelValues { | ||
GetErrorTotal(er).Add(0) | ||
} | ||
for opcode := range ops.OpCodeStrings { | ||
if opcode != ops.MsgOpUndef && opcode != ops.MsgOpTest { | ||
GetHandlerErrors(opcode, HandlePerfHandlerError).Add(0) | ||
} | ||
} | ||
// NB: We initialize only ops.MsgOpUndef here, but unknown_opcode can occur for any opcode | ||
// that is not explicitly handled. | ||
GetHandlerErrors(ops.MsgOpUndef, HandlePerfUnknownOp).Add(0) | ||
|
||
// NOTES: | ||
// * op, msg_op, opcode - standardize on a label (+ add human-readable label) | ||
// * error, error_type, type - standardize on a label | ||
// * Delete errors_total{type="handler_error"} - it duplicates handler_errors_total | ||
// * Consider further splitting errors_total | ||
// * Rename handler_errors_total to event_handler_errors_total? | ||
} | ||
|
||
// Get a new handle on an ErrorTotal metric for an ErrorType | ||
func GetErrorTotal(t ErrorType) prometheus.Counter { | ||
return ErrorTotal.WithLabelValues(string(t)) | ||
func GetErrorTotal(er ErrorType) prometheus.Counter { | ||
return ErrorTotal.WithLabelValues(er.String()) | ||
} | ||
|
||
// Increment an ErrorTotal for an ErrorType | ||
func ErrorTotalInc(t ErrorType) { | ||
GetErrorTotal(t).Inc() | ||
func ErrorTotalInc(er ErrorType) { | ||
GetErrorTotal(er).Inc() | ||
} | ||
|
||
// Get a new handle on the HandlerErrors metric | ||
func GetHandlerErrors(opcode int, err error) prometheus.Counter { | ||
return HandlerErrors.WithLabelValues(fmt.Sprint(opcode), strings.ReplaceAll(fmt.Sprintf("%T", errors.Cause(err)), "*", "")) | ||
func GetHandlerErrors(opcode ops.OpCode, er EventHandlerError) prometheus.Counter { | ||
return HandlerErrors.WithLabelValues(fmt.Sprint(int32(opcode)), er.String()) | ||
} | ||
|
||
// Increment the HandlerErrors metric | ||
func HandlerErrorsInc(opcode int, err error) { | ||
GetHandlerErrors(opcode, err).Inc() | ||
func HandlerErrorsInc(opcode ops.OpCode, er EventHandlerError) { | ||
GetHandlerErrors(opcode, er).Inc() | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does it make sense to link GH issues here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah, I think so, just I'm not 100% sure which of these points are worth implementing and which not. Many of these are technically breaking changes, but also many metrics are probably used only by Tetragon developers so 🤷♀️ I need one more pass over the metrics and maybe another pair of eyes, then I'll open issues for improvements.