Skip to content

Commit

Permalink
add command labels in tcp connect metrics and span attributes (#260)
Browse files Browse the repository at this point in the history
* feat: add command info in tcp_connect metrics

Signed-off-by: niejiangang <niejiangang@harmonycloud.cn>

* feat: add pid and command info in trace

Signed-off-by: niejiangang <niejiangang@harmonycloud.cn>

* doc: add comm descrition in doc

Signed-off-by: niejiangang <niejiangang@harmonycloud.cn>

* doc: update config

Signed-off-by: niejiangang <niejiangang@harmonycloud.cn>

* fix: add command info in connectionStats

Signed-off-by: niejiangang <niejiangang@harmonycloud.cn>

* fix: add comm in aggreagation Selectors

Signed-off-by: niejiangang <niejiangang@harmonycloud.cn>
  • Loading branch information
NeJan2020 authored Jun 23, 2022
1 parent bd2a843 commit f8cd235
Show file tree
Hide file tree
Showing 13 changed files with 39 additions and 4 deletions.
3 changes: 3 additions & 0 deletions collector/analyzer/network/network_analyzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,7 @@ func (na *NetworkAnalyzer) getConnectFailRecords(mps *messagePairs) []*model.Dat
ret.UpdateAddIntMetric(constvalues.ConnectTime, int64(mps.connects.getDuration()))
ret.UpdateAddIntMetric(constvalues.RequestTotalTime, int64(mps.connects.getDuration()))
ret.Labels.UpdateAddIntValue(constlabels.Pid, int64(evt.GetPid()))
ret.Labels.UpdateAddStringValue(constlabels.Comm, evt.GetComm())
ret.Labels.UpdateAddStringValue(constlabels.SrcIp, evt.GetSip())
ret.Labels.UpdateAddStringValue(constlabels.DstIp, evt.GetDip())
ret.Labels.UpdateAddIntValue(constlabels.SrcPort, int64(evt.GetSport()))
Expand All @@ -515,6 +516,7 @@ func (na *NetworkAnalyzer) getRecords(mps *messagePairs, protocol string, attrib
ret := na.dataGroupPool.Get()
labels := ret.Labels
labels.UpdateAddIntValue(constlabels.Pid, int64(evt.GetPid()))
labels.UpdateAddStringValue(constlabels.Comm, evt.GetComm())
labels.UpdateAddStringValue(constlabels.SrcIp, evt.GetSip())
labels.UpdateAddStringValue(constlabels.DstIp, evt.GetDip())
labels.UpdateAddIntValue(constlabels.SrcPort, int64(evt.GetSport()))
Expand Down Expand Up @@ -563,6 +565,7 @@ func (na *NetworkAnalyzer) getRecordWithSinglePair(mps *messagePairs, mp *messag
ret := na.dataGroupPool.Get()
labels := ret.Labels
labels.UpdateAddIntValue(constlabels.Pid, int64(evt.GetPid()))
labels.UpdateAddStringValue(constlabels.Comm, evt.GetComm())
labels.UpdateAddStringValue(constlabels.SrcIp, evt.GetSip())
labels.UpdateAddStringValue(constlabels.DstIp, evt.GetDip())
labels.UpdateAddIntValue(constlabels.SrcPort, int64(evt.GetSport()))
Expand Down
5 changes: 4 additions & 1 deletion collector/analyzer/tcpconnectanalyzer/analyzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,10 @@ func (a *TcpConnectAnalyzer) generateLabels(connectStats *internal.ConnectionSta
labels := model.NewAttributeMap()
// The connect events always come from the client-side
labels.AddBoolValue(constlabels.IsServer, false)
labels.AddIntValue(constlabels.Pid, int64(connectStats.Pid))
if a.config.NeedProcessInfo {
labels.AddIntValue(constlabels.Pid, int64(connectStats.Pid))
labels.AddStringValue(constlabels.Comm, connectStats.Comm)
}
labels.AddStringValue(constlabels.ContainerId, connectStats.ContainerId)
labels.AddIntValue(constlabels.Errno, int64(connectStats.Code))
if connectStats.StateMachine.GetCurrentState() == internal.Success {
Expand Down
6 changes: 4 additions & 2 deletions collector/analyzer/tcpconnectanalyzer/config.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
package tcpconnectanalyzer

type Config struct {
ChannelSize int `mapstructure:"channel_size"`
WaitEventSecond int `mapstructure:"wait_event_second"`
ChannelSize int `mapstructure:"channel_size"`
WaitEventSecond int `mapstructure:"wait_event_second"`
NeedProcessInfo bool `mapstructure:"need_process_info"`
}

func NewDefaultConfig() *Config {
return &Config{
ChannelSize: 2000,
WaitEventSecond: 10,
NeedProcessInfo: false,
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ func (c *ConnectMonitor) ReadInConnectExitSyscall(event *model.KindlingEvent) (*
// "connect_exit" comes to analyzer after "tcp_connect"
connStats.EndTimestamp = event.Timestamp
connStats.Pid = event.GetPid()
connStats.Comm = event.GetComm()
connStats.ContainerId = event.GetContainerId()
var eventType EventType
if retValueInt == 0 {
Expand Down Expand Up @@ -97,6 +98,7 @@ func (c *ConnectMonitor) ReadSendRequestSyscall(event *model.KindlingEvent) (*Co
return nil, nil
}
connStats.Pid = event.GetPid()
connStats.Comm = event.GetComm()
connStats.ContainerId = event.GetContainerId()
return connStats.StateMachine.ReceiveEvent(sendRequestSyscall, c.connMap)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ const (

type ConnectionStats struct {
Pid uint32
Comm string
ContainerId string
ConnKey ConnKey
StateMachine *StateMachine
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ func TestCallback(t *testing.T) {
statesResource := createStatesResource()
connStats := &ConnectionStats{
Pid: 0,
Comm: "test",
ConnKey: connKey,
InitialTimestamp: 0,
EndTimestamp: 0,
Expand Down
2 changes: 2 additions & 0 deletions collector/consumer/exporter/tools/adapter/net_dict.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ var SpanDicList = []dictionary{
//{constlabels.DstPod, constlabels.DstPod, String},
{constlabels.SrcNode, constlabels.SrcNode, String},
{constlabels.SrcPod, constlabels.SrcPod, String},
{constlabels.Pid, constlabels.Pid, Int64},
{constlabels.Comm, constlabels.Comm, String},
}

var topologyMetricDicList = []dictionary{
Expand Down
2 changes: 2 additions & 0 deletions collector/consumer/processor/aggregateprocessor/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ func (p *AggregateProcessor) Consume(dataGroup *model.DataGroup) error {
func newNetRequestLabelSelectors() *aggregator.LabelSelectors {
return aggregator.NewLabelSelectors(
aggregator.LabelSelector{Name: constlabels.Pid, VType: aggregator.IntType},
aggregator.LabelSelector{Name: constlabels.Comm, VType: aggregator.StringType},
aggregator.LabelSelector{Name: constlabels.Protocol, VType: aggregator.StringType},
aggregator.LabelSelector{Name: constlabels.IsServer, VType: aggregator.BooleanType},
aggregator.LabelSelector{Name: constlabels.ContainerId, VType: aggregator.StringType},
Expand Down Expand Up @@ -208,6 +209,7 @@ func newTcpLabelSelectors() *aggregator.LabelSelectors {
func newTcpConnectLabelSelectors() *aggregator.LabelSelectors {
return aggregator.NewLabelSelectors(
aggregator.LabelSelector{Name: constlabels.Pid, VType: aggregator.IntType},
aggregator.LabelSelector{Name: constlabels.Comm, VType: aggregator.StringType},
aggregator.LabelSelector{Name: constlabels.SrcNode, VType: aggregator.StringType},
aggregator.LabelSelector{Name: constlabels.SrcNodeIp, VType: aggregator.StringType},
aggregator.LabelSelector{Name: constlabels.SrcNamespace, VType: aggregator.StringType},
Expand Down
2 changes: 2 additions & 0 deletions collector/docker/kindling-collector-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ analyzers:
tcpconnectanalyzer:
channel_size: 10000
wait_event_second: 10
# Whether add pid and command info in tcp-connect-metrics's labels
need_process_info: false
tcpmetricanalyzer:
networkanalyzer:
connect_timeout: 100
Expand Down
1 change: 1 addition & 0 deletions collector/model/constlabels/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ const (
)

const (
Comm = "comm"
Pid = "pid"
Protocol = "protocol"
IsError = "is_error"
Expand Down
12 changes: 12 additions & 0 deletions collector/model/kindling_event_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,18 @@ func (x *KindlingEvent) GetPid() uint32 {
return threadInfo.Pid
}

func (x *KindlingEvent) GetComm() string {
ctx := x.GetCtx()
if ctx == nil {
return ""
}
threadInfo := ctx.GetThreadInfo()
if threadInfo == nil {
return ""
}
return threadInfo.Comm
}

func (x *KindlingEvent) GetContainerId() string {
ctx := x.GetCtx()
if ctx == nil {
Expand Down
2 changes: 2 additions & 0 deletions deploy/agent/kindling-collector-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ analyzers:
tcpconnectanalyzer:
channel_size: 10000
wait_event_second: 10
# Whether add pid and command info in tcp-connect-metrics's labels
need_process_info: false
tcpmetricanalyzer:
networkanalyzer:
connect_timeout: 100
Expand Down
4 changes: 3 additions & 1 deletion docs/prometheus_metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,8 @@ We made some rules for considering whether a request is abnormal. For the abnorm
### Labels List
| **Label Name** | **Example** | **Notes** |
| --- | --- | --- |
| `pid` | 1024 | The client's process ID |
| `pid` | 1024 | The client's process ID|
| `comm` | java | The client's process command|
| `src_node` | slave-node1 | Which node the source pod is on |
| `src_namespace` | default | Namespace of the source pod |
| `src_workload_kind` | deployment | Workload kind of the source pod |
Expand Down Expand Up @@ -254,6 +255,7 @@ We made some rules for considering whether a request is abnormal. For the abnorm

**Note 2**: The field `errno` is not `0` only if the TCP socket is blocking and there is an error happened. There are multiple possible values it could contain. See the `ERRORS` section of the [connect(2) manual](https://man7.org/linux/man-pages/man2/connect.2.html) for more details.

**Note 3**: The field `pid` and `comm` will not exist if you set `need_process_info` to `false` (default is false), that will reduce the pressure of Prometheus.

## PromQL Example
Here are some examples of how to use these metrics in Prometheus, which can help you understand them faster.
Expand Down

0 comments on commit f8cd235

Please sign in to comment.