Skip to content

Commit

Permalink
New metric p4_cmds_paused_errors to count errors due to pause thresho…
Browse files Browse the repository at this point in the history
…lds being exceeded
  • Loading branch information
rcowham committed Aug 15, 2024
1 parent 0cf5a8f commit 07b4866
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 5 deletions.
11 changes: 6 additions & 5 deletions cmd/log2sql/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ func writeHeader(f io.Writer) {
activeThreadsMax int NULL, -- Active threads (max in last 10 secs)
pausedThreads int NULL, -- Paused threads
pausedThreadsMax int NULL, -- Paused threads (max in last 10 secs)
pausedErrorCount int NULL, -- Commands exited in error due to pause thresholds being exceeded
pauseRateCPU int NULL, -- Pause rate CPU (percentage 0-100)
pauseRateMem int NULL, -- Pause rate Mem (percentage 0-100)
cpuPressureState int NULL, -- CPU pressure (0 low, 1 med, 2 high)
Expand Down Expand Up @@ -158,10 +159,10 @@ func getProcessStatement() string {
func getEventsStatement() string {
return `INSERT INTO events
(lineNumber, eventTime,
activeThreads, activeThreadsMax, pausedThreads, pausedThreadsMax,
activeThreads, activeThreadsMax, pausedThreads, pausedThreadsMax, pausedErrorCount,
pauseRateCPU, pauseRateMem,
cpuPressureState, memPressureState)
VALUES (?,?,?,?,?,?,?,?,?,?)`
VALUES (?,?,?,?,?,?,?,?,?,?,?)`
}

func getTableUseStatement() string {
Expand Down Expand Up @@ -226,7 +227,7 @@ func preparedInsert(logger *logrus.Logger, stmtProcess, stmtTableuse *sqlite3.St
func preparedInsertServerEvents(logger *logrus.Logger, stmtEvents *sqlite3.Stmt, evt *p4dlog.ServerEvent) int64 {
rows := 1
err := stmtEvents.Exec(
evt.LineNo, dateStr(evt.EventTime), evt.ActiveThreads, evt.ActiveThreadsMax, evt.PausedThreads, evt.PausedThreadsMax,
evt.LineNo, dateStr(evt.EventTime), evt.ActiveThreads, evt.ActiveThreadsMax, evt.PausedThreads, evt.PausedThreadsMax, evt.PausedErrorCount,
evt.PauseRateCPU, evt.PauseRateMem, evt.CPUPressureState, evt.MemPressureState)
if err != nil {
logger.Errorf("Events insert: %v lineNo %d, %s",
Expand All @@ -237,8 +238,8 @@ func preparedInsertServerEvents(logger *logrus.Logger, stmtEvents *sqlite3.Stmt,

func writeSQLServerEvents(f io.Writer, evt *p4dlog.ServerEvent) int64 {
rows := 1
fmt.Fprintf(f, `INSERT INTO events VALUES (%d,"%s",%d,%d,%d,%d,%d,%d,%d,%d);`+"\n",
evt.LineNo, dateStr(evt.EventTime), evt.ActiveThreads, evt.ActiveThreadsMax, evt.PausedThreads, evt.PausedThreadsMax,
fmt.Fprintf(f, `INSERT INTO events VALUES (%d,"%s",%d,%d,%d,%d,%d,%d,%d,%d,%d);`+"\n",
evt.LineNo, dateStr(evt.EventTime), evt.ActiveThreads, evt.ActiveThreadsMax, evt.PausedThreads, evt.PausedThreadsMax, evt.PausedErrorCount,
evt.PauseRateCPU, evt.PauseRateMem, evt.CPUPressureState, evt.MemPressureState)
return int64(rows)
}
Expand Down
3 changes: 3 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ type P4DMetrics struct {
cmdsRunningMax int64
cmdsPaused int64 // Server Events
cmdsPausedMax int64 // Server Events
cmdsPausedErrorCount int64 // ditto
pauseRateCPU int64 // ditto
pauseRateMem int64 // ditto
cpuPressureState int64 // ditto
Expand Down Expand Up @@ -301,6 +302,7 @@ func (p4m *P4DMetrics) getCumulativeMetrics() string {
p4m.outputMetric(metrics, "p4_cmds_running_max", "The max number of running commands at any one time since last metric", "gauge", fmt.Sprintf("%d", p4m.cmdsRunningMax), fixedLabels)
p4m.outputMetric(metrics, "p4_cmds_paused", "The number of (resource pressure) paused commands at any one time", "gauge", fmt.Sprintf("%d", p4m.cmdsPaused), fixedLabels)
p4m.outputMetric(metrics, "p4_cmds_paused_max", "The max number of (resource pressure) paused commands since last metric", "gauge", fmt.Sprintf("%d", p4m.cmdsPausedMax), fixedLabels)
p4m.outputMetric(metrics, "p4_cmds_paused_errors", "The number of commands exited with error due to resource pressure thresholds being exceeded", "counter", fmt.Sprintf("%d", p4m.cmdsPausedErrorCount), fixedLabels)
p4m.outputMetric(metrics, "p4_pause_rate_cpu", "The (resource pressure) pause rate for CPU", "gauge", fmt.Sprintf("%d", p4m.pauseRateCPU), fixedLabels)
p4m.outputMetric(metrics, "p4_pause_rate_mem", "The (resource pressure) pause rate for Mem", "gauge", fmt.Sprintf("%d", p4m.pauseRateMem), fixedLabels)
p4m.outputMetric(metrics, "p4_pause_state_cpu", "The (resource pressure) pause state for CPU (0-2)", "gauge", fmt.Sprintf("%d", p4m.cpuPressureState), fixedLabels)
Expand Down Expand Up @@ -518,6 +520,7 @@ func (p4m *P4DMetrics) publishSvrEvent(evt p4dlog.ServerEvent) {
p4m.cmdsRunningMax = evt.ActiveThreadsMax
p4m.cmdsPaused = evt.PausedThreads
p4m.cmdsPausedMax = evt.PausedThreadsMax
p4m.cmdsPausedErrorCount = evt.PausedErrorCount
p4m.pauseRateCPU = evt.PauseRateCPU
p4m.pauseRateMem = evt.PauseRateMem
p4m.cpuPressureState = evt.CPUPressureState
Expand Down
55 changes: 55 additions & 0 deletions metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -847,3 +847,58 @@ p4_pause_state_mem{serverid="myserverid"} 1
p4_prom_svr_events_processed{serverid="myserverid"} 3`, -1)
compareOutput(t, expected, output)
}

func TestServerEventsPauseErrors(t *testing.T) {
cfg := &Config{
ServerID: "myserverid",
UpdateInterval: 10 * time.Millisecond,
OutputCmdsByUser: true}
input := `
Perforce server info:
2024/06/19 12:25:31 pid 1056864 perforce@ip-10-0-0-106 127.0.0.1 [p4/2024.1.TEST-TEST_ONLY/LINUX26X86_64/2611120] 'user-fstat -Ob //...'
Perforce server error:
Date 2024/06/19 12:25:31:
Pid 1056860
Operation: user-fstat
Operation 'user-fstat' failed.
Too many commands paused; terminated.
Perforce server info:
2024/06/19 12:25:39 pid 1056864 completed 8.39s 598+67us 304+0io 0+0net 68864k 0pf
Perforce server info:
2024/06/19 12:25:31 pid 1056864 perforce@ip-10-0-0-106 127.0.0.1 [p4/2024.1.TEST-TEST_ONLY/LINUX26X86_64/2611120] 'user-fstat -Ob //...'
--- exited on fatal server error
--- lapse 8.39s
--- usage 598+67us 304+0io 0+0net 68864k 0pf
--- memory cmd/proc 74mb/74mb
--- rpc msgs/size in+out 2+84225/0mb+45mb himarks 795416/795272 snd/rcv 5.64s/.002s
2024/06/19 12:25:39 004246895 pid 1056103: Server under resource pressure. Pause rate CPU 59%, mem 20%, CPU pressure 2, mem pressure 1
`
historical := false
output := basicTest(cfg, input, historical)

expected := eol.Split(`p4_cmd_counter{serverid="myserverid",cmd="user-fstat"} 1
p4_cmd_cpu_system_cumulative_seconds{serverid="myserverid",cmd="user-fstat"} 0.067
p4_cmd_cpu_user_cumulative_seconds{serverid="myserverid",cmd="user-fstat"} 0.598
p4_cmd_cumulative_seconds{serverid="myserverid",cmd="user-fstat"} 8.390
p4_cmd_error_counter{serverid="myserverid",cmd="user-fstat"} 1
p4_cmd_mem_mb{serverid="myserverid"} 74
p4_cmd_mem_peak_mb{serverid="myserverid"} 74
p4_cmd_program_counter{serverid="myserverid",program="p4/2024.1.TEST-TEST_ONLY/LINUX26X86_64/2611120"} 1
p4_cmd_program_cumulative_seconds{serverid="myserverid",program="p4/2024.1.TEST-TEST_ONLY/LINUX26X86_64/2611120"} 8.390
p4_cmd_running{serverid="myserverid"} 1
p4_cmd_user_counter{serverid="myserverid",user="perforce"} 1
p4_cmd_user_cumulative_seconds{serverid="myserverid",user="perforce"} 8.390
p4_cmds_paused_errors{serverid="myserverid"} 1
p4_cmds_running{serverid="myserverid"} 1
p4_pause_rate_cpu{serverid="myserverid"} 59
p4_pause_rate_mem{serverid="myserverid"} 20
p4_pause_state_cpu{serverid="myserverid"} 2
p4_pause_state_mem{serverid="myserverid"} 1
p4_prom_cmds_processed{serverid="myserverid"} 1
p4_prom_log_lines_read{serverid="myserverid"} 23
p4_prom_svr_events_processed{serverid="myserverid"} 1`, -1)
compareOutput(t, expected, output)
}
6 changes: 6 additions & 0 deletions p4dlog.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ type ServerEvent struct {
ActiveThreadsMax int64 `json:"activeThreadsMax"`
PausedThreads int64 `json:"pausedThreads"`
PausedThreadsMax int64 `json:"pausedThreadsMax"`
PausedErrorCount int64 `json:"pausedErrorCount"`
PauseRateCPU int64 `json:"pauseRateCPU"` // Percentage 1-100
PauseRateMem int64 `json:"pauseRateMem"` // Percentage 1-100
CPUPressureState int64 `json:"cpuPressureState"` // 0-2
Expand Down Expand Up @@ -584,6 +585,7 @@ func (s *ServerEvent) MarshalJSON() ([]byte, error) {
ActiveThreadsMax int64 `json:"activeThreadsMax"`
PausedThreads int64 `json:"pausedThreads"`
PausedThreadsMax int64 `json:"pausedThreadsMax"`
PausedErrorCount int64 `json:"pausedErrorCount"`
PauseRateCPU int64 `json:"pauseRateCPU"` // Percentage 1-100
PauseRateMem int64 `json:"pauseRateMem"` // Percentage 1-100
CPUPressureState int64 `json:"cpuPressureState"` // 0-2
Expand All @@ -595,6 +597,7 @@ func (s *ServerEvent) MarshalJSON() ([]byte, error) {
ActiveThreadsMax: s.ActiveThreadsMax,
PausedThreads: s.PausedThreads,
PausedThreadsMax: s.PausedThreadsMax,
PausedErrorCount: s.PausedErrorCount,
PauseRateCPU: s.PauseRateCPU,
PauseRateMem: s.PauseRateMem,
CPUPressureState: s.CPUPressureState,
Expand Down Expand Up @@ -1091,6 +1094,7 @@ type P4dFileParser struct {
cmdsRunningMax int64 // Max No of currently running threads
cmdsPaused int64 // No of paused threads
cmdsPausedMax int64 // Max no of paused threads
cmdsPausedErrorCount int64 // Count of commands paused due to resource pressure errors
pauseRateCPU int64 // Resource pressure
pauseRateMem int64 // ditto
cpuPressureState int64 // ditto
Expand Down Expand Up @@ -1365,6 +1369,7 @@ func (fp *P4dFileParser) processTrackRecords(cmd *Command, lines []string) {
if strings.HasPrefix(line, trackFatalError) {
cmd.CmdError = true
hasTrackInfo = true
fp.cmdsPausedErrorCount += 1
continue
}
if strings.HasPrefix(line, trackDB) {
Expand Down Expand Up @@ -1712,6 +1717,7 @@ func (fp *P4dFileParser) outputSvrEvent(timeStr string, lineNo int64) {
ActiveThreadsMax: fp.cmdsRunningMax,
PausedThreads: fp.cmdsPaused,
PausedThreadsMax: fp.cmdsPausedMax,
PausedErrorCount: fp.cmdsPausedErrorCount,
PauseRateCPU: fp.pauseRateCPU,
PauseRateMem: fp.pauseRateMem,
CPUPressureState: fp.cpuPressureState,
Expand Down

0 comments on commit 07b4866

Please sign in to comment.