Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
fix error aggregator bug
Browse files Browse the repository at this point in the history
  • Loading branch information
Binyang2014 committed Sep 17, 2019
1 parent 6f4fc57 commit 1998650
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"os"
"strconv"
"time"

"github.com/microsoft/runtime/pkg/aggregator"
"github.com/microsoft/runtime/pkg/logger"
Expand Down Expand Up @@ -50,6 +51,7 @@ func main() {
}

log.Info("start to generate the exit summary")
start := time.Now()
a, err := aggregator.NewErrorAggregator(&logFiles, log)
if err != nil {
panic("fatal: create log aggregator: " + err.Error())
Expand All @@ -74,7 +76,8 @@ func main() {
if err != nil {
panic("fatal: dumping summary info: " + err.Error())
}
log.Info("finish generating the exit summary")
elapsed := time.Since(start)
log.Info("finish generating the exit summary, time consumed:", elapsed)

os.Exit(exitInfo.Exitcode)
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,40 +7,25 @@
- patterns:
- exitCode: 1
userLogRegex: exec\(compile\(getattr\(token[a-z]*, 'open', open\)\(__file__\)
reason: Failed to open tocken file
solution:
- Please check the tocken file path
containerExitCode: 10

- patterns:
- exitCode: 1
userLogRegex: failed with error code 1 in /tmp/[a-z]*
- exitCode: 1
userLogRegex: package not find [a-z]*
reason: Some package not install property
solution:
- Please check the pip install command
containerExitCode: 12

- patterns:
- exitCode: 1
userLogRegex: connect tensorboard failed
platformLogRegex: Failed to start tensorboard
reason: Tensorboad not startup correctly
solution:
- Please try again or connect admin
containerExitCode: 15

- patterns:
- exitCode: 137
reason: User program terminated by SIGKILL
solution:
- Please check the log and retry again
containerExitCode: 137

- patterns:
- exitCode: 143
reason: User program terminated by SIGTERM
solution:
- Please check the log and retry again
containerExitCode: 143
Original file line number Diff line number Diff line change
Expand Up @@ -348,10 +348,26 @@ func (a *ErrorAggregator) getTailContentFromFile(f *os.File, maxTailSize int64)
return content, err
}

func (a *ErrorAggregator) truncateLog(logConent *string, truncateSize int) (*string, int) {
func (a *ErrorAggregator) truncateLog(logConent *string, truncateSize int, matchString *string) (*string, int) {
if logConent == nil {
return nil, 0
}

logSize := len(*logConent)
matchBeginPos := -1
if matchString != nil {
matchBeginPos = strings.Index(*logConent, *matchString)
}

if logSize > truncateSize {
truncatedLog := (*logConent)[truncateSize:]
if matchString == nil || matchBeginPos == -1 || matchBeginPos > truncateSize {
truncatedLog := (*logConent)[truncateSize:]
return &truncatedLog, logSize - len(truncatedLog)
}
// try to keep the match string as much as posible
truncatedLog := (*logConent)[matchBeginPos:]
remainTruncateSize := truncateSize - matchBeginPos
truncatedLog = truncatedLog[:len(truncatedLog)-remainTruncateSize]
return &truncatedLog, logSize - len(truncatedLog)
}
return nil, logSize
Expand All @@ -372,7 +388,7 @@ func (a *ErrorAggregator) truncateExitSummary(runtimeExitInfo *RuntimeExitInfo)

if runtimeExitInfo.ErrorLogs != nil {
// truncate runtime log first
truncatedRuntimeLog, trucatedSize := a.truncateLog(runtimeExitInfo.ErrorLogs.Platform, remainTruncateSize)
truncatedRuntimeLog, trucatedSize := a.truncateLog(runtimeExitInfo.ErrorLogs.Platform, remainTruncateSize, runtimeExitInfo.MatchedPlatformLogString)
runtimeExitInfo.ErrorLogs.Platform = truncatedRuntimeLog
remainTruncateSize = remainTruncateSize - trucatedSize
if remainTruncateSize <= 0 {
Expand All @@ -381,7 +397,7 @@ func (a *ErrorAggregator) truncateExitSummary(runtimeExitInfo *RuntimeExitInfo)
}

// truncate the user log
truncatedUserLog, trucatedSize := a.truncateLog(runtimeExitInfo.ErrorLogs.User, remainTruncateSize)
truncatedUserLog, trucatedSize := a.truncateLog(runtimeExitInfo.ErrorLogs.User, remainTruncateSize, runtimeExitInfo.MatchedUserLogString)
runtimeExitInfo.ErrorLogs.User = truncatedUserLog
remainTruncateSize = remainTruncateSize - trucatedSize

Expand Down Expand Up @@ -440,17 +456,20 @@ func NewErrorAggregator(l *LogFiles, logger *logger.Logger) (*ErrorAggregator, e
return nil, errors.New("logger not provide")
}

const exitInfoBeginTag = "[PAI_RUNTIME_ERROR_START]"
const exitInfoEndTag = "[PAI_RUNTIME_ERROR_END]"

a := ErrorAggregator{
logFiles: l,
logger: logger,
maxAggregateLogSize: 4096,
maxAggregateLogSize: 4096 - len(exitInfoBeginTag) - len(exitInfoEndTag),
maxMatchLogLen: 2048,
maxUserLogLines: 20,
maxUserLogLines: 15,
maxRuntimeLogLines: 10,
defaulExitCode: 255,
maxSearchLogSize: 10 * 1024 * 1024, // 10MB
aggExitInfoBegin: "[PAI_RUNTIME_ERROR_START]",
aggExitInfoEnd: "[PAI_RUNTIME_ERROR_END]",
aggExitInfoBegin: exitInfoBeginTag,
aggExitInfoEnd: exitInfoEndTag,
}
return &a, nil
}

0 comments on commit 1998650

Please sign in to comment.