diff --git a/src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/cmd/exithandler/main.go b/src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/cmd/exithandler/main.go index cb852eb27d1..c7f238101a2 100644 --- a/src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/cmd/exithandler/main.go +++ b/src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/cmd/exithandler/main.go @@ -3,6 +3,7 @@ package main import ( "os" "strconv" + "time" "github.com/microsoft/runtime/pkg/aggregator" "github.com/microsoft/runtime/pkg/logger" @@ -50,6 +51,7 @@ func main() { } log.Info("start to generate the exit summary") + start := time.Now() a, err := aggregator.NewErrorAggregator(&logFiles, log) if err != nil { panic("fatal: create log aggregator: " + err.Error()) @@ -74,7 +76,8 @@ func main() { if err != nil { panic("fatal: dumping summary info: " + err.Error()) } - log.Info("finish generating the exit summary") + elapsed := time.Since(start) + log.Info("finish generating the exit summary, time consumed:", elapsed) os.Exit(exitInfo.Exitcode) } diff --git a/src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/example/config/failurePatterns.yml b/src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/example/config/failurePatterns.yml index 881c4fd9b4c..d2b5cab7a0f 100644 --- a/src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/example/config/failurePatterns.yml +++ b/src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/example/config/failurePatterns.yml @@ -7,9 +7,6 @@ - patterns: - exitCode: 1 userLogRegex: exec\(compile\(getattr\(token[a-z]*, 'open', open\)\(__file__\) - reason: Failed to open tocken file - solution: - - Please check the tocken file path containerExitCode: 10 - patterns: @@ -17,30 +14,18 @@ userLogRegex: failed with error code 1 in /tmp/[a-z]* - exitCode: 1 userLogRegex: package not find [a-z]* - reason: Some package not install property - solution: - - Please check the pip install command containerExitCode: 12 - patterns: - exitCode: 1 userLogRegex: connect tensorboard failed platformLogRegex: Failed to start tensorboard - reason: Tensorboad not startup correctly - solution: - - Please try again or connect admin containerExitCode: 15 - patterns: - exitCode: 137 - reason: User program terminated by SIGKILL - solution: - - Please check the log and retry again containerExitCode: 137 - patterns: - exitCode: 143 - reason: User program terminated by SIGTERM - solution: - - Please check the log and retry again containerExitCode: 143 \ No newline at end of file diff --git a/src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/pkg/aggregator/error_aggregator.go b/src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/pkg/aggregator/error_aggregator.go index 09cdb6f83fe..961b5fe4ba6 100644 --- a/src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/pkg/aggregator/error_aggregator.go +++ b/src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/pkg/aggregator/error_aggregator.go @@ -348,10 +348,26 @@ func (a *ErrorAggregator) getTailContentFromFile(f *os.File, maxTailSize int64) return content, err } -func (a *ErrorAggregator) truncateLog(logConent *string, truncateSize int) (*string, int) { +func (a *ErrorAggregator) truncateLog(logConent *string, truncateSize int, matchString *string) (*string, int) { + if logConent == nil { + return nil, 0 + } + logSize := len(*logConent) + matchBeginPos := -1 + if matchString != nil { + matchBeginPos = strings.Index(*logConent, *matchString) + } + if logSize > truncateSize { - truncatedLog := (*logConent)[truncateSize:] + if matchString == nil || matchBeginPos == -1 || matchBeginPos > truncateSize { + truncatedLog := (*logConent)[truncateSize:] + return &truncatedLog, logSize - len(truncatedLog) + } + // try to keep the match string as much as posible + truncatedLog := (*logConent)[matchBeginPos:] + remainTruncateSize := truncateSize - matchBeginPos + truncatedLog = truncatedLog[:len(truncatedLog)-remainTruncateSize] return &truncatedLog, logSize - len(truncatedLog) } return nil, logSize @@ -372,7 +388,7 @@ func (a *ErrorAggregator) truncateExitSummary(runtimeExitInfo *RuntimeExitInfo) if runtimeExitInfo.ErrorLogs != nil { // truncate runtime log first - truncatedRuntimeLog, trucatedSize := a.truncateLog(runtimeExitInfo.ErrorLogs.Platform, remainTruncateSize) + truncatedRuntimeLog, trucatedSize := a.truncateLog(runtimeExitInfo.ErrorLogs.Platform, remainTruncateSize, runtimeExitInfo.MatchedPlatformLogString) runtimeExitInfo.ErrorLogs.Platform = truncatedRuntimeLog remainTruncateSize = remainTruncateSize - trucatedSize if remainTruncateSize <= 0 { @@ -381,7 +397,7 @@ func (a *ErrorAggregator) truncateExitSummary(runtimeExitInfo *RuntimeExitInfo) } // truncate the user log - truncatedUserLog, trucatedSize := a.truncateLog(runtimeExitInfo.ErrorLogs.User, remainTruncateSize) + truncatedUserLog, trucatedSize := a.truncateLog(runtimeExitInfo.ErrorLogs.User, remainTruncateSize, runtimeExitInfo.MatchedUserLogString) runtimeExitInfo.ErrorLogs.User = truncatedUserLog remainTruncateSize = remainTruncateSize - trucatedSize @@ -440,17 +456,20 @@ func NewErrorAggregator(l *LogFiles, logger *logger.Logger) (*ErrorAggregator, e return nil, errors.New("logger not provide") } + const exitInfoBeginTag = "[PAI_RUNTIME_ERROR_START]" + const exitInfoEndTag = "[PAI_RUNTIME_ERROR_END]" + a := ErrorAggregator{ logFiles: l, logger: logger, - maxAggregateLogSize: 4096, + maxAggregateLogSize: 4096 - len(exitInfoBeginTag) - len(exitInfoEndTag), maxMatchLogLen: 2048, - maxUserLogLines: 20, + maxUserLogLines: 15, maxRuntimeLogLines: 10, defaulExitCode: 255, maxSearchLogSize: 10 * 1024 * 1024, // 10MB - aggExitInfoBegin: "[PAI_RUNTIME_ERROR_START]", - aggExitInfoEnd: "[PAI_RUNTIME_ERROR_END]", + aggExitInfoBegin: exitInfoBeginTag, + aggExitInfoEnd: exitInfoEndTag, } return &a, nil }