From d5d58f66ca7f9c46ec398162cc8e8a9fad1ae4d0 Mon Sep 17 00:00:00 2001 From: avelichk Date: Wed, 11 Aug 2021 21:56:09 +0100 Subject: [PATCH 1/4] Skip error in case of empty Process --- pkg/metricscollector/v1beta1/common/pns.go | 31 +++++++++++++--------- pkg/metricscollector/v1beta1/common/pns.py | 9 ++++--- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/pkg/metricscollector/v1beta1/common/pns.go b/pkg/metricscollector/v1beta1/common/pns.go index 7fc19b51e3e..b1330566f05 100644 --- a/pkg/metricscollector/v1beta1/common/pns.go +++ b/pkg/metricscollector/v1beta1/common/pns.go @@ -26,6 +26,7 @@ import ( "time" psutil "github.com/shirou/gopsutil/process" + "k8s.io/klog" ) // WaitPidsOpts is the input options for metrics collector @@ -40,7 +41,7 @@ type WaitPidsOpts struct { func WaitMainProcesses(opts WaitPidsOpts) error { if runtime.GOOS != "linux" { - return fmt.Errorf("Platform '%s' unsupported", runtime.GOOS) + return fmt.Errorf("platform '%s' unsupported", runtime.GOOS) } pids, mainPid, err := GetMainProcesses(opts.CompletedMarkedDirPath) @@ -59,7 +60,7 @@ func GetMainProcesses(completedMarkedDirPath string) (map[int]bool, int, error) mainPid := 0 if err != nil { - return nil, 0, fmt.Errorf("Failed to list processes: %v", err) + return nil, 0, fmt.Errorf("failed to list processes: %v", err) } thisPID := os.Getpid() @@ -67,13 +68,14 @@ func GetMainProcesses(completedMarkedDirPath string) (map[int]bool, int, error) // Create process object from pid proc, err := psutil.NewProcess(pid) if err != nil { - return nil, 0, fmt.Errorf("Failed to create new Process from pid %v, error: %v", pid, err) + klog.Info("Skip Process with pid: %v, error: %v", pid, err) + continue } // Get parent process ppid, err := proc.Ppid() if err != nil { - return nil, 0, fmt.Errorf("Unable to get parent pid for pid: %v, error: %v", ppid, err) + return nil, 0, fmt.Errorf("unable to get parent pid for pid: %v, error: %v", ppid, err) } // Ignore the pause container, our own pid, and non-root processes (parent pid != 0) @@ -84,21 +86,24 @@ func GetMainProcesses(completedMarkedDirPath string) (map[int]bool, int, error) // Read the process command line cmdline, err := proc.Cmdline() if err != nil { - return nil, 0, fmt.Errorf("Unable to get cmdline from pid %v, error: %v", pid, err) + return nil, 0, fmt.Errorf("unable to get cmdline from pid %v, error: %v", pid, err) } - // By default mainPid is the first process. - // Command line contains completed marker for the main pid + // Command line contains completed marker for the main pid. // For example: echo completed > /var/log/katib/$$$$.pid // completedMarkedDirPath is the directory for completed marker, e.g. /var/log/katib - if mainPid == 0 || - strings.Contains(cmdline, fmt.Sprintf("echo %s > %s", TrainingCompleted, completedMarkedDirPath)) { + if strings.Contains(cmdline, fmt.Sprintf("echo %s > %s", TrainingCompleted, completedMarkedDirPath)) { mainPid = int(pid) } pids[int(pid)] = true } + // If mainPid has not been found, return an error. + if mainPid == 0 { + return nil, 0, fmt.Errorf("unable to find main pid from the process list %v", allPids) + } + return pids, mainPid, nil } @@ -132,7 +137,7 @@ func WaitPIDs(pids map[int]bool, mainPid int, opts WaitPidsOpts) error { // Read file with "completed" marker contents, err := ioutil.ReadFile(markFile) if err != nil { - return fmt.Errorf("Training container is failed. Unable to read file %v for pid %v, error: %v", markFile, pid, err) + return fmt.Errorf("training container is failed. Unable to read file %v for pid %v, error: %v", markFile, pid, err) } // Check if file contains "early-stopped" marker // In that case process is not completed @@ -141,7 +146,7 @@ func WaitPIDs(pids map[int]bool, mainPid int, opts WaitPidsOpts) error { } // Check if file contains "completed" marker if strings.TrimSpace(string(contents)) != TrainingCompleted { - return fmt.Errorf("Unable to find marker: %v in file: %v with contents: %v for pid: %v", + return fmt.Errorf("unable to find marker: %v in file: %v with contents: %v for pid: %v", TrainingCompleted, markFile, string(contents), pid) } } @@ -157,7 +162,7 @@ func WaitPIDs(pids map[int]bool, mainPid int, opts WaitPidsOpts) error { } // We should receive only not exist error when we check /proc/ dir } else { - return fmt.Errorf("Fail to check process info: %v", err) + return fmt.Errorf("fail to check process info: %v", err) } } } @@ -167,7 +172,7 @@ func WaitPIDs(pids map[int]bool, mainPid int, opts WaitPidsOpts) error { // After main loop notFinishedPids map should be empty if len(notFinishedPids) != 0 { - return fmt.Errorf("Timed out waiting for pids to complete") + return fmt.Errorf("timed out waiting for pids to complete") } return nil } diff --git a/pkg/metricscollector/v1beta1/common/pns.py b/pkg/metricscollector/v1beta1/common/pns.py index b45942c3f8b..1544691471c 100644 --- a/pkg/metricscollector/v1beta1/common/pns.py +++ b/pkg/metricscollector/v1beta1/common/pns.py @@ -54,15 +54,18 @@ def GetMainProcesses(completed_marked_dir): # Read the process command line, join all cmdlines in one string cmd_lind = " ".join(proc.cmdline()) - # By default main_pid is the first process. - # Command line contains completed marker for the main pid + # Command line contains completed marker for the main pid. # For example: echo completed > /var/log/katib/$$$$.pid # completed_marked_dir is the directory for completed marker, e.g. /var/log/katib - if main_pid == 0 or ("echo {} > {}".format(const.TRAINING_COMPLETED, completed_marked_dir) in cmd_lind): + if "echo {} > {}".format(const.TRAINING_COMPLETED, completed_marked_dir) in cmd_lind: main_pid = pid pids.add(pid) + # If mainPid has not been found, return an error. + if main_pid == 0: + raise Exception("Unable to find main pid from the process list {}".format(pids)) + return pids, main_pid From 38ee088582e8ec8c184b8e995978a1df2596ee6e Mon Sep 17 00:00:00 2001 From: avelichk Date: Wed, 11 Aug 2021 22:20:54 +0100 Subject: [PATCH 2/4] Fix print --- pkg/metricscollector/v1beta1/common/pns.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/metricscollector/v1beta1/common/pns.go b/pkg/metricscollector/v1beta1/common/pns.go index b1330566f05..c9599d26c87 100644 --- a/pkg/metricscollector/v1beta1/common/pns.go +++ b/pkg/metricscollector/v1beta1/common/pns.go @@ -68,7 +68,7 @@ func GetMainProcesses(completedMarkedDirPath string) (map[int]bool, int, error) // Create process object from pid proc, err := psutil.NewProcess(pid) if err != nil { - klog.Info("Skip Process with pid: %v, error: %v", pid, err) + klog.Infof("Skip Process with pid: %v, error: %v", pid, err) continue } From f3fde43aa96dae93ad7cf890fe84aad808c8e701 Mon Sep 17 00:00:00 2001 From: avelichk Date: Wed, 11 Aug 2021 22:38:56 +0100 Subject: [PATCH 3/4] Skip other errors --- pkg/metricscollector/v1beta1/common/pns.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pkg/metricscollector/v1beta1/common/pns.go b/pkg/metricscollector/v1beta1/common/pns.go index c9599d26c87..5e0825a8b5c 100644 --- a/pkg/metricscollector/v1beta1/common/pns.go +++ b/pkg/metricscollector/v1beta1/common/pns.go @@ -68,14 +68,15 @@ func GetMainProcesses(completedMarkedDirPath string) (map[int]bool, int, error) // Create process object from pid proc, err := psutil.NewProcess(pid) if err != nil { - klog.Infof("Skip Process with pid: %v, error: %v", pid, err) + klog.Infof("Unable to create new process from pid: %v, error: %v. Continue to next pid", pid, err) continue } // Get parent process ppid, err := proc.Ppid() if err != nil { - return nil, 0, fmt.Errorf("unable to get parent pid for pid: %v, error: %v", ppid, err) + klog.Infof("Unable to get parent process for pid: %v, error: %v. Continue to next pid", pid, err) + continue } // Ignore the pause container, our own pid, and non-root processes (parent pid != 0) @@ -86,7 +87,8 @@ func GetMainProcesses(completedMarkedDirPath string) (map[int]bool, int, error) // Read the process command line cmdline, err := proc.Cmdline() if err != nil { - return nil, 0, fmt.Errorf("unable to get cmdline from pid %v, error: %v", pid, err) + klog.Infof("Unable to get cmdline from pid: %v, error: %v. Continue to next pid", pid, err) + continue } // Command line contains completed marker for the main pid. From d497d4424485dfed07e88f69ec1125e5a790a7f5 Mon Sep 17 00:00:00 2001 From: avelichk Date: Wed, 11 Aug 2021 23:03:08 +0100 Subject: [PATCH 4/4] Set first process as default main --- pkg/metricscollector/v1beta1/common/pns.go | 6 ++++-- pkg/metricscollector/v1beta1/common/pns.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pkg/metricscollector/v1beta1/common/pns.go b/pkg/metricscollector/v1beta1/common/pns.go index 5e0825a8b5c..2baebe4aee8 100644 --- a/pkg/metricscollector/v1beta1/common/pns.go +++ b/pkg/metricscollector/v1beta1/common/pns.go @@ -91,10 +91,12 @@ func GetMainProcesses(completedMarkedDirPath string) (map[int]bool, int, error) continue } - // Command line contains completed marker for the main pid. + // By default mainPid is the first process. + // In addition to that, command line contains completed marker for the main pid // For example: echo completed > /var/log/katib/$$$$.pid // completedMarkedDirPath is the directory for completed marker, e.g. /var/log/katib - if strings.Contains(cmdline, fmt.Sprintf("echo %s > %s", TrainingCompleted, completedMarkedDirPath)) { + if mainPid == 0 || + strings.Contains(cmdline, fmt.Sprintf("echo %s > %s", TrainingCompleted, completedMarkedDirPath)) { mainPid = int(pid) } diff --git a/pkg/metricscollector/v1beta1/common/pns.py b/pkg/metricscollector/v1beta1/common/pns.py index 1544691471c..e2621c8162d 100644 --- a/pkg/metricscollector/v1beta1/common/pns.py +++ b/pkg/metricscollector/v1beta1/common/pns.py @@ -54,10 +54,11 @@ def GetMainProcesses(completed_marked_dir): # Read the process command line, join all cmdlines in one string cmd_lind = " ".join(proc.cmdline()) - # Command line contains completed marker for the main pid. + # By default main_pid is the first process. + # In addition to that, command line contains completed marker for the main pid. # For example: echo completed > /var/log/katib/$$$$.pid # completed_marked_dir is the directory for completed marker, e.g. /var/log/katib - if "echo {} > {}".format(const.TRAINING_COMPLETED, completed_marked_dir) in cmd_lind: + if main_pid == 0 or ("echo {} > {}".format(const.TRAINING_COMPLETED, completed_marked_dir) in cmd_lind): main_pid = pid pids.add(pid)