From 0a3ee7b5d63c1b98b5de56925f89fa17cadb00de Mon Sep 17 00:00:00 2001 From: "Jeffrey T. Palmer" Date: Wed, 1 Jul 2020 18:55:20 -0400 Subject: [PATCH 1/2] Update Slurm shredder to ignore non-ended jobs Also changes the Slurm helper script to query for jobs in all states. --- bin/xdmod-slurm-helper | 4 - classes/OpenXdmod/Shredder/Slurm.php | 71 +++++++++-- docs/resource-manager-slurm.md | 2 - .../slurm/input/non-ended-job-state.log | 2 + .../slurm/input/unknown-job-state.log | 2 + .../slurm/output/non-ended-job-state.json | 14 +++ .../slurm/output/unknown-job-state.json | 20 ++++ .../Tests/Shredder/SlurmShredderTest.php | 111 ++++++++++++++++++ 8 files changed, 207 insertions(+), 19 deletions(-) create mode 100644 tests/artifacts/xdmod/unit/shredder/slurm/input/non-ended-job-state.log create mode 100644 tests/artifacts/xdmod/unit/shredder/slurm/input/unknown-job-state.log create mode 100644 tests/artifacts/xdmod/unit/shredder/slurm/output/non-ended-job-state.json create mode 100644 tests/artifacts/xdmod/unit/shredder/slurm/output/unknown-job-state.json diff --git a/bin/xdmod-slurm-helper b/bin/xdmod-slurm-helper index 42dc6f3fb1..f65fb78bb2 100755 --- a/bin/xdmod-slurm-helper +++ b/bin/xdmod-slurm-helper @@ -235,10 +235,6 @@ function getSacctCmdArgs( $args[] = '--format'; $args[] = $format; - $states = implode(',', $shredder->getStates()); - $args[] = '--state'; - $args[] = $states; - if ($startTime !== null) { $args[] = '--starttime'; $args[] = $startTime; diff --git a/classes/OpenXdmod/Shredder/Slurm.php b/classes/OpenXdmod/Shredder/Slurm.php index 3801a43e29..5cb88589bb 100644 --- a/classes/OpenXdmod/Shredder/Slurm.php +++ b/classes/OpenXdmod/Shredder/Slurm.php @@ -153,9 +153,10 @@ class Slurm extends Shredder * The Slurm job states corresponding to jobs that are no longer * running. * - * @var array + * @var string[] */ - protected static $states = array( + private static $endedJobStates = [ + 'BOOT_FAIL', 'CANCELLED', 'COMPLETED', 'FAILED', @@ -165,7 +166,28 @@ class Slurm extends Shredder 'OUT_OF_MEMORY', 'DEADLINE', 'REVOKED' - ); + ]; + + /** + * The Slurm job states corresponding to jobs that have not started or not + * ended. + * + * @var string[] + */ + private static $nonEndedJobStates = [ + 'PENDING', + 'RUNNING', + 'REQUEUED', + 'RESIZING', + 'SUSPENDED' + ]; + + /** + * Any job states that are not currently known to the shredder. + * + * @var string[] + */ + private static $unknownJobStates = []; /** * Time zone used when parsing datetimes. @@ -230,6 +252,39 @@ public function shredLine($line) return; } + // Split the job state because canceled jobs are reported as "CANCELLED + // by ...". + list($jobState) = explode(' ', strtoupper($job['state']), 2); + + if (!in_array($jobState, self::$endedJobStates)) { + if (in_array($jobState, self::$nonEndedJobStates)) { + $this->logger->debug( + sprintf( + 'Skipping job with non-ended state "%s"', + $jobState + ) + ); + return; + } + + // Warn about an unknown job state the first time it is + // encountered. + if (!in_array($jobState, self::$unknownJobStates)) { + $this->logger->warning( + sprintf( + 'Found job with unknown state "%s", ' + . 'all jobs with this state will be ignored', + $jobState + ) + ); + self::$unknownJobStates[] = $jobState; + } + $this->logger->debug( + sprintf('Skipping job with unknown state "%s"', $jobState) + ); + return; + } + $this->logger->debug('Parsed data: ' . json_encode($job)); $node = $this->getFirstNode($job['node_list']); @@ -313,16 +368,6 @@ public function getFieldNames() return self::$fieldNames; } - /** - * Returns the states for completed jobs as named by sacct. - * - * @return array - */ - public function getStates() - { - return self::$states; - } - /** * Return the first node from a nodeset. * diff --git a/docs/resource-manager-slurm.md b/docs/resource-manager-slurm.md index 2aa10a72ab..d916c9f683 100644 --- a/docs/resource-manager-slurm.md +++ b/docs/resource-manager-slurm.md @@ -45,8 +45,6 @@ $ TZ=UTC sacct --clusters *cluster* --allusers \ --format jobid,jobidraw,cluster,partition,account,group,gid,user,uid,\ submit,eligible,start,end,elapsed,exitcode,state,nnodes,ncpus,reqcpus,reqmem,\ reqgres,reqtres,alloctres,timelimit,nodelist,jobname \ - --state CANCELLED,COMPLETED,FAILED,NODE_FAIL,PREEMPTED,TIMEOUT,\ -OUT_OF_MEMORY,DEADLINE,REVOKED \ --starttime 2013-01-01T00:00:00 --endtime 2013-01-01T23:59:59 \ >/tmp/slurm.log diff --git a/tests/artifacts/xdmod/unit/shredder/slurm/input/non-ended-job-state.log b/tests/artifacts/xdmod/unit/shredder/slurm/input/non-ended-job-state.log new file mode 100644 index 0000000000..b95467989b --- /dev/null +++ b/tests/artifacts/xdmod/unit/shredder/slurm/input/non-ended-job-state.log @@ -0,0 +1,2 @@ +3451067|3451067|phillips|focaccia|chaff|chaff|89200691|noror|89200691|2020-06-24T15:06:34|2020-06-24T15:06:34|2020-06-30T19:24:08|2020-06-30T19:24:08|22:32:22|0:0|RUNNING|1|12|12|48000Mn||billing=12,cpu=12,mem=48000M,node=1|billing=12,cpu=12,mem=48000M,node=1|3-00:00:00|cpn-k07-02-01|0065 +3483276|3483276|phillips|black|taifl|taifl|311132|honbu|311132|2020-07-01T15:28:17|2020-07-01T15:28:17|Unknown|2020-07-01T15:28:17|00:00:00|0:0|PENDING|1|40|40|120000Mn||billing=40,cpu=40,mem=120000M,node=1||3-00:00:00|cpn-k07-02-01|21PyNP4a4b diff --git a/tests/artifacts/xdmod/unit/shredder/slurm/input/unknown-job-state.log b/tests/artifacts/xdmod/unit/shredder/slurm/input/unknown-job-state.log new file mode 100644 index 0000000000..62948d7c98 --- /dev/null +++ b/tests/artifacts/xdmod/unit/shredder/slurm/input/unknown-job-state.log @@ -0,0 +1,2 @@ +3451067|3451067|phillips|focaccia|chaff|chaff|89200691|noror|89200691|2020-06-24T15:06:34|2020-06-24T15:06:34|2020-06-30T19:24:08|2020-06-30T19:24:08|22:32:22|0:0|FOO|1|12|12|48000Mn||billing=12,cpu=12,mem=48000M,node=1|billing=12,cpu=12,mem=48000M,node=1|3-00:00:00|cpn-k07-02-01|0065 +3483276|3483276|phillips|black|taifl|taifl|311132|honbu|311132|2020-07-01T15:28:17|2020-07-01T15:28:17|Unknown|2020-07-01T15:28:17|00:00:00|0:0|BAR|1|40|40|120000Mn||billing=40,cpu=40,mem=120000M,node=1||3-00:00:00|cpn-k07-02-01|21PyNP4a4b diff --git a/tests/artifacts/xdmod/unit/shredder/slurm/output/non-ended-job-state.json b/tests/artifacts/xdmod/unit/shredder/slurm/output/non-ended-job-state.json new file mode 100644 index 0000000000..76f1d13165 --- /dev/null +++ b/tests/artifacts/xdmod/unit/shredder/slurm/output/non-ended-job-state.json @@ -0,0 +1,14 @@ +[ + { + "debug": [ + "/^Shredding line/", + "/^Skipping job with non-ended state/" + ] + }, + { + "debug": [ + "/^Shredding line/", + "/^Skipping job with non-ended state/" + ] + } +] diff --git a/tests/artifacts/xdmod/unit/shredder/slurm/output/unknown-job-state.json b/tests/artifacts/xdmod/unit/shredder/slurm/output/unknown-job-state.json new file mode 100644 index 0000000000..a23e0dd779 --- /dev/null +++ b/tests/artifacts/xdmod/unit/shredder/slurm/output/unknown-job-state.json @@ -0,0 +1,20 @@ +[ + { + "warning": [ + "/^Found job with unknown state/" + ], + "debug": [ + "/^Shredding line/", + "/^Skipping job with unknown state/" + ] + }, + { + "warning": [ + "/^Found job with unknown state/" + ], + "debug": [ + "/^Shredding line/", + "/^Skipping job with unknown state/" + ] + } +] diff --git a/tests/unit/lib/OpenXdmod/Tests/Shredder/SlurmShredderTest.php b/tests/unit/lib/OpenXdmod/Tests/Shredder/SlurmShredderTest.php index a9de47de88..42f0a0f1ad 100644 --- a/tests/unit/lib/OpenXdmod/Tests/Shredder/SlurmShredderTest.php +++ b/tests/unit/lib/OpenXdmod/Tests/Shredder/SlurmShredderTest.php @@ -100,6 +100,89 @@ public function testJobGpuGresParsing($line, $gpuCount) $shredder->shredLine($line); } + /** + * Test how job records with non-ended job states are handled. + * + * @dataProvider nonEndedJobStateLogProvider + */ + public function testNonEndedJobStateHandling($line, $messages) + { + $shredder = $this + ->getMockBuilder('\OpenXdmod\Shredder\Slurm') + ->setConstructorArgs([$this->db]) + ->setMethods(['insertRow']) + ->getMock(); + $shredder + ->expects($this->never()) + ->method('insertRow'); + + $logger = $this + ->getMockBuilder('\Log') + ->setMethods(['debug', 'warning']) + ->getMock(); + $logger + ->expects($this->never()) + ->method('warning'); + + // "withConsecutive" requires argument unpacking. + call_user_func_array( + [ + $logger->expects($this->exactly(count($messages['debug']))) + ->method('debug'), + 'withConsecutive' + ], + $this->convertLoggerArgumentsToAssertions($messages['debug']) + ); + + $shredder->setLogger($logger); + $shredder->shredLine($line); + } + + /** + * Test how job records with unknown job states are handled. + * + * @dataProvider unknownJobStateLogProvider + */ + public function testUnknownJobStateHandling($line, $messages) + { + $shredder = $this + ->getMockBuilder('\OpenXdmod\Shredder\Slurm') + ->setConstructorArgs([$this->db]) + ->setMethods(['insertRow']) + ->getMock(); + $shredder + ->expects($this->never()) + ->method('insertRow'); + + $logger = $this + ->getMockBuilder('\Log') + ->setMethods(['debug', 'warning']) + ->getMock(); + + // "withConsecutive" requires argument unpacking. + call_user_func_array( + [ + $logger->expects($this->exactly(count($messages['debug']))) + ->method('debug'), + 'withConsecutive' + ], + $this->convertLoggerArgumentsToAssertions($messages['debug']) + ); + + // "withConsecutive" requires argument unpacking. + call_user_func_array( + [ + $logger->expects($this->exactly(count($messages['warning']))) + ->method('warning'), + 'withConsecutive' + ], + $this->convertLoggerArgumentsToAssertions($messages['warning']) + ); + + $shredder->setLogger($logger); + $shredder->shredLine($line); + } + public function accountingLogProvider() { return $this->getLogFileTestCases('accounting-logs'); @@ -114,4 +197,32 @@ public function accountingLogWithGpuGresProvider() { return $this->getLogFileTestCases('accounting-logs-with-gpu-gres'); } + + public function nonEndedJobStateLogProvider() + { + return $this->getLogFileTestCases('non-ended-job-state'); + } + + public function unknownJobStateLogProvider() + { + return $this->getLogFileTestCases('unknown-job-state'); + } + + /** + * Convert test data to PHPUnit asserts. + * + * Transforms the test used to test log messages. Input is an array of + * strings that are regular expression. + * + * @param string[] $loggerPatterns + * @return array[] + */ + private function convertLoggerArgumentsToAssertions(array $logPatterns) + { + $assertions = []; + foreach ($logPatterns as $pattern) { + $assertions[] = [$this->matchesRegularExpression($pattern)]; + } + return $assertions; + } } From a8fa033901c78160e9191cebc9a06a89de8460c5 Mon Sep 17 00:00:00 2001 From: "Jeffrey T. Palmer" Date: Thu, 2 Jul 2020 07:21:05 -0400 Subject: [PATCH 2/2] Add documentation --- docs/faq.md | 7 +++++++ docs/upgrade.md | 13 ++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/docs/faq.md b/docs/faq.md index 94df30773e..aba571c379 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -178,3 +178,10 @@ sites should use the https:// prefix in the web address. The template Apache configuration file must be edited to specify the path to valid SSL certificates. See the [webserver configuration section](configuration.html#apache-configuration) for details on how to configure the server. + +### Why do I see the warning message "Skipping job with unknown state ..." while shredding Slurm data? + +The Open XDMoD Slurm shredder will accept data for jobs in all states, but +ignore jobs that have not ended. If an unknown job state is encountered this +warning message will be generated. Please notify the Open XDMoD developers +about the unknown state using the [support](support.html) contact information. diff --git a/docs/upgrade.md b/docs/upgrade.md index f5d892e964..fb33dd042a 100644 --- a/docs/upgrade.md +++ b/docs/upgrade.md @@ -120,11 +120,18 @@ realm. Since Open XDMoD 6.5 data from slurm (`ReqGRES`) has been ingested into the database, but not displayed in the portal. These jobs may now be re-ingested and any GPU data will be used. -### Input File Format Changes +### Slurm Input File Format Changes The input file format for Slurm data has changed to include the `AllocTRES` -field. If you are generating Slurm input for the `xdmod-shredder` command then -you will need to make the appropriate changes. Refer to the [Slurm +field. + +The slurm shredder has also been updated to accept jobs in all states and to +ignore jobs that have not ended. Due to this change the `--state` option of +the `sacct` command is no longer recommended. If an unrecognized state is +encountered a warning will be generated. + +**If you are generating Slurm input for the `xdmod-shredder` command then you +will need to make the appropriate changes.** Refer to the [Slurm Notes](resource-manager-slurm.html#input-format) for the example `sacct` command. If you are using the `xdmod-slurm-helper` command then no changes are necessary.