Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Slurm shredder to ignore non-ended job states #1362

Merged
merged 2 commits into from
Jul 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions bin/xdmod-slurm-helper
Original file line number Diff line number Diff line change
Expand Up @@ -235,10 +235,6 @@ function getSacctCmdArgs(
$args[] = '--format';
$args[] = $format;

$states = implode(',', $shredder->getStates());
$args[] = '--state';
$args[] = $states;

if ($startTime !== null) {
$args[] = '--starttime';
$args[] = $startTime;
Expand Down
71 changes: 58 additions & 13 deletions classes/OpenXdmod/Shredder/Slurm.php
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,10 @@ class Slurm extends Shredder
* The Slurm job states corresponding to jobs that are no longer
* running.
*
* @var array
* @var string[]
*/
protected static $states = array(
private static $endedJobStates = [
'BOOT_FAIL',
'CANCELLED',
'COMPLETED',
'FAILED',
Expand All @@ -165,7 +166,28 @@ class Slurm extends Shredder
'OUT_OF_MEMORY',
'DEADLINE',
'REVOKED'
);
];

/**
* The Slurm job states corresponding to jobs that have not started or not
* ended.
*
* @var string[]
*/
private static $nonEndedJobStates = [
'PENDING',
'RUNNING',
'REQUEUED',
'RESIZING',
'SUSPENDED'
];

/**
* Any job states that are not currently known to the shredder.
*
* @var string[]
*/
private static $unknownJobStates = [];

/**
* Time zone used when parsing datetimes.
Expand Down Expand Up @@ -230,6 +252,39 @@ public function shredLine($line)
return;
}

// Split the job state because canceled jobs are reported as "CANCELLED
// by ...".
list($jobState) = explode(' ', strtoupper($job['state']), 2);

if (!in_array($jobState, self::$endedJobStates)) {
if (in_array($jobState, self::$nonEndedJobStates)) {
$this->logger->debug(
sprintf(
'Skipping job with non-ended state "%s"',
$jobState
)
);
return;
}

// Warn about an unknown job state the first time it is
// encountered.
if (!in_array($jobState, self::$unknownJobStates)) {
$this->logger->warning(
sprintf(
'Found job with unknown state "%s", '
. 'all jobs with this state will be ignored',
$jobState
)
);
self::$unknownJobStates[] = $jobState;
}
$this->logger->debug(
sprintf('Skipping job with unknown state "%s"', $jobState)
);
return;
}

$this->logger->debug('Parsed data: ' . json_encode($job));

$node = $this->getFirstNode($job['node_list']);
Expand Down Expand Up @@ -313,16 +368,6 @@ public function getFieldNames()
return self::$fieldNames;
}

/**
* Returns the states for completed jobs as named by sacct.
*
* @return array
*/
public function getStates()
{
return self::$states;
}

/**
* Return the first node from a nodeset.
*
Expand Down
7 changes: 7 additions & 0 deletions docs/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,10 @@ sites should use the https:// prefix in the web address.
The template Apache configuration file must be edited to specify the path to
valid SSL certificates. See the [webserver configuration section](configuration.html#apache-configuration)
for details on how to configure the server.

### Why do I see the warning message "Skipping job with unknown state ..." while shredding Slurm data?

The Open XDMoD Slurm shredder will accept data for jobs in all states, but
ignore jobs that have not ended. If an unknown job state is encountered this
warning message will be generated. Please notify the Open XDMoD developers
about the unknown state using the [support](support.html) contact information.
2 changes: 0 additions & 2 deletions docs/resource-manager-slurm.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@ $ TZ=UTC sacct --clusters *cluster* --allusers \
--format jobid,jobidraw,cluster,partition,account,group,gid,user,uid,\
submit,eligible,start,end,elapsed,exitcode,state,nnodes,ncpus,reqcpus,reqmem,\
reqgres,reqtres,alloctres,timelimit,nodelist,jobname \
--state CANCELLED,COMPLETED,FAILED,NODE_FAIL,PREEMPTED,TIMEOUT,\
OUT_OF_MEMORY,DEADLINE,REVOKED \
--starttime 2013-01-01T00:00:00 --endtime 2013-01-01T23:59:59 \
>/tmp/slurm.log

Expand Down
13 changes: 10 additions & 3 deletions docs/upgrade.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,18 @@ realm. Since Open XDMoD 6.5 data from slurm (`ReqGRES`) has been ingested into
the database, but not displayed in the portal. These jobs may now be
re-ingested and any GPU data will be used.

### Input File Format Changes
### Slurm Input File Format Changes

The input file format for Slurm data has changed to include the `AllocTRES`
field. If you are generating Slurm input for the `xdmod-shredder` command then
you will need to make the appropriate changes. Refer to the [Slurm
field.

The slurm shredder has also been updated to accept jobs in all states and to
ignore jobs that have not ended. Due to this change the `--state` option of
the `sacct` command is no longer recommended. If an unrecognized state is
encountered a warning will be generated.

**If you are generating Slurm input for the `xdmod-shredder` command then you
will need to make the appropriate changes.** Refer to the [Slurm
Notes](resource-manager-slurm.html#input-format) for the example `sacct`
command. If you are using the `xdmod-slurm-helper` command then no changes are
necessary.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
3451067|3451067|phillips|focaccia|chaff|chaff|89200691|noror|89200691|2020-06-24T15:06:34|2020-06-24T15:06:34|2020-06-30T19:24:08|2020-06-30T19:24:08|22:32:22|0:0|RUNNING|1|12|12|48000Mn||billing=12,cpu=12,mem=48000M,node=1|billing=12,cpu=12,mem=48000M,node=1|3-00:00:00|cpn-k07-02-01|0065
3483276|3483276|phillips|black|taifl|taifl|311132|honbu|311132|2020-07-01T15:28:17|2020-07-01T15:28:17|Unknown|2020-07-01T15:28:17|00:00:00|0:0|PENDING|1|40|40|120000Mn||billing=40,cpu=40,mem=120000M,node=1||3-00:00:00|cpn-k07-02-01|21PyNP4a4b
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
3451067|3451067|phillips|focaccia|chaff|chaff|89200691|noror|89200691|2020-06-24T15:06:34|2020-06-24T15:06:34|2020-06-30T19:24:08|2020-06-30T19:24:08|22:32:22|0:0|FOO|1|12|12|48000Mn||billing=12,cpu=12,mem=48000M,node=1|billing=12,cpu=12,mem=48000M,node=1|3-00:00:00|cpn-k07-02-01|0065
3483276|3483276|phillips|black|taifl|taifl|311132|honbu|311132|2020-07-01T15:28:17|2020-07-01T15:28:17|Unknown|2020-07-01T15:28:17|00:00:00|0:0|BAR|1|40|40|120000Mn||billing=40,cpu=40,mem=120000M,node=1||3-00:00:00|cpn-k07-02-01|21PyNP4a4b
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[
{
"debug": [
"/^Shredding line/",
"/^Skipping job with non-ended state/"
]
},
{
"debug": [
"/^Shredding line/",
"/^Skipping job with non-ended state/"
]
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[
{
"warning": [
"/^Found job with unknown state/"
],
"debug": [
"/^Shredding line/",
"/^Skipping job with unknown state/"
]
},
{
"warning": [
"/^Found job with unknown state/"
],
"debug": [
"/^Shredding line/",
"/^Skipping job with unknown state/"
]
}
]
111 changes: 111 additions & 0 deletions tests/unit/lib/OpenXdmod/Tests/Shredder/SlurmShredderTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,89 @@ public function testJobGpuGresParsing($line, $gpuCount)
$shredder->shredLine($line);
}

/**
* Test how job records with non-ended job states are handled.
*
* @dataProvider nonEndedJobStateLogProvider
*/
public function testNonEndedJobStateHandling($line, $messages)
{
$shredder = $this
->getMockBuilder('\OpenXdmod\Shredder\Slurm')
->setConstructorArgs([$this->db])
->setMethods(['insertRow'])
->getMock();
$shredder
->expects($this->never())
->method('insertRow');

$logger = $this
->getMockBuilder('\Log')
->setMethods(['debug', 'warning'])
->getMock();
$logger
->expects($this->never())
->method('warning');

// "withConsecutive" requires argument unpacking.
call_user_func_array(
[
$logger->expects($this->exactly(count($messages['debug'])))
->method('debug'),
'withConsecutive'
],
$this->convertLoggerArgumentsToAssertions($messages['debug'])
);

$shredder->setLogger($logger);
$shredder->shredLine($line);
}

/**
* Test how job records with unknown job states are handled.
*
* @dataProvider unknownJobStateLogProvider
*/
public function testUnknownJobStateHandling($line, $messages)
{
$shredder = $this
->getMockBuilder('\OpenXdmod\Shredder\Slurm')
->setConstructorArgs([$this->db])
->setMethods(['insertRow'])
->getMock();
$shredder
->expects($this->never())
->method('insertRow');

$logger = $this
->getMockBuilder('\Log')
->setMethods(['debug', 'warning'])
->getMock();

// "withConsecutive" requires argument unpacking.
call_user_func_array(
[
$logger->expects($this->exactly(count($messages['debug'])))
->method('debug'),
'withConsecutive'
],
$this->convertLoggerArgumentsToAssertions($messages['debug'])
);

// "withConsecutive" requires argument unpacking.
call_user_func_array(
[
$logger->expects($this->exactly(count($messages['warning'])))
->method('warning'),
'withConsecutive'
],
$this->convertLoggerArgumentsToAssertions($messages['warning'])
);

$shredder->setLogger($logger);
$shredder->shredLine($line);
}

public function accountingLogProvider()
{
return $this->getLogFileTestCases('accounting-logs');
Expand All @@ -114,4 +197,32 @@ public function accountingLogWithGpuGresProvider()
{
return $this->getLogFileTestCases('accounting-logs-with-gpu-gres');
}

public function nonEndedJobStateLogProvider()
{
return $this->getLogFileTestCases('non-ended-job-state');
}

public function unknownJobStateLogProvider()
{
return $this->getLogFileTestCases('unknown-job-state');
}

/**
* Convert test data to PHPUnit asserts.
*
* Transforms the test used to test log messages. Input is an array of
* strings that are regular expression.
*
* @param string[] $loggerPatterns
* @return array[]
*/
private function convertLoggerArgumentsToAssertions(array $logPatterns)
{
$assertions = [];
foreach ($logPatterns as $pattern) {
$assertions[] = [$this->matchesRegularExpression($pattern)];
}
return $assertions;
}
}