Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor job data set analysis fields #256

Merged
merged 1 commit into from
Dec 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 52 additions & 103 deletions classes/DataWarehouse/Query/SUPREMM/JobDataset.php
Original file line number Diff line number Diff line change
Expand Up @@ -180,17 +180,45 @@ public function __construct(
*/
private function addFieldByDefinition(array $fieldDef)
{
$tableAlias = $fieldDef['tableAlias'];
$table = null;
if (array_key_exists($tableAlias, $this->tables)) {
$table = $this->tables[$tableAlias];
} elseif (array_key_exists($tableAlias, $this->tableDefs)) {
$table = $this->addTableByDefinition($this->tableDefs[$tableAlias]);
if (array_key_exists('tableAlias', $fieldDef)) {
$tableAlias = $fieldDef['tableAlias'];
if (array_key_exists($tableAlias, $this->tables)) {
$table = $this->tables[$tableAlias];
} elseif (array_key_exists($tableAlias, $this->tableDefs)) {
$table = $this->addTableByDefinition($this->tableDefs[$tableAlias]);
} else {
throw new \Exception(sprintf('Unrecognized table alias "%s"', $tableAlias));
}
}

if (!array_key_exists('alias', $fieldDef)) {
throw new \Exception(sprintf('Missing alias for definition: %s', json_encode($fieldDef)));
}
$alias = $fieldDef['alias'];

if ($table !== null && array_key_exists('column', $fieldDef)) {
$this->addField(new TableField($table, $fieldDef['column'], $alias));
} elseif (array_key_exists('formula', $fieldDef)) {
$this->addField(new FormulaField($fieldDef['formula'], $alias));
} else {
throw new \Exception(sprintf('Unrecognized table alias "%s"', $tableAlias));
throw new \Exception(sprintf(
'Missing tableAlias and column or formula for "%s", definition: %s',
$alias,
json_encode($fieldDef)
));
}

$this->documentation[$alias] = $fieldDef;

if (array_key_exists('withError', $fieldDef)) {
$errorDef = $fieldDef['withError'];
$this->addErrorField(
$errorDef['column'],
$errorDef['tableAlias'],
array_key_exists('name', $errorDef) ? $errorDef['name'] : null
);
}
$this->addField(new TableField($table, $fieldDef['column'], $fieldDef['alias']));
$this->documentation[$fieldDef['alias']] = $fieldDef;
}

/**
Expand Down Expand Up @@ -228,21 +256,25 @@ private function joinTo($othertable, $joinkey, $otherkey, $colalias, $idcol = "i
}

/**
* Add a field and the corresponding error field to the query.
* Add an error field to the query.
*
* @param Field $field The field to add to the query.
* @param string $fieldName The name of the field.
* @param Table $errorTable The error table.
* @param string $errorTableAlias The error table alias.
* @param string $errorName The name of the error field, if null then the field name is
* autogenerated based on the fieldName.
*
* @return null
*/
private function addFieldWithError($field, $fieldName, $errorTable, $errorName = null)
private function addErrorField($fieldName, $errorTableAlias, $errorName = null)
{
static $errorTableIdx = 0;
$errorTable = null;
if (array_key_exists($errorTableAlias, $this->tables)) {
$errorTable = $this->tables[$errorTableAlias];
} elseif (array_key_exists($errorTableAlias, $this->tableDefs)) {
$errorTable = $this->addTableByDefinition($this->tableDefs[$errorTableAlias]);
} else {
throw new \Exception(sprintf('Unrecognized table alias "%s"', $errorTableAlias));
}

$this->addField($field);
static $errorTableIdx = 0;

$errordesc = new Table(
new Schema('modw'),
Expand Down Expand Up @@ -307,98 +339,15 @@ private function addMetricsFields()

private function addAnalyticsFields()
{
$dataTable = $this->getDataTable();
$joberrors = new Table(new Schema('modw_supremm'), 'job_errors', 'je');
$this->addTable($joberrors);

$this->addWhereCondition(
new WhereCondition(
new TableField($dataTable, '_id'),
'=',
new TableField($joberrors, '_id')
)
);

foreach ($this->fieldDefs as $sfield => $sdata) {
// TODO work out a better way to have metrics have multiple
// meta-types (ie cpu user is an analytic as well as a metric).
if ($sfield == "cpu_user") {
$this->addFieldWithError(new TableField($dataTable, $sfield), $sfield, $joberrors);
$this->documentation[$sfield] = $sdata;
$this->addFieldByDefinition($sdata);
$this->addErrorField($sfield, 'je');
} elseif ($sdata['dtype'] == 'analysis') {
$this->addFieldByDefinition($sdata);
}
}
$this->addFieldWithError(
new FormulaField("(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))", "homogeneity"),
'catastrophe',
$joberrors,
'homogeneity_error'
);
$this->documentation['homogeneity'] = array(
'name'=> 'Homogeneity',
'units' => 'ratio',
'per' => 'job',
'visibility' => 'public',
'documentation' => 'A measure of how uniform the L1D load rate is over the lifetime of the job.
Jobs with a low homogeneity value (~0) should be investigated to check if there
has been a catastrophic failure during the job',
'batchExport' => true,
'dtype' => 'analysis'
);

$this->addFieldWithError(
new FormulaField('(1.0 - (jf.cpu_user_imbalance/100.0))', 'cpu_user_balance'),
'cpu_user_imbalance',
$joberrors,
'cpu_user_balance_error'
);
$this->documentation['cpu_user_balance'] = array(
'name'=> 'CPU User Balance',
'units' => 'ratio',
'per' => 'job',
'visibility' => 'public',
'documentation' => 'A measure of how uniform the CPU usage is between the cores that the job was
assigned. A value of CPU User Balance near 1 corresponds to a job with evenly
loaded CPUs. A value near 0 corresponds to a job with one or more CPU cores
with much lower utilization that the others.',
'batchExport' => true,
'dtype' => 'analysis'
);

$this->addFieldWithError(
new FormulaField('(1.0 - 1.0/POW(2-jf.max_memory, 5))', 'mem_coefficient'),
'max_memory',
$joberrors,
'mem_coefficient_error'
);
$this->documentation['mem_coefficient'] = array(
'name'=> 'Memory Headroom',
'units' => 'ratio',
'per' => 'job',
'visibility' => 'public',
'documentation' => 'A measure of the peak compute-node memory usage for the job. A value of 0 corresponds
to a job which used all of the available memory and 1 corresponds to a job with low memory usage.
The value is computed as 1 - 1 / (2 - m)^5, where m is the ratio of memory used to memory available for
the compute node that had the highest memory usage.',
'batchExport' => true,
'dtype' => 'analysis'
);

$this->addFieldWithError(
new FormulaField("LEAST(jf.wall_time / jf.requested_wall_time, 1)", "wall_accuracy"),
'requested_wall_time',
$joberrors,
'requested_wall_time_error'
);
$this->documentation['wall_accuracy'] = array(
'name'=> 'Walltime Accuracy',
'units' => 'ratio',
'per' => 'job',
'visibility' => 'public',
'documentation' => 'The ratio of actual wall time to requested wall time. A value near 1 indicates that
the requested wall time close to the actual wall time. A good wall time accuracy improves
system wide scheduling.',
'batchExport' => true,
'dtype' => 'analysis'
);
}
}
67 changes: 37 additions & 30 deletions docs/customization.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,20 @@ title: Customization

This document describes some advanced customizations for the Job Performance module.

**The automated upgade scripts do not have any support for preserving
**The automated upgrade scripts do not have any support for preserving
customizations. Any changes made to the underlying Open XDMoD source code
will likely be overwitten when the software us upgraded.**
will likely be overwritten when the software is upgraded.**

## Job Analytics

The Job analytics panel shows selected job performance metrics in color
The job analytics panel shows selected job performance metrics in color
coded plots across the top of the job tab in the Job Viewer. The value of
each metric in the panel is normalized so a value near 1 means a favourable
value and a value near 0 indicates an unfavourable value.

There are five default analytics. These are the CPU usage,
CPU Balance, Walltime Accuracy, Memory Efficiency and Homogeneity, see Figure 1
below. If the CPU usage metric is unavailable then the analytics toolbar is not displayed.
There are five default analytics. These are the CPU Usage,
Homogeneity, CPU Balance, Memory Efficiency, and Walltime Accuracy, see Figure 1
below. If the CPU Usage metric is unavailable then the analytics toolbar is not displayed.
If any of the other metrics are unavailable then an error message is displayed.

<figure>
Expand All @@ -27,12 +27,12 @@ If any of the other metrics are unavailable then an error message is displayed.

A common reason why an analytic is unavailable is that the underlying data was not collected
when the job was running. For example, the homogeneity analytic uses the L1D load count and
CPU clock tick counter hardware counter data. If the hardware counter data was not configured
CPU clock tick counter hardware counter data. If the hardware counter data were not configured
to be collected or the hardware does not support a L1D load counter then the homogeneity
metric will be unavailable. An example of the display in this case is shown in Figure 2.

<figure>
<img src="{{ site.baseurl }}/assets/images/analytics_unavailable.png" alt="Screenshot of showing a performance metric from the analytics toolbar where the performance datum is unavailable. The metric display shows an exclaimation mark icon with the text 'Metric Missing: Not Available On The Compute Nodes" />
<img src="{{ site.baseurl }}/assets/images/analytics_unavailable.png" alt="Screenshot showing a performance metric from the analytics toolbar where the performance datum is unavailable. The metric display shows an exclamation mark icon with the text 'Metric Missing: Not Available On The Compute Nodes'" />
<figcaption>Figure 2. Example analytics metric display when the datum is unavailable.</figcaption>
</figure>

Expand All @@ -44,26 +44,33 @@ hardware support), then the Open XDMoD instance can be customized to never show
**These instructions only apply to Open XDMoD {{ page.sw_version }}. For other
versions please refer to the documentation for that release.**

To remove an analytic, you need to edit `/usr/share/xdmod/classes/DataWarehouse/Query/SUPREMM/JobDataset.php`
and remove the code associated with the analytic. For example, to remove the homogeneity
analytic you would remove (or comment out) lines 330-346. I.e. the function call to `addFieldWithError` and the
update to the documentation object. The lines to remove are shown below.
```php
330 $this->addFieldWithError(
331 new FormulaField("(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))", "homogeneity"),
332 'catastrophe',
333 $joberrors,
334 'homogeneity_error'
335 );
336 $this->documentation['homogeneity'] = array(
337 'name'=> 'Homogeneity',
338 'units' => 'ratio',
339 'per' => 'job',
340 'visibility' => 'public',
341 'documentation' => 'A measure of how uniform the L1D load rate is over the lifetime of the job.
342 Jobs with a low homogeneity value (~0) should be investigated to check if there
343 has been a catastrophic failure during the job',
344 'batchExport' => true,
345 'dtype' => 'analysis'
346 );
To remove an analytic you need to edit `/usr/share/xdmod/etl/js/config/supremm/etl.schema.js`
and remove the code associated with the analytic. For example to remove the homogeneity
analytic you would remove (or comment out) lines 2716–2732. The lines to remove are shown below.
```js
2716 homogeneity: {
2717 name: 'Homogeneity',
2718 formula: '(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))',
2719 withError: {
2720 name: 'homogeneity_error',
2721 column: 'catastrophe',
2722 tableAlias: 'je'
2723 },
2724 unit: 'ratio',
2725 per: 'job',
2726 visibility: 'public',
2727 comments: 'A measure of how uniform the L1D load rate is over the lifetime of the job. ' +
2728 'Jobs with a low homogeneity value (~0) should be investigated to check if there ' +
2729 'has been a catastrophic failure during the job',
2730 batchExport: true,
2731 dtype: 'analysis'
2732 },
```

After editing the file, run:
```
# node /usr/share/xdmod/etl/js/etl.cli.js -i
```

To change the order in which the analytics appear in the toolbar, edit the
`metricOrder` variable in `/usr/share/xdmod/html/gui/js/modules/job_viewer/JobPanel.js`.
Loading