From 107d52ae8f632b5013978eb650b3b0d6513f59ff Mon Sep 17 00:00:00 2001 From: Aaron Weeden Date: Thu, 21 Dec 2023 11:08:38 -0500 Subject: [PATCH] Refactor job data set analysis fields. Primary author was @jtpalmer Co-Authored-By: Jeffrey T. Palmer <328284+jtpalmer@users.noreply.github.com> --- .../Query/SUPREMM/JobDataset.php | 155 ++++++------------ docs/customization.md | 67 ++++---- etl/js/config/supremm/etl.schema.js | 92 ++++++++++- 3 files changed, 179 insertions(+), 135 deletions(-) diff --git a/classes/DataWarehouse/Query/SUPREMM/JobDataset.php b/classes/DataWarehouse/Query/SUPREMM/JobDataset.php index 6fe15342b..acd17ac85 100644 --- a/classes/DataWarehouse/Query/SUPREMM/JobDataset.php +++ b/classes/DataWarehouse/Query/SUPREMM/JobDataset.php @@ -180,17 +180,45 @@ public function __construct( */ private function addFieldByDefinition(array $fieldDef) { - $tableAlias = $fieldDef['tableAlias']; $table = null; - if (array_key_exists($tableAlias, $this->tables)) { - $table = $this->tables[$tableAlias]; - } elseif (array_key_exists($tableAlias, $this->tableDefs)) { - $table = $this->addTableByDefinition($this->tableDefs[$tableAlias]); + if (array_key_exists('tableAlias', $fieldDef)) { + $tableAlias = $fieldDef['tableAlias']; + if (array_key_exists($tableAlias, $this->tables)) { + $table = $this->tables[$tableAlias]; + } elseif (array_key_exists($tableAlias, $this->tableDefs)) { + $table = $this->addTableByDefinition($this->tableDefs[$tableAlias]); + } else { + throw new \Exception(sprintf('Unrecognized table alias "%s"', $tableAlias)); + } + } + + if (!array_key_exists('alias', $fieldDef)) { + throw new \Exception(sprintf('Missing alias for definition: %s', json_encode($fieldDef))); + } + $alias = $fieldDef['alias']; + + if ($table !== null && array_key_exists('column', $fieldDef)) { + $this->addField(new TableField($table, $fieldDef['column'], $alias)); + } elseif (array_key_exists('formula', $fieldDef)) { + $this->addField(new FormulaField($fieldDef['formula'], $alias)); } else { - throw new \Exception(sprintf('Unrecognized table alias "%s"', $tableAlias)); + throw new \Exception(sprintf( + 'Missing tableAlias and column or formula for "%s", definition: %s', + $alias, + json_encode($fieldDef) + )); + } + + $this->documentation[$alias] = $fieldDef; + + if (array_key_exists('withError', $fieldDef)) { + $errorDef = $fieldDef['withError']; + $this->addErrorField( + $errorDef['column'], + $errorDef['tableAlias'], + array_key_exists('name', $errorDef) ? $errorDef['name'] : null + ); } - $this->addField(new TableField($table, $fieldDef['column'], $fieldDef['alias'])); - $this->documentation[$fieldDef['alias']] = $fieldDef; } /** @@ -228,21 +256,25 @@ private function joinTo($othertable, $joinkey, $otherkey, $colalias, $idcol = "i } /** - * Add a field and the corresponding error field to the query. + * Add an error field to the query. * - * @param Field $field The field to add to the query. * @param string $fieldName The name of the field. - * @param Table $errorTable The error table. + * @param string $errorTableAlias The error table alias. * @param string $errorName The name of the error field, if null then the field name is * autogenerated based on the fieldName. - * - * @return null */ - private function addFieldWithError($field, $fieldName, $errorTable, $errorName = null) + private function addErrorField($fieldName, $errorTableAlias, $errorName = null) { - static $errorTableIdx = 0; + $errorTable = null; + if (array_key_exists($errorTableAlias, $this->tables)) { + $errorTable = $this->tables[$errorTableAlias]; + } elseif (array_key_exists($errorTableAlias, $this->tableDefs)) { + $errorTable = $this->addTableByDefinition($this->tableDefs[$errorTableAlias]); + } else { + throw new \Exception(sprintf('Unrecognized table alias "%s"', $errorTableAlias)); + } - $this->addField($field); + static $errorTableIdx = 0; $errordesc = new Table( new Schema('modw'), @@ -307,98 +339,15 @@ private function addMetricsFields() private function addAnalyticsFields() { - $dataTable = $this->getDataTable(); - $joberrors = new Table(new Schema('modw_supremm'), 'job_errors', 'je'); - $this->addTable($joberrors); - - $this->addWhereCondition( - new WhereCondition( - new TableField($dataTable, '_id'), - '=', - new TableField($joberrors, '_id') - ) - ); - foreach ($this->fieldDefs as $sfield => $sdata) { // TODO work out a better way to have metrics have multiple // meta-types (ie cpu user is an analytic as well as a metric). if ($sfield == "cpu_user") { - $this->addFieldWithError(new TableField($dataTable, $sfield), $sfield, $joberrors); - $this->documentation[$sfield] = $sdata; + $this->addFieldByDefinition($sdata); + $this->addErrorField($sfield, 'je'); + } elseif ($sdata['dtype'] == 'analysis') { + $this->addFieldByDefinition($sdata); } } - $this->addFieldWithError( - new FormulaField("(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))", "homogeneity"), - 'catastrophe', - $joberrors, - 'homogeneity_error' - ); - $this->documentation['homogeneity'] = array( - 'name'=> 'Homogeneity', - 'units' => 'ratio', - 'per' => 'job', - 'visibility' => 'public', - 'documentation' => 'A measure of how uniform the L1D load rate is over the lifetime of the job. - Jobs with a low homogeneity value (~0) should be investigated to check if there - has been a catastrophic failure during the job', - 'batchExport' => true, - 'dtype' => 'analysis' - ); - - $this->addFieldWithError( - new FormulaField('(1.0 - (jf.cpu_user_imbalance/100.0))', 'cpu_user_balance'), - 'cpu_user_imbalance', - $joberrors, - 'cpu_user_balance_error' - ); - $this->documentation['cpu_user_balance'] = array( - 'name'=> 'CPU User Balance', - 'units' => 'ratio', - 'per' => 'job', - 'visibility' => 'public', - 'documentation' => 'A measure of how uniform the CPU usage is between the cores that the job was - assigned. A value of CPU User Balance near 1 corresponds to a job with evenly - loaded CPUs. A value near 0 corresponds to a job with one or more CPU cores - with much lower utilization that the others.', - 'batchExport' => true, - 'dtype' => 'analysis' - ); - - $this->addFieldWithError( - new FormulaField('(1.0 - 1.0/POW(2-jf.max_memory, 5))', 'mem_coefficient'), - 'max_memory', - $joberrors, - 'mem_coefficient_error' - ); - $this->documentation['mem_coefficient'] = array( - 'name'=> 'Memory Headroom', - 'units' => 'ratio', - 'per' => 'job', - 'visibility' => 'public', - 'documentation' => 'A measure of the peak compute-node memory usage for the job. A value of 0 corresponds - to a job which used all of the available memory and 1 corresponds to a job with low memory usage. - The value is computed as 1 - 1 / (2 - m)^5, where m is the ratio of memory used to memory available for - the compute node that had the highest memory usage.', - 'batchExport' => true, - 'dtype' => 'analysis' - ); - - $this->addFieldWithError( - new FormulaField("LEAST(jf.wall_time / jf.requested_wall_time, 1)", "wall_accuracy"), - 'requested_wall_time', - $joberrors, - 'requested_wall_time_error' - ); - $this->documentation['wall_accuracy'] = array( - 'name'=> 'Walltime Accuracy', - 'units' => 'ratio', - 'per' => 'job', - 'visibility' => 'public', - 'documentation' => 'The ratio of actual wall time to requested wall time. A value near 1 indicates that - the requested wall time close to the actual wall time. A good wall time accuracy improves - system wide scheduling.', - 'batchExport' => true, - 'dtype' => 'analysis' - ); } } diff --git a/docs/customization.md b/docs/customization.md index 3661489f8..6ecdd97d6 100644 --- a/docs/customization.md +++ b/docs/customization.md @@ -4,20 +4,20 @@ title: Customization This document describes some advanced customizations for the Job Performance module. -**The automated upgade scripts do not have any support for preserving +**The automated upgrade scripts do not have any support for preserving customizations. Any changes made to the underlying Open XDMoD source code -will likely be overwitten when the software us upgraded.** +will likely be overwritten when the software is upgraded.** ## Job Analytics -The Job analytics panel shows selected job performance metrics in color +The job analytics panel shows selected job performance metrics in color coded plots across the top of the job tab in the Job Viewer. The value of each metric in the panel is normalized so a value near 1 means a favourable value and a value near 0 indicates an unfavourable value. -There are five default analytics. These are the CPU usage, -CPU Balance, Walltime Accuracy, Memory Efficiency and Homogeneity, see Figure 1 -below. If the CPU usage metric is unavailable then the analytics toolbar is not displayed. +There are five default analytics. These are the CPU Usage, +Homogeneity, CPU Balance, Memory Efficiency, and Walltime Accuracy, see Figure 1 +below. If the CPU Usage metric is unavailable then the analytics toolbar is not displayed. If any of the other metrics are unavailable then an error message is displayed.
@@ -27,12 +27,12 @@ If any of the other metrics are unavailable then an error message is displayed. A common reason why an analytic is unavailable is that the underlying data was not collected when the job was running. For example, the homogeneity analytic uses the L1D load count and -CPU clock tick counter hardware counter data. If the hardware counter data was not configured +CPU clock tick counter hardware counter data. If the hardware counter data were not configured to be collected or the hardware does not support a L1D load counter then the homogeneity metric will be unavailable. An example of the display in this case is shown in Figure 2.
-Screenshot of showing a performance metric from the analytics toolbar where the performance datum is unavailable. The metric display shows an exclaimation mark icon with the text 'Metric Missing: Not Available On The Compute Nodes +Screenshot showing a performance metric from the analytics toolbar where the performance datum is unavailable. The metric display shows an exclamation mark icon with the text 'Metric Missing: Not Available On The Compute Nodes'
Figure 2. Example analytics metric display when the datum is unavailable.
@@ -44,26 +44,33 @@ hardware support), then the Open XDMoD instance can be customized to never show **These instructions only apply to Open XDMoD {{ page.sw_version }}. For other versions please refer to the documentation for that release.** -To remove an analytic, you need to edit `/usr/share/xdmod/classes/DataWarehouse/Query/SUPREMM/JobDataset.php` -and remove the code associated with the analytic. For example, to remove the homogeneity -analytic you would remove (or comment out) lines 330-346. I.e. the function call to `addFieldWithError` and the -update to the documentation object. The lines to remove are shown below. -```php -330 $this->addFieldWithError( -331 new FormulaField("(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))", "homogeneity"), -332 'catastrophe', -333 $joberrors, -334 'homogeneity_error' -335 ); -336 $this->documentation['homogeneity'] = array( -337 'name'=> 'Homogeneity', -338 'units' => 'ratio', -339 'per' => 'job', -340 'visibility' => 'public', -341 'documentation' => 'A measure of how uniform the L1D load rate is over the lifetime of the job. -342 Jobs with a low homogeneity value (~0) should be investigated to check if there -343 has been a catastrophic failure during the job', -344 'batchExport' => true, -345 'dtype' => 'analysis' -346 ); +To remove an analytic you need to edit `/usr/share/xdmod/etl/js/config/supremm/etl.schema.js` +and remove the code associated with the analytic. For example to remove the homogeneity +analytic you would remove (or comment out) lines 2716–2732. The lines to remove are shown below. +```js +2716 homogeneity: { +2717 name: 'Homogeneity', +2718 formula: '(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))', +2719 withError: { +2720 name: 'homogeneity_error', +2721 column: 'catastrophe', +2722 tableAlias: 'je' +2723 }, +2724 unit: 'ratio', +2725 per: 'job', +2726 visibility: 'public', +2727 comments: 'A measure of how uniform the L1D load rate is over the lifetime of the job. ' + +2728 'Jobs with a low homogeneity value (~0) should be investigated to check if there ' + +2729 'has been a catastrophic failure during the job', +2730 batchExport: true, +2731 dtype: 'analysis' +2732 }, ``` + +After editing the file, run: +``` +# node /usr/share/xdmod/etl/js/etl.cli.js -i +``` + +To change the order in which the analytics appear in the toolbar, edit the +`metricOrder` variable in `/usr/share/xdmod/html/gui/js/modules/job_viewer/JobPanel.js`. diff --git a/etl/js/config/supremm/etl.schema.js b/etl/js/config/supremm/etl.schema.js index 457f7059d..d107b7c03 100644 --- a/etl/js/config/supremm/etl.schema.js +++ b/etl/js/config/supremm/etl.schema.js @@ -1080,7 +1080,7 @@ module.exports = { catastrophe: { unit: "ratio", type: "double", - dtype: "analysis", + dtype: 'ignore', nullable: true, def: null, batchExport: true, @@ -1323,7 +1323,7 @@ module.exports = { cpu_user_imbalance: { unit: "%", type: "double", - dtype: "analysis", + dtype: 'ignore', nullable: true, def: null, batchExport: true, @@ -2681,6 +2681,19 @@ module.exports = { // Include columns from this table in the raw statistics configuration. table: 'modw_supremm.job', + tables: [ + { + schema: 'modw_supremm', + name: 'job_errors', + alias: 'je', + join: { + primaryKey: '_id', + foreignTableAlias: 'jf', + foreignKey: '_id' + } + } + ], + // Fields not already defined as part of the ETL schema. fields: { timezone: { @@ -2698,6 +2711,81 @@ module.exports = { foreignKey: 'resource_id', column: 'timezone' } + }, + // Note that the code below is referenced in docs/customization.md. + homogeneity: { + name: 'Homogeneity', + formula: '(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))', + withError: { + name: 'homogeneity_error', + column: 'catastrophe', + tableAlias: 'je' + }, + unit: 'ratio', + per: 'job', + visibility: 'public', + comments: 'A measure of how uniform the L1D load rate is over the lifetime of the job. ' + + 'Jobs with a low homogeneity value (~0) should be investigated to check if there ' + + 'has been a catastrophic failure during the job', + batchExport: true, + dtype: 'analysis', + group: 'Other' + }, + cpu_user_balance: { + name: 'CPU User Balance', + formula: '(1.0 - (jf.cpu_user_imbalance/100.0))', + withError: { + name: 'cpu_user_balance_error', + column: 'cpu_user_imbalance', + tableAlias: 'je' + }, + unit: 'ratio', + per: 'job', + visibility: 'public', + comments: 'A measure of how uniform the CPU usage is between the cores that the job was ' + + 'assigned. A value of CPU User Balance near 1 corresponds to a job with evenly ' + + 'loaded CPUs. A value near 0 corresponds to a job with one or more CPU cores ' + + 'with much lower utilization that the others.', + batchExport: true, + dtype: 'analysis', + group: 'Other' + }, + mem_coefficient: { + name: 'Memory Headroom', + formula: '(1.0 - 1.0/POW(2-jf.max_memory, 5))', + withError: { + name: 'mem_coefficient_error', + column: 'max_memory', + tableAlias: 'je' + }, + unit: 'ratio', + per: 'job', + visibility: 'public', + comments: 'A measure of the peak compute-node memory usage for the job. A value of 0 corresponds ' + + 'to a job which used all of the available memory and 1 corresponds to a job with low memory usage. ' + + 'The value is computed as 1 - 1 / (2 - m)^5, where m is the ratio of memory used to memory available for ' + + 'the compute node that had the highest memory usage.', + batchExport: true, + dtype: 'analysis', + group: 'Other' + }, + wall_accuracy: { + name: 'Walltime Accuracy', + formula: 'LEAST(jf.wall_time / jf.requested_wall_time, 1)', + withError: { + name: 'requested_wall_time_error', + column: 'requested_wall_time', + tableAlias: 'je' + }, + unit: 'ratio', + per: 'job', + visibility: 'public', + comments: 'The ratio of actual wall time to requested wall time. A value near 1 indicates that ' + + 'the requested wall time close to the actual wall time. A good wall time accuracy improves ' + + 'system wide scheduling.', + batchExport: true, + dtype: 'analysis', + group: 'Other' } } }