From 107d52ae8f632b5013978eb650b3b0d6513f59ff Mon Sep 17 00:00:00 2001
From: Aaron Weeden <aaronwee@buffalo.edu>
Date: Thu, 21 Dec 2023 11:08:38 -0500
Subject: [PATCH] Refactor job data set analysis fields.

Primary author was @jtpalmer

Co-Authored-By: Jeffrey T. Palmer <328284+jtpalmer@users.noreply.github.com>
---
 .../Query/SUPREMM/JobDataset.php              | 155 ++++++------------
 docs/customization.md                         |  67 ++++----
 etl/js/config/supremm/etl.schema.js           |  92 ++++++++++-
 3 files changed, 179 insertions(+), 135 deletions(-)

diff --git a/classes/DataWarehouse/Query/SUPREMM/JobDataset.php b/classes/DataWarehouse/Query/SUPREMM/JobDataset.php
index 6fe15342b..acd17ac85 100644
--- a/classes/DataWarehouse/Query/SUPREMM/JobDataset.php
+++ b/classes/DataWarehouse/Query/SUPREMM/JobDataset.php
@@ -180,17 +180,45 @@ public function __construct(
      */
     private function addFieldByDefinition(array $fieldDef)
     {
-        $tableAlias = $fieldDef['tableAlias'];
         $table = null;
-        if (array_key_exists($tableAlias, $this->tables)) {
-            $table = $this->tables[$tableAlias];
-        } elseif (array_key_exists($tableAlias, $this->tableDefs)) {
-            $table = $this->addTableByDefinition($this->tableDefs[$tableAlias]);
+        if (array_key_exists('tableAlias', $fieldDef)) {
+            $tableAlias = $fieldDef['tableAlias'];
+            if (array_key_exists($tableAlias, $this->tables)) {
+                $table = $this->tables[$tableAlias];
+            } elseif (array_key_exists($tableAlias, $this->tableDefs)) {
+                $table = $this->addTableByDefinition($this->tableDefs[$tableAlias]);
+            } else {
+                throw new \Exception(sprintf('Unrecognized table alias "%s"', $tableAlias));
+            }
+        }
+
+        if (!array_key_exists('alias', $fieldDef)) {
+            throw new \Exception(sprintf('Missing alias for definition: %s', json_encode($fieldDef)));
+        }
+        $alias = $fieldDef['alias'];
+
+        if ($table !== null && array_key_exists('column', $fieldDef)) {
+            $this->addField(new TableField($table, $fieldDef['column'], $alias));
+        } elseif (array_key_exists('formula', $fieldDef)) {
+            $this->addField(new FormulaField($fieldDef['formula'], $alias));
         } else {
-            throw new \Exception(sprintf('Unrecognized table alias "%s"', $tableAlias));
+            throw new \Exception(sprintf(
+                'Missing tableAlias and column or formula for "%s", definition: %s',
+                $alias,
+                json_encode($fieldDef)
+            ));
+        }
+
+        $this->documentation[$alias] = $fieldDef;
+
+        if (array_key_exists('withError', $fieldDef)) {
+            $errorDef = $fieldDef['withError'];
+            $this->addErrorField(
+                $errorDef['column'],
+                $errorDef['tableAlias'],
+                array_key_exists('name', $errorDef) ? $errorDef['name'] : null
+            );
         }
-        $this->addField(new TableField($table, $fieldDef['column'], $fieldDef['alias']));
-        $this->documentation[$fieldDef['alias']] = $fieldDef;
     }
 
     /**
@@ -228,21 +256,25 @@ private function joinTo($othertable, $joinkey, $otherkey, $colalias, $idcol = "i
     }
 
     /**
-     * Add a field and the corresponding error field to the query.
+     * Add an error field to the query.
      *
-     * @param Field  $field      The field to add to the query.
      * @param string $fieldName  The name of the field.
-     * @param Table  $errorTable The error table.
+     * @param string $errorTableAlias The error table alias.
      * @param string $errorName  The name of the error field, if null then the field name is
      *                           autogenerated based on the fieldName.
-     *
-     * @return null
      */
-    private function addFieldWithError($field, $fieldName, $errorTable, $errorName = null)
+    private function addErrorField($fieldName, $errorTableAlias, $errorName = null)
     {
-        static $errorTableIdx = 0;
+        $errorTable = null;
+        if (array_key_exists($errorTableAlias, $this->tables)) {
+            $errorTable = $this->tables[$errorTableAlias];
+        } elseif (array_key_exists($errorTableAlias, $this->tableDefs)) {
+            $errorTable = $this->addTableByDefinition($this->tableDefs[$errorTableAlias]);
+        } else {
+            throw new \Exception(sprintf('Unrecognized table alias "%s"', $errorTableAlias));
+        }
 
-        $this->addField($field);
+        static $errorTableIdx = 0;
 
         $errordesc = new Table(
             new Schema('modw'),
@@ -307,98 +339,15 @@ private function addMetricsFields()
 
     private function addAnalyticsFields()
     {
-        $dataTable = $this->getDataTable();
-        $joberrors = new Table(new Schema('modw_supremm'), 'job_errors', 'je');
-        $this->addTable($joberrors);
-
-        $this->addWhereCondition(
-            new WhereCondition(
-                new TableField($dataTable, '_id'),
-                '=',
-                new TableField($joberrors, '_id')
-            )
-        );
-
         foreach ($this->fieldDefs as $sfield => $sdata) {
             // TODO work out a better way to have metrics have multiple
             // meta-types (ie cpu user is an analytic as well as a metric).
             if ($sfield == "cpu_user") {
-                $this->addFieldWithError(new TableField($dataTable, $sfield), $sfield, $joberrors);
-                $this->documentation[$sfield] = $sdata;
+                $this->addFieldByDefinition($sdata);
+                $this->addErrorField($sfield, 'je');
+            } elseif ($sdata['dtype'] == 'analysis') {
+                $this->addFieldByDefinition($sdata);
             }
         }
-        $this->addFieldWithError(
-            new FormulaField("(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))", "homogeneity"),
-            'catastrophe',
-            $joberrors,
-            'homogeneity_error'
-        );
-        $this->documentation['homogeneity'] = array(
-            'name'=> 'Homogeneity',
-            'units' => 'ratio',
-            'per' => 'job',
-            'visibility' => 'public',
-            'documentation' => 'A measure of how uniform the L1D load rate is over the lifetime of the job.
-                                Jobs with a low homogeneity value (~0) should be investigated to check if there
-                                has been a catastrophic failure during the job',
-            'batchExport' => true,
-            'dtype' => 'analysis'
-        );
-
-        $this->addFieldWithError(
-            new FormulaField('(1.0 - (jf.cpu_user_imbalance/100.0))', 'cpu_user_balance'),
-            'cpu_user_imbalance',
-            $joberrors,
-            'cpu_user_balance_error'
-        );
-        $this->documentation['cpu_user_balance'] = array(
-            'name'=> 'CPU User Balance',
-            'units' => 'ratio',
-            'per' => 'job',
-            'visibility' => 'public',
-            'documentation' => 'A measure of how uniform the CPU usage is between the cores that the job was
-                                assigned. A value of CPU User Balance near 1 corresponds to a job with evenly
-                                loaded CPUs. A value near 0 corresponds to a job with one or more CPU cores
-                                with much lower utilization that the others.',
-            'batchExport' => true,
-            'dtype' => 'analysis'
-        );
-
-        $this->addFieldWithError(
-            new FormulaField('(1.0 - 1.0/POW(2-jf.max_memory, 5))', 'mem_coefficient'),
-            'max_memory',
-            $joberrors,
-            'mem_coefficient_error'
-        );
-        $this->documentation['mem_coefficient'] = array(
-            'name'=> 'Memory Headroom',
-            'units' => 'ratio',
-            'per' => 'job',
-            'visibility' => 'public',
-            'documentation' => 'A measure of the peak compute-node memory usage for the job. A value of 0 corresponds
-            to a job which used all of the available memory and 1 corresponds to a job with low memory usage.
-            The value is computed as 1 - 1 / (2 - m)^5, where m is the ratio of memory used to memory available for
-            the compute node that had the highest memory usage.',
-            'batchExport' => true,
-            'dtype' => 'analysis'
-        );
-
-        $this->addFieldWithError(
-            new FormulaField("LEAST(jf.wall_time / jf.requested_wall_time, 1)", "wall_accuracy"),
-            'requested_wall_time',
-            $joberrors,
-            'requested_wall_time_error'
-        );
-        $this->documentation['wall_accuracy'] = array(
-            'name'=> 'Walltime Accuracy',
-            'units' => 'ratio',
-            'per' => 'job',
-            'visibility' => 'public',
-            'documentation' => 'The ratio of actual wall time to requested wall time. A value near 1 indicates that
-                               the requested wall time close to the actual wall time. A good wall time accuracy improves
-                               system wide scheduling.',
-            'batchExport' => true,
-            'dtype' => 'analysis'
-        );
     }
 }
diff --git a/docs/customization.md b/docs/customization.md
index 3661489f8..6ecdd97d6 100644
--- a/docs/customization.md
+++ b/docs/customization.md
@@ -4,20 +4,20 @@ title: Customization
 
 This document describes some advanced customizations for the Job Performance module.
 
-**The automated upgade scripts do not have any support for preserving
+**The automated upgrade scripts do not have any support for preserving
 customizations. Any changes made to the underlying Open XDMoD source code
-will likely be overwitten when the software us upgraded.**
+will likely be overwritten when the software is upgraded.**
 
 ## Job Analytics
 
-The Job analytics panel shows selected job performance metrics in color
+The job analytics panel shows selected job performance metrics in color
 coded plots across the top of the job tab in the Job Viewer. The value of
 each metric in the panel is normalized so a value near 1 means a favourable
 value and a value near 0 indicates an unfavourable value.
 
-There are five default analytics. These are the CPU usage,
-CPU Balance, Walltime Accuracy, Memory Efficiency and Homogeneity, see Figure 1
-below. If the CPU usage metric is unavailable then the analytics toolbar is not displayed.
+There are five default analytics. These are the CPU Usage,
+Homogeneity, CPU Balance, Memory Efficiency, and Walltime Accuracy, see Figure 1
+below. If the CPU Usage metric is unavailable then the analytics toolbar is not displayed.
 If any of the other metrics are unavailable then an error message is displayed.
 
 <figure>
@@ -27,12 +27,12 @@ If any of the other metrics are unavailable then an error message is displayed.
 
 A common reason why an analytic is unavailable is that the underlying data was not collected
 when the job was running. For example, the homogeneity analytic uses the L1D load count and
-CPU clock tick counter hardware counter data. If the hardware counter data was not configured
+CPU clock tick counter hardware counter data. If the hardware counter data were not configured
 to be collected or the hardware does not support a L1D load counter then the homogeneity
 metric will be unavailable. An example of the display in this case is shown in Figure 2.
 
 <figure>
-<img src="{{ site.baseurl }}/assets/images/analytics_unavailable.png" alt="Screenshot of showing a performance metric from the analytics toolbar where the performance datum is unavailable. The metric display shows an exclaimation mark icon with the text 'Metric Missing: Not Available On The Compute Nodes" />
+<img src="{{ site.baseurl }}/assets/images/analytics_unavailable.png" alt="Screenshot showing a performance metric from the analytics toolbar where the performance datum is unavailable. The metric display shows an exclamation mark icon with the text 'Metric Missing: Not Available On The Compute Nodes'" />
 <figcaption>Figure 2. Example analytics metric display when the datum is unavailable.</figcaption>
 </figure>
 
@@ -44,26 +44,33 @@ hardware support), then the Open XDMoD instance can be customized to never show
 **These instructions only apply to Open XDMoD {{ page.sw_version }}. For other
 versions please refer to the documentation for that release.**
 
-To remove an analytic, you need to edit `/usr/share/xdmod/classes/DataWarehouse/Query/SUPREMM/JobDataset.php`
-and remove the code associated with the analytic. For example, to remove the homogeneity
-analytic you would remove (or comment out) lines 330-346. I.e. the function call to `addFieldWithError` and the
-update to the documentation object. The lines to remove are shown below.
-```php
-330         $this->addFieldWithError(
-331             new FormulaField("(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))", "homogeneity"),
-332             'catastrophe',
-333             $joberrors,
-334             'homogeneity_error'
-335         );
-336         $this->documentation['homogeneity'] = array(
-337             'name'=> 'Homogeneity',
-338             'units' => 'ratio',
-339             'per' => 'job',
-340             'visibility' => 'public',
-341             'documentation' => 'A measure of how uniform the L1D load rate is over the lifetime of the job.
-342                                 Jobs with a low homogeneity value (~0) should be investigated to check if there
-343                                 has been a catastrophic failure during the job',
-344             'batchExport' => true,
-345             'dtype' => 'analysis'
-346         );
+To remove an analytic you need to edit `/usr/share/xdmod/etl/js/config/supremm/etl.schema.js`
+and remove the code associated with the analytic. For example to remove the homogeneity
+analytic you would remove (or comment out) lines 2716–2732.  The lines to remove are shown below.
+```js
+2716             homogeneity: {
+2717                 name: 'Homogeneity',
+2718                 formula: '(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))',
+2719                 withError: {
+2720                     name: 'homogeneity_error',
+2721                     column: 'catastrophe',
+2722                     tableAlias: 'je'
+2723                 },
+2724                 unit: 'ratio',
+2725                 per: 'job',
+2726                 visibility: 'public',
+2727                 comments: 'A measure of how uniform the L1D load rate is over the lifetime of the job. ' +
+2728                     'Jobs with a low homogeneity value (~0) should be investigated to check if there ' +
+2729                     'has been a catastrophic failure during the job',
+2730                 batchExport: true,
+2731                 dtype: 'analysis'
+2732             },
 ```
+
+After editing the file, run:
+```
+# node /usr/share/xdmod/etl/js/etl.cli.js -i
+```
+
+To change the order in which the analytics appear in the toolbar, edit the
+`metricOrder` variable in `/usr/share/xdmod/html/gui/js/modules/job_viewer/JobPanel.js`.
diff --git a/etl/js/config/supremm/etl.schema.js b/etl/js/config/supremm/etl.schema.js
index 457f7059d..d107b7c03 100644
--- a/etl/js/config/supremm/etl.schema.js
+++ b/etl/js/config/supremm/etl.schema.js
@@ -1080,7 +1080,7 @@ module.exports = {
         catastrophe: {
             unit: "ratio",
             type: "double",
-            dtype: "analysis",
+            dtype: 'ignore',
             nullable: true,
             def: null,
             batchExport: true,
@@ -1323,7 +1323,7 @@ module.exports = {
         cpu_user_imbalance: {
             unit: "%",
             type: "double",
-            dtype: "analysis",
+            dtype: 'ignore',
             nullable: true,
             def: null,
             batchExport: true,
@@ -2681,6 +2681,19 @@ module.exports = {
         // Include columns from this table in the raw statistics configuration.
         table: 'modw_supremm.job',
 
+        tables: [
+            {
+                schema: 'modw_supremm',
+                name: 'job_errors',
+                alias: 'je',
+                join: {
+                    primaryKey: '_id',
+                    foreignTableAlias: 'jf',
+                    foreignKey: '_id'
+                }
+            }
+        ],
+
         // Fields not already defined as part of the ETL schema.
         fields: {
             timezone: {
@@ -2698,6 +2711,81 @@ module.exports = {
                     foreignKey: 'resource_id',
                     column: 'timezone'
                 }
+            },
+            // Note that the code below is referenced in docs/customization.md.
+            homogeneity: {
+                name: 'Homogeneity',
+                formula: '(1.0 - (1.0 / (1.0 + 1000.0 * jf.catastrophe)))',
+                withError: {
+                    name: 'homogeneity_error',
+                    column: 'catastrophe',
+                    tableAlias: 'je'
+                },
+                unit: 'ratio',
+                per: 'job',
+                visibility: 'public',
+                comments: 'A measure of how uniform the L1D load rate is over the lifetime of the job. '
+                    + 'Jobs with a low homogeneity value (~0) should be investigated to check if there '
+                    + 'has been a catastrophic failure during the job',
+                batchExport: true,
+                dtype: 'analysis',
+                group: 'Other'
+            },
+            cpu_user_balance: {
+                name: 'CPU User Balance',
+                formula: '(1.0 - (jf.cpu_user_imbalance/100.0))',
+                withError: {
+                    name: 'cpu_user_balance_error',
+                    column: 'cpu_user_imbalance',
+                    tableAlias: 'je'
+                },
+                unit: 'ratio',
+                per: 'job',
+                visibility: 'public',
+                comments: 'A measure of how uniform the CPU usage is between the cores that the job was '
+                    + 'assigned. A value of CPU User Balance near 1 corresponds to a job with evenly '
+                    + 'loaded CPUs. A value near 0 corresponds to a job with one or more CPU cores '
+                    + 'with much lower utilization that the others.',
+                batchExport: true,
+                dtype: 'analysis',
+                group: 'Other'
+            },
+            mem_coefficient: {
+                name: 'Memory Headroom',
+                formula: '(1.0 - 1.0/POW(2-jf.max_memory, 5))',
+                withError: {
+                    name: 'mem_coefficient_error',
+                    column: 'max_memory',
+                    tableAlias: 'je'
+                },
+                unit: 'ratio',
+                per: 'job',
+                visibility: 'public',
+                comments: 'A measure of the peak compute-node memory usage for the job. A value of 0 corresponds '
+                    + 'to a job which used all of the available memory and 1 corresponds to a job with low memory usage. '
+                    + 'The value is computed as 1 - 1 / (2 - m)^5, where m is the ratio of memory used to memory available for '
+                    + 'the compute node that had the highest memory usage.',
+                batchExport: true,
+                dtype: 'analysis',
+                group: 'Other'
+            },
+            wall_accuracy: {
+                name: 'Walltime Accuracy',
+                formula: 'LEAST(jf.wall_time / jf.requested_wall_time, 1)',
+                withError: {
+                    name: 'requested_wall_time_error',
+                    column: 'requested_wall_time',
+                    tableAlias: 'je'
+                },
+                unit: 'ratio',
+                per: 'job',
+                visibility: 'public',
+                comments: 'The ratio of actual wall time to requested wall time. A value near 1 indicates that '
+                    + 'the requested wall time close to the actual wall time. A good wall time accuracy improves '
+                    + 'system wide scheduling.',
+                batchExport: true,
+                dtype: 'analysis',
+                group: 'Other'
             }
         }
     }