From ffe21373c777b47a326269501d64578ef71b9ecf Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Thu, 11 Feb 2021 16:34:57 +0000
Subject: [PATCH 01/27] Removed gateway node

---
 .../hadoop-yarn/ansible/03-create-masters.yml |  4 +--
 .../hadoop-yarn/ansible/04-create-workers.yml |  4 +--
 .../hadoop-yarn/ansible/05-config-ssh.yml     |  6 ++---
 .../hadoop-yarn/ansible/06-config-dns.yml     | 26 +++----------------
 .../hadoop-yarn/ansible/07-host-keys.yml      |  4 +--
 .../hadoop-yarn/ansible/08-ping-test.yml      |  2 +-
 .../ansible/25-create-zeppelin.yml            | 15 ++++++++++-
 .../hadoop-yarn/ansible/51-cephfs-mount.yml   |  2 +-
 .../hadoop-yarn/ansible/combined-01.yml       |  1 -
 .../hadoop-yarn/ansible/create-all.yml        |  3 +--
 experiments/hadoop-yarn/ansible/hosts.yml     |  4 ---
 .../ansible/templates/dns-hosts.j2            |  8 ++----
 .../ansible/templates/hadoop-workers.j2       | 21 +++++++++++++++
 .../ansible/templates/ssh-ansible.j2          | 19 ++++----------
 14 files changed, 58 insertions(+), 61 deletions(-)

diff --git a/experiments/hadoop-yarn/ansible/03-create-masters.yml b/experiments/hadoop-yarn/ansible/03-create-masters.yml
index facb7fb0..8c621019 100644
--- a/experiments/hadoop-yarn/ansible/03-create-masters.yml
+++ b/experiments/hadoop-yarn/ansible/03-create-masters.yml
@@ -35,7 +35,7 @@
       register:
         mastersec
 
-    - name: "Add a rule to allow SSH from our gateway"
+    - name: "Add a rule to allow SSH from zeppelin"
       os_security_group_rule:
         cloud: "{{ cloudname }}"
         state: present
@@ -44,7 +44,7 @@
         protocol:  'tcp'
         port_range_min: 22
         port_range_max: 22
-        remote_group: "{{ security['gateway'] }}"
+        remote_group: "{{ security['zeppelin'] }}"
 
     - name: "Create our masters"
       os_server:
diff --git a/experiments/hadoop-yarn/ansible/04-create-workers.yml b/experiments/hadoop-yarn/ansible/04-create-workers.yml
index eeb43bb6..e6dc0cda 100644
--- a/experiments/hadoop-yarn/ansible/04-create-workers.yml
+++ b/experiments/hadoop-yarn/ansible/04-create-workers.yml
@@ -35,7 +35,7 @@
       register:
         secgroup
 
-    - name: "Add a rule to allow ssh from the gateway"
+    - name: "Add a rule to allow ssh from zeppelin"
       os_security_group_rule:
         cloud: "{{ cloudname }}"
         state: present
@@ -44,7 +44,7 @@
         protocol:  'tcp'
         port_range_min: 22
         port_range_max: 22
-        remote_group: "{{ security['gateway'] }}"
+        remote_group: "{{ security['zeppelin'] }}"
 
     - name: "Create our workers"
       os_server:
diff --git a/experiments/hadoop-yarn/ansible/05-config-ssh.yml b/experiments/hadoop-yarn/ansible/05-config-ssh.yml
index 4f60131e..72a42edc 100644
--- a/experiments/hadoop-yarn/ansible/05-config-ssh.yml
+++ b/experiments/hadoop-yarn/ansible/05-config-ssh.yml
@@ -35,12 +35,12 @@
         mode: 'u=rwx,g=rx,o=rx'
         state: directory
 
-    - name: "Discover our gateway nodes"
+    - name: "Discover our zeppelin node"
       os_server_info:
         cloud:  "{{ cloudname }}"
-        server: "{{ deployname }}-gateway"
+        server: "{{ deployname }}-zeppelin"
       register:
-        gatewaynodes
+        zeppelinnodes
 
     - name: "Generate Ansible SSH config"
       template:
diff --git a/experiments/hadoop-yarn/ansible/06-config-dns.yml b/experiments/hadoop-yarn/ansible/06-config-dns.yml
index 1e20d587..0ec41589 100644
--- a/experiments/hadoop-yarn/ansible/06-config-dns.yml
+++ b/experiments/hadoop-yarn/ansible/06-config-dns.yml
@@ -26,12 +26,12 @@
     - /tmp/ansible-vars.yml
   tasks:
 
-    - name: "Discover our gateway nodes"
+    - name: "Discover our Zeppelin node"
       os_server_info:
         cloud:  "{{ cloudname }}"
-        server: "{{ deployname }}-gateway*"
+        server: "{{ deployname }}-zeppelin"
       register:
-        gatewaynodes
+        zeppelinnode
 
     - name: "Discover our master nodes"
       os_server_info:
@@ -47,34 +47,16 @@
       register:
         workernodes
 
-    - name: "Discover our Zeppelin nodes"
-      os_server_info:
-        cloud:  "{{ cloudname }}"
-        server: "{{ deployname }}-zeppelin"
-      register:
-        zeppelinnode
-
     - name: "Generate our DNS hosts file"
       template:
         src:  'templates/dns-hosts.j2'
         dest: "/tmp/aglais-dns-hosts"
 
-- hosts: gateway
-  gather_facts: false
-  tasks:
-    - name: "Deploy [/etc/hosts] to our gateway"
-      become: true
-      copy:
-        src:  /tmp/aglais-dns-hosts
-        dest: /etc/hosts
-        owner: root
-        group: root
-        mode:  u=rw,g=r,o=r
 
 - hosts: zeppelin
   gather_facts: false
   tasks:
-    - name: "Deploy [/etc/hosts] to our Zeppelin"
+    - name: "Deploy [/etc/hosts] to our Zeppelin node"
       become: true
       copy:
         src:  /tmp/aglais-dns-hosts
diff --git a/experiments/hadoop-yarn/ansible/07-host-keys.yml b/experiments/hadoop-yarn/ansible/07-host-keys.yml
index ea87e946..fbfc803c 100644
--- a/experiments/hadoop-yarn/ansible/07-host-keys.yml
+++ b/experiments/hadoop-yarn/ansible/07-host-keys.yml
@@ -22,7 +22,7 @@
 # https://everythingshouldbevirtual.com/automation/ansible-ssh-known-host-keys/
 #
 
-- hosts: gateway
+- hosts: zeppelin
   gather_facts: false
   tasks:
 
@@ -50,7 +50,7 @@
         dest: "/tmp/aglais-ssh-hosts"
 
 
-- hosts: gateway:masters:workers:zeppelin
+- hosts: masters:workers:zeppelin
   gather_facts: false
   tasks:
     - name: "Deploy the known hosts file to [/etc/ssh/ssh_known_hosts]"
diff --git a/experiments/hadoop-yarn/ansible/08-ping-test.yml b/experiments/hadoop-yarn/ansible/08-ping-test.yml
index c07607ae..b8e8cd33 100644
--- a/experiments/hadoop-yarn/ansible/08-ping-test.yml
+++ b/experiments/hadoop-yarn/ansible/08-ping-test.yml
@@ -22,7 +22,7 @@
 
 ---
 - name: "Ping tests"
-  hosts: gateway:masters:workers:zeppelin
+  hosts: zeppelin:masters:workers
   gather_facts: false
   tasks:
 
diff --git a/experiments/hadoop-yarn/ansible/25-create-zeppelin.yml b/experiments/hadoop-yarn/ansible/25-create-zeppelin.yml
index 9ce10005..5768e05e 100644
--- a/experiments/hadoop-yarn/ansible/25-create-zeppelin.yml
+++ b/experiments/hadoop-yarn/ansible/25-create-zeppelin.yml
@@ -35,17 +35,30 @@
       register:
         zeppelinsec
 
-    - name: "Add a rule to allow ssh from the gateway"
+    - name: "Add a security rule for IPv4 SSH"
       os_security_group_rule:
         cloud: "{{ cloudname }}"
         state: present
         security_group: "{{ zeppelinsec.id }}"
         direction: 'ingress'
         protocol:  'tcp'
+        ethertype: 'IPv4'
         port_range_min: 22
         port_range_max: 22
         remote_ip_prefix: '0.0.0.0/0'
 
+    - name: "Add a security rule for IPv6 SSH"
+      os_security_group_rule:
+        cloud: "{{ cloudname }}"
+        state: present
+        security_group: "{{ zeppelinsec.id }}"
+        direction: 'ingress'
+        protocol:  'tcp'
+        ethertype: 'IPv6'
+        port_range_min: 22
+        port_range_max: 22
+        remote_ip_prefix: '::/0'
+
     - name: "Add a security rule for IPv4 Port 8080"
       os_security_group_rule:
         cloud: "{{ cloudname }}"
diff --git a/experiments/hadoop-yarn/ansible/51-cephfs-mount.yml b/experiments/hadoop-yarn/ansible/51-cephfs-mount.yml
index 17b669f5..6704d885 100644
--- a/experiments/hadoop-yarn/ansible/51-cephfs-mount.yml
+++ b/experiments/hadoop-yarn/ansible/51-cephfs-mount.yml
@@ -38,7 +38,7 @@
 
 ---
 - name: "Install and mount a CephFS share"
-  hosts: gateway:masters:workers:zeppelin
+  hosts: zeppelin:masters:workers
   gather_facts: false
   vars_files:
     - /tmp/ansible-vars.yml
diff --git a/experiments/hadoop-yarn/ansible/combined-01.yml b/experiments/hadoop-yarn/ansible/combined-01.yml
index 2015238a..bc7f5fc3 100644
--- a/experiments/hadoop-yarn/ansible/combined-01.yml
+++ b/experiments/hadoop-yarn/ansible/combined-01.yml
@@ -22,7 +22,6 @@
 
 ---
 - import_playbook: 01-create-network.yml
-- import_playbook: 02-create-gateway.yml
 - import_playbook: 03-create-masters.yml
 - import_playbook: 04-create-workers.yml
 - import_playbook: 25-create-zeppelin.yml
diff --git a/experiments/hadoop-yarn/ansible/create-all.yml b/experiments/hadoop-yarn/ansible/create-all.yml
index b021db8a..9476842c 100644
--- a/experiments/hadoop-yarn/ansible/create-all.yml
+++ b/experiments/hadoop-yarn/ansible/create-all.yml
@@ -22,10 +22,9 @@
 ---
 - import_playbook: 01-create-keypair.yml
 - import_playbook: 01-create-network.yml
-- import_playbook: 02-create-gateway.yml
+- import_playbook: 25-create-zeppelin.yml
 - import_playbook: 03-create-masters.yml
 - import_playbook: 04-create-workers.yml
-- import_playbook: 25-create-zeppelin.yml
 
 - import_playbook: 05-config-ssh.yml
 - import_playbook: 06-config-dns.yml
diff --git a/experiments/hadoop-yarn/ansible/hosts.yml b/experiments/hadoop-yarn/ansible/hosts.yml
index 67e1d8ad..150585eb 100644
--- a/experiments/hadoop-yarn/ansible/hosts.yml
+++ b/experiments/hadoop-yarn/ansible/hosts.yml
@@ -49,10 +49,6 @@ all:
         ansible_host_key_checking: false
 
     hosts:
-        gateway:
-            login:  'fedora'
-            image:  'Fedora-30-1.2'
-            flavor: 'general.v1.tiny'
 
         zeppelin:
             login:  'fedora'
diff --git a/experiments/hadoop-yarn/ansible/templates/dns-hosts.j2 b/experiments/hadoop-yarn/ansible/templates/dns-hosts.j2
index 75aabc4d..b170552d 100644
--- a/experiments/hadoop-yarn/ansible/templates/dns-hosts.j2
+++ b/experiments/hadoop-yarn/ansible/templates/dns-hosts.j2
@@ -26,8 +26,8 @@
 127.0.0.1   localhost localhost.localdomain localhost4 localhost4.localdomain4
 ::1         localhost localhost.localdomain localhost6 localhost6.localdomain6
 
-# Gateway nodes.
-{% for node in gatewaynodes.openstack_servers %}
+# Zeppelin nodes.
+{% for node in zeppelinnode.openstack_servers %}
 {{ "%-15s" | format(node.private_v4,) }}  {{ node.metadata.hostname }}
 {% endfor %}
 
@@ -41,7 +41,3 @@
 {{ "%-15s" | format(node.private_v4,) }}  {{ node.metadata.hostname }}
 {% endfor %}
 
-# Zeppelin nodes.
-{% for node in zeppelinnode.openstack_servers %}
-{{ "%-15s" | format(node.private_v4,) }}  {{ node.metadata.hostname }}
-{% endfor %}
diff --git a/experiments/hadoop-yarn/ansible/templates/hadoop-workers.j2 b/experiments/hadoop-yarn/ansible/templates/hadoop-workers.j2
index ce03890f..66f637ef 100644
--- a/experiments/hadoop-yarn/ansible/templates/hadoop-workers.j2
+++ b/experiments/hadoop-yarn/ansible/templates/hadoop-workers.j2
@@ -1,3 +1,24 @@
+{#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2020, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#}
+
 {% for worker in groups['workers'] %}
 {{ worker }}
 {% endfor %}
diff --git a/experiments/hadoop-yarn/ansible/templates/ssh-ansible.j2 b/experiments/hadoop-yarn/ansible/templates/ssh-ansible.j2
index f995ac83..0cbdbe25 100644
--- a/experiments/hadoop-yarn/ansible/templates/ssh-ansible.j2
+++ b/experiments/hadoop-yarn/ansible/templates/ssh-ansible.j2
@@ -28,9 +28,9 @@ ServerAliveInterval 60
 ServerAliveCountMax 5
 
 # Primary gateway node.
-Host gateway
-    User {{ hostvars['gateway'].login }}
-    HostName {{ gatewaynodes.openstack_servers[0].accessIPv4 }}
+Host zeppelin
+    User {{ hostvars['zeppelin'].login }}
+    HostName {{ zeppelinnodes.openstack_servers[0].accessIPv4 }}
     ControlPath ~/.ssh/%r@%h:%p
     ControlMaster auto
     ControlPersist 5m
@@ -39,7 +39,7 @@ Host gateway
 {% for hostname in groups['masters'] %}
 Host {{ hostname }}
     User {{ hostvars[hostname]['login'] }}
-    ProxyCommand ssh -W %h:%p -l {{ hostvars['gateway'].login }} -F {{ lookup('env','HOME') }}/.ssh/ansible-config gateway
+    ProxyCommand ssh -W %h:%p -l {{ hostvars['zeppelin'].login }} -F {{ lookup('env','HOME') }}/.ssh/ansible-config zeppelin
     ControlPath ~/.ssh/%r@%h:%p
     ControlMaster auto
     ControlPersist 5m
@@ -49,19 +49,10 @@ Host {{ hostname }}
 {% for hostname in groups['workers'] %}
 Host {{ hostname }}
     User {{ hostvars[hostname]['login'] }}
-    ProxyCommand ssh -W %h:%p -l {{ hostvars['gateway'].login }} -F {{ lookup('env','HOME') }}/.ssh/ansible-config gateway
+    ProxyCommand ssh -W %h:%p -l {{ hostvars['zeppelin'].login }} -F {{ lookup('env','HOME') }}/.ssh/ansible-config zeppelin
     ControlPath ~/.ssh/%r@%h:%p
     ControlMaster auto
     ControlPersist 5m
 
 {% endfor %}
 
-
-# Zeppelin node
-Host zeppelin
-    User {{ hostvars['zeppelin']['login'] }}
-    ProxyCommand ssh -W %h:%p -l {{ hostvars['zeppelin'].login }} -F {{ lookup('env','HOME') }}/.ssh/ansible-config gateway
-    ControlPath ~/.ssh/%r@%h:%p
-    ControlMaster auto
-    ControlPersist 5m
-

From 5f1cb10a28c9584a468cef4b61cc9a907c1e4198 Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Thu, 11 Feb 2021 16:44:53 +0000
Subject: [PATCH 02/27] Fix to catch all the keys created by create-all

---
 experiments/openstack/bin/delete-all.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/experiments/openstack/bin/delete-all.sh b/experiments/openstack/bin/delete-all.sh
index 46dd5282..beb232e9 100755
--- a/experiments/openstack/bin/delete-all.sh
+++ b/experiments/openstack/bin/delete-all.sh
@@ -354,7 +354,7 @@
             --os-cloud "${cloudname:?}" \
             keypair list \
                 --format json \
-        | jq -r '.[] | select(.Name | startswith("aglais")) | .Name'
+        | jq -r '.[] | select(.Name | startswith("'${cloudname:?}'")) | .Name'
         )
     do
         echo "- Deleting key [${keyname:?}]"

From d0906a1a31ac86d582748a52dbc87b2bab133275 Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Thu, 11 Feb 2021 16:45:26 +0000
Subject: [PATCH 03/27] Finish notes on cherry picking

---
 notes/zrq/20210206-01-git-cherry-pick.txt | 32 +++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/notes/zrq/20210206-01-git-cherry-pick.txt b/notes/zrq/20210206-01-git-cherry-pick.txt
index 1421114d..00311762 100644
--- a/notes/zrq/20210206-01-git-cherry-pick.txt
+++ b/notes/zrq/20210206-01-git-cherry-pick.txt
@@ -788,6 +788,38 @@
 
         git add 'notes/zrq/20210206-01-git-cherry-pick.txt'
 
+        git commit -m "Added notes on git cherry picking"
 
+        git push
+
+    popd
+
+
+# -----------------------------------------------------
+# Create a new working branch ....
+#[user@desktop]
+
+    prevbranch=${nextbranch:?}
+    nextbranch=$(date '+%Y%m%d')-zrq-working
+
+    source "${HOME}/aglais.env"
+    pushd  "${AGLAIS_CODE}"
 
+        git checkout -b "${nextbranch:?}"
+
+    >   Switched to a new branch '20210206-zrq-working'
+
+
+        git push --set-upstream 'origin' "${nextbranch:?}"
+
+    >   Total 0 (delta 0), reused 0 (delta 0), pack-reused 0
+    >   remote:
+    >   remote: Create a pull request for '20210206-zrq-working' on GitHub by visiting:
+    >   remote:      https://github.com/Zarquan/aglais/pull/new/20210206-zrq-working
+    >   remote:
+    >   To github.com:Zarquan/aglais.git
+    >    * [new branch]      20210206-zrq-working -> 20210206-zrq-working
+    >   Branch '20210206-zrq-working' set up to track remote branch '20210206-zrq-working' from 'origin'.
+
+    popd
 

From 24324016ae2a43b97c63bcd23c2d4bce296805a9 Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Thu, 11 Feb 2021 16:58:44 +0000
Subject: [PATCH 04/27] Moved Hadoop, Spark and Zeppelin vars into hosts.yml

---
 .../hadoop-yarn/ansible/11-install-hadoop.yml |  7 ---
 .../ansible/12-config-hadoop-core.yml         |  7 ---
 .../ansible/13-config-hdfs-namenode.yml       |  6 ---
 .../ansible/14-config-hdfs-workers.yml        |  7 ---
 .../ansible/16-config-yarn-masters.yml        |  7 ---
 .../ansible/17-config-yarn-workers.yml        |  7 ---
 .../hadoop-yarn/ansible/20-install-spark.yml  |  8 ---
 .../ansible/22-config-spark-master.yml        |  9 ----
 .../ansible/24-install-pyspark.yml            |  6 ---
 .../ansible/27-install-zeppelin.yml           |  7 ---
 experiments/hadoop-yarn/ansible/hosts.yml     | 50 +++++++++++++++++++
 11 files changed, 50 insertions(+), 71 deletions(-)

diff --git a/experiments/hadoop-yarn/ansible/11-install-hadoop.yml b/experiments/hadoop-yarn/ansible/11-install-hadoop.yml
index 427bbaa0..696d9e88 100644
--- a/experiments/hadoop-yarn/ansible/11-install-hadoop.yml
+++ b/experiments/hadoop-yarn/ansible/11-install-hadoop.yml
@@ -29,13 +29,6 @@
 - name: "Install Hadoop"
   hosts: masters:workers:zeppelin
   gather_facts: false
-  vars:
-    hdname: "hadoop-3.1.3"
-    hdbase: "/opt"
-    hdhome: "/opt/hadoop"
-    hddata: "/var/local/hadoop"
-    hdhost: "{{groups['masters'][0]}}"
-    hduser: "{{hostvars[inventory_hostname].login}}"
 
   tasks:
 
diff --git a/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml b/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml
index bbbe6e71..2c4e50ad 100644
--- a/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml
+++ b/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml
@@ -23,13 +23,6 @@
 - name: "Configure Hadoop [core-site.xml]"
   hosts: masters:workers:zeppelin
   gather_facts: false
-  vars:
-    hdname: "hadoop-3.1.3"
-    hdbase: "/opt"
-    hdhome: "/opt/hadoop"
-    hddata: "/var/local/hadoop"
-    hdhost: "{{groups['masters'][0]}}"
-    hduser: "{{hostvars[inventory_hostname].login}}"
 
   tasks:
 
diff --git a/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml b/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml
index caae6b5b..4b8344c4 100644
--- a/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml
+++ b/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml
@@ -24,12 +24,6 @@
   hosts: master01:zeppelin
   gather_facts: false
   vars:
-    hdname: "hadoop-3.1.3"
-    hdbase: "/opt"
-    hdhome: "/opt/hadoop"
-    hddata: "/var/local/hadoop"
-    hdhost: "{{groups['masters'][0]}}"
-    hduser: "{{hostvars[inventory_hostname].login}}"
 
   tasks:
 
diff --git a/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml b/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml
index 6df8aafe..fe7341b0 100644
--- a/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml
+++ b/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml
@@ -23,13 +23,6 @@
 - name: "Configure Hadoop workers"
   hosts: workers:zeppelin
   gather_facts: false
-  vars:
-    hdname: "hadoop-3.1.3"
-    hdbase: "/opt"
-    hdhome: "/opt/hadoop"
-    hddata: "/var/local/hadoop"
-    hdhost: "{{groups['masters'][0]}}"
-    hduser: "{{hostvars[inventory_hostname].login}}"
 
   tasks:
 
diff --git a/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml b/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml
index 47a32d7a..688bb1a2 100644
--- a/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml
+++ b/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml
@@ -23,13 +23,6 @@
 - name: "Configure YARN masters"
   hosts: master01:zeppelin
   gather_facts: false
-  vars:
-    hdname: "hadoop-3.1.3"
-    hdbase: "/opt"
-    hdhome: "/opt/hadoop"
-    hddata: "/var/local/hadoop"
-    hdhost: "master01"
-    hduser: "{{hostvars[inventory_hostname].login}}"
 
   tasks:
 
diff --git a/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml b/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml
index d39524d4..b6e6a116 100644
--- a/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml
+++ b/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml
@@ -23,13 +23,6 @@
 - name: "Configure YARN workers"
   hosts: workers:zeppelin
   gather_facts: false
-  vars:
-    hdname: "hadoop-3.1.3"
-    hdbase: "/opt"
-    hdhome: "/opt/hadoop"
-    hddata: "/var/local/hadoop"
-    hdhost: "{{groups['masters'][0]}}"
-    hduser: "{{hostvars[inventory_hostname].login}}"
 
   tasks:
 
diff --git a/experiments/hadoop-yarn/ansible/20-install-spark.yml b/experiments/hadoop-yarn/ansible/20-install-spark.yml
index 114a6717..dfefac4a 100644
--- a/experiments/hadoop-yarn/ansible/20-install-spark.yml
+++ b/experiments/hadoop-yarn/ansible/20-install-spark.yml
@@ -24,14 +24,6 @@
 - name: "Install Spark"
   hosts: master01:zeppelin
   gather_facts: false
-  vars:
-    spname: "spark-2.4.7"
-    spfull: "spark-2.4.7-bin-hadoop2.7"
-    spbase: "/opt"
-    sphome: "/opt/spark"
-    spdata: "/var/local/spark"
-    sphost: "{{groups['masters'][0]}}"
-    spuser: "{{hostvars[inventory_hostname].login}}"
 
   tasks:
 
diff --git a/experiments/hadoop-yarn/ansible/22-config-spark-master.yml b/experiments/hadoop-yarn/ansible/22-config-spark-master.yml
index aa4ff104..66b452c4 100644
--- a/experiments/hadoop-yarn/ansible/22-config-spark-master.yml
+++ b/experiments/hadoop-yarn/ansible/22-config-spark-master.yml
@@ -23,15 +23,6 @@
 - name: "Configure YARN masters"
   hosts: master01:zeppelin
   gather_facts: false
-  vars:
-    hdbase: "/opt"
-    hdhome: "/opt/hadoop"
-    hdhost: "{{groups['masters'][0]}}"
-    spbase: "/opt"
-    sphome: "/opt/spark"
-    spdata: "/var/local/spark"
-    sphost: "master01"
-    spuser: "{{hostvars[inventory_hostname].login}}"
 
   tasks:
 
diff --git a/experiments/hadoop-yarn/ansible/24-install-pyspark.yml b/experiments/hadoop-yarn/ansible/24-install-pyspark.yml
index 8ca2ef3d..75b330ac 100644
--- a/experiments/hadoop-yarn/ansible/24-install-pyspark.yml
+++ b/experiments/hadoop-yarn/ansible/24-install-pyspark.yml
@@ -24,12 +24,6 @@
 - name: "Install PySpark"
   hosts: master01:zeppelin
   gather_facts: false
-  vars:
-    spbase: "/opt"
-    sphome: "/opt/spark"
-    spdata: "/var/local/spark"
-    sphost: "{{groups['masters'][0]}}"
-    spuser: "{{hostvars[inventory_hostname].login}}"
 
   tasks:
 
diff --git a/experiments/hadoop-yarn/ansible/27-install-zeppelin.yml b/experiments/hadoop-yarn/ansible/27-install-zeppelin.yml
index 2f091f8c..45a5eafa 100644
--- a/experiments/hadoop-yarn/ansible/27-install-zeppelin.yml
+++ b/experiments/hadoop-yarn/ansible/27-install-zeppelin.yml
@@ -24,13 +24,6 @@
   hosts: zeppelin
   gather_facts: yes
   vars:
-    zepname: "zeppelin-0.8.2"
-    sphome: "/opt/spark"
-    hdhome: "/opt/hadoop"
-    zepbase: "/home/fedora"
-    zephome: "/home/fedora/zeppelin-0.8.2-bin-all"
-    zephost: "zeppelin"
-    zepuser: "{{hostvars[inventory_hostname].login}}"
     zeppelinconfig: |
             <?xml version="1.0"?>
             <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
diff --git a/experiments/hadoop-yarn/ansible/hosts.yml b/experiments/hadoop-yarn/ansible/hosts.yml
index 150585eb..8247e37a 100644
--- a/experiments/hadoop-yarn/ansible/hosts.yml
+++ b/experiments/hadoop-yarn/ansible/hosts.yml
@@ -48,6 +48,56 @@ all:
         # https://docs.ansible.com/ansible/latest/user_guide/intro_getting_started.html#host-key-checking
         ansible_host_key_checking: false
 
+    # Hadoop vars
+
+        hdname: "hadoop-3.1.3"
+        hdbase: "/opt"
+        hdhome: "/opt/hadoop"
+
+        hdconf: "{{hdhome}}/etc/hadoop"
+        hdhost: "master01"
+        hduser: "fedora"
+
+        hddatalink: "/var/hadoop/data"
+        hddatadest: "/mnt/cinder/vdc/hadoop/data"
+
+        hdlogslink: "/var/hadoop/logs"
+        hdlogsdest: "/mnt/cinder/vdc/hadoop/logs"
+
+    # HDFS vars
+
+        hdfsconf: "/var/hdfs/conf"
+
+        hdfsmetalink: "/var/hdfs/meta"
+        hdfsmetadest: "/mnt/cinder/vdc/hdfs/meta"
+
+        hdfslogslink: "/var/hdfs/logs"
+        hdfslogsdest: "/mnt/cinder/vdc/hdfs/logs"
+
+        hdfsdatalink: "/var/hdfs/data"
+        hdfsdatadest: "/mnt/cinder/vdc/hdfs/data"
+
+  # Spark vars
+        spname: "spark-2.4.7"
+        spfull: "spark-2.4.7-bin-hadoop2.7"
+        spbase: "/opt"
+        sphome: "/opt/spark"
+        sphost: "master01"
+        spuser: "fedora"
+
+        sptemplink: "/var/spark/temp"
+        sptempdest: "/mnt/local/vdb/spark/temp"
+
+    # Zeppelin vars
+        zepname: "zeppelin-0.8.2"
+        zepbase: "/home/fedora"
+        zephome: "/home/fedora/zeppelin-0.8.2-bin-all"
+        zephost: "zeppelin"
+        zepuser: "fedora"
+
+        #zepdatalink: '/var/zeppelin/data'
+        #zepdatadest: "/mnt/cinder/vdc/zeppelin/data"
+
     hosts:
 
         zeppelin:

From 090ba72e62019b21d984e03be9b2ba1115049fcb Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Thu, 11 Feb 2021 17:09:41 +0000
Subject: [PATCH 05/27] Fix a problem with Fedora updates

---
 .../hadoop-yarn/ansible/04-update-fedora.yml  | 50 +++++++++++
 .../hadoop-yarn/ansible/09-worker-volumes.yml |  2 +-
 .../hadoop-yarn/ansible/combined-01.yml       |  1 +
 .../hadoop-yarn/ansible/create-all.yml        |  2 +
 notes/zrq/20210208-01-fedora-repo.txt         | 86 +++++++++++++++++++
 5 files changed, 140 insertions(+), 1 deletion(-)
 create mode 100644 experiments/hadoop-yarn/ansible/04-update-fedora.yml
 create mode 100644 notes/zrq/20210208-01-fedora-repo.txt

diff --git a/experiments/hadoop-yarn/ansible/04-update-fedora.yml b/experiments/hadoop-yarn/ansible/04-update-fedora.yml
new file mode 100644
index 00000000..d5085c0e
--- /dev/null
+++ b/experiments/hadoop-yarn/ansible/04-update-fedora.yml
@@ -0,0 +1,50 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2020, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+# ignore_errors
+# https://docs.ansible.com/ansible/latest/user_guide/playbooks_error_handling.html#ignoring-failed-commands
+
+- name: "DNF update"
+  gather_facts: false
+  hosts: masters:workers:zeppelin
+  vars_files:
+    - /tmp/ansible-vars.yml
+  tasks:
+
+    # This is a noop to force a cache-refresh.
+    - name: "Update the DNF cache"
+      become: true
+      ignore_errors: yes
+      dnf:
+        name:  'kernel'
+        state: present
+        update_cache: yes
+
+
+    - name: "Install monitoring tools"
+      become: true
+      dnf:
+        name:
+        - 'atop'
+        - 'htop'
+        state: present
+
diff --git a/experiments/hadoop-yarn/ansible/09-worker-volumes.yml b/experiments/hadoop-yarn/ansible/09-worker-volumes.yml
index de158418..3f71a848 100644
--- a/experiments/hadoop-yarn/ansible/09-worker-volumes.yml
+++ b/experiments/hadoop-yarn/ansible/09-worker-volumes.yml
@@ -45,7 +45,7 @@
       become: true
       dnf:
         name:  btrfs-progs
-        state: latest
+        state: present
 
     - name: "Mount data volumes for {{ inventory_hostname }}"
       include_tasks: tasks/mount-volumes.yml
diff --git a/experiments/hadoop-yarn/ansible/combined-01.yml b/experiments/hadoop-yarn/ansible/combined-01.yml
index bc7f5fc3..35394d1c 100644
--- a/experiments/hadoop-yarn/ansible/combined-01.yml
+++ b/experiments/hadoop-yarn/ansible/combined-01.yml
@@ -31,4 +31,5 @@
 - import_playbook: 07-host-keys.yml
 - import_playbook: 08-ping-test.yml
 
+- import_playbook: 04-update-fedora.yml
 
diff --git a/experiments/hadoop-yarn/ansible/create-all.yml b/experiments/hadoop-yarn/ansible/create-all.yml
index 9476842c..8aae3fb4 100644
--- a/experiments/hadoop-yarn/ansible/create-all.yml
+++ b/experiments/hadoop-yarn/ansible/create-all.yml
@@ -32,6 +32,8 @@
 - import_playbook: 07-host-keys.yml
 - import_playbook: 08-ping-test.yml
 
+- import_playbook: 04-update-fedora.yml
+
 - import_playbook: 09-worker-volumes.yml
 - import_playbook: 26-zeppelin-volumes.yml
 
diff --git a/notes/zrq/20210208-01-fedora-repo.txt b/notes/zrq/20210208-01-fedora-repo.txt
new file mode 100644
index 00000000..efd961f3
--- /dev/null
+++ b/notes/zrq/20210208-01-fedora-repo.txt
@@ -0,0 +1,86 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+    Quick test to see if we can solve the problems with resolving the Fedora reposrityry.
+    Try configuring dnf to use the UK mirror service.
+
+    pushd /etc/yum.repos.d/
+        for repo in *.repo
+        do
+            echo "---- ----"
+            echo "Repo [${repo:?}]"
+            sudo sed -i '
+                s/^metalink/#metalink/
+                s/^#baseurl/baseurl/
+                s|http://download.fedoraproject.org|http://www.mirrorservice.org/sites/download.fedora.redhat.com|
+                ' "${repo:?}"
+
+        done
+    popd
+
+    Fails because mirrorservice only has data for 32 and 33?
+    Directories for 30 and 31 are empty.
+
+    http://www.mirrorservice.org/sites/download.fedora.redhat.com/pub/fedora/linux/releases/30 - empty
+    http://www.mirrorservice.org/sites/download.fedora.redhat.com/pub/fedora/linux/releases/31 - empty
+
+    http://www.mirrorservice.org/sites/download.fedora.redhat.com/pub/fedora/linux/releases/32 - OK
+    http://www.mirrorservice.org/sites/download.fedora.redhat.com/pub/fedora/linux/releases/33 - OK
+
+    Suggests this would work - once we have updated.
+    Alternative - create our own caching proxy site.
+    Add this to the infrastructure node.
+
+
+    List of mirrors per version
+
+    https://admin.fedoraproject.org/mirrormanager/
+    https://admin.fedoraproject.org/mirrormanager/mirrors/Fedora/30/x86_64
+
+        http://mirrors.dotsrc.org/fedora-buffet/archive/fedora/linux/updates/30/
+
+        https://ftp-stud.hs-esslingen.de/pub/Mirrors/archive.fedoraproject.org/fedora/linux/updates/30/
+
+
+
+
+    #
+    # Issue solved by forcing DNF to flush the metadata cache.
+    # Using the Ansible DNF plugin to check something is present.
+    # Picked 'kernel' because it is on every machine.
+    #
+
+    - name: "Update the DNF cache"
+      become: true
+      dnf:
+        name:  'kernel'
+        state: present
+        update_cache: yes
+
+
+
+

From 287a16406631e35df79841cc0e1fb1de73bbe2ad Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Thu, 11 Feb 2021 17:44:22 +0000
Subject: [PATCH 06/27] Volume mounts for temp space

---
 .../hadoop-yarn/ansible/09-worker-volumes.yml |   3 +-
 .../hadoop-yarn/ansible/11-install-hadoop.yml |  38 +-
 .../ansible/13-config-hdfs-namenode.yml       |  16 +-
 .../ansible/14-config-hdfs-workers.yml        |  19 +-
 .../ansible/16-config-yarn-masters.yml        |   2 +-
 .../ansible/17-config-yarn-workers.yml        |   2 +-
 .../hadoop-yarn/ansible/20-install-spark.yml  |   2 +-
 .../ansible/22-config-spark-master.yml        |  28 +-
 .../ansible/22-config-spark-workers.yml       |  36 +
 .../ansible/24-install-pyspark.yml            |   2 +-
 .../hadoop-yarn/ansible/combined-03.yml       |   1 +
 .../hadoop-yarn/ansible/create-all.yml        |   1 +
 experiments/hadoop-yarn/ansible/hosts.yml     |  31 +-
 .../ansible/tasks/create-linked.yml           |  61 ++
 .../ansible/tasks/create-volumes.yml          |  12 +-
 .../ansible/tasks/mount-volumes.yml           |  18 +-
 notes/zrq/20210208-02-ansible-deploy.txt      | 238 ++++++
 notes/zrq/20210210-01-ansible-deploy.txt      | 467 +++++++++++
 notes/zrq/20210211-01-ansible-deploy.txt      | 771 ++++++++++++++++++
 19 files changed, 1665 insertions(+), 83 deletions(-)
 create mode 100644 experiments/hadoop-yarn/ansible/22-config-spark-workers.yml
 create mode 100644 experiments/hadoop-yarn/ansible/tasks/create-linked.yml
 create mode 100644 notes/zrq/20210208-02-ansible-deploy.txt
 create mode 100644 notes/zrq/20210210-01-ansible-deploy.txt
 create mode 100644 notes/zrq/20210211-01-ansible-deploy.txt

diff --git a/experiments/hadoop-yarn/ansible/09-worker-volumes.yml b/experiments/hadoop-yarn/ansible/09-worker-volumes.yml
index 3f71a848..584f1a7a 100644
--- a/experiments/hadoop-yarn/ansible/09-worker-volumes.yml
+++ b/experiments/hadoop-yarn/ansible/09-worker-volumes.yml
@@ -47,10 +47,11 @@
         name:  btrfs-progs
         state: present
 
-    - name: "Mount data volumes for {{ inventory_hostname }}"
+    - name: "Call the mount-volumes task"
       include_tasks: tasks/mount-volumes.yml
       loop: "{{ hostvars[ inventory_hostname ].discs }}"
       loop_control:
         loop_var: disc
+      when: ((disc.type == 'cinder') or (disc.type == 'local'))
 
 
diff --git a/experiments/hadoop-yarn/ansible/11-install-hadoop.yml b/experiments/hadoop-yarn/ansible/11-install-hadoop.yml
index 696d9e88..2434d200 100644
--- a/experiments/hadoop-yarn/ansible/11-install-hadoop.yml
+++ b/experiments/hadoop-yarn/ansible/11-install-hadoop.yml
@@ -39,32 +39,26 @@
         dest: "{{hdbase}}"
         remote_src: yes
 
-    - name: "Create a symbolic link"
+    - name: "Create a symlink for the Hadoop version"
       become: true
       file:
         src:  "{{hdname}}"
         path: "{{hdhome}}"
         state: link
 
-    - name: "Create '{{hddata}}'"
-      become: true
-      file:
-        path: "{{hddata}}"
-        mode: 'u=rwx,g=rwxs,o=rx'
-        state: directory
-        recurse: yes
-        owner: "{{hduser}}"
-        group: "{{hduser}}"
+    - name: "Create Hadoop data directory"
+      include_tasks: "tasks/create-linked.yml"
+      vars:
+        linkdest: "{{hddatadest}}"
+        linkpath: "{{hddatalink}}"
+        linkuser: "{{hduser}}"
 
-    - name: "Create [{{hddata}}/logs]"
-      become: true
-      file:
-        path: "{{hddata}}/logs"
-        mode: 'u=rwx,g=rwxs,o=rx'
-        state: directory
-        recurse: yes
-        owner: "{{hduser}}"
-        group: "{{hduser}}"
+    - name: "Create Hadoop logs directory"
+      include_tasks: "tasks/create-linked.yml"
+      vars:
+        linkdest: "{{hdlogsdest}}"
+        linkpath: "{{hdlogslink}}"
+        linkuser: "{{hduser}}"
 
     # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/ClusterSetup.html#Configuring_Environment_of_Hadoop_Daemons
     - name: "Create [/etc/profile.d/hadoop.sh]"
@@ -82,8 +76,8 @@
           export PATH=${PATH}:{{hdhome}}/bin:{{hdhome}}/sbin
           #export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:{{hdhome}}/lib/native
           export HADOOP_HOME={{hdhome}}
-          export HADOOP_DATA={{hddata}}
-          export HADOOP_CONF_DIR={{hdhome}}/etc/hadoop
-          export HADOOP_LOG_DIR=${HADOOP_DATA}/logs
+          export HADOOP_DATA={{hddatalink}}
+          export HADOOP_CONF_DIR={{hdconf}}
+          export HADOOP_LOG_DIR={{hdlogslink}}
 
 
diff --git a/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml b/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml
index 4b8344c4..7bfe899b 100644
--- a/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml
+++ b/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml
@@ -21,16 +21,24 @@
 #
 
 - name: "Configure HDFS namenode"
-  hosts: master01:zeppelin
+  hosts: master01
   gather_facts: false
   vars:
+    hdfsimage: "{{hdfsmetalink}}/namenode/fsimage"
 
   tasks:
 
-    - name: "Create [{{hddata}}/namenode/fsimage]"
+    - name: "Create HDFS metadata directory"
+      include_tasks: "tasks/create-linked.yml"
+      vars:
+        linkdest: "{{hdfsmetadest}}"
+        linkpath: "{{hdfsmetalink}}"
+        linkuser: "{{hduser}}"
+
+    - name: "Create [{{hdfsimage}}]"
       become: true
       file:
-        path: "{{hddata}}/namenode/fsimage"
+        path: "{{hdfsimage}}"
         mode: 'u=rwx,g=rwxs,o=rx'
         state: directory
         recurse: yes
@@ -53,7 +61,7 @@
                 +-->
             <property>
                 <name>dfs.namenode.name.dir</name>
-                <value>{{hddata}}/namenode/fsimage</value>
+                <value>{{hdfsimage}}</value>
             </property>
 
             <!--+
diff --git a/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml b/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml
index fe7341b0..c3dd25d9 100644
--- a/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml
+++ b/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml
@@ -21,20 +21,17 @@
 #
 
 - name: "Configure Hadoop workers"
-  hosts: workers:zeppelin
+  hosts: workers
   gather_facts: false
 
   tasks:
 
-    # TODO Create from hosts.yml
-    - name: "Create [/data-01/hdfs/data]"
-      become: true
-      file:
-        path: "/data-01/hdfs/data"
-        mode: 'u=rwx,g=rwxs,o=rx'
-        state: directory
-        owner: "{{hduser}}"
-        group: "{{hduser}}"
+    - name: "Create HDFS data directory"
+      include_tasks: "tasks/create-linked.yml"
+      vars:
+        linkdest: "{{hdfsdatadest}}"
+        linkpath: "{{hdfsdatalink}}"
+        linkuser: "{{hduser}}"
 
     # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/ClusterSetup.html#Configuring_the_Hadoop_Daemons
     - name: "Configure [{{hdhome}}/etc/hadoop/hdfs-site.xml]"
@@ -53,7 +50,7 @@
                 +-->
             <property>
                 <name>dfs.datanode.data.dir</name>
-                <value>/data-01/hdfs/data</value>
+                <value>{{hdfsdatalink}}</value>
             </property>
 
 
diff --git a/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml b/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml
index 688bb1a2..e5a9fa76 100644
--- a/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml
+++ b/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml
@@ -21,7 +21,7 @@
 #
 
 - name: "Configure YARN masters"
-  hosts: master01:zeppelin
+  hosts: masters:zeppelin
   gather_facts: false
 
   tasks:
diff --git a/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml b/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml
index b6e6a116..7490cdc4 100644
--- a/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml
+++ b/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml
@@ -21,7 +21,7 @@
 #
 
 - name: "Configure YARN workers"
-  hosts: workers:zeppelin
+  hosts: workers
   gather_facts: false
 
   tasks:
diff --git a/experiments/hadoop-yarn/ansible/20-install-spark.yml b/experiments/hadoop-yarn/ansible/20-install-spark.yml
index dfefac4a..ba9ce98a 100644
--- a/experiments/hadoop-yarn/ansible/20-install-spark.yml
+++ b/experiments/hadoop-yarn/ansible/20-install-spark.yml
@@ -22,7 +22,7 @@
 
 ---
 - name: "Install Spark"
-  hosts: master01:zeppelin
+  hosts: masters:zeppelin
   gather_facts: false
 
   tasks:
diff --git a/experiments/hadoop-yarn/ansible/22-config-spark-master.yml b/experiments/hadoop-yarn/ansible/22-config-spark-master.yml
index 66b452c4..d46359d5 100644
--- a/experiments/hadoop-yarn/ansible/22-config-spark-master.yml
+++ b/experiments/hadoop-yarn/ansible/22-config-spark-master.yml
@@ -20,19 +20,21 @@
 #
 #
 
-- name: "Configure YARN masters"
-  hosts: master01:zeppelin
+- name: "Configure Spark masters"
+  hosts: zeppelin:masters
   gather_facts: false
 
   tasks:
 
-    - name: Creates directory
-      file:
-        path: "{{sphome}}/local"
-        state: directory
-        owner: "{{spuser}}"
-        group: "{{spuser}}"
-        mode: 0775
+    #
+    # The Zeppelin node is acting as our Spark Master.
+    - name: "Create Spark temp directory"
+      include_tasks: "tasks/create-linked.yml"
+      vars:
+        linkdest: "{{sptempdest}}"
+        linkpath: "{{sptemplink}}"
+        linkuser: "{{spuser}}"
+
     #
     # Documentation
     # https://spark.apache.org/docs/3.0.0-preview2/running-on-yarn.html#configuration
@@ -56,7 +58,7 @@
             spark.yarn.am.cores  4
             spark.eventLog.enabled  true
             spark.driver.maxResultSize	8192m
-            spark.local.dir         {{sphome}}/local
+            spark.local.dir         {{sptemplink}}
             spark.master            yarn
             spark.eventLog.enabled  true
             spark.eventLog.dir      hdfs://{{hdhost}}:9000/spark-log
@@ -94,12 +96,6 @@
             spark.yarn.appMasterEnv.YARN_CONF_DIR={{hdhome}}/etc/hadoop
             spark.yarn.appMasterEnv.HADOOP_CONF_DIR={{hdhome}}/etc/hadoop
 
-#
-# TODO Experiment
-# Move Spark to master02, add Yarn config.
-# {{hdhome}}/etc/hadoop/yarn-site.xml
-
-
 #
 # TODO History server.
 # https://spark.apache.org/docs/3.0.0-preview2/monitoring.html#viewing-after-the-fact
diff --git a/experiments/hadoop-yarn/ansible/22-config-spark-workers.yml b/experiments/hadoop-yarn/ansible/22-config-spark-workers.yml
new file mode 100644
index 00000000..7b086f91
--- /dev/null
+++ b/experiments/hadoop-yarn/ansible/22-config-spark-workers.yml
@@ -0,0 +1,36 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2020, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+- name: "Configure Spark workers"
+  hosts: workers
+  gather_facts: false
+
+  tasks:
+
+#    - name: "Create Spark temp directory"
+#      include_tasks: "tasks/create-linked.yml"
+#      vars:
+#        linkdest: "{{sptempdest}}"
+#        linkpath: "{{sptemplink}}"
+#        linkuser: "{{spuser}}"
+
+
diff --git a/experiments/hadoop-yarn/ansible/24-install-pyspark.yml b/experiments/hadoop-yarn/ansible/24-install-pyspark.yml
index 75b330ac..78573095 100644
--- a/experiments/hadoop-yarn/ansible/24-install-pyspark.yml
+++ b/experiments/hadoop-yarn/ansible/24-install-pyspark.yml
@@ -22,7 +22,7 @@
 
 ---
 - name: "Install PySpark"
-  hosts: master01:zeppelin
+  hosts: masters:zeppelin
   gather_facts: false
 
   tasks:
diff --git a/experiments/hadoop-yarn/ansible/combined-03.yml b/experiments/hadoop-yarn/ansible/combined-03.yml
index a2bea86a..ca8f4484 100644
--- a/experiments/hadoop-yarn/ansible/combined-03.yml
+++ b/experiments/hadoop-yarn/ansible/combined-03.yml
@@ -24,6 +24,7 @@
 - import_playbook: 20-install-spark.yml
 - import_playbook: 21-config-spark-security.yml
 - import_playbook: 22-config-spark-master.yml
+- import_playbook: 22-config-spark-workers.yml
 
 - import_playbook: 23-install-python.yml
 - import_playbook: 24-install-pyspark.yml
diff --git a/experiments/hadoop-yarn/ansible/create-all.yml b/experiments/hadoop-yarn/ansible/create-all.yml
index 8aae3fb4..67e91c0c 100644
--- a/experiments/hadoop-yarn/ansible/create-all.yml
+++ b/experiments/hadoop-yarn/ansible/create-all.yml
@@ -54,6 +54,7 @@
 - import_playbook: 20-install-spark.yml
 - import_playbook: 21-config-spark-security.yml
 - import_playbook: 22-config-spark-master.yml
+- import_playbook: 22-config-spark-workers.yml
 
 - import_playbook: 23-install-python.yml
 - import_playbook: 24-install-pyspark.yml
diff --git a/experiments/hadoop-yarn/ansible/hosts.yml b/experiments/hadoop-yarn/ansible/hosts.yml
index 8247e37a..9035f5f0 100644
--- a/experiments/hadoop-yarn/ansible/hosts.yml
+++ b/experiments/hadoop-yarn/ansible/hosts.yml
@@ -103,18 +103,23 @@ all:
         zeppelin:
             login:  'fedora'
             image:  'Fedora-30-1.2'
-            flavor: 'general.v1.small'
+            flavor: 'general.v1.medium'
             discs:
-              - size: 512
-                name: data-02
-                mntpath: '/data-02'
-                devpath: '/dev/vdb'
+              - type: 'local'
+                format: 'ext4'
+                mntpath: "/mnt/local/vdb"
+                devname: 'vdb'
+              - type: 'cinder'
+                size: 512
+                format: 'btrfs'
+                mntpath: "/mnt/cinder/vdc"
+                devname: 'vdc'
 
     children:
 
         masters:
             hosts:
-                master[01:02]:
+                master[01:01]:
             vars:
                 login:  'fedora'
                 image:  'Fedora-30-1.2'
@@ -128,9 +133,13 @@ all:
                 image:  'Fedora-30-1.2'
                 flavor: 'general.v1.small'
                 discs:
-                  - size: 512
-                    name: data-01
-                    mntpath: '/data-01'
-                    devpath: '/dev/vdb'
-
+#                  - type: 'local'
+#                    format: 'ext4'
+#                    mntpath: "/mnt/local/vdb"
+#                    devname: 'vdb'
+                  - type: 'cinder'
+                    size: 512
+                    format: 'btrfs'
+                    mntpath: "/mnt/cinder/vdb"
+                    devname: 'vdb'
 
diff --git a/experiments/hadoop-yarn/ansible/tasks/create-linked.yml b/experiments/hadoop-yarn/ansible/tasks/create-linked.yml
new file mode 100644
index 00000000..1c137b86
--- /dev/null
+++ b/experiments/hadoop-yarn/ansible/tasks/create-linked.yml
@@ -0,0 +1,61 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2020, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+- name: "Create destination parent [{{linkdest | dirname}}]"
+  become: true
+  file:
+    path: "{{linkdest | dirname}}"
+    mode: 'u=rwx,g=rwxs,o=rx'
+    state: directory
+    recurse: yes
+    owner: 'root'
+    group: 'root'
+
+- name: "Create link destination [{{linkdest}}]"
+  become: true
+  file:
+    path: "{{linkdest}}"
+    mode: 'u=rwx,g=rwxs,o=rx'
+    state: directory
+    owner: "{{linkuser}}"
+    group: "{{linkuser}}"
+
+
+- name: "Create link parent [{{linkpath | dirname}}]"
+  become: true
+  file:
+    path: "{{linkpath | dirname}}"
+    mode: 'u=rwx,g=rwxs,o=rx'
+    state: directory
+    recurse: yes
+    owner: 'root'
+    group: 'root'
+
+- name: "Create link [{{linkpath}} -> {{linkdest}}]"
+  become: true
+  file:
+    src:  "{{linkdest}}"
+    path: "{{linkpath}}"
+    state: link
+#   owner: 'root'
+#   group: 'root'
+
diff --git a/experiments/hadoop-yarn/ansible/tasks/create-volumes.yml b/experiments/hadoop-yarn/ansible/tasks/create-volumes.yml
index 6109fc3e..17b52456 100644
--- a/experiments/hadoop-yarn/ansible/tasks/create-volumes.yml
+++ b/experiments/hadoop-yarn/ansible/tasks/create-volumes.yml
@@ -20,23 +20,25 @@
 #
 #
 
-- name: "Create volumes for [{{ vmname }}]"
+- name: "Create Cinder volumes for [{{ vmname }}]"
   os_volume:
     cloud: "{{ cloudname }}"
     state: present
     size: "{{ item.size }}"
-    display_name: "{{ deployname }}-{{ vmname }}-{{ item.name }}"
+    display_name: "{{ deployname }}-{{ vmname }}-{{ item.devname }}"
   loop:
     "{{ hostvars[vmname].discs }}"
+  when: item.type == 'cinder'
 
-- name: "Attach volumes to [{{ vmname }}]"
+- name: "Attach Cinder volumes to [{{ vmname }}]"
   os_server_volume:
     cloud: "{{ cloudname }}"
     state: present
     server: "{{ deployname }}-{{ vmname }}"
-    volume: "{{ deployname }}-{{ vmname }}-{{ item.name }}"
-    device: "{{ item.devpath }}"
+    volume: "{{ deployname }}-{{ vmname }}-{{ item.devname }}"
+    device: "/dev/{{ item.devname }}"
   loop:
     "{{ hostvars[vmname].discs }}"
+  when: item.type == 'cinder'
 
 
diff --git a/experiments/hadoop-yarn/ansible/tasks/mount-volumes.yml b/experiments/hadoop-yarn/ansible/tasks/mount-volumes.yml
index 512aab23..a6efa442 100644
--- a/experiments/hadoop-yarn/ansible/tasks/mount-volumes.yml
+++ b/experiments/hadoop-yarn/ansible/tasks/mount-volumes.yml
@@ -20,19 +20,19 @@
 #
 #
 
-
-- name: "Create btrfs filesystem on {{disc.devpath}}"
+- name: "Create [{{disc.format}}] filesystem on [/dev/{{disc.devname}}]"
   become: true
   filesystem:
-    fstype: btrfs
-    dev: "{{disc.devpath}}"
+    fstype: "{{disc.format}}"
+    dev:    "/dev/{{disc.devname}}"
+  when: (disc.format == 'btrfs')
 
-# TODO Only do this if not already created
-- name: "Create the mount point {{disc.mntpath}}"
+- name: "Create mount path [{{disc.mntpath}}]"
   become: true
   file:
     path: "{{disc.mntpath}}"
     state: directory
+    recurse: yes
 
 # TODO Only do this if not already mounted
 - name: "Create the mount-failed indicator"
@@ -44,12 +44,12 @@
 
 # TODO Only do this if not already mounted
 # TODO Mount using UUID rather than device path.
-- name: "Mount device {{disc.devpath}} at {{disc.mntpath}}"
+- name: "Mount [{{disc.format}}] [/dev/{{disc.devname}}] at [{{disc.mntpath}}]"
   become: true
   mount:
-    src:  "{{disc.devpath}}"
+    src:    "/dev/{{disc.devname}}"
     path: "{{disc.mntpath}}"
-    fstype: btrfs
+    fstype: "{{disc.format}}"
     state:  mounted
 
 
diff --git a/notes/zrq/20210208-02-ansible-deploy.txt b/notes/zrq/20210208-02-ansible-deploy.txt
new file mode 100644
index 00000000..5caed874
--- /dev/null
+++ b/notes/zrq/20210208-02-ansible-deploy.txt
@@ -0,0 +1,238 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+    Target:
+
+        Add support for different types of disc ...
+
+    Results:
+
+        Work in progress ....
+
+
+# -----------------------------------------------------
+# Update the Openstack cloud name.
+#[user@desktop]
+
+    cloudname=gaia-dev
+
+    sed -i '
+        s/^\(AGLAIS_CLOUD\)=.*$/\1='${cloudname:?}'/
+        ' "${HOME}/aglais.env"
+
+
+# -----------------------------------------------------
+# Create a container to work with.
+# (*) extra volume mount for /common
+#[user@desktop]
+
+    source "${HOME:?}/aglais.env"
+
+    podman run \
+        --rm \
+        --tty \
+        --interactive \
+        --name ansibler \
+        --hostname ansibler \
+        --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \
+        --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \
+        --env "cloudname=${AGLAIS_CLOUD:?}" \
+        --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/common:/common:ro,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/openstack:/openstack:ro,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/hadoop-yarn:/hadoop-yarn:ro,z" \
+        atolmis/ansible-client:2020.12.02 \
+        bash
+
+
+# -----------------------------------------------------
+# Create our Aglais configuration.
+#[root@kubernator]
+
+cat > '/tmp/aglais-config.yml' << EOF
+aglais:
+    version: 1.0
+    spec:
+        openstack:
+            cloud: '${cloudname:?}'
+
+EOF
+
+
+# -----------------------------------------------------
+# Create everything from scratch.
+#[root@ansibler]
+
+    time \
+        /openstack/bin/delete-all.sh \
+            "${cloudname:?}"
+
+    rm -f ~/.ssh/*
+
+    time \
+        /hadoop-yarn/bin/create-all.sh
+
+
+    >   ....
+    >   ....
+
+
+# -----------------------------------------------------
+# Check the data directories.
+#[root@ansibler]
+
+
+    ssh worker01 \
+        '
+        date
+        hostname
+        echo  "----"
+        echo  "/var/spark"
+        ls -l "/var/spark"
+        df -h "/var/spark/temp"
+
+        echo  "----"
+        echo  "/var/hdfs"
+        ls -l "/var/hdfs"
+        df -h "/var/hdfs/data"
+
+        echo  "----"
+        echo  "/var/hadoop"
+        ls -l "/var/hadoop"
+        df -h "/var/hadoop/data"
+        '
+
+    >   Tue Feb  9 15:46:45 UTC 2021
+    >   gaia-dev-20210209-worker01.novalocal
+    >   ----
+    >   /var/spark
+    >   total 0
+    >   lrwxrwxrwx. 1 root root 25 Feb  9 15:41 temp -> /mnt/local/vdb/spark/temp
+    >   Filesystem      Size  Used Avail Use% Mounted on
+    >   /dev/vdb         59G   53M   56G   1% /mnt/local/vdb
+    >   ----
+    >   /var/hdfs
+    >   total 0
+    >   lrwxrwxrwx. 1 root root 25 Feb  9 15:38 data -> /mnt/cinder/vdc/hdfs/data
+    >   Filesystem      Size  Used Avail Use% Mounted on
+    >   /dev/vdc        512G   17M  510G   1% /mnt/cinder/vdc
+    >   ----
+    >   /var/hadoop
+    >   total 0
+    >   lrwxrwxrwx. 1 root root 27 Feb  9 15:36 data -> /mnt/cinder/vdc/hadoop/data
+    >   lrwxrwxrwx. 1 root root 27 Feb  9 15:36 logs -> /mnt/cinder/vdc/hadoop/logs
+    >   Filesystem      Size  Used Avail Use% Mounted on
+    >   /dev/vdc        512G   17M  510G   1% /mnt/cinder/vdc
+
+
+# -----------------------------------------------------
+# Check the deployment status.
+#[root@ansibler]
+
+    cat '/tmp/aglais-status.yml'
+
+    >   aglais:
+    >     spec:
+    >       openstack:
+    >         cloud: gaia-dev
+    >     status:
+    >       deployment:
+    >         type: hadoop-yarn
+    >         name: gaia-dev-20210209
+    >         date: 20210209T194001
+
+    deployname=$(
+        yq read \
+            '/tmp/aglais-status.yml' \
+                'aglais.status.deployment.name'
+        )
+
+    echo "Deployment [${deployname}]"
+
+    >   Deployment [gaia-dev-20210209]
+
+
+# -----------------------------------------------------
+# Get the public IP address of our Zeppelin node.
+#[root@ansibler]
+
+    zeppelinid=$(
+        openstack \
+            --os-cloud "${cloudname:?}" \
+            server list \
+                --format json \
+        | jq -r '.[] | select(.Name == "'${deployname:?}'-zeppelin") | .ID'
+        )
+
+    zeppelinip=$(
+        openstack \
+            --os-cloud "${cloudname:?}" \
+            server show \
+                --format json \
+                "${zeppelinid:?}" \
+        | jq -r '.addresses' \
+        | sed '
+            s/[[:space:]]//
+            s/.*=\(.*\)/\1/
+            s/.*,\(.*\)/\1/
+            '
+        )
+
+cat << EOF
+Zeppelin ID [${zeppelinid:?}]
+Zeppelin IP [${zeppelinip:?}]
+EOF
+
+    >   Zeppelin ID [a10a9b20-812a-4cab-ae97-efb2ccaddc0f]
+    >   Zeppelin IP [128.232.227.229]
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+    Update our DNS
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Login to Zeppelin ...
+#[user@desktop]
+
+    firefox --new-window "http://zeppelin.metagrid.xyz:8080/" &
+
+
+# -----------------------------------------------------
+# Run test notebooks ..
+#[user@zeppelin]
+
+
+    Import notebooks from GitHu, clear the output and run all the cells ...
+
+    Good astrometric solutions via ML Random Forrest classifier
+    https://raw.githubusercontent.com/wfau/aglais-notebooks/main/2FRPC4BFS/note.json
+
diff --git a/notes/zrq/20210210-01-ansible-deploy.txt b/notes/zrq/20210210-01-ansible-deploy.txt
new file mode 100644
index 00000000..6cdff1c3
--- /dev/null
+++ b/notes/zrq/20210210-01-ansible-deploy.txt
@@ -0,0 +1,467 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+    Target:
+
+        Get Spark to work with the new configuration.
+
+        Test config:
+            no gateway
+            medium zeppelin
+            4 medium workers
+
+    Results:
+
+        Work in progress ....
+
+
+# -----------------------------------------------------
+# Update the Openstack cloud name.
+#[user@desktop]
+
+    cloudname=gaia-dev
+
+    sed -i '
+        s/^\(AGLAIS_CLOUD\)=.*$/\1='${cloudname:?}'/
+        ' "${HOME}/aglais.env"
+
+
+# -----------------------------------------------------
+# Create a container to work with.
+# (*) extra volume mount for /common
+#[user@desktop]
+
+    source "${HOME:?}/aglais.env"
+
+    podman run \
+        --rm \
+        --tty \
+        --interactive \
+        --name ansibler \
+        --hostname ansibler \
+        --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \
+        --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \
+        --env "cloudname=${AGLAIS_CLOUD:?}" \
+        --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/common:/common:ro,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/openstack:/openstack:ro,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/hadoop-yarn:/hadoop-yarn:ro,z" \
+        atolmis/ansible-client:2020.12.02 \
+        bash
+
+
+# -----------------------------------------------------
+# Create our Aglais configuration.
+#[root@kubernator]
+
+cat > '/tmp/aglais-config.yml' << EOF
+aglais:
+    version: 1.0
+    spec:
+        openstack:
+            cloud: '${cloudname:?}'
+
+EOF
+
+
+# -----------------------------------------------------
+# Create everything from scratch.
+#[root@ansibler]
+
+    time \
+        /openstack/bin/delete-all.sh \
+            "${cloudname:?}"
+
+    rm -f ~/.ssh/*
+
+    time \
+        /hadoop-yarn/bin/create-all.sh
+
+
+    >   ....
+    >   ....
+
+
+# -----------------------------------------------------
+# Check the deployment status.
+#[root@ansibler]
+
+    cat '/tmp/aglais-status.yml'
+
+    >   aglais:
+    >     spec:
+    >       openstack:
+    >         cloud: gaia-dev
+    >     status:
+    >       deployment:
+    >         type: hadoop-yarn
+    >         name: gaia-dev-20210209
+    >         date: 20210209T194001
+
+
+# -----------------------------------------------------
+# Get the public IP address of our Zeppelin node.
+#[root@ansibler]
+
+    deployname=$(
+        yq read \
+            '/tmp/aglais-status.yml' \
+                'aglais.status.deployment.name'
+        )
+
+    zeppelinid=$(
+        openstack \
+            --os-cloud "${cloudname:?}" \
+            server list \
+                --format json \
+        | jq -r '.[] | select(.Name == "'${deployname:?}'-zeppelin") | .ID'
+        )
+
+    zeppelinip=$(
+        openstack \
+            --os-cloud "${cloudname:?}" \
+            server show \
+                --format json \
+                "${zeppelinid:?}" \
+        | jq -r '.addresses' \
+        | sed '
+            s/[[:space:]]//
+            s/.*=\(.*\)/\1/
+            s/.*,\(.*\)/\1/
+            '
+        )
+
+cat << EOF
+Zeppelin ID [${zeppelinid:?}]
+Zeppelin IP [${zeppelinip:?}]
+EOF
+
+    >   Zeppelin ID [a10a9b20-812a-4cab-ae97-efb2ccaddc0f]
+    >   Zeppelin IP [128.232.227.229]
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+    Update our DNS
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Login to Zeppelin ...
+#[user@desktop]
+
+    firefox --new-window "http://zeppelin.metagrid.xyz:8080/" &
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+
+    Import notebooks from GitHub, clear the output and run all the cells ...
+
+    Good astrometric solutions via ML Random Forrest classifier
+    https://raw.githubusercontent.com/wfau/aglais-notebooks/main/2FRPC4BFS/note.json
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+    >   org.apache.thrift.transport.TTransportException
+    >   	at org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:132)
+    >   	at org.apache.thrift.transport.TTransport.readAll(TTransport.java:86)
+    >   	at org.apache.thrift.protocol.TBinaryProtocol.readAll(TBinaryProtocol.java:429)
+    >   	at org.apache.thrift.protocol.TBinaryProtocol.readI32(TBinaryProtocol.java:318)
+    >   	at org.apache.thrift.protocol.TBinaryProtocol.readMessageBegin(TBinaryProtocol.java:219)
+    >   	at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:69)
+    >   	at org.apache.zeppelin.interpreter.thrift.RemoteInterpreterService$Client.recv_interpret(RemoteInterpreterService.java:274)
+    >   	at org.apache.zeppelin.interpreter.thrift.RemoteInterpreterService$Client.interpret(RemoteInterpreterService.java:258)
+    >   	at org.apache.zeppelin.interpreter.remote.RemoteInterpreter$4.call(RemoteInterpreter.java:233)
+    >   	at org.apache.zeppelin.interpreter.remote.RemoteInterpreter$4.call(RemoteInterpreter.java:229)
+    >   	at org.apache.zeppelin.interpreter.remote.RemoteInterpreterProcess.callRemoteFunction(RemoteInterpreterProcess.java:135)
+    >   	at org.apache.zeppelin.interpreter.remote.RemoteInterpreter.interpret(RemoteInterpreter.java:228)
+    >   	at org.apache.zeppelin.notebook.Paragraph.jobRun(Paragraph.java:449)
+    >   	at org.apache.zeppelin.scheduler.Job.run(Job.java:188)
+    >   	at org.apache.zeppelin.scheduler.RemoteScheduler$JobRunner.run(RemoteScheduler.java:315)
+    >   	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
+    >   	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
+    >   	at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
+    >   	at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
+    >   	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
+    >   	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
+    >   	at java.lang.Thread.run(Thread.java:748)
+
+
+# -----------------------------------------------------
+# Check the Zeppelin logs.
+#[user@zeppelin]
+
+    pushd /home/fedora/zeppelin-0.8.2-bin-all/logs
+        cat zeppelin-interpreter-spark-fedora-gaia-dev-20210210-zeppelin.novalocal.log
+
+    >   ....
+    >   ....
+    >    INFO [2021-02-10 12:33:29,264] ({main} RemoteInterpreterServer.java[main]:261) - URL:jar:file:/home/fedora/zeppelin-0.8.2-bin-all/interpreter/spark/spark-interpreter-0.8.2.jar!/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.class
+    >    INFO [2021-02-10 12:33:29,284] ({main} RemoteInterpreterServer.java[<init>]:162) - Launching ThriftServer at 10.10.2.210:36095
+    >    INFO [2021-02-10 12:33:29,286] ({main} RemoteInterpreterServer.java[<init>]:166) - Starting remote interpreter server on port 36095
+    >    INFO [2021-02-10 12:33:29,287] ({Thread-3} RemoteInterpreterServer.java[run]:203) - Starting remote interpreter server on port 36095
+    >    INFO [2021-02-10 12:33:29,291] ({Thread-4} RemoteInterpreterUtils.java[registerInterpreter]:165) - callbackHost: 10.10.2.210, callbackPort: 42139, callbackInfo: CallbackInfo(host:10.10.2.210, port:36095)
+    >    INFO [2021-02-10 12:33:29,354] ({pool-1-thread-1} RemoteInterpreterServer.java[createInterpreter]:311) - Instantiate interpreter org.apache.zeppelin.spark.SparkInterpreter
+    >    INFO [2021-02-10 12:33:29,355] ({pool-1-thread-1} RemoteInterpreterServer.java[createInterpreter]:311) - Instantiate interpreter org.apache.zeppelin.spark.SparkSqlInterpreter
+    >    INFO [2021-02-10 12:33:29,360] ({pool-1-thread-1} RemoteInterpreterServer.java[createInterpreter]:311) - Instantiate interpreter org.apache.zeppelin.spark.DepInterpreter
+    >    INFO [2021-02-10 12:33:29,363] ({pool-1-thread-1} RemoteInterpreterServer.java[createInterpreter]:311) - Instantiate interpreter org.apache.zeppelin.spark.PySparkInterpreter
+    >    INFO [2021-02-10 12:33:29,366] ({pool-1-thread-1} RemoteInterpreterServer.java[createInterpreter]:311) - Instantiate interpreter org.apache.zeppelin.spark.IPySparkInterpreter
+    >    INFO [2021-02-10 12:33:29,368] ({pool-1-thread-1} RemoteInterpreterServer.java[createInterpreter]:311) - Instantiate interpreter org.apache.zeppelin.spark.SparkRInterpreter
+    >    WARN [2021-02-10 12:33:29,456] ({pool-1-thread-1} ZeppelinConfiguration.java[create]:117) - Failed to load configuration, proceeding with a default
+    >    INFO [2021-02-10 12:33:29,470] ({pool-1-thread-1} ZeppelinConfiguration.java[create]:129) - Server Host: 127.0.0.1
+    >    INFO [2021-02-10 12:33:29,470] ({pool-1-thread-1} ZeppelinConfiguration.java[create]:131) - Server Port: 8080
+    >    INFO [2021-02-10 12:33:29,470] ({pool-1-thread-1} ZeppelinConfiguration.java[create]:135) - Context Path: /
+    >    INFO [2021-02-10 12:33:29,472] ({pool-1-thread-1} ZeppelinConfiguration.java[create]:136) - Zeppelin Version: 0.8.2
+    >    INFO [2021-02-10 12:33:29,472] ({pool-1-thread-1} SchedulerFactory.java[<init>]:59) - Scheduler Thread Pool Size: 100
+    >    INFO [2021-02-10 12:33:29,475] ({pool-2-thread-2} SchedulerFactory.java[jobStarted]:114) - Job 20201013-131649_1734629667 started by scheduler interpreter_1097442532
+    >    INFO [2021-02-10 12:33:29,818] ({pool-2-thread-2} IPythonInterpreter.java[checkIPythonPrerequisite]:200) - IPython prerequisite is met
+    >    INFO [2021-02-10 12:33:29,820] ({pool-2-thread-2} NewSparkInterpreter.java[open]:83) - Using Scala Version: 2.11
+    >    INFO [2021-02-10 12:33:33,057] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Running Spark version 2.4.7
+    >    WARN [2021-02-10 12:33:33,104] ({pool-2-thread-2} Logging.scala[logWarning]:66) - Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
+    >    INFO [2021-02-10 12:33:33,113] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Submitted application: Zeppelin
+    >    INFO [2021-02-10 12:33:33,162] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Changing view acls to: fedora
+    >    INFO [2021-02-10 12:33:33,163] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Changing modify acls to: fedora
+    >    INFO [2021-02-10 12:33:33,163] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Changing view acls groups to:
+    >    INFO [2021-02-10 12:33:33,163] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Changing modify acls groups to:
+    >    INFO [2021-02-10 12:33:33,163] ({pool-2-thread-2} Logging.scala[logInfo]:54) - SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(fedora); groups with view permissions: Set(); users  with modify permissions: Set(fedora); groups with modify permissions: Set()
+    >    INFO [2021-02-10 12:33:33,343] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Successfully started service 'sparkDriver' on port 36301.
+    >    INFO [2021-02-10 12:33:33,365] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Registering MapOutputTracker
+    >    INFO [2021-02-10 12:33:33,381] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Registering BlockManagerMaster
+    >    INFO [2021-02-10 12:33:33,383] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
+    >    INFO [2021-02-10 12:33:33,384] ({pool-2-thread-2} Logging.scala[logInfo]:54) - BlockManagerMasterEndpoint up
+    >   ERROR [2021-02-10 12:33:33,394] ({pool-2-thread-2} Logging.scala[logError]:91) - Failed to create local dir in /var/spark/temp. Ignoring this directory.
+    >   java.io.IOException: Failed to create a temp directory (under /var/spark/temp) after 10 attempts!
+    >   	at org.apache.spark.util.Utils$.createDirectory(Utils.scala:311)
+    >   	at org.apache.spark.storage.DiskBlockManager$$anonfun$createLocalDirs$1.apply(DiskBlockManager.scala:141)
+    >   	at org.apache.spark.storage.DiskBlockManager$$anonfun$createLocalDirs$1.apply(DiskBlockManager.scala:139)
+    >   	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
+    >   	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
+    >   	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
+    >   	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
+    >   	at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
+    >   	at scala.collection.mutable.ArrayOps$ofRef.flatMap(ArrayOps.scala:186)
+    >   	at org.apache.spark.storage.DiskBlockManager.createLocalDirs(DiskBlockManager.scala:139)
+    >   	at org.apache.spark.storage.DiskBlockManager.<init>(DiskBlockManager.scala:42)
+    >   	at org.apache.spark.storage.BlockManager.<init>(BlockManager.scala:143)
+    >   	at org.apache.spark.SparkEnv$.create(SparkEnv.scala:349)
+    >   	at org.apache.spark.SparkEnv$.createDriverEnv(SparkEnv.scala:175)
+    >   	at org.apache.spark.SparkContext.createSparkEnv(SparkContext.scala:257)
+    >   	at org.apache.spark.SparkContext.<init>(SparkContext.scala:424)
+    >   	at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2520)
+    >   	at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:930)
+    >   	at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:921)
+    >   	at scala.Option.getOrElse(Option.scala:121)
+    >   	at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:921)
+    >   	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
+    >   	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
+    >   	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
+    >   	at java.lang.reflect.Method.invoke(Method.java:498)
+    >   	at org.apache.zeppelin.spark.BaseSparkScalaInterpreter.spark2CreateContext(BaseSparkScalaInterpreter.scala:263)
+    >   	at org.apache.zeppelin.spark.BaseSparkScalaInterpreter.createSparkContext(BaseSparkScalaInterpreter.scala:182)
+    >   	at org.apache.zeppelin.spark.SparkScala211Interpreter.open(SparkScala211Interpreter.scala:90)
+    >   	at org.apache.zeppelin.spark.NewSparkInterpreter.open(NewSparkInterpreter.java:102)
+    >   	at org.apache.zeppelin.spark.SparkInterpreter.open(SparkInterpreter.java:62)
+    >   	at org.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:69)
+    >   	at org.apache.zeppelin.spark.IPySparkInterpreter.getSparkInterpreter(IPySparkInterpreter.java:94)
+    >   	at org.apache.zeppelin.spark.IPySparkInterpreter.open(IPySparkInterpreter.java:54)
+    >   	at org.apache.zeppelin.spark.PySparkInterpreter.open(PySparkInterpreter.java:129)
+    >   	at org.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:69)
+    >   	at org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:616)
+    >   	at org.apache.zeppelin.scheduler.Job.run(Job.java:188)
+    >   	at org.apache.zeppelin.scheduler.FIFOScheduler$1.run(FIFOScheduler.java:140)
+    >   	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
+    >   	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
+    >   	at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
+    >   	at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
+    >   	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
+    >   	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
+    >   	at java.lang.Thread.run(Thread.java:748)
+    >   ERROR [2021-02-10 12:33:33,396] ({pool-2-thread-2} Logging.scala[logError]:70) - Failed to create any local dir.
+    >    INFO [2021-02-10 12:33:33,399] ({Thread-1} Logging.scala[logInfo]:54) - Shutdown hook called
+    >    INFO [2021-02-10 12:33:33,400] ({Thread-1} Logging.scala[logInfo]:54) - Deleting directory /tmp/spark-c78cdeb5-667e-4ad4-bdf4-c8da522abd15
+
+    #
+    # Zeppelin is actibg as the Spark master, and is looking for the /var/spark/temp local directory.
+    #
+
+    #
+    # Now that we have removed the gateway node, can we change Zeppelin into a medium node
+    # and add the /var/spark/temp local directory.
+    #
+    # Yes - needed to add the /var/spark/temp local directory to Zeppelin node to get notebook to run.
+    #
+
+    #
+    # Zeppelin node is running a single threaded python task at 100% cpu.
+    # Logs on Zeppelin node show it is sending out tasks to the other nodes ..
+
+
+    100% active thread is ipython_server
+
+        python /tmp/zeppelin_ipython8675898749474775789/ipython_server.py 43261
+
+
+# -----------------------------------------------------
+# Check the Zeppelin logs.
+#[user@zeppelin]
+
+    pushd /home/fedora
+
+        tail -f zeppelin-0.8.2-bin-all/logs/zeppelin-interpreter-spark-fedora-gaia-dev-20210210-zeppelin.novalocal.log
+
+    >   ....
+    >   ....
+    >    INFO [2021-02-10 18:15:13,368] ({dispatcher-event-loop-4} Logging.scala[logInfo]:54) - Starting task 674.0 in stage 92.0 (TID 318172, worker01, executor 2, partition 674, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:13,368] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 541.0 in stage 92.0 (TID 318039) in 201169 ms on worker01 (executor 2) (663/5720)
+    >    INFO [2021-02-10 18:15:15,880] ({dispatcher-event-loop-11} Logging.scala[logInfo]:54) - Starting task 675.0 in stage 92.0 (TID 318173, worker02, executor 3, partition 675, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:15,880] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 529.0 in stage 92.0 (TID 318027) in 215511 ms on worker02 (executor 3) (664/5720)
+    >    INFO [2021-02-10 18:15:15,884] ({dispatcher-event-loop-1} Logging.scala[logInfo]:54) - Starting task 676.0 in stage 92.0 (TID 318174, worker04, executor 1, partition 676, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:15,884] ({task-result-getter-0} Logging.scala[logInfo]:54) - Finished task 652.0 in stage 92.0 (TID 318150) in 57495 ms on worker04 (executor 1) (665/5720)
+    >    INFO [2021-02-10 18:15:16,153] ({dispatcher-event-loop-2} Logging.scala[logInfo]:54) - Starting task 677.0 in stage 92.0 (TID 318175, worker04, executor 1, partition 677, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:16,153] ({task-result-getter-3} Logging.scala[logInfo]:54) - Finished task 676.0 in stage 92.0 (TID 318174) in 269 ms on worker04 (executor 1) (666/5720)
+    >   ....
+    >   ....
+    >    INFO [2021-02-10 18:15:42,232] ({dispatcher-event-loop-4} Logging.scala[logInfo]:54) - Starting task 678.0 in stage 92.0 (TID 318176, worker01, executor 2, partition 678, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:42,232] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 628.0 in stage 92.0 (TID 318126) in 129777 ms on worker01 (executor 2) (667/5720)
+    >    INFO [2021-02-10 18:15:42,662] ({dispatcher-event-loop-12} Logging.scala[logInfo]:54) - Starting task 679.0 in stage 92.0 (TID 318177, worker01, executor 2, partition 679, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:42,662] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 678.0 in stage 92.0 (TID 318176) in 430 ms on worker01 (executor 2) (668/5720)
+    >    INFO [2021-02-10 18:15:51,745] ({dispatcher-event-loop-0} Logging.scala[logInfo]:54) - Starting task 680.0 in stage 92.0 (TID 318178, worker04, executor 1, partition 680, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:51,746] ({task-result-getter-0} Logging.scala[logInfo]:54) - Finished task 673.0 in stage 92.0 (TID 318171) in 60263 ms on worker04 (executor 1) (669/5720)
+    >    INFO [2021-02-10 18:15:51,988] ({dispatcher-event-loop-7} Logging.scala[logInfo]:54) - Starting task 681.0 in stage 92.0 (TID 318179, worker04, executor 1, partition 681, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:51,988] ({task-result-getter-3} Logging.scala[logInfo]:54) - Finished task 680.0 in stage 92.0 (TID 318178) in 243 ms on worker04 (executor 1) (670/5720)
+    >    INFO [2021-02-10 18:15:52,201] ({dispatcher-event-loop-6} Logging.scala[logInfo]:54) - Starting task 682.0 in stage 92.0 (TID 318180, worker04, executor 1, partition 682, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:52,202] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 681.0 in stage 92.0 (TID 318179) in 214 ms on worker04 (executor 1) (671/5720)
+    >    INFO [2021-02-10 18:15:52,388] ({dispatcher-event-loop-9} Logging.scala[logInfo]:54) - Starting task 683.0 in stage 92.0 (TID 318181, worker04, executor 1, partition 683, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:52,388] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 682.0 in stage 92.0 (TID 318180) in 187 ms on worker04 (executor 1) (672/5720)
+    >    INFO [2021-02-10 18:15:52,822] ({dispatcher-event-loop-4} Logging.scala[logInfo]:54) - Starting task 684.0 in stage 92.0 (TID 318182, worker04, executor 1, partition 684, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:52,822] ({task-result-getter-0} Logging.scala[logInfo]:54) - Finished task 683.0 in stage 92.0 (TID 318181) in 434 ms on worker04 (executor 1) (673/5720)
+    >    INFO [2021-02-10 18:15:53,118] ({dispatcher-event-loop-10} Logging.scala[logInfo]:54) - Starting task 685.0 in stage 92.0 (TID 318183, worker04, executor 1, partition 685, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:53,118] ({task-result-getter-3} Logging.scala[logInfo]:54) - Finished task 684.0 in stage 92.0 (TID 318182) in 296 ms on worker04 (executor 1) (674/5720)
+    >    INFO [2021-02-10 18:15:53,312] ({dispatcher-event-loop-0} Logging.scala[logInfo]:54) - Starting task 686.0 in stage 92.0 (TID 318184, worker04, executor 1, partition 686, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:53,313] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 685.0 in stage 92.0 (TID 318183) in 194 ms on worker04 (executor 1) (675/5720)
+    >    INFO [2021-02-10 18:15:53,522] ({dispatcher-event-loop-7} Logging.scala[logInfo]:54) - Starting task 687.0 in stage 92.0 (TID 318185, worker04, executor 1, partition 687, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:53,522] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 686.0 in stage 92.0 (TID 318184) in 210 ms on worker04 (executor 1) (676/5720)
+    >    INFO [2021-02-10 18:15:54,158] ({dispatcher-event-loop-6} Logging.scala[logInfo]:54) - Starting task 688.0 in stage 92.0 (TID 318186, worker04, executor 1, partition 688, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:54,159] ({task-result-getter-0} Logging.scala[logInfo]:54) - Finished task 687.0 in stage 92.0 (TID 318185) in 637 ms on worker04 (executor 1) (677/5720)
+    >    INFO [2021-02-10 18:15:54,408] ({dispatcher-event-loop-9} Logging.scala[logInfo]:54) - Starting task 689.0 in stage 92.0 (TID 318187, worker04, executor 1, partition 689, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:54,408] ({task-result-getter-3} Logging.scala[logInfo]:54) - Finished task 688.0 in stage 92.0 (TID 318186) in 250 ms on worker04 (executor 1) (678/5720)
+    >    INFO [2021-02-10 18:15:57,157] ({dispatcher-event-loop-7} Logging.scala[logInfo]:54) - Starting task 690.0 in stage 92.0 (TID 318188, worker02, executor 3, partition 690, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:57,157] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 675.0 in stage 92.0 (TID 318173) in 41277 ms on worker02 (executor 3) (679/5720)
+    >    INFO [2021-02-10 18:15:57,825] ({dispatcher-event-loop-13} Logging.scala[logInfo]:54) - Starting task 691.0 in stage 92.0 (TID 318189, worker02, executor 3, partition 691, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-10 18:15:57,825] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 690.0 in stage 92.0 (TID 318188) in 668 ms on worker02 (executor 3) (680/5720)
+
+    top on the worker nodes show 90% idle, then peaks of activity
+
+        50%cpu to cheph-fuse
+        10%cpu to java
+
+# -----------------------------------------------------
+# Check the Spark temp on Zeppelin.
+#[user@zeppelin]
+
+    ls -1 /var/spark/temp/
+
+    >   blockmgr-ddd69876-f595-49bf-bc89-b4c5d52204a3
+    >   spark-668f91a5-2e20-4f67-8fa5-bc68e679243b
+
+
+    du -h -d 1 /var/spark/temp/
+
+    >   220K	/var/spark/temp/spark-668f91a5-2e20-4f67-8fa5-bc68e679243b
+    >   200K	/var/spark/temp/blockmgr-ddd69876-f595-49bf-bc89-b4c5d52204a3
+    >   424K	/var/spark/temp/
+
+
+# -----------------------------------------------------
+# Check the Spark temp on workers.
+#[user@worker01]
+
+    ls -1 /var/spark/temp/
+
+
+
+    du -h -d 1 /var/spark/temp/
+
+    >   4.0K	/var/spark/temp/
+
+
+    #
+    # Zeppelin node is acting as the Spark masert.
+    # ipython process is single thread 100% cpu, the rest is idle.
+    # using <55k of spark/temp
+    #
+
+    #
+    # worker nodes are 10-50% ceph, 0-10% java
+    # mostly idle
+    # not using spark/temp
+    #
+
+
+    Initial select query (10%)
+    Took 17 min 48 sec. Last updated by gaiauser at February 10 2021, 6:22:12 PM.
+
+    First graph
+    Took 18 min 11 sec. Last updated by gaiauser at February 10 2021, 6:40:23 PM.
+
+    Good/bad selection
+    Took 42 min 13 sec. Last updated by gaiauser at February 10 2021, 7:22:37 PM.
+
+    RandomForestClassifier
+    Took 2 hrs 59 min 26 sec. Last updated by gaiauser at February 10 2021, 10:22:03 PM.
+
+    Confusion matrix
+    Took 42 min 6 sec. Last updated by gaiauser at February 10 2021, 11:04:09 PM.
+
+    Second graph
+    Took 1 hrs 18 min 18 sec. Last updated by gaiauser at February 11 2021, 12:22:28 AM.
+
+    Histogram
+    Took 19 min 17 sec. Last updated by gaiauser at February 11 2021, 12:41:45 AM.
+
+    Good plot
+    Took 40 min 3 sec. Last updated by gaiauser at February 11 2021, 1:21:48 AM.
+
+    Bad plot
+    Took 40 min 8 sec. Last updated by gaiauser at February 11 2021, 2:01:56 AM.
+
+    Good/bad count
+    Took 40 min 4 sec. Last updated by gaiauser at February 11 2021, 2:42:00 AM.
+
+    Histogram
+    Took 40 min 15 sec. Last updated by gaiauser at February 11 2021, 3:22:15 AM.
+
+    Null count
+    Took 25 min 30 sec. Last updated by gaiauser at February 11 2021, 3:47:45 AM.
+
+
diff --git a/notes/zrq/20210211-01-ansible-deploy.txt b/notes/zrq/20210211-01-ansible-deploy.txt
new file mode 100644
index 00000000..1585776a
--- /dev/null
+++ b/notes/zrq/20210211-01-ansible-deploy.txt
@@ -0,0 +1,771 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+    Target:
+
+        Get Spark to work with the new configuration.
+
+        Test config:
+            no gateway
+            medium zeppelin
+            6 small workers
+
+    Results:
+
+        Work in progress ....
+
+    TODO:
+
+        To keep consistent with the other deployments,
+        deploy Zeppelin in /opt rather than /home/fedora.
+
+        To keep consistent with the other deployments,
+        add a symlik for the Zeppelin deployment.
+
+            zeppelin -> zeppelin-0.8.2-bin-all
+
+        Move the zeppelin logs directory out of the
+        deployed source tree.
+
+
+# -----------------------------------------------------
+# Update the Openstack cloud name.
+#[user@desktop]
+
+    cloudname=gaia-dev
+
+    sed -i '
+        s/^\(AGLAIS_CLOUD\)=.*$/\1='${cloudname:?}'/
+        ' "${HOME}/aglais.env"
+
+
+# -----------------------------------------------------
+# Create a container to work with.
+# (*) extra volume mount for /common
+#[user@desktop]
+
+    source "${HOME:?}/aglais.env"
+
+    podman run \
+        --rm \
+        --tty \
+        --interactive \
+        --name ansibler \
+        --hostname ansibler \
+        --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \
+        --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \
+        --env "cloudname=${AGLAIS_CLOUD:?}" \
+        --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/common:/common:ro,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/openstack:/openstack:ro,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/hadoop-yarn:/hadoop-yarn:ro,z" \
+        atolmis/ansible-client:2020.12.02 \
+        bash
+
+
+# -----------------------------------------------------
+# Create our Aglais configuration.
+#[root@kubernator]
+
+cat > '/tmp/aglais-config.yml' << EOF
+aglais:
+    version: 1.0
+    spec:
+        openstack:
+            cloud: '${cloudname:?}'
+
+EOF
+
+
+# -----------------------------------------------------
+# Create everything from scratch.
+#[root@ansibler]
+
+    time \
+        /openstack/bin/delete-all.sh \
+            "${cloudname:?}"
+
+    rm -f ~/.ssh/*
+
+    time \
+        /hadoop-yarn/bin/create-all.sh
+
+
+    >   ....
+    >   ....
+    >   real    38m51.791s
+    >   user    10m10.096s
+    >   sys     3m6.966s
+
+
+# -----------------------------------------------------
+# Check the deployment status.
+#[root@ansibler]
+
+    cat '/tmp/aglais-status.yml'
+
+    >   ....
+    >   ....
+
+
+# -----------------------------------------------------
+# Get the public IP address of our Zeppelin node.
+#[root@ansibler]
+
+    deployname=$(
+        yq read \
+            '/tmp/aglais-status.yml' \
+                'aglais.status.deployment.name'
+        )
+
+    zeppelinid=$(
+        openstack \
+            --os-cloud "${cloudname:?}" \
+            server list \
+                --format json \
+        | jq -r '.[] | select(.Name == "'${deployname:?}'-zeppelin") | .ID'
+        )
+
+    zeppelinip=$(
+        openstack \
+            --os-cloud "${cloudname:?}" \
+            server show \
+                --format json \
+                "${zeppelinid:?}" \
+        | jq -r '.addresses' \
+        | sed '
+            s/[[:space:]]//
+            s/.*=\(.*\)/\1/
+            s/.*,\(.*\)/\1/
+            '
+        )
+
+cat << EOF
+Zeppelin ID [${zeppelinid:?}]
+Zeppelin IP [${zeppelinip:?}]
+EOF
+
+    >   Zeppelin ID [e4db55cb-2106-4f24-afe6-e335f98ecca1]
+    >   Zeppelin IP [128.232.227.230]
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+    Update our DNS
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Login to Zeppelin ...
+#[user@desktop]
+
+    firefox --new-window "http://zeppelin.metagrid.xyz:8080/" &
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+
+    Import notebooks from GitHub, clear the output and run all the cells ...
+
+    Good astrometric solutions via ML Random Forrest classifier
+    https://raw.githubusercontent.com/wfau/aglais-notebooks/main/2FRPC4BFS/note.json
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+
+# -----------------------------------------------------
+# Check the Zeppelin logs.
+#[user@zeppelin]
+
+    pushd /home/fedora
+
+        tail -f zeppelin-0.8.2-bin-all/logs/zeppelin-interpreter-spark-fedora-gaia-dev-20210210-zeppelin.novalocal.log
+
+
+    top on the worker nodes show 90% idle, then peaks of activity
+
+        50%cpu to cheph-fuse
+        10%cpu to java
+
+# -----------------------------------------------------
+# Check the Spark temp on Zeppelin.
+#[user@zeppelin]
+
+    ssh zeppelin \
+        '
+        date
+        hostname
+        echo
+        ls -1 /var/spark/temp/
+        echo
+        du -h -d 1 /var/spark/temp/
+        '
+
+    >   Thu Feb 11 11:42:45 UTC 2021
+    >   gaia-dev-20210211-zeppelin.novalocal
+    >   
+    >   blockmgr-6e40f938-4cea-4e51-aa7d-7b8e8d957fd9
+    >   spark-02a20dcf-c44b-46fd-aba2-84dcd4092b77
+    >   
+    >   220K    /var/spark/temp/spark-02a20dcf-c44b-46fd-aba2-84dcd4092b77
+    >   168K    /var/spark/temp/blockmgr-6e40f938-4cea-4e51-aa7d-7b8e8d957fd9
+    >   392K    /var/spark/temp/
+
+
+# -----------------------------------------------------
+# Check the Spark temp on workers.
+#[root@ansibler]
+
+    ssh zeppelin \
+        '
+        ssh worker01 \
+            "
+            date
+            hostname
+            echo
+            ls -1 /var/spark/temp/
+            echo
+            du -h -d 1 /var/spark/temp/
+            "
+        '
+
+    >   Thu Feb 11 11:43:12 UTC 2021
+    >   gaia-dev-20210211-worker01.novalocal
+    >   
+    >   ls: cannot access '/var/spark/temp/': No such file or directory
+    >   
+    >   du: cannot access '/var/spark/temp/': No such file or directory
+
+
+
+# -----------------------------------------------------
+# Check /tmp on workers01.
+#[root@ansibler]
+
+    ssh zeppelin \
+        '
+        ssh worker01 \
+            "
+            date
+            hostname
+            echo
+            ls -1 /tmp/
+            echo
+            du -h -d 1 /tmp/
+            "
+        '
+
+    >   Thu Feb 11 11:44:10 UTC 2021
+    >   gaia-dev-20210211-worker01.novalocal
+    >   
+    >   hadoop-fedora
+    >   hadoop-fedora-datanode.pid
+    >   hadoop-fedora-nodemanager.pid
+    >   hsperfdata_fedora
+    >   hsperfdata_root
+    >   jetty-0.0.0.0-8042-node-_-any-222466267334014063.dir
+    >   jetty-localhost-41565-datanode-_-any-8693046579271702220.dir
+    >   systemd-private-25d0add6979849dcaa7ef3260c7db798-chronyd.service-WR7RUG
+    >   systemd-private-25d0add6979849dcaa7ef3260c7db798-dbus-broker.service-YbHwCr
+    >   
+    >   4.0K    /tmp/jetty-localhost-41565-datanode-_-any-8693046579271702220.dir
+    >   du: cannot read directory '/tmp/systemd-private-25d0add6979849dcaa7ef3260c7db798-chronyd.service-WR7RUG': Permission denied
+    >   4.0K    /tmp/systemd-private-25d0add6979849dcaa7ef3260c7db798-chronyd.service-WR7RUG
+    >   4.0K    /tmp/systemd-private-25d0add6979849dcaa7ef3260c7db798-dbus-broker.service-YbHwCr
+    >   4.0K    /tmp/.ICE-unix
+    >   4.0K    /tmp/.X11-unix
+    >   4.0K    /tmp/.Test-unix
+    >   8.0K    /tmp/jetty-0.0.0.0-8042-node-_-any-222466267334014063.dir
+    >   4.0K    /tmp/.font-unix
+    >   du: cannot read directory '/tmp/systemd-private-25d0add6979849dcaa7ef3260c7db798-dbus-broker.service-YbHwCr': Permission denied
+    >   100K    /tmp/hsperfdata_fedora
+    >   4.0K    /tmp/.XIM-unix
+    >   235M    /tmp/hadoop-fedora
+    >   36K     /tmp/hsperfdata_root
+    >   235M    /tmp/
+
+
+# -----------------------------------------------------
+# Check the Zeppelin machine
+#[user@zeppelin]
+
+    ls -1 /home/fedora
+
+    >   spark-warehouse
+    >   zeppelin-0.8.2-bin-all
+
+
+    ls -1 /home/fedora/spark-warehouse
+
+    >   -
+
+
+    ls -1 /home/fedora/zeppelin-0.8.2-bin-all
+
+    >   bin
+    >   conf
+    >   interpreter
+    >   lib
+    >   LICENSE
+    >   licenses
+    >   local-repo
+    >   logs
+    >   notebook
+    >   NOTICE
+    >   README.md
+    >   run
+    >   webapps
+    >   zeppelin-web-0.8.2.war
+
+
+    ls -1 /home/fedora/zeppelin-0.8.2-bin-all/logs
+
+    >   zeppelin-fedora-gaia-dev-20210211-zeppelin.novalocal.log
+    >   zeppelin-fedora-gaia-dev-20210211-zeppelin.novalocal.out
+    >   zeppelin-interpreter-md-fedora-gaia-dev-20210211-zeppelin.novalocal.log
+    >   zeppelin-interpreter-spark-fedora-gaia-dev-20210211-zeppelin.novalocal.log
+
+
+# -----------------------------------------------------
+# Check the Zeppelin log
+#[user@zeppelin]
+
+    pushd  /home/fedora/zeppelin-0.8.2-bin-all/logs
+
+        ls -1 .
+
+    >   zeppelin-fedora-gaia-dev-20210211-zeppelin.novalocal.log
+    >   zeppelin-fedora-gaia-dev-20210211-zeppelin.novalocal.out
+    >   zeppelin-interpreter-md-fedora-gaia-dev-20210211-zeppelin.novalocal.log
+    >   zeppelin-interpreter-spark-fedora-gaia-dev-20210211-zeppelin.novalocal.log
+
+
+        less zeppelin-fedora-gaia-dev-20210211-zeppelin.novalocal.log
+
+    >   INFO [2021-02-11 07:17:07,710] ({main} ZeppelinConfiguration.java[create]:121) - Load configuration from file:/home/fedora/zeppelin-0.8.2-bin-all/conf/zeppelin-site.xml
+    >    INFO [2021-02-11 07:17:07,750] ({main} ZeppelinConfiguration.java[create]:129) - Server Host: 10.10.0.88
+    >    INFO [2021-02-11 07:17:07,750] ({main} ZeppelinConfiguration.java[create]:131) - Server Port: 8080
+    >    INFO [2021-02-11 07:17:07,751] ({main} ZeppelinConfiguration.java[create]:135) - Context Path: /
+    >    INFO [2021-02-11 07:17:07,752] ({main} ZeppelinConfiguration.java[create]:136) - Zeppelin Version: 0.8.2
+    >   ....
+    >   ....
+    >    INFO [2021-02-11 10:31:27,611] ({pool-2-thread-2} SchedulerFactory.java[jobStarted]:114) - Job 20201013-131059_546082898 started by scheduler org.apache.zeppelin.interpreter.remote.RemoteInterpreter-md:shared_proces
+    >   s-shared_session
+    >    INFO [2021-02-11 10:31:27,612] ({pool-2-thread-2} Paragraph.java[jobRun]:381) - Run paragraph [paragraph_id: 20201013-131059_546082898, interpreter: md, note_id: 2FYW1HNED, user: gaiauser]
+    >    INFO [2021-02-11 10:31:27,612] ({pool-2-thread-2} ManagedInterpreterGroup.java[getOrCreateInterpreterProcess]:61) - Create InterpreterProcess for InterpreterGroup: md:shared_process
+    >    INFO [2021-02-11 10:31:27,612] ({pool-2-thread-2} ShellScriptLauncher.java[launch]:48) - Launching Interpreter: md
+    >    INFO [2021-02-11 10:31:27,623] ({pool-2-thread-2} RemoteInterpreterManagedProcess.java[start]:115) - Thrift server for callback will start. Port: 39353
+    >    INFO [2021-02-11 10:31:27,631] ({pool-2-thread-2} RemoteInterpreterManagedProcess.java[start]:190) - Run interpreter process [/home/fedora/zeppelin-0.8.2-bin-all/bin/interpreter.sh, -d, /home/fedora/zeppelin-0.8.2-b
+    >   in-all/interpreter/md, -c, 10.10.0.88, -p, 39353, -r, :, -l, /home/fedora/zeppelin-0.8.2-bin-all/local-repo/md, -g, md]
+    >    INFO [2021-02-11 10:31:27,890] ({pool-7-thread-1} RemoteInterpreterManagedProcess.java[callback]:123) - RemoteInterpreterServer Registered: CallbackInfo(host:10.10.0.88, port:33847)
+    >    INFO [2021-02-11 10:31:27,925] ({pool-2-thread-2} RemoteInterpreter.java[call]:168) - Create RemoteInterpreter org.apache.zeppelin.markdown.Markdown
+    >    INFO [2021-02-11 10:31:28,006] ({pool-2-thread-2} RemoteInterpreter.java[call]:142) - Open RemoteInterpreter org.apache.zeppelin.markdown.Markdown
+    >    INFO [2021-02-11 10:31:28,006] ({pool-2-thread-2} RemoteInterpreter.java[pushAngularObjectRegistryToRemote]:436) - Push local angular object registry from ZeppelinServer to remote interpreter group md:shared_process
+    >    INFO [2021-02-11 10:31:28,371] ({pool-2-thread-2} NotebookServer.java[afterStatusChange]:2314) - Job 20201013-131059_546082898 is finished successfully, status: FINISHED
+    >    INFO [2021-02-11 10:31:28,438] ({pool-2-thread-2} VFSNotebookRepo.java[save]:196) - Saving note:2FYW1HNED
+    >    INFO [2021-02-11 10:31:28,441] ({pool-2-thread-2} SchedulerFactory.java[jobFinished]:120) - Job 20201013-131059_546082898 finished by scheduler org.apache.zeppelin.interpreter.remote.RemoteInterpreter-md:shared_proc
+    >   ess-shared_session
+    >    INFO [2021-02-11 10:31:28,458] ({qtp1580893732-14} VFSNotebookRepo.java[save]:196) - Saving note:2FYW1HNED
+    >   ....
+    >   ....
+    >    INFO [2021-02-11 10:31:28,462] ({pool-2-thread-3} SchedulerFactory.java[jobStarted]:114) - Job 20201013-131649_1734629667 started by scheduler org.apache.zeppelin.interpreter.remote.RemoteInterpreter-spark:shared_pr
+    >   ocess-shared_session
+    >    INFO [2021-02-11 10:31:28,462] ({pool-2-thread-3} Paragraph.java[jobRun]:381) - Run paragraph [paragraph_id: 20201013-131649_1734629667, interpreter: spark.pyspark, note_id: 2FYW1HNED, user: gaiauser]
+    >    INFO [2021-02-11 10:31:28,462] ({pool-2-thread-3} ManagedInterpreterGroup.java[getOrCreateInterpreterProcess]:61) - Create InterpreterProcess for InterpreterGroup: spark:shared_process
+    >    INFO [2021-02-11 10:31:28,463] ({pool-2-thread-3} ShellScriptLauncher.java[launch]:48) - Launching Interpreter: spark
+    >    INFO [2021-02-11 10:31:28,464] ({pool-2-thread-3} SparkInterpreterLauncher.java[buildEnvFromProperties]:108) - Run Spark under non-secure mode as no keytab and principal is specified
+    >    INFO [2021-02-11 10:31:28,464] ({pool-2-thread-3} RemoteInterpreterManagedProcess.java[start]:115) - Thrift server for callback will start. Port: 39131
+    >    INFO [2021-02-11 10:31:28,965] ({pool-2-thread-3} RemoteInterpreterManagedProcess.java[start]:190) - Run interpreter process [/home/fedora/zeppelin-0.8.2-bin-all/bin/interpreter.sh, -d, /home/fedora/zeppelin-0.8.2-b
+    >   in-all/interpreter/spark, -c, 10.10.0.88, -p, 39131, -r, :, -l, /home/fedora/zeppelin-0.8.2-bin-all/local-repo/spark, -g, spark]
+    >    INFO [2021-02-11 10:31:30,280] ({pool-9-thread-1} RemoteInterpreterManagedProcess.java[callback]:123) - RemoteInterpreterServer Registered: CallbackInfo(host:10.10.0.88, port:39975)
+    >    INFO [2021-02-11 10:31:30,282] ({pool-2-thread-3} RemoteInterpreter.java[call]:168) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkInterpreter
+    >    INFO [2021-02-11 10:31:30,336] ({pool-2-thread-3} RemoteInterpreter.java[call]:168) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkSqlInterpreter
+    >    INFO [2021-02-11 10:31:30,337] ({pool-2-thread-3} RemoteInterpreter.java[call]:168) - Create RemoteInterpreter org.apache.zeppelin.spark.DepInterpreter
+    >    INFO [2021-02-11 10:31:30,342] ({pool-2-thread-3} RemoteInterpreter.java[call]:168) - Create RemoteInterpreter org.apache.zeppelin.spark.PySparkInterpreter
+    >    INFO [2021-02-11 10:31:30,346] ({pool-2-thread-3} RemoteInterpreter.java[call]:168) - Create RemoteInterpreter org.apache.zeppelin.spark.IPySparkInterpreter
+    >    INFO [2021-02-11 10:31:30,349] ({pool-2-thread-3} RemoteInterpreter.java[call]:168) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkRInterpreter
+    >    INFO [2021-02-11 10:31:30,350] ({pool-2-thread-3} RemoteInterpreter.java[call]:142) - Open RemoteInterpreter org.apache.zeppelin.spark.PySparkInterpreter
+    >    INFO [2021-02-11 10:31:30,350] ({pool-2-thread-3} RemoteInterpreter.java[pushAngularObjectRegistryToRemote]:436) - Push local angular object registry from ZeppelinServer to remote interpreter group spark:shared_proc
+    >   ess
+    >    INFO [2021-02-11 10:32:38,857] ({pool-2-thread-3} NotebookServer.java[afterStatusChange]:2314) - Job 20201013-131649_1734629667 is finished successfully, status: FINISHED
+    >   ....
+    >   ....
+    >   ....
+    >   ....
+    >    INFO [2021-02-11 10:59:05,185] ({pool-2-thread-3} SchedulerFactory.java[jobStarted]:114) - Job 20201013-152110_1282917873 started by scheduler org.apache.zeppelin.interpreter.remote.RemoteInterpreter-spark:shared_process-shared_session
+    >    INFO [2021-02-11 10:59:05,186] ({pool-2-thread-3} Paragraph.java[jobRun]:381) - Run paragraph [paragraph_id: 20201013-152110_1282917873, interpreter: spark.pyspark, note_id: 2FYW1HNED, user: gaiauser]
+    >    WARN [2021-02-11 11:06:34,508] ({pool-2-thread-3} NotebookServer.java[afterStatusChange]:2316) - Job 20201013-152110_1282917873 is finished, status: ERROR, exception: null, result: %text ESC[0;31m---------------------------------------------------------------------------ESC[0m
+    >   ESC[0;31mPy4JJavaErrorESC[0m                             Traceback (most recent call last)
+    >   ESC[0;32m<ipython-input-14-d1302027d275>ESC[0m in ESC[0;36m<module>ESC[0;34mESC[0m
+    >   ESC[1;32m      6ESC[0m ESC[0;31m# instantiate a trained RF classifier, seeded for repeatability at this stage:ESC[0mESC[0;34mESC[0mESC[0;34mESC[0mESC[0;34mESC[0mESC[0m
+    >   ....
+    >   ....
+    >   ESC[0;32m/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.pyESC[0m in ESC[0;36mget_return_valueESC[0;34m(answer, gateway_client, target_id, name)ESC[0m
+    >   ESC[1;32m    326ESC[0m                 raise Py4JJavaError(
+    >   ESC[1;32m    327ESC[0m                     ESC[0;34m"An error occurred while calling {0}{1}{2}.\n"ESC[0mESC[0;34m.ESC[0mESC[0;34mESC[0mESC[0;34mESC[0mESC[0m
+    >   ESC[0;32m--> 328ESC[0;31m                     format(target_id, ".", name), value)
+    >   ESC[0mESC[1;32m    329ESC[0m             ESC[0;32melseESC[0mESC[0;34m:ESC[0mESC[0;34mESC[0mESC[0;34mESC[0mESC[0m
+    >   ESC[1;32m    330ESC[0m                 raise Py4JError(
+    >   
+    >   ESC[0;31mPy4JJavaErrorESC[0m: An error occurred while calling o191.fit.
+    >   : org.apache.spark.SparkException: Job aborted due to stage failure: Task 3226 in stage 35.0 failed 4 times, most recent failure: Lost task 3226.3 in stage 35.0 (TID 122005, worker05, executor 2): java.io.IOException: No space left on device
+    >           at java.io.FileOutputStream.writeBytes(Native Method)
+    >           at java.io.FileOutputStream.write(FileOutputStream.java:326)
+    >           at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:58)
+    >           at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
+    >           at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140)
+    >           at net.jpountz.lz4.LZ4BlockOutputStream.finish(LZ4BlockOutputStream.java:260)
+    >           at net.jpountz.lz4.LZ4BlockOutputStream.close(LZ4BlockOutputStream.java:190)
+    >           at java.io.ObjectOutputStream$BlockDataOutputStream.close(ObjectOutputStream.java:1828)
+    >           at java.io.ObjectOutputStream.close(ObjectOutputStream.java:742)
+    >           at org.apache.spark.serializer.JavaSerializationStream.close(JavaSerializer.scala:57)
+    >           at org.apache.spark.storage.DiskBlockObjectWriter.commitAndGet(DiskBlockObjectWriter.scala:173)
+    >           at org.apache.spark.util.collection.ExternalSorter.writePartitionedFile(ExternalSorter.scala:701)
+    >           at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:71)
+    >           at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
+    >           at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
+    >           at org.apache.spark.scheduler.Task.run(Task.scala:123)
+    >           at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
+    >           at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
+    >           at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
+    >           at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
+    >           at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
+    >           at java.lang.Thread.run(Thread.java:748)
+    >   
+    >   Driver stacktrace:
+    >           at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
+    >           at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
+    >           at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
+    >           at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
+    >           at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
+    >           at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
+    >           at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
+    >           at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
+    >           at scala.Option.foreach(Option.scala:257)
+    >           at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
+    >           at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)
+    >           at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
+    >           at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
+    >           at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
+    >           at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
+    >           at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
+    >           at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
+    >           at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
+    >           at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
+    >           at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
+    >           at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
+    >           at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
+    >           at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
+    >           at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
+    >           at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:743)
+    >           at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:742)
+    >           at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
+    >           at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
+    >           at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
+    >           at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:742)
+    >           at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:567)
+    >           at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:201)
+    >           at org.apache.spark.ml.classification.RandomForestClassifier$$anonfun$train$1.apply(RandomForestClassifier.scala:142)
+    >           at org.apache.spark.ml.classification.RandomForestClassifier$$anonfun$train$1.apply(RandomForestClassifier.scala:120)
+    >           at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
+    >           at scala.util.Try$.apply(Try.scala:192)
+    >           at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
+    >           at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:120)
+    >           at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:46)
+    >           at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
+    >           at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
+    >           at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
+    >           at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
+    >           at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
+    >           at java.lang.reflect.Method.invoke(Method.java:498)
+    >           at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
+    >           at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
+    >           at py4j.Gateway.invoke(Gateway.java:282)
+    >           at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
+    >           at py4j.commands.CallCommand.execute(CallCommand.java:79)
+    >           at py4j.GatewayConnection.run(GatewayConnection.java:238)
+    >           at java.lang.Thread.run(Thread.java:748)
+    >   
+    >   Caused by: java.io.IOException: No space left on device
+    >           at java.io.FileOutputStream.writeBytes(Native Method)
+    >           at java.io.FileOutputStream.write(FileOutputStream.java:326)
+    >           at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:58)
+    >           at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
+    >           at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140)
+    >           at net.jpountz.lz4.LZ4BlockOutputStream.finish(LZ4BlockOutputStream.java:260)
+    >           at net.jpountz.lz4.LZ4BlockOutputStream.close(LZ4BlockOutputStream.java:190)
+    >           at java.io.ObjectOutputStream$BlockDataOutputStream.close(ObjectOutputStream.java:1828)
+    >           at java.io.ObjectOutputStream.close(ObjectOutputStream.java:742)
+    >           at org.apache.spark.serializer.JavaSerializationStream.close(JavaSerializer.scala:57)
+    >           at org.apache.spark.storage.DiskBlockObjectWriter.commitAndGet(DiskBlockObjectWriter.scala:173)
+    >           at org.apache.spark.util.collection.ExternalSorter.writePartitionedFile(ExternalSorter.scala:701)
+    >           at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:71)
+    >           at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
+    >           at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
+    >           at org.apache.spark.scheduler.Task.run(Task.scala:123)
+    >           at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
+    >           at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
+    >           at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
+    >           at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
+    >           at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
+    >           ... 1 more
+    >   
+    >    INFO [2021-02-11 11:06:34,565] ({pool-2-thread-3} VFSNotebookRepo.java[save]:196) - Saving note:2FYW1HNED
+    >    INFO [2021-02-11 11:06:34,572] ({pool-2-thread-3} SchedulerFactory.java[jobFinished]:120) - Job 20201013-152110_1282917873 finished by scheduler org.apache.zeppelin.interpreter.remote.RemoteInterpreter-spark:shared_process-shared_session
+    >    INFO [2021-02-11 11:30:45,031] ({SessionValidationThread-1} AbstractValidatingSessionManager.java[validateSessions]:275) - Validating all active sessions...
+    >    INFO [2021-02-11 11:30:45,032] ({SessionValidationThread-1} AbstractValidatingSessionManager.java[validateSessions]:308) - Finished session validation.  No sessions were stopped.
+
+
+    Lots of information in that ..
+
+    - The exception was reported by (TID 122005, worker05, executor 2)
+    - I think the out of space was on worker05, not the Zeppelin node.
+
+    - The stack trace suggests that RandomForestClassifier understands org.apache.spark.rdd.RDD
+    - Which means at least part of the RandomForestClassifier training is offloaded to the workers.
+
+    >           at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
+    >           at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
+    >           at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
+    >           ....
+    >           at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
+    >           at scala.Option.foreach(Option.scala:257)
+    >           at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
+    >           ....
+    >           at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
+    >           at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
+    >           at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
+    >           at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
+    >           ....
+    >           at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
+    >           ....
+    >           at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
+    >           at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
+    >           ....
+    >           at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:567)
+    >           ....
+    >           at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
+    >           at scala.util.Try$.apply(Try.scala:192)
+    >           at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
+    >           at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:120)
+    >           at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:46)
+    >           at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
+
+
+# -----------------------------------------------------
+# Check the disc space on zeppelin
+#[user@zeppelin]
+
+    ls -1 /var/spark/temp/
+
+    >   blockmgr-6e40f938-4cea-4e51-aa7d-7b8e8d957fd9
+    >   spark-02a20dcf-c44b-46fd-aba2-84dcd4092b77
+
+
+    du -h -d 1 /var/spark/temp/
+
+    >   220K    /var/spark/temp/spark-02a20dcf-c44b-46fd-aba2-84dcd4092b77
+    >   168K    /var/spark/temp/blockmgr-6e40f938-4cea-4e51-aa7d-7b8e8d957fd9
+    >   392K    /var/spark/temp/
+
+
+# -----------------------------------------------------
+# Check the disc space on worker05
+#[user@zeppelin]
+
+
+    ssh worker05 \
+        '
+        hostname
+        date
+        echo
+        ls -1 /var/spark/temp/
+        echo
+        du -h -d 1 /var/spark/temp/
+        '
+
+    >   gaia-dev-20210211-worker05.novalocal
+    >   Thu 11 Feb 12:16:18 UTC 2021
+    >   
+    >   ls: cannot access '/var/spark/temp/': No such file or directory
+    >   
+    >   du: cannot access '/var/spark/temp/': No such file or directory
+
+    #
+    # When we changed this back down to a small node we didn't create the spark temp directory.
+    #
+
+
+    ssh worker05 \
+        '
+        hostname
+        date
+        echo
+        ls -1 /tmp/
+        echo
+        du -h -d 1 /tmp/
+        '
+
+    >   gaia-dev-20210211-worker05.novalocal
+    >   Thu 11 Feb 12:16:49 UTC 2021
+    >   
+    >   hadoop-fedora
+    >   hadoop-fedora-datanode.pid
+    >   hadoop-fedora-nodemanager.pid
+    >   hsperfdata_fedora
+    >   hsperfdata_root
+    >   jetty-0.0.0.0-8042-node-_-any-5267485435957391381.dir
+    >   jetty-localhost-33243-datanode-_-any-3555236917512612600.dir
+    >   systemd-private-e9fcce57f1be40acb5b15c979c850494-chronyd.service-0AFL7F
+    >   systemd-private-e9fcce57f1be40acb5b15c979c850494-dbus-broker.service-ej71vh
+    >   
+    >   4.0K    /tmp/jetty-localhost-33243-datanode-_-any-3555236917512612600.dir
+    >   4.0K    /tmp/.ICE-unix
+    >   4.0K    /tmp/.X11-unix
+    >   4.0K    /tmp/.Test-unix
+    >   du: cannot read directory '/tmp/systemd-private-e9fcce57f1be40acb5b15c979c850494-chronyd.service-0AFL7F': Permission denied
+    >   4.0K    /tmp/systemd-private-e9fcce57f1be40acb5b15c979c850494-chronyd.service-0AFL7F
+    >   8.0K    /tmp/jetty-0.0.0.0-8042-node-_-any-5267485435957391381.dir
+    >   4.0K    /tmp/.font-unix
+    >   100K    /tmp/hsperfdata_fedora
+    >   4.0K    /tmp/.XIM-unix
+    >   14G     /tmp/hadoop-fedora
+    >   4.0K    /tmp/systemd-private-e9fcce57f1be40acb5b15c979c850494-dbus-broker.service-ej71vh
+    >   36K     /tmp/hsperfdata_root
+    >   14G     /tmp/
+    >   du: cannot read directory '/tmp/systemd-private-e9fcce57f1be40acb5b15c979c850494-dbus-broker.service-ej71vh': Permission denied
+
+
+    ssh worker05 \
+        '
+        hostname
+        date
+        echo
+        ls -1 /tmp/hadoop-fedora
+        echo
+        du -h -d 1 /tmp/hadoop-fedora
+        '
+
+    >   gaia-dev-20210211-worker05.novalocal
+    >   Thu 11 Feb 12:18:19 UTC 2021
+    >   
+    >   nm-local-dir
+    >   
+    >   14G    /tmp/hadoop-fedora/nm-local-dir
+    >   14G    /tmp/hadoop-fedora
+
+
+    ssh worker05 \
+        '
+        hostname
+        date
+        echo
+        du -h /tmp/hadoop-fedora
+        '
+
+    >   4.0K    /tmp/hadoop-fedora/nm-local-dir/nmPrivate/application_1613027823151_0001
+    >   8.0K    /tmp/hadoop-fedora/nm-local-dir/nmPrivate
+    >   ....    ....
+    >   284K    /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/12
+    >   592K    /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/11
+    >   231M    /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/13/__spark_libs__4343915086399681065.zip
+    >   231M    /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/13
+    >   ....    ....
+    >   2.9M    /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/10/sparkr.zip
+    >   2.9M    /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/10
+    >   52K     /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/14
+    >   235M    /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache
+    >   ....    ....
+    >   51M     /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc/22
+    >   64M     /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc/1c
+    >   66M     /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc/24
+    >   414M    /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc/30
+    >   13G     /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc
+    >   4.0K    /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/filecache
+    >   13G     /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001
+    >   13G     /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache
+    >   14G     /tmp/hadoop-fedora/nm-local-dir/usercache/fedora
+    >   14G     /tmp/hadoop-fedora/nm-local-dir/usercache
+    >   4.0K    /tmp/hadoop-fedora/nm-local-dir/filecache
+    >   14G     /tmp/hadoop-fedora/nm-local-dir
+    >   14G     /tmp/hadoop-fedora
+
+    Lots of information in that ..
+
+    - By the time the job gets here it it is a Hadoop job NOT a Spark job.
+    - The temp files are owned by Hadoop node-manager and Hadoop block-manager.
+    - To move them to another location we should use the Hadoop temp settings, not the Spark temp settings.
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+
+    Things we have learned so far.
+
+    Even if we don't create the separate disc mounts, we should still create the spark and hadoop temp directories.
+
+    Move the mount paths from a global setting to a host specific setting.
+    Always create the same directories
+
+        /var/spark/temp
+        /var/spark/data
+
+        /var/hadoop/temp
+        /var/hadoop/data
+
+    If this is a medium node, then change some of them into links.
+    If the host config has mount paths for them.
+
+    The master node isn't doing much.
+    Possibly managing the HDFS namenode ?
+    Is it actually managing the Yarn scheduling ?
+    Could all this be done by a tiny VM ?
+
+    The Zeppelin node is running the Spark interpreter.
+    The Spark interpreter is scheduling the Spark jobs.
+    The Spark interpreter aggregates the notebook results.
+
+    The Spark interpreter uses 392K of space in /var/spark/temp.
+    This could still probably me a small node.
+    The main cpu use is the ipython server running one thread at 100%.
+    The rest of the cores are idle most of the time.
+
+
+
+
+
+
+
+
+
+

From ebdb26d822605b0040491e85dbf49ba2046a23ea Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Sat, 13 Feb 2021 06:32:19 +0000
Subject: [PATCH 07/27] Notes on git branches

---
 notes/zrq/20210211-02-git-branches.txt | 457 +++++++++++++++++++++++++
 1 file changed, 457 insertions(+)
 create mode 100644 notes/zrq/20210211-02-git-branches.txt

diff --git a/notes/zrq/20210211-02-git-branches.txt b/notes/zrq/20210211-02-git-branches.txt
new file mode 100644
index 00000000..022dc660
--- /dev/null
+++ b/notes/zrq/20210211-02-git-branches.txt
@@ -0,0 +1,457 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+    Target:
+
+        Transfer work on 20210206-zrq-working branch onto spaller task specific branches.
+        We spent a while adding a mixture of changes to the working branch.
+        Needed to step back and commit the changes as separate task specific PRs.
+
+    Result:
+
+        Work in progress ...
+
+# -----------------------------------------------------
+# Create a copy of the local working branch.
+#[user@desktop]
+
+    source "${HOME}/aglais.env"
+    pushd  "${AGLAIS_HOME}"
+
+        pushd github-zrq
+
+            git add .
+            git commit -m "Adding everything to the working branch"
+            git push
+
+        popd
+
+        cp -a github-zrq github-working
+        mv    github-zrq github-backup
+
+
+    popd
+
+# -----------------------------------------------------
+# Update the working copy with merged PRs from upstream.
+#[user@desktop]
+
+    source "${HOME}/aglais.env"
+    pushd  "${AGLAIS_HOME}"
+
+        pushd github-working
+
+            git checkout master
+
+    >   Switched to branch 'master'
+    >   Your branch is up to date with 'origin/master'.
+
+            git pull
+
+    >   Already up to date.
+
+            git fetch upstream
+
+    >   remote: Enumerating objects: 7, done.
+    >   remote: Counting objects: 100% (7/7), done.
+    >   remote: Total 26 (delta 7), reused 7 (delta 7), pack-reused 19
+    >   Unpacking objects: 100% (26/26), 45.43 KiB | 186.00 KiB/s, done.
+    >   From github.com:wfau/aglais
+    >      7f642cd..01c7c74  master     -> upstream/master
+
+
+            git merge upstream/master
+
+    >   Updating 7f642cd..01c7c74
+    >   Fast-forward
+    >    experiments/hadoop-yarn/ansible/01-create-keypair.yml    |    2 +-
+    >    experiments/hadoop-yarn/ansible/02-create-gateway.yml    |    2 +-
+    >   ....
+    >   ....
+    >    create mode 100644 notes/zrq/20210205-02-resources.txt
+    >    create mode 100644 notes/zrq/20210206-01-git-cherry-pick.txt
+
+
+            git push
+
+    >   Total 0 (delta 0), reused 0 (delta 0), pack-reused 0
+    >   To github.com:Zarquan/aglais.git
+    >      01c7c74..f46bc2b  master -> master
+
+
+            git status
+
+    >   On branch master
+    >   Your branch is up to date with 'origin/master'.
+
+
+        popd
+    popd
+
+
+# -----------------------------------------------------
+# Delete merged branches.
+#[user@desktop]
+
+    source "${HOME}/aglais.env"
+    pushd  "${AGLAIS_HOME}"
+
+        pushd github-working
+
+            git branch
+
+    >     20210113-zrq-source-build
+    >     20210125-zrq-format-notes
+    >     20210125-zrq-kubernetes-deploy
+    >     20210127-zrq-error-trap
+    >     20210127-zrq-oauth
+    >     20210127-zrq-working
+    >     20210205-zrq-deployname
+    >     20210205-zrq-error-trap
+    >     20210205-zrq-notes
+    >     20210205-zrq-testing
+    >     20210205-zrq-timeout
+    >     20210206-zrq-working
+    >   * master
+
+
+            git branch -d 20210125-zrq-format-notes
+
+    >   Deleted branch 20210125-zrq-format-notes (was fd1e449).
+
+
+            git branch -d 20210125-zrq-kubernetes-deploy
+
+    >   Deleted branch 20210125-zrq-kubernetes-deploy (was 3ab3b55).
+
+
+            git branch -d 20210127-zrq-error-trap
+
+    >   Deleted branch 20210127-zrq-error-trap (was 1b80704).
+
+
+            git branch -d 20210127-zrq-oauth
+
+    >   Deleted branch 20210127-zrq-oauth (was d5af1da).
+
+
+            git branch -d 20210127-zrq-working
+
+    >   warning: deleting branch '20210127-zrq-working' that has been merged to
+    >            'refs/remotes/origin/20210127-zrq-working', but not yet merged to HEAD.
+    >   
+    >   Deleted branch 20210127-zrq-working (was e12e24c).
+
+
+            git branch -d 20210205-zrq-deployname
+
+    >   Deleted branch 20210205-zrq-deployname (was 64d0f2c).
+
+
+            git branch -d 20210205-zrq-error-trap
+
+    >   Deleted branch 20210205-zrq-error-trap (was 1b80704).
+
+
+            git branch -d 20210205-zrq-notes
+
+    >   Deleted branch 20210205-zrq-notes (was 9c73277).
+
+
+            git branch -d 20210205-zrq-testing
+
+    >   Deleted branch 20210205-zrq-testing (was c148e78).
+
+
+            git branch -d 20210205-zrq-timeout
+
+    >   Deleted branch 20210205-zrq-timeout (was 9c73277).
+
+
+            git branch
+
+    >     20210113-zrq-source-build
+    >     20210206-zrq-working
+    >   * master
+
+
+        popd
+    popd
+
+
+# -----------------------------------------------------
+# Create a new branch for the gateway changes.
+#[user@desktop]
+
+    source "${HOME}/aglais.env"
+    pushd  "${AGLAIS_HOME}"
+
+        pushd github-working
+
+            nextbranch=$(date '+%Y%m%d')-zrq-gateway
+
+            git checkout master
+
+    >   Already on 'master'
+    >   Your branch is up to date with 'origin/master'.
+
+
+            git checkout -b "${nextbranch:?}"
+
+    >   Switched to a new branch '20210211-zrq-gateway'
+
+
+            git push --set-upstream origin "${nextbranch:?}"
+
+    >   Total 0 (delta 0), reused 0 (delta 0), pack-reused 0
+    >   remote:
+    >   remote: Create a pull request for '20210211-zrq-gateway' on GitHub by visiting:
+    >   remote:      https://github.com/Zarquan/aglais/pull/new/20210211-zrq-gateway
+    >   remote:
+    >   To github.com:Zarquan/aglais.git
+    >    * [new branch]      20210211-zrq-gateway -> 20210211-zrq-gateway
+    >   Branch '20210211-zrq-gateway' set up to track remote branch '20210211-zrq-gateway' from 'origin'.
+
+        popd
+    popd
+
+
+# -----------------------------------------------------
+# Transfer the changes to remove the gateway node.
+#[user@desktop]
+
+    source "${HOME}/aglais.env"
+    pushd  "${AGLAIS_HOME}"
+
+        meld github-backup github-working &
+
+        pushd github-working
+
+            meld . &
+
+            git branch
+
+    >     20210113-zrq-source-build
+    >     20210206-zrq-working
+    >   * 20210211-zrq-gateway
+    >     master
+
+            git add .
+
+            git commit -m "Removed gateway node"
+
+    >   [20210211-zrq-gateway ffe2137] Removed gateway node
+    >    14 files changed, 58 insertions(+), 61 deletions(-)
+
+            git push
+
+    >   Enumerating objects: 39, done.
+    >   Counting objects: 100% (39/39), done.
+    >   Delta compression using up to 4 threads
+    >   Compressing objects: 100% (20/20), done.
+    >   Writing objects: 100% (20/20), 1.84 KiB | 470.00 KiB/s, done.
+    >   Total 20 (delta 17), reused 0 (delta 0), pack-reused 0
+    >   remote: Resolving deltas: 100% (17/17), completed with 16 local objects.
+    >   To github.com:Zarquan/aglais.git
+    >      f46bc2b..ffe2137  20210211-zrq-gateway -> 20210211-zrq-gateway
+
+        popd
+    popd
+
+
+# -----------------------------------------------------
+# Create a new branch with a fix to delete-all.
+# Note - this branch follows on from previous branch, carrying forward the changes
+#[user@desktop]
+
+    source "${HOME}/aglais.env"
+    pushd  "${AGLAIS_HOME}"
+
+        pushd github-working
+
+            git status
+
+    >   On branch 20210211-zrq-gateway
+    >   Your branch is up to date with 'origin/20210211-zrq-gateway'.
+
+            nextbranch=$(date '+%Y%m%d')-zrq-delete-fix
+
+            git checkout -b "${nextbranch:?}"
+
+            git push --set-upstream origin "${nextbranch:?}"
+
+            meld ../github-backup . &
+
+            git status
+
+            git add experiments/openstack/bin/delete-all.sh
+            git commit -m "Fix to catch all the keys created by create-all"
+
+            git add notes/zrq/20210206-01-git-cherry-pick.txt
+            git commit -m "Finish notes on cherry picking"
+
+            git push
+
+        popd
+    popd
+
+
+# -----------------------------------------------------
+# Create a new branch to move hadoop and spark vars into the hosts file.
+# Note - this branch follows on from previous branch, carrying forward the changes
+#[user@desktop]
+
+    source "${HOME}/aglais.env"
+    pushd  "${AGLAIS_HOME}"
+
+        pushd github-working
+
+            git status
+
+    >   On branch 20210211-zrq-delete-fix
+    >   Your branch is up to date with 'origin/20210211-zrq-delete-fix'.
+
+            nextbranch=$(date '+%Y%m%d')-zrq-move-vars
+
+            git checkout -b "${nextbranch:?}"
+
+            git push --set-upstream origin "${nextbranch:?}"
+
+            meld ../github-backup . &
+
+            git status
+
+            git add .
+
+            git commit -m "Moved Hadoop, Spark and Zeppelin vars into hosts.yml"
+
+    >   [20210211-zrq-move-vars 2432401] Moved Hadoop, Spark and Zeppelin vars into hosts.yml
+    >    11 files changed, 50 insertions(+), 71 deletions(-)
+
+            git push
+
+        popd
+    popd
+
+
+# -----------------------------------------------------
+# Create a new branch to fix the issue with Fedora updates.
+# Note - this branch follows on from previous branch, carrying forward the changes
+#[user@desktop]
+
+    source "${HOME}/aglais.env"
+    pushd  "${AGLAIS_HOME}"
+
+        pushd github-working
+
+            git status
+
+    >   On branch 20210211-zrq-move-vars
+    >   Your branch is up to date with 'origin/20210211-zrq-move-vars'.
+
+            nextbranch=$(date '+%Y%m%d')-zrq-fedora-updates
+
+            git checkout -b "${nextbranch:?}"
+
+            git push --set-upstream origin "${nextbranch:?}"
+
+            meld ../github-backup . &
+
+            git status
+
+            git add .
+
+            git commit -m "Fix a problem with Fedora updates"
+
+            git push
+
+        popd
+    popd
+
+
+# -----------------------------------------------------
+# Create a new branch to add misc notes.
+# Note - this branch follows on from the master branch, nothing to carry forward.
+#[user@desktop]
+
+    source "${HOME}/aglais.env"
+    pushd  "${AGLAIS_HOME}"
+
+        pushd github-working
+
+            git checkout master
+
+            nextbranch=$(date '+%Y%m%d')-zrq-notes
+
+            git checkout -b "${nextbranch:?}"
+
+            meld ../github-backup . &
+
+            git status
+
+            git add .
+
+            git commit -m "Added new notes"
+
+            git push --set-upstream origin "${nextbranch:?}"
+
+        popd
+    popd
+
+
+# -----------------------------------------------------
+# Create a new branch to add misc notes.
+# Note - this branch follows on from a previous branch, carrying forward the changes
+#[user@desktop]
+
+    source "${HOME}/aglais.env"
+    pushd  "${AGLAIS_HOME}"
+
+        pushd github-working
+
+            git checkout 20210211-zrq-fedora-updates
+
+            nextbranch=$(date '+%Y%m%d')-zrq-volume-mounts
+
+            git checkout -b "${nextbranch:?}"
+
+            meld ../github-backup . &
+
+            git status
+
+            meld . &
+
+            git add .
+
+            git commit -m "Volume mounts for temp space"
+
+            git push --set-upstream origin "${nextbranch:?}"
+
+        popd
+    popd
+

From fb30bea8e69af97888df3451606f784563722b90 Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Sat, 13 Feb 2021 06:36:06 +0000
Subject: [PATCH 08/27] Configuring Hadoop and HDFS directories

---
 .../ansible/12-config-hadoop-core.yml         | 13 ++++++++++++
 .../ansible/12-config-ssh-access.yml          |  7 -------
 .../ansible/13-config-hdfs-namenode.yml       | 12 +++++------
 .../ansible/14-config-hdfs-workers.yml        | 11 ++++++++--
 experiments/hadoop-yarn/ansible/hosts.yml     | 20 +++++++++++--------
 .../ansible/tasks/create-linked.yml           |  2 --
 6 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml b/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml
index 2c4e50ad..aa6e6693 100644
--- a/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml
+++ b/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml
@@ -26,6 +26,13 @@
 
   tasks:
 
+    - name: "Create Hadoop temp directory"
+      include_tasks: "tasks/create-linked.yml"
+      vars:
+        linkdest: "{{hdtempdest}}"
+        linkpath: "{{hdtemplink}}"
+        linkuser: "{{hduser}}"
+
     # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/ClusterSetup.html#Configuring_the_Hadoop_Daemons
     # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/core-default.html
     - name: "Configure [{{hdhome}}/etc/hadoop/core-site.xml]"
@@ -51,4 +58,10 @@
                 <value>hdfs://{{hdhost}}:9000</value>
             </property>
 
+            <property>
+                <name>hadoop.tmp.dir</name>
+                <value>{{hdtemplink}}</value>
+            </property>
+
+
 
diff --git a/experiments/hadoop-yarn/ansible/12-config-ssh-access.yml b/experiments/hadoop-yarn/ansible/12-config-ssh-access.yml
index 7fd83bca..20a372f0 100644
--- a/experiments/hadoop-yarn/ansible/12-config-ssh-access.yml
+++ b/experiments/hadoop-yarn/ansible/12-config-ssh-access.yml
@@ -102,13 +102,6 @@
 - name: "Configure Hadoop [workers] on master nodes"
   hosts: masters:zeppelin
   gather_facts: false
-  vars:
-    hdname: "hadoop-3.1.3"
-    hdbase: "/opt"
-    hdhome: "/opt/hadoop"
-    hddata: "/var/local/hadoop"
-    hdhost: "{{groups['masters'][0]}}"
-    hduser: "{{hostvars[inventory_hostname].login}}"
 
   tasks:
 
diff --git a/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml b/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml
index 7bfe899b..390d6656 100644
--- a/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml
+++ b/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml
@@ -33,7 +33,7 @@
       vars:
         linkdest: "{{hdfsmetadest}}"
         linkpath: "{{hdfsmetalink}}"
-        linkuser: "{{hduser}}"
+        linkuser: "{{hdfsuser}}"
 
     - name: "Create [{{hdfsimage}}]"
       become: true
@@ -42,8 +42,8 @@
         mode: 'u=rwx,g=rwxs,o=rx'
         state: directory
         recurse: yes
-        owner: "{{hduser}}"
-        group: "{{hduser}}"
+        owner: "{{hdfsuser}}"
+        group: "{{hdfsuser}}"
 
     # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/ClusterSetup.html#Configuring_the_Hadoop_Daemons
     # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml
@@ -56,7 +56,7 @@
         block: |
             <!--+
                 | Determines where on the local filesystem the DFS name node should store the name table(fsimage).
-                | If this is a comma-delimited list of directories then the name table is replicated in all of the directories, for redundancy. 
+                | If this is a comma-delimited list of directories then the name table is replicated in all of the directories, for redundancy.
                 | https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml
                 +-->
             <property>
@@ -87,7 +87,7 @@
             <!--+
                 | Default block replication.
                 | The actual number of replications can be specified when the file is created.
-                | The default is used if replication is not specified in create time.           
+                | The default is used if replication is not specified in create time.
                 | https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml
                 +-->
             <property>
@@ -117,7 +117,7 @@
                 <name>dfs.client.use.datanode.hostname</name>
                 <value>true</value>
             </property>
-            
+
             <property>
                 <name>dfs.datanode.use.datanode.hostname</name>
                 <value>true</value>
diff --git a/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml b/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml
index c3dd25d9..1842c580 100644
--- a/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml
+++ b/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml
@@ -31,7 +31,14 @@
       vars:
         linkdest: "{{hdfsdatadest}}"
         linkpath: "{{hdfsdatalink}}"
-        linkuser: "{{hduser}}"
+        linkuser: "{{hdfsuser}}"
+
+    - name: "Create HDFS logs directory"
+      include_tasks: "tasks/create-linked.yml"
+      vars:
+        linkdest: "{{hdfslogsdest}}"
+        linkpath: "{{hdfslogslink}}"
+        linkuser: "{{hdfsuser}}"
 
     # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/ClusterSetup.html#Configuring_the_Hadoop_Daemons
     - name: "Configure [{{hdhome}}/etc/hadoop/hdfs-site.xml]"
@@ -46,7 +53,7 @@
                 | If this is a comma-delimited list of directories, then data will be stored in all named directories, typically on different devices.
                 | The directories should be tagged with corresponding storage types ([SSD]/[DISK]/[ARCHIVE]/[RAM_DISK]) for HDFS storage policies.
                 | The default storage type will be DISK if the directory does not have a storage type tagged explicitly.
-                | Directories that do not exist will be created if local filesystem permission allows. 
+                | Directories that do not exist will be created if local filesystem permission allows.
                 +-->
             <property>
                 <name>dfs.datanode.data.dir</name>
diff --git a/experiments/hadoop-yarn/ansible/hosts.yml b/experiments/hadoop-yarn/ansible/hosts.yml
index 9035f5f0..2c1804ef 100644
--- a/experiments/hadoop-yarn/ansible/hosts.yml
+++ b/experiments/hadoop-yarn/ansible/hosts.yml
@@ -61,12 +61,16 @@ all:
         hddatalink: "/var/hadoop/data"
         hddatadest: "/mnt/cinder/vdc/hadoop/data"
 
+        hdtemplink: "/var/hadoop/temp"
+        hdtempdest: "/mnt/local/vdb/hadoop/temp"
+
         hdlogslink: "/var/hadoop/logs"
         hdlogsdest: "/mnt/cinder/vdc/hadoop/logs"
 
     # HDFS vars
 
         hdfsconf: "/var/hdfs/conf"
+        hdfsuser: "fedora"
 
         hdfsmetalink: "/var/hdfs/meta"
         hdfsmetadest: "/mnt/cinder/vdc/hdfs/meta"
@@ -127,19 +131,19 @@ all:
 
         workers:
             hosts:
-                worker[01:06]:
+                worker[01:04]:
             vars:
                 login:  'fedora'
                 image:  'Fedora-30-1.2'
-                flavor: 'general.v1.small'
+                flavor: 'general.v1.medium'
                 discs:
-#                  - type: 'local'
-#                    format: 'ext4'
-#                    mntpath: "/mnt/local/vdb"
-#                    devname: 'vdb'
+                  - type: 'local'
+                    format: 'ext4'
+                    mntpath: "/mnt/local/vdb"
+                    devname: 'vdb'
                   - type: 'cinder'
                     size: 512
                     format: 'btrfs'
-                    mntpath: "/mnt/cinder/vdb"
-                    devname: 'vdb'
+                    mntpath: "/mnt/cinder/vdc"
+                    devname: 'vdc'
 
diff --git a/experiments/hadoop-yarn/ansible/tasks/create-linked.yml b/experiments/hadoop-yarn/ansible/tasks/create-linked.yml
index 1c137b86..1da47c38 100644
--- a/experiments/hadoop-yarn/ansible/tasks/create-linked.yml
+++ b/experiments/hadoop-yarn/ansible/tasks/create-linked.yml
@@ -26,7 +26,6 @@
     path: "{{linkdest | dirname}}"
     mode: 'u=rwx,g=rwxs,o=rx'
     state: directory
-    recurse: yes
     owner: 'root'
     group: 'root'
 
@@ -46,7 +45,6 @@
     path: "{{linkpath | dirname}}"
     mode: 'u=rwx,g=rwxs,o=rx'
     state: directory
-    recurse: yes
     owner: 'root'
     group: 'root'
 

From 7ba885d16c250a7c0191484a042af0cc849a210a Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Sat, 13 Feb 2021 06:37:24 +0000
Subject: [PATCH 09/27] Performance optimizations

---
 .../ansible/16-config-yarn-masters.yml        |   37 +-
 .../ansible/17-config-yarn-workers.yml        |   61 +-
 .../ansible/22-config-spark-master.yml        |   32 +-
 notes/zrq/20210211-03-ansible-deploy.txt      | 1016 +++++++++++++++++
 4 files changed, 1095 insertions(+), 51 deletions(-)
 create mode 100644 notes/zrq/20210211-03-ansible-deploy.txt

diff --git a/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml b/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml
index e5a9fa76..f2d3c00f 100644
--- a/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml
+++ b/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml
@@ -55,7 +55,7 @@
             <!--+
                 | [host]
                 | Single hostname that can be set in place of setting all yarn.resourcemanager*address resources.
-                | Results in default ports for ResourceManager components. 
+                | Results in default ports for ResourceManager components.
                 +-->
             <property>
                 <name>yarn.resourcemanager.hostname</name>
@@ -64,7 +64,7 @@
 
             <!--+
                 | [host:port]
-                | If set, overrides the hostname set in yarn.resourcemanager.hostname. 
+                | If set, overrides the hostname set in yarn.resourcemanager.hostname.
             <property>
                 <name>yarn.resourcemanager.address</name>
                 <value>{{hdhost}}:8032</value>
@@ -73,7 +73,7 @@
 
             <!--+
                 | [host:port]
-                | If set, overrides the hostname set in yarn.resourcemanager.hostname. 
+                | If set, overrides the hostname set in yarn.resourcemanager.hostname.
             <property>
                 <name>yarn.resourcemanager.admin.address</name>
                 <value>{{hdhost}}:8033</value>
@@ -82,7 +82,7 @@
 
             <!--+
                 | [host:port]
-                | If set, overrides the hostname set in yarn.resourcemanager.hostname. 
+                | If set, overrides the hostname set in yarn.resourcemanager.hostname.
             <property>
                 <name>yarn.resourcemanager.resource-tracker.address</name>
                 <value>{{hdhost}}:8031</value>
@@ -91,7 +91,7 @@
 
             <!--+
                 | [host:port]
-                | If set, overrides the hostname set in yarn.resourcemanager.hostname. 
+                | If set, overrides the hostname set in yarn.resourcemanager.hostname.
             <property>
                 <name>yarn.resourcemanager.scheduler.address</name>
                 <value>{{hdhost}}:8030</value>
@@ -100,7 +100,7 @@
 
             <!--+
                 | [host:port]
-                | If set, overrides the hostname set in yarn.resourcemanager.hostname. 
+                | If set, overrides the hostname set in yarn.resourcemanager.hostname.
             <property>
                 <name>yarn.resourcemanager.webapp.address</name>
                 <value>{{hdhost}}:8088</value>
@@ -112,7 +112,7 @@
                 | CapacityScheduler (recommended), FairScheduler (also recommended), or FifoScheduler.
                 | Use a fully qualified class name, e.g.
                 | org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler
-                | org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler	
+                | org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler
             <property>
                 <name>yarn.resourcemanager.scheduler.class</name>
                 <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value>
@@ -121,30 +121,23 @@
 
             <!--+
                 | Maximum limit of memory to allocate to each container request at the Resource Manager.
+                | https://stackoverflow.com/a/43827548
+                | https://github.com/hortonworks/hdp-configuration-utils
                 +-->
             <property>
                 <name>yarn.scheduler.maximum-allocation-mb</name>
-                <value>20000</value>
+                <value>43008</value>
             </property>
 
-
-            <!--+
-                | Maximum limit of memory to allocate to the Sheduler.
-                +-->
-            <property>
-                <name>yarn.scheduler.maximum-allocation-mb</name>
-                <value>20000</value>
-            </property>
-
-                
-
             <!--+
                 | Minimum limit of memory to allocate to each container request at the Resource Manager.
+                | https://stackoverflow.com/a/43827548
+                | https://github.com/hortonworks/hdp-configuration-utils
+                +-->
             <property>
                 <name>yarn.scheduler.minimum-allocation-mb</name>
-                <value>2000</value>
+                <value>14336</value>
             </property>
-                +-->
 
     #
     # CapacityScheduler config.
@@ -161,7 +154,7 @@
             <!--+
                 | The multiple of the queue capacity which can be configured to allow a single user to acquire more resources.
                 | By default this is set to 1 which ensures that a single user can never take more than the queue’s configured capacity irrespective of how idle the cluster is.
-                | Value is specified as a float. 
+                | Value is specified as a float.
                 | default: 1
                 +-->
             <property>
diff --git a/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml b/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml
index 7490cdc4..a6330d76 100644
--- a/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml
+++ b/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml
@@ -54,30 +54,61 @@
 
             <!--+
                 | Single hostname that can be set in place of setting all yarn.resourcemanager*address resources.
-                | Results in default ports for ResourceManager components. 
+                | Results in default ports for ResourceManager components.
                 +-->
             <property>
                 <name>yarn.resourcemanager.hostname</name>
                 <value>{{hdhost}}</value>
             </property>
-            
+
 
             <!--+
-                | Maximum limit of memory to allocate to the Sheduler.
+                | Maximum limit of memory to allocate to each container request at the Resource Manager.
+                | https://stackoverflow.com/a/43827548
+                | https://github.com/hortonworks/hdp-configuration-utils
                 +-->
             <property>
                 <name>yarn.scheduler.maximum-allocation-mb</name>
-                <value>20000</value>
+                <value>43008</value>
+            </property>
+
+            <!--+
+                | Minimum limit of memory to allocate to each container request at the Resource Manager.
+                | https://stackoverflow.com/a/43827548
+                | https://github.com/hortonworks/hdp-configuration-utils
+                +-->
+            <property>
+                <name>yarn.scheduler.minimum-allocation-mb</name>
+                <value>14336</value>
             </property>
 
             <!--+
                 | Maximum limit of memory to allocate to each container request at the Resource Manager.
+                | Amount of physical memory, in MB, that can be allocated for containers.
+                | It means the amount of memory YARN can utilize on this node and therefore this property should be lower than the total memory of that machine.
+                | https://stackoverflow.com/a/43827548
+                | 44*1024 = 45056
+                | https://github.com/hortonworks/hdp-configuration-utils
                 +-->
             <property>
                 <name>yarn.nodemanager.resource.memory-mb</name>
-                <value>20000</value>
+                <value>43008</value>
             </property>
 
+            <!--+
+                | "the general recommendation is ... set it to be equal to the number of physical cores on the machine"
+                | https://serverfault.com/q/896783
+                +-->
+            <property>
+                <name>yarn.nodemanager.resource.cpu-vcores</name>
+                <value>13</value>
+            </property>
+
+            <!--+
+                | The default value for yarn.scheduler.maximum-allocation-vcores is set to twice the number of CPUs.
+                | This oversubscription assumes that CPUs are not always running a thread, and hence assigning more cores enables maximum CPU utilization.
+                | https://serverfault.com/a/908778
+                +-->
             <property>
                 <name>yarn.scheduler.maximum-allocation-vcores</name>
                 <value>48</value>
@@ -89,8 +120,8 @@
             </property>
 
             <!--+
-                | Comma-separated list of paths on the local filesystem where intermediate data is written. 
-                | Multiple paths help spread disk i/o. 
+                | Comma-separated list of paths on the local filesystem where intermediate data is written.
+                | Multiple paths help spread disk i/o.
             <property>
                 <name>yarn.nodemanager.local-dirs</name>
                 <value/>
@@ -98,8 +129,8 @@
                 +-->
 
             <!--+
-                | Comma-separated list of paths on the local filesystem where logs are written. 
-                | Multiple paths help spread disk i/o. 
+                | Comma-separated list of paths on the local filesystem where logs are written.
+                | Multiple paths help spread disk i/o.
             <property>
                 <name>yarn.nodemanager.log-dirs</name>
                 <value/>
@@ -109,7 +140,7 @@
             <!--+
                 | HDFS directory where the application logs are moved on application completion.
                 | Need to set appropriate permissions.
-                | Only applicable if log-aggregation is enabled. 
+                | Only applicable if log-aggregation is enabled.
             <property>
                 <name>yarn.nodemanager.remote-app-log-dir</name>
                 <value>/logs</value>
@@ -119,7 +150,7 @@
             <!--+
                 | Suffix appended to the remote log dir.
                 | Logs will be aggregated to ${yarn.nodemanager.remote-app-log-dir}/${user}/${thisParam}.
-                | Only applicable if log-aggregation is enabled. 
+                | Only applicable if log-aggregation is enabled.
             <property>
                 <name>yarn.nodemanager.remote-app-log-dir-suffix</name>
                 <value>/logs</value>
@@ -128,7 +159,7 @@
 
             <!--+
                 | A comma separated list of services where service name should only contain a-zA-Z0-9_ and can not start with numbers.
-                | mapreduce_shuffle : Shuffle service that needs to be set for Map Reduce applications. 
+                | mapreduce_shuffle : Shuffle service that needs to be set for Map Reduce applications.
                 +-->
             <property>
                 <name>yarn.nodemanager.aux-services</name>
@@ -141,10 +172,10 @@
                 |   HADOOP_CONF_DIR,
                 |   HADOOP_HDFS_HOME,
                 |   HADOOP_YARN_HOME,
-                |   HADOOP_MAPRED_HOME, 
+                |   HADOOP_MAPRED_HOME,
                 |   HADOOP_COMMON_HOME,
                 |   CLASSPATH_PREPEND_DISTCACHE
-                | 
+                |
             <property>
                 <name>yarn.nodemanager.env-whitelist </name>
                 <value/>
@@ -159,7 +190,7 @@
                 <value>false</value>
             </property>
                 +-->
-          
+
             <!--+
                 | Whether virtual memory limits will be enforced for containers.
                 | default: true
diff --git a/experiments/hadoop-yarn/ansible/22-config-spark-master.yml b/experiments/hadoop-yarn/ansible/22-config-spark-master.yml
index d46359d5..c1fe70e8 100644
--- a/experiments/hadoop-yarn/ansible/22-config-spark-master.yml
+++ b/experiments/hadoop-yarn/ansible/22-config-spark-master.yml
@@ -39,6 +39,7 @@
     # Documentation
     # https://spark.apache.org/docs/3.0.0-preview2/running-on-yarn.html#configuration
     # https://spark.apache.org/docs/3.0.0-preview2/configuration.html
+    # https://blog.cloudera.com/how-to-tune-your-apache-spark-jobs-part-2/
     #
     - name: "Configure [{{sphome}}/conf/spark-defaults.conf]"
       become: true
@@ -49,20 +50,23 @@
         insertbefore: "EOF"
         block: |
             # https://spark.apache.org/docs/3.0.0-preview2/running-on-yarn.html#spark-properties
-            spark.master            yarn
-            spark.driver.memory              17g
-            spark.yarn.am.memory            17g
-            spark.executor.memory          17g
-            spark.executor.cores            4
-            spark.executor.instances    4
-            spark.yarn.am.cores  4
-            spark.eventLog.enabled  true
-            spark.driver.maxResultSize	8192m
-            spark.local.dir         {{sptemplink}}
-            spark.master            yarn
-            spark.eventLog.enabled  true
-            spark.eventLog.dir      hdfs://{{hdhost}}:9000/spark-log
-    
+            spark.master                yarn
+
+            spark.driver.memory          13g
+            spark.yarn.am.memory         13g
+            spark.yarn.am.cores            4
+
+            spark.executor.memory        13g
+            spark.executor.cores           4
+            spark.executor.instances      11
+
+            spark.eventLog.enabled      true
+            spark.driver.maxResultSize 8192m
+
+            spark.local.dir            {{sptemplink}}
+            spark.eventLog.dir         hdfs://{{hdhost}}:9000/spark-log
+
+
     #
     # https://spark.apache.org/docs/3.0.0-preview2/configuration.html#environment-variables
     # - name: "Update [/etc/profile.d/spark.sh]"
diff --git a/notes/zrq/20210211-03-ansible-deploy.txt b/notes/zrq/20210211-03-ansible-deploy.txt
new file mode 100644
index 00000000..72078369
--- /dev/null
+++ b/notes/zrq/20210211-03-ansible-deploy.txt
@@ -0,0 +1,1016 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+    Target:
+
+        Get Spark to work with the new configuration.
+
+        Changes based on information from Stelio's notes.
+            notes/stv/20210210-Benchmarking-ML-Notebook-01.txt
+            notes/stv/20210211-ML-Notebook-Benchmarking.txt
+
+            Added hadoop.tmp.dir to the core config.
+            Added /var/hadoop/temp to the volume mounts.
+
+        Changed to 4 medium workers
+
+        Test config:
+            1 small master
+            1 medium zeppelin
+            4 medium workers
+
+        Variable results caused by problems withthe Ceph stprage system.
+        The whole notebook is IO limited, all of the calculations are starved of input data.
+        Even on a good run, the cpu use is around 1%.
+
+        Multiple disc failures were causing problems with the Ceph system.
+        John removed broken discs from the array and stayed late to finish rebuilding the array.
+        After that results were much better, but still starved of data.
+
+        Hadoop and Spark work best with local data.
+
+        The gaia machines sitting in the racks at ROE are a much better fit for the typer of load.
+        Spread the data across the workers, don't centralise it in one place.
+        Either HDFS or another form of local caching.
+
+    Links about file system optimisation
+
+        Best practices for caching in Spark SQL
+        https://towardsdatascience.com/best-practices-for-caching-in-spark-sql-b22fb0f02d34
+
+        RADOS (Reliable Autonomic Distributed Object Store)
+        https://searchstorage.techtarget.com/definition/RADOS-Reliable-Autonomic-Distributed-Object-Store
+
+        CephFS: a new generation storage platform for Australian High Energy Physics
+        https://indico.cern.ch/event/505613/contributions/2230911/attachments/1345227/2039428/Oral-v5-162.pdf
+
+        CephFS file layouts
+        https://docs.ceph.com/en/mimic/cephfs/file-layouts/
+
+        Detecting CPU steal time in guest virtual machines
+        https://opensource.com/article/20/1/cpu-steal-time
+
+    Results:
+
+        Notebook works with 100% of eDR3 and 500 trees.
+        Need to experiment with adding more trees.
+
+    TODO:
+
+
+
+# -----------------------------------------------------
+# Update the Openstack cloud name.
+#[user@desktop]
+
+    cloudname=gaia-dev
+
+    sed -i '
+        s/^\(AGLAIS_CLOUD\)=.*$/\1='${cloudname:?}'/
+        ' "${HOME}/aglais.env"
+
+
+# -----------------------------------------------------
+# Create a container to work with.
+# (*) extra volume mount for /common
+#[user@desktop]
+
+    source "${HOME:?}/aglais.env"
+
+    podman run \
+        --rm \
+        --tty \
+        --interactive \
+        --name ansibler \
+        --hostname ansibler \
+        --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \
+        --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \
+        --env "cloudname=${AGLAIS_CLOUD:?}" \
+        --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/common:/common:ro,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/openstack:/openstack:ro,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/hadoop-yarn:/hadoop-yarn:ro,z" \
+        atolmis/ansible-client:2020.12.02 \
+        bash
+
+
+# -----------------------------------------------------
+# Create our Aglais configuration.
+#[root@kubernator]
+
+cat > '/tmp/aglais-config.yml' << EOF
+aglais:
+    version: 1.0
+    spec:
+        openstack:
+            cloud: '${cloudname:?}'
+
+EOF
+
+
+# -----------------------------------------------------
+# Create everything from scratch.
+#[root@ansibler]
+
+    time \
+        /openstack/bin/delete-all.sh \
+            "${cloudname:?}"
+
+    rm -f ~/.ssh/*
+
+    time \
+        /hadoop-yarn/bin/create-all.sh
+
+
+    >   real    33m6.197s
+    >   user    8m17.797s
+    >   sys     2m33.633s
+
+    >   real    31m27.362s
+    >   user    7m41.976s
+    >   sys     2m27.153s
+
+    >   
+    >   real    32m40.876s
+    >   user    8m1.610s
+    >   sys     2m34.779s
+
+    >   real    31m42.765s
+    >   user    7m59.668s
+    >   sys     2m30.155s
+
+
+
+# -----------------------------------------------------
+# Check the deployment status.
+#[root@ansibler]
+
+    cat '/tmp/aglais-status.yml'
+
+    >   ....
+    >   ....
+
+
+# -----------------------------------------------------
+# Get the public IP address of our Zeppelin node.
+#[root@ansibler]
+
+    deployname=$(
+        yq read \
+            '/tmp/aglais-status.yml' \
+                'aglais.status.deployment.name'
+        )
+
+    zeppelinid=$(
+        openstack \
+            --os-cloud "${cloudname:?}" \
+            server list \
+                --format json \
+        | jq -r '.[] | select(.Name == "'${deployname:?}'-zeppelin") | .ID'
+        )
+
+    zeppelinip=$(
+        openstack \
+            --os-cloud "${cloudname:?}" \
+            server show \
+                --format json \
+                "${zeppelinid:?}" \
+        | jq -r '.addresses' \
+        | sed '
+            s/[[:space:]]//
+            s/.*=\(.*\)/\1/
+            s/.*,\(.*\)/\1/
+            '
+        )
+
+cat << EOF
+Zeppelin ID [${zeppelinid:?}]
+Zeppelin IP [${zeppelinip:?}]
+EOF
+
+    >   Zeppelin ID [ecbdba16-f723-4f5f-a5e8-e943f83f95bd]
+    >   Zeppelin IP [128.232.227.228]
+
+    >   Zeppelin ID [721e11ed-d1c5-4f7a-81fd-61dd87d4c13d]
+    >   Zeppelin IP [128.232.227.202]
+
+    >   Zeppelin ID [31bd4e5e-3ea0-4dd2-a08c-863b61d923ea]
+    >   Zeppelin IP [128.232.227.247]
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+    Update our DNS
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Login to Zeppelin ...
+#[user@desktop]
+
+    firefox --new-window "http://zeppelin.metagrid.xyz:8080/" &
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+    Import notebooks from GitHub, clear the output and run all the cells ...
+
+    Good astrometric solutions via ML Random Forrest classifier
+    https://raw.githubusercontent.com/wfau/aglais-notebooks/main/2FRPC4BFS/note.json
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+Stelio's test #1a
+20210211-ML-Notebook-Benchmarking.txt
+
+    Cinder Volumes for temp storage for Spark & Hadoop
+    500 trees
+
+    main select statement
+	>  Took 43 mins
+
+    RandomForestClassifier - 10% data 500 trees
+	> Took 17 mins
+
+Stelio's test #1b
+20210211-ML-Notebook-Benchmarking.txt
+
+    Cinder Volumes for temp storage for Spark & Hadoop
+    5000 trees
+
+    main select statement
+	> ???
+
+    RandomForestClassifier - 10% data 5000 trees
+	> ???
+
+	notebook took 3 hrs 23 min 28 sec ()
+
+Stelio's test #2
+20210211-ML-Notebook-Benchmarking.txt
+
+    Revert changes to Ansible scripts so that it matches what is currently deployed on zeppelin.aglais.uk
+
+    main select statement
+    > 28 min 10 sec.
+
+    RandomForestClassifier - (assume 10% data 500 trees, not stated)
+    > 15 min 28 sec.
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+Live deployment #1
+
+    quick_filter=' AND MOD(random_index, 10) = 0'
+    quick_plot_filter=' AND MOD(random_index, 25) = 0'
+
+    main select statement
+    Took 29 min 10 sec. Last updated by gaiauser at February 12 2021, 4:59:01 AM.
+
+    first plot
+    ....
+
+    good/bad select
+    ....
+
+    RandomForestClassifier - 10% data 500 trees
+    Took 12 min 4 sec. Last updated by gaiauser at February 12 2021, 5:11:23 AM.
+
+
+Live deployment #2
+
+    quick_filter=''
+    quick_plot_filter=' AND MOD(random_index, 25) = 0'
+
+    main select statement
+    1724028
+    Took 4 min 16 sec. Last updated by gaiauser at February 12 2021, 6:16:23 AM.
+
+    first plot
+    Took 14 min 46 sec. Last updated by gaiauser at February 12 2021, 6:31:09 AM.
+
+    good/bad select
+    Good training data size: 244740 rows
+    Bad  training data size: 244740 rows
+    Took 23 min 8 sec. Last updated by gaiauser at February 12 2021, 6:54:18 AM.
+
+    RandomForestClassifier - 100% data 500 trees
+    Started 3 hours ago .... 66%
+
+
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+How-to: Tune Your Apache Spark Jobs (Part 2)
+https://blog.cloudera.com/how-to-tune-your-apache-spark-jobs-part-2/
+
+    Imagine a cluster with six (4) nodes running NodeManagers, each equipped with 16 (14) cores and 64GB (45) of memory.
+    The NodeManager capacities, yarn.nodemanager.resource.memory-mb and yarn.nodemanager.resource.cpu-vcores, should probably be set to 63 * 1024 = 64512 (megabytes) and 15 respectively.
+
+    example
+        yarn.nodemanager.resource.memory-mb     63 * 1024 = 64512
+        yarn.nodemanager.resource.cpu-vcores    16 - 1 = 15
+
+    The NodeManager capacities, yarn.nodemanager.resource.memory-mb and yarn.nodemanager.resource.cpu-vcores, should probably be set to 44 * 1024 = 45056 (megabytes) and 13 respectively.
+
+    aglais
+        yarn.nodemanager.resource.memory-mb     44 * 1024 = 45056
+        yarn.nodemanager.resource.cpu-vcores    14 - 1 = 13
+
+    We avoid allocating 100% of the resources to YARN containers because the node needs some resources to run the OS and Hadoop daemons.
+    In this case, we leave a gigabyte and a core for these system processes. Cloudera Manager helps by accounting for these and configuring these YARN properties automatically.
+
+The likely first impulse would be to use --num-executors 6 --executor-cores 15 --executor-memory 63G. However, this is the wrong approach because:
+
+    63GB + the executor memory overhead won’t fit within the 63GB capacity of the NodeManagers.
+    The application master will take up a core on one of the nodes, meaning that there won’t be room for a 15-core executor on that node.
+    15 cores per executor can lead to bad HDFS I/O throughput.
+
+A better option would be to use --num-executors 17 --executor-cores 5 --executor-memory 19G. Why?
+
+    This config results in three executors on all nodes except for the one with the AM, which will have two executors.
+    --executor-memory was derived as (63/3 executors per node) = 21.  21 * 0.07 = 1.47.  21 – 1.47 ~ 19.
+
+
+    example
+        6 nodes
+        15 cores per node
+        63G per node
+        3 executors per node
+        executor-cores 15 / 3 = 5
+        num-executors (6*3)-1 = 17
+
+        executor-memory
+            63/3 = 21
+            21 * (1 - 0.07) = 19
+
+
+    alais
+        4 nodes
+        13 cores per node
+        44G per node
+        3 executors per node
+        executor-cores 13 / 3 = 4
+        num-executors (4*3)-1 = 11
+
+        executor-memory
+            44/3 = 14
+            14 * (1 - 0.07) = 13
+
+        ---- ----
+
+        yarn.nodemanager.resource.memory-mb     45056
+        yarn.nodemanager.resource.cpu-vcores    13
+
+        executor-cores   4
+        num-executors   11
+        executor-memory 13
+
+
+        spark-master
+        spark-defaults.conf
+
+            spark.driver.memory             13g
+            spark.yarn.am.memory            13g
+            spark.yarn.am.cores               4
+
+            spark.executor.memory           13g
+            spark.executor.cores              4
+            spark.executor.instances         11
+
+            spark.eventLog.enabled  true
+            spark.driver.maxResultSize	     8g
+
+
+        yarn-masters
+        yarn-site.xml
+            yarn.scheduler.maximum-allocation-mb 13312
+            yarn.scheduler.minimum-allocation-mb  2048
+                                                 14336
+
+
+        yarn-workers
+        yarn-site.xml
+
+            yarn.nodemanager.resource.memory-mb         ((45-1)*1024) = 45056
+            yarn.nodemanager.resource.cpu-vcores        13
+            yarn.scheduler.maximum-allocation-vcores    26
+            yarn.scheduler.minimum-allocation-vcores     1
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+
+    https://github.com/hortonworks/hdp-configuration-utils
+
+    hdp-configuration-utils.py -c 14 -m 45 -d 1 -k False
+
+    >   Using cores=14 memory=45GB disks=1 hbase=False
+    >   Profile: cores=14 memory=45056MB reserved=1GB usableMem=44GB disks=1
+    >   Num Container=3
+    >   Container Ram=14336MB
+    >   Used Ram=42GB
+    >   Unused Ram=1GB
+    >   ***** mapred-site.xml *****
+    >   mapreduce.map.memory.mb=14336
+    >   mapreduce.map.java.opts=-Xmx11264m
+    >   mapreduce.reduce.memory.mb=14336
+    >   mapreduce.reduce.java.opts=-Xmx11264m
+    >   mapreduce.task.io.sort.mb=1792
+    >   ***** yarn-site.xml *****
+    >   yarn.scheduler.minimum-allocation-mb=14336
+    >   yarn.scheduler.maximum-allocation-mb=43008
+    >   yarn.nodemanager.resource.memory-mb=43008
+    >   yarn.app.mapreduce.am.resource.mb=14336
+    >   yarn.app.mapreduce.am.command-opts=-Xmx11264m
+    >   ***** tez-site.xml *****
+    >   tez.am.resource.memory.mb=14336
+    >   tez.am.java.opts=-Xmx11264m
+    >   ***** hive-site.xml *****
+    >   hive.tez.container.size=14336
+    >   hive.tez.java.opts=-Xmx11264m
+    >   hive.auto.convert.join.noconditionaltask.size=3758096000
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+dev deployment #1
+
+    test #1.1
+
+        default settings, 10% data, 500 trees
+
+        main select statement
+        Took 28 min 32 sec. Last updated by gaiauser at February 12 2021, 4:03:51 AM.
+
+        RandomForestClassifier - 10% data 500 trees
+        Took 14 min 58 sec. Last updated by gaiauser at February 12 2021, 4:19:07 AM.
+
+    #
+    # Tweaked the Hadoop/Yarn settings ..
+    #
+
+dev deployment #2
+
+    test #2.1
+
+        default settings, 10% data, 500 trees
+
+        java.lang.IllegalArgumentException:
+            Required executor memory (13312), overhead (1331 MB), and PySpark memory (0 MB) is above the max threshold (13312 MB) of this cluster!
+            Please check the values of 'yarn.scheduler.maximum-allocation-mb' and/or 'yarn.nodemanager.resource.memory-mb'.
+
+    #
+    # Fixed the Hadoop/Yarn settings ..
+    #
+
+        yarn-masters
+        yarn-site.xml
+            yarn.scheduler.maximum-allocation-mb ((45-1)*1024) = 45056
+            yarn.scheduler.minimum-allocation-mb  2048
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+dev deployment #3
+
+    test #3.1
+
+        default settings, 10% data, 500 trees
+
+        main select statement
+        1724028
+        Took 25 min 45 sec. Last updated by gaiauser at February 12 2021, 11:02:10 AM.
+
+        first plot
+        Took 6 sec. Last updated by gaiauser at February 12 2021, 11:02:16 AM.
+
+        good/bad select - 10% data
+        Good training data size: 24225 rows
+        Bad  training data size: 24225 rows
+        Took 10 sec. Last updated by gaiauser at February 12 2021, 11:02:26 AM.
+
+        RandomForestClassifier - 10% data 500 trees
+        Took 14 min 56 sec. Last updated by gaiauser at February 12 2021, 11:17:23 AM.
+
+            Slack chat with Paul Browne, asked him if there were any issues.
+            Suddenly running much faster - worker has 4 java processes at 96% cpu.
+            Might be a coincidence, might be something he tweaked ...
+            I think it was coincidence, I don't think he is online at the moment.
+
+        Good sources plot
+        Took 35 sec. Last updated by gaiauser at February 12 2021, 11:19:52 AM.
+
+        Bad sources plot
+        Took 36 sec. Last updated by gaiauser at February 12 2021, 11:20:28 AM.
+
+        Results
+        No. of good sources:  11180
+        No. of bad sources:   13102
+        Took 38 sec. Last updated by gaiauser at February 12 2021, 11:21:06 AM.
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Checking logs on worker04.
+#[fedora@gaia-dev-20210212-worker04]
+
+    # worker01,02 and 04 all have a lot of activity.
+
+    ls -al /var/hadoop/logs/
+
+    >   drwxrwsr-x. 1 fedora fedora   582 Feb 12 10:19 .
+    >   drwxrwsr-x. 1 root   root      16 Feb 12 10:08 ..
+    >   -rw-rw-r--. 1 fedora fedora 38792 Feb 12 11:19 hadoop-fedora-datanode-gaia-dev-20210212-worker04.novalocal.log
+    >   -rw-rw-r--. 1 fedora fedora   702 Feb 12 10:19 hadoop-fedora-datanode-gaia-dev-20210212-worker04.novalocal.out
+    >   -rw-rw-r--. 1 fedora fedora 37728 Feb 12 11:09 hadoop-fedora-nodemanager-gaia-dev-20210212-worker04.novalocal.log
+    >   -rw-rw-r--. 1 fedora fedora  2218 Feb 12 10:19 hadoop-fedora-nodemanager-gaia-dev-20210212-worker04.novalocal.out
+    >   -rw-rw-r--. 1 fedora fedora     0 Feb 12 10:19 SecurityAuth-fedora.audit
+    >   drwxr-xr-x. 1 fedora fedora    60 Feb 12 11:17 userlogs
+
+    tail -f /var/hadoop/logs/hadoop-fedora-datanode-gaia-dev-20210212-worker04.novalocal.log
+
+    >   ....
+    >   ....
+    >   2021-02-12 11:04:41,833 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:313ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741833
+    >   2021-02-12 11:05:23,204 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:475ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741833
+    >   2021-02-12 11:12:07,737 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:712ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741833
+    >   ....
+    >   ....
+
+
+    ls -al /var/hdfs/data
+
+    >   lrwxrwxrwx. 1 root root 25 Feb 12 10:11 /var/hdfs/data -> /mnt/cinder/vdc/hdfs/data
+
+
+    df -h /var/hdfs/data
+
+    >   Filesystem      Size  Used Avail Use% Mounted on
+    >   /dev/vdc        512G  663M  510G   1% /mnt/cinder/vdc
+
+    #
+    # Writing to the Cinder volume is slower than Hadoop is expecting.
+    # cost:712ms (threshold=300ms)
+    #
+
+# -----------------------------------------------------
+# Checking logs on worker03.
+#[fedora@gaia-dev-20210212-worker04]
+
+    # worker03 has much less activity.
+
+    ls -al /var/hadoop/logs/
+
+    >   drwxrwsr-x. 1 fedora fedora   582 Feb 12 10:19 .
+    >   drwxrwsr-x. 1 root   root      16 Feb 12 10:08 ..
+    >   -rw-rw-r--. 1 fedora fedora 35013 Feb 12 11:19 hadoop-fedora-datanode-gaia-dev-20210212-worker03.novalocal.log
+    >   -rw-rw-r--. 1 fedora fedora   702 Feb 12 10:19 hadoop-fedora-datanode-gaia-dev-20210212-worker03.novalocal.out
+    >   -rw-rw-r--. 1 fedora fedora 37730 Feb 12 11:09 hadoop-fedora-nodemanager-gaia-dev-20210212-worker03.novalocal.log
+    >   -rw-rw-r--. 1 fedora fedora  2218 Feb 12 10:19 hadoop-fedora-nodemanager-gaia-dev-20210212-worker03.novalocal.out
+    >   -rw-rw-r--. 1 fedora fedora     0 Feb 12 10:19 SecurityAuth-fedora.audit
+    >   drwxr-xr-x. 1 fedora fedora    60 Feb 12 11:17 userlogs
+
+    tail -f /var/hadoop/logs/hadoop-fedora-datanode-gaia-dev-20210212-worker03.novalocal.log
+
+    >   ....
+    >   ....
+    >   2021-02-12 11:18:38,134 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:493ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741834
+    >   2021-02-12 11:19:04,401 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:618ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741834
+    >   2021-02-12 11:19:29,870 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:486ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741834
+    >   2021-02-12 11:19:44,349 INFO org.apache.hadoop.hdfs.server.datanode.DataNode.clienttrace: src: /10.10.3.76:39778, dest: /10.10.0.137:9866, bytes: 134217728, op: HDFS_WRITE, cliID: DFSClient_NONMAPREDUCE_-1806520083_23, offset: 0, srvID: e9c64f4b-966f-468c-af20-b6ae51d502de, blockid: BP-346622070-10.10.3.194-1613125180505:blk_1073741834_1010, duration(ns): 134185201769
+    >   2021-02-12 11:19:44,349 INFO org.apache.hadoop.hdfs.server.datanode.DataNode: PacketResponder: BP-346622070-10.10.3.194-1613125180505:blk_1073741834_1010, type=HAS_DOWNSTREAM_IN_PIPELINE, downstreams=1:[10.10.1.46:9866] terminating
+    >   2021-02-12 11:19:44,364 INFO org.apache.hadoop.hdfs.server.datanode.DataNode: Receiving BP-346622070-10.10.3.194-1613125180505:blk_1073741835_1011 src: /10.10.3.104:47116 dest: /10.10.0.137:9866
+    >   2021-02-12 11:19:44,365 INFO org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
+    >   2021-02-12 11:20:16,593 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:643ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741835
+    >   ....
+    >   ....
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Test run #3.2 tailing Zeppelin log
+#[fedora@gaia-dev-20210212-zeppelin]
+
+    ls -al zeppelin-0.8.2-bin-all/logs
+
+    >   drwxrwxr-x.  2 fedora fedora      4096 Feb 12 10:34 .
+    >   drwxr-xr-x. 12 fedora fedora      4096 Feb 12 10:20 ..
+    >   -rw-rw-r--.  1 fedora fedora     55109 Feb 12 11:29 zeppelin-fedora-gaia-dev-20210212-zeppelin.novalocal.log
+    >   -rw-rw-r--.  1 fedora fedora      6194 Feb 12 10:34 zeppelin-fedora-gaia-dev-20210212-zeppelin.novalocal.out
+    >   -rw-rw-r--.  1 fedora fedora      2885 Feb 12 11:28 zeppelin-interpreter-md-fedora-gaia-dev-20210212-zeppelin.novalocal.log
+    >   -rw-rw-r--.  1 fedora fedora 122946934 Feb 12 11:31 zeppelin-interpreter-spark-fedora-gaia-dev-20210212-zeppelin.novalocal.log
+
+
+    tail -f zeppelin-0.8.2-bin-all/logs/zeppelin-interpreter-spark-fedora-gaia-dev-20210212-zeppelin.novalocal.log
+
+    >   ....
+    >   ....
+    >    INFO [2021-02-12 11:28:59,148] ({pool-2-thread-13} SchedulerFactory.java[jobStarted]:114) - Job 20201013-131649_1734629667 started by scheduler interpreter_2016348950
+    >   ....
+    >   ....
+    >    INFO [2021-02-12 11:37:38,123] ({dispatcher-event-loop-12} Logging.scala[logInfo]:54) - Starting task 2997.0 in stage 92.0 (TID 320495, worker02, executor 2, partition 2997, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-12 11:37:38,123] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 2996.0 in stage 92.0 (TID 320494) in 242 ms on worker02 (executor 2) (2986/5720)
+    >    INFO [2021-02-12 11:37:38,312] ({dispatcher-event-loop-4} Logging.scala[logInfo]:54) - Starting task 2998.0 in stage 92.0 (TID 320496, worker02, executor 2, partition 2998, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-12 11:37:38,312] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 2997.0 in stage 92.0 (TID 320495) in 189 ms on worker02 (executor 2) (2987/5720)
+    >    INFO [2021-02-12 11:37:38,546] ({dispatcher-event-loop-2} Logging.scala[logInfo]:54) - Starting task 2999.0 in stage 92.0 (TID 320497, worker02, executor 2, partition 2999, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-12 11:37:38,547] ({task-result-getter-3} Logging.scala[logInfo]:54) - Finished task 2998.0 in stage 92.0 (TID 320496) in 235 ms on worker02 (executor 2) (2988/5720)
+    >    INFO [2021-02-12 11:37:39,376] ({dispatcher-event-loop-11} Logging.scala[logInfo]:54) - Starting task 3000.0 in stage 92.0 (TID 320498, worker01, executor 1, partition 3000, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-12 11:37:39,376] ({task-result-getter-0} Logging.scala[logInfo]:54) - Finished task 2986.0 in stage 92.0 (TID 320484) in 4466 ms on worker01 (executor 1) (2989/5720)
+    >    INFO [2021-02-12 11:37:39,974] ({dispatcher-event-loop-1} Logging.scala[logInfo]:54) - Starting task 3001.0 in stage 92.0 (TID 320499, worker04, executor 3, partition 3001, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-12 11:37:39,974] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 2976.0 in stage 92.0 (TID 320474) in 7743 ms on worker04 (executor 3) (2990/5720)
+    >    INFO [2021-02-12 11:37:40,235] ({dispatcher-event-loop-2} Logging.scala[logInfo]:54) - Starting task 3002.0 in stage 92.0 (TID 320500, worker04, executor 3, partition 3002, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-12 11:37:40,235] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished
+    >   ....
+    >   ....
+
+    #
+    # Not sending much to worker03 for some reason ?
+    #
+
+# -----------------------------------------------------
+# Test run #3.2 disc use on worker02
+#[fedora@gaia-dev-20210212-worker01]
+
+
+    ls -al /var/hadoop/
+
+    >   total 8
+    >   drwxrwsr-x.  2 root root 4096 Feb 12 10:09 .
+    >   drwxr-xr-x. 20 root root 4096 Feb 12 10:11 ..
+    >   lrwxrwxrwx.  1 root root   27 Feb 12 10:08 data -> /mnt/cinder/vdc/hadoop/data
+    >   lrwxrwxrwx.  1 root root   27 Feb 12 10:09 logs -> /mnt/cinder/vdc/hadoop/logs
+    >   lrwxrwxrwx.  1 root root   26 Feb 12 10:09 temp -> /mnt/local/vdb/hadoop/temp
+
+
+    du -h -d 2 -L /var/hadoop/
+
+    >   91M	/var/hadoop/logs/userlogs
+    >   91M	/var/hadoop/logs
+    >   0	/var/hadoop/data
+    >   293M	/var/hadoop/temp/nm-local-dir
+    >   293M	/var/hadoop/temp
+    >   384M	/var/hadoop/
+
+
+
+    ls -al /var/hadoop/logs/
+
+    >   drwxrwsr-x. 1 fedora fedora   582 Feb 12 10:19 .
+    >   drwxrwsr-x. 1 root   root      16 Feb 12 10:08 ..
+    >   -rw-rw-r--. 1 fedora fedora 39272 Feb 12 11:42 hadoop-fedora-datanode-gaia-dev-20210212-worker02.novalocal.log
+    >   -rw-rw-r--. 1 fedora fedora   702 Feb 12 10:19 hadoop-fedora-datanode-gaia-dev-20210212-worker02.novalocal.out
+    >   -rw-rw-r--. 1 fedora fedora 38788 Feb 12 11:49 hadoop-fedora-nodemanager-gaia-dev-20210212-worker02.novalocal.log
+    >   -rw-rw-r--. 1 fedora fedora  2218 Feb 12 10:19 hadoop-fedora-nodemanager-gaia-dev-20210212-worker02.novalocal.out
+    >   -rw-rw-r--. 1 fedora fedora     0 Feb 12 10:19 SecurityAuth-fedora.audit
+    >   drwxr-xr-x. 1 fedora fedora    60 Feb 12 11:51 userlogs
+
+
+    du -h -d 2 /var/hadoop/logs/
+
+    >   88M	/var/hadoop/logs/userlogs/application_1613125194823_0001
+    >   88M	/var/hadoop/logs/userlogs
+    >   89M	/var/hadoop/logs/
+
+
+    ls -al /var/hadoop/data/
+
+    >   total 0
+    >   drwxrwsr-x. 1 fedora fedora  0 Feb 12 10:08 .
+    >   drwxrwsr-x. 1 root   root   16 Feb 12 10:08 ..
+
+
+    du -h -d 2 /var/hadoop/data/
+
+    >   0	/var/hadoop/data/
+
+
+    ls -al /var/hadoop/temp/
+
+    >   drwxrwsr-x. 3 fedora fedora 4096 Feb 12 10:19 .
+    >   drwxrwsr-x. 3 root   root   4096 Feb 12 10:09 ..
+    >   drwxr-xr-x. 5 fedora fedora 4096 Feb 12 11:55 nm-local-dir
+
+
+    du -h -d 2 /var/hadoop/temp/
+
+    >   292M	/var/hadoop/temp/nm-local-dir/usercache
+    >   4.0K	/var/hadoop/temp/nm-local-dir/filecache
+    >   36K	/var/hadoop/temp/nm-local-dir/nmPrivate
+    >   292M	/var/hadoop/temp/nm-local-dir
+    >   292M	/var/hadoop/temp/
+
+
+    ls -al /var/hdfs/
+
+    >   total 8
+    >   drwxrwsr-x.  2 root root 4096 Feb 12 10:11 .
+    >   drwxr-xr-x. 20 root root 4096 Feb 12 10:11 ..
+    >   lrwxrwxrwx.  1 root root   25 Feb 12 10:11 data -> /mnt/cinder/vdc/hdfs/data
+    >   lrwxrwxrwx.  1 root root   25 Feb 12 10:11 logs -> /mnt/cinder/vdc/hdfs/logs
+
+
+    ls -al /var/hdfs/data/
+
+    >   total 4
+    >   drwx------. 1 fedora fedora 36 Feb 12 10:19 .
+    >   drwxrwsr-x. 1 root   root   16 Feb 12 10:11 ..
+    >   drwxrwxr-x. 1 fedora fedora 90 Feb 12 10:19 current
+    >   -rw-rw-r--. 1 fedora fedora 14 Feb 12 10:19 in_use.lock
+
+
+    du -h -d 2 /var/hdfs/data/
+
+    >   928M	/var/hdfs/data/current/BP-346622070-10.10.3.194-1613125180505
+    >   928M	/var/hdfs/data/current
+    >   928M	/var/hdfs/data/
+
+
+    ls -al /var/hdfs/logs/
+
+    >   drwxrwsr-x. 1 fedora fedora  0 Feb 12 10:11 .
+    >   drwxrwsr-x. 1 root   root   16 Feb 12 10:11 ..
+
+
+    du -h -d 2 /var/hdfs/logs/
+
+    >   0	/var/hdfs/logs/
+
+
+# -----------------------------------------------------
+# Test run #3.2 disc use on zeppelin
+#[fedora@gaia-dev-20210212-zeppelin]
+
+    ls -al /var/spark/
+
+    >   total 8
+    >   drwxrwsr-x.  2 root root 4096 Feb 12 10:13 .
+    >   drwxr-xr-x. 20 root root 4096 Feb 12 10:13 ..
+    >   lrwxrwxrwx.  1 root root   25 Feb 12 10:13 temp -> /mnt/local/vdb/spark/temp
+
+
+    ls -al /var/spark/temp/
+
+    >   drwxrwsr-x.  4 fedora fedora 4096 Feb 12 10:35 .
+    >   drwxrwsr-x.  3 root   root   4096 Feb 12 10:13 ..
+    >   drwxrwsr-x. 51 fedora fedora 4096 Feb 12 11:29 blockmgr-2c154649-34c5-4eb0-b588-b08b5bceccda
+    >   drwx--S---.  4 fedora fedora 4096 Feb 12 10:35 spark-0f42b2cc-7042-48bd-992f-99d49cabd8d5
+
+
+    du -h -d 2 /var/spark/temp/
+
+    >   220K	/var/spark/temp/spark-0f42b2cc-7042-48bd-992f-99d49cabd8d5
+    >   200K	/var/spark/temp/blockmgr-2c154649-34c5-4eb0-b588-b08b5bceccda
+    >   424K	/var/spark/temp/
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+dev deployment #3
+
+    test #3.2
+
+        changed settings, 100% data, 500 trees
+        edit the notebook to remove quick_filter
+        clear output and run again
+
+        main select statement
+        1724028
+        Took 12 min 51 sec. Last updated by gaiauser at February 12 2021, 11:42:40 AM.
+
+        first plot
+        Took 16 min 21 sec. Last updated by gaiauser at February 12 2021, 11:59:01 AM.
+
+        good/bad select - 100% data
+        Good training data size: 244740 rows
+        Bad  training data size: 244740 rows
+        Took 19 min 16 sec. Last updated by gaiauser at February 12 2021, 12:18:17 PM.
+
+        RandomForestClassifier - 100% data 500 trees
+
+            Back to slow progress.
+            Several Java 30%cpu. lots of cephfuse at 2%cpu.
+
+        Reached 66% and then started to go backwards.
+        Reached 62% and decided to stop it.
+        Started 2 hours ago.
+
+        Clicked the [Cancel] button - no effect.
+        Log shows new tasks being issued.
+
+        Keyboard cancel, Ctrl-Atl-C - no effect.
+        Log shows new tasks being issued.
+
+        Restarted the intepreter - result.
+        Log shows tasks being cancelled.
+
+    >    INFO [2021-02-12 14:37:41,547] ({dispatcher-event-loop-10} Logging.scala[logInfo]:54) - Added rdd_419_2815 in memory on worker02:39835 (size: 16.0 B, free: 6.6 GB)
+    >    INFO [2021-02-12 14:37:41,582] ({dispatcher-event-loop-8} Logging.scala[logInfo]:54) - Starting task 2817.0 in stage 113.0 (TID 383260, worker02, executor 2, partition 2817, PROCESS_LOCAL, 8559 bytes)
+    >    INFO [2021-02-12 14:37:41,582] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 2815.0 in stage 113.0 (TID 383258) in 1246 ms on worker02 (executor 2) (2806/5721)
+    >    INFO [2021-02-12 14:37:52,456] ({dispatcher-event-loop-0} Logging.scala[logInfo]:54) - Added rdd_419_2817 in memory on worker02:39835 (size: 16.0 B, free: 6.6 GB)
+    >    INFO [2021-02-12 14:37:52,487] ({dispatcher-event-loop-2} Logging.scala[logInfo]:54) - Starting task 2818.0 in stage 113.0 (TID 383261, worker02, executor 2, partition 2818, PROCESS_LOCAL, 8559 bytes)
+    >    INFO [2021-02-12 14:37:52,487] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 2817.0 in stage 113.0 (TID 383260) in 10905 ms on worker02 (executor 2) (2807/5721)
+    >    INFO [2021-02-12 14:37:57,681] ({pool-1-thread-3} RemoteInterpreterServer.java[cancel]:681) - cancel org.apache.zeppelin.spark.PySparkInterpreter 20201013-152110_1282917873
+    >    INFO [2021-02-12 14:37:57,702] ({pool-1-thread-3} Logging.scala[logInfo]:54) - Asked to cancel job group zeppelin-gaiauser-2FX82FMTH-20201013-152110_1282917873
+    >    INFO [2021-02-12 14:37:57,706] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - Cancelling stage 113
+    >    INFO [2021-02-12 14:37:57,707] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - Killing all running tasks in stage 113: Stage cancelled
+    >    INFO [2021-02-12 14:37:57,711] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - Stage 113 was cancelled
+    >    INFO [2021-02-12 14:37:57,712] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - ShuffleMapStage 113 (mapPartitions at RandomForest.scala:538) failed in 1388.544 s due to Job 61 cancelled part of cancelled job group zeppelin-gaiauser-2FX82FMTH-20201013-152110_1282917873
+    >    INFO [2021-02-12 14:37:57,713] ({Thread-39} Logging.scala[logInfo]:54) - Job 61 failed: collectAsMap at RandomForest.scala:567, took 1388.585122 s
+    >   ERROR [2021-02-12 14:37:57,720] ({Thread-39} Logging.scala[logError]:70) - org.apache.spark.SparkException: Job 61 cancelled part of cancelled job group zeppelin-gaiauser-2FX82FMTH-20201013-152110_1282917873
+    >   	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
+    >   	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:1860)
+    >   	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply$mcVI$sp(DAGScheduler.scala:928)
+    >   	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply(DAGScheduler.scala:928)
+    >   	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply(DAGScheduler.scala:928)
+    >   	at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
+    >   	at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:928)
+    >   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2115)
+    >   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
+    >   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
+
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+dev deployment #3
+
+    test #3.3
+
+        changed settings, 100% data, 500 trees, no cache
+
+        Caching may cause problems for datasets in Parquet files.
+        https://towardsdatascience.com/best-practices-for-caching-in-spark-sql-b22fb0f02d34
+
+        edit the notebook to remove quick_filter and caching
+
+        -   quick_filter = ' AND MOD(random_index, 10) = 0'
+        +   quick_filter = '' # AND MOD(random_index, 10) = 0'
+
+        -   raw_sources_df.cache()
+        +  #raw_sources_df.cache()
+
+        clear output and run again
+
+        main select statement
+        1724028
+        Took 10 min 39 sec. Last updated by gaiauser at February 12 2021, 2:57:19 PM.
+
+        first plot
+        Took 19 min 22 sec. Last updated by gaiauser at February 12 2021, 3:16:42 PM.
+
+        good/bad select - 100% data
+        Good training data size: 244740 rows
+        Bad  training data size: 244740 rows
+        Took 28 min 3 sec. Last updated by gaiauser at February 12 2021, 3:44:45 PM.
+
+        RandomForestClassifier - 100% data 500 trees
+        Killed at 80% to allow John to heal the Ceph system.
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+dev deployment #3
+
+    test #3.4
+
+        100% data, 500 trees, no cache
+
+        ml intro
+        Took 0 sec. Last updated by gaiauser at February 12 2021, 8:18:31 PM.
+
+        temp view
+        Took 1 min 40 sec. Last updated by gaiauser at February 12 2021, 8:20:11 PM.
+
+        main select statement
+        1724028
+        Took 1 min 38 sec. Last updated by gaiauser at February 12 2021, 8:21:49 PM.
+
+            ceph-fuse at 80-90%
+            java at 20-50%
+
+        Hertzsprung-Russell
+        Took 4 min 39 sec. Last updated by gaiauser at February 12 2021, 8:26:28 PM.
+
+        good/bad select - 100% data
+        Good training data size: 244740 rows
+        Bad  training data size: 244740 rows
+        Took 7 min 13 sec. Last updated by gaiauser at February 12 2021, 8:33:41 PM.
+
+        RandomForestClassifier - 100% data 500 trees
+        Took 1 hrs 16 min 6 sec. Last updated by gaiauser at February 12 2021, 9:49:48 PM.
+
+        Misclassifications for the test set: 0.35 %
+        Took 18 min 35 sec. Last updated by gaiauser at February 12 2021, 10:08:23 PM.
+
+        Hertzsprung-Russell
+        Took 54 min 22 sec. Last updated by gaiauser at February 12 2021, 11:02:46 PM.
+
+        histogram
+        Took 14 min 58 sec. Last updated by gaiauser at February 12 2021, 11:17:44 PM.
+
+        Good sources plot
+        Took 27 min 12 sec. Last updated by gaiauser at February 12 2021, 11:44:56 PM.
+
+        Bad sources plot
+        Took 27 min 13 sec. Last updated by gaiauser at February 13 2021, 12:12:09 AM.
+
+        No. of good sources:  22254
+        No. of bad sources:   26170
+        Took 27 min 42 sec. Last updated by gaiauser at February 13 2021, 12:39:51 AM.
+
+        histogram
+        Took 19 min 10 sec. Last updated by gaiauser at February 13 2021, 12:59:01 AM.
+
+        Nulls
+        Took 15 min 48 sec. Last updated by gaiauser at February 13 2021, 1:14:49 AM.
+
+        ----
+
+dev deployment #3
+
+    test #3.5
+
+        repeat of the same
+        100% data, 500 trees, no cache
+
+        clear cells and run all
+
+        ml intro
+        Took 0 sec. Last updated by gaiauser at February 13 2021, 3:19:07 AM.
+
+        temp view
+        Took 50 sec. Last updated by gaiauser at February 13 2021, 3:19:57 AM.
+
+        main select statement
+        1724028
+        Took 38 sec. Last updated by gaiauser at February 13 2021, 3:20:35 AM.
+
+        Hertzsprung-Russell
+        Took 4 min 1 sec. Last updated by gaiauser at February 13 2021, 3:24:36 AM.
+
+        good/bad select - 100% data
+        Good training data size: 244740 rows
+        Bad  training data size: 244740 rows
+        Took 7 min 4 sec. Last updated by gaiauser at February 13 2021, 3:31:40 AM.
+
+        RandomForestClassifier - 100% data 500 trees
+        Took 1 hrs 19 min 39 sec. Last updated by gaiauser at February 13 2021, 4:51:20 AM.
+
+        Misclassifications for the test set: 0.35 %
+        Took 20 min 13 sec. Last updated by gaiauser at February 13 2021, 5:11:34 AM.
+
+        Hertzsprung-Russell
+        Took 55 min 7 sec. Last updated by gaiauser at February 13 2021, 6:06:42 AM.
+
+        histogram
+        Took 14 min 12 sec. Last updated by gaiauser at February 13 2021, 6:20:54 AM.
+
+        Good sources plot
+
+        Bad sources plot
+
+        good/bad count
+
+        histogram
+
+        Nulls
+
+

From 56082d9ab15e6d340d563381ae18d241dffa36f9 Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Sat, 13 Feb 2021 07:07:01 +0000
Subject: [PATCH 10/27] ....

---
 notes/zrq/20210211-03-ansible-deploy.txt | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/notes/zrq/20210211-03-ansible-deploy.txt b/notes/zrq/20210211-03-ansible-deploy.txt
index 72078369..69522b08 100644
--- a/notes/zrq/20210211-03-ansible-deploy.txt
+++ b/notes/zrq/20210211-03-ansible-deploy.txt
@@ -155,7 +155,7 @@ EOF
     >   user    7m41.976s
     >   sys     2m27.153s
 
-    >   
+    >
     >   real    32m40.876s
     >   user    8m1.610s
     >   sys     2m34.779s
@@ -1004,6 +1004,7 @@ dev deployment #3
         Took 14 min 12 sec. Last updated by gaiauser at February 13 2021, 6:20:54 AM.
 
         Good sources plot
+        Took 27 min 15 sec. Last updated by gaiauser at February 13 2021, 6:48:09 AM.
 
         Bad sources plot
 
@@ -1014,3 +1015,9 @@ dev deployment #3
         Nulls
 
 
+    TODO
+
+        retry with caching enabled
+        retry with 1000 and 5000 trees
+
+

From 8626e08eeba08435af4b0371af46fc6f16ff7cc1 Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Sat, 13 Feb 2021 13:04:50 +0000
Subject: [PATCH 11/27] ....

---
 notes/zrq/20210211-03-ansible-deploy.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/notes/zrq/20210211-03-ansible-deploy.txt b/notes/zrq/20210211-03-ansible-deploy.txt
index 69522b08..84dce1a0 100644
--- a/notes/zrq/20210211-03-ansible-deploy.txt
+++ b/notes/zrq/20210211-03-ansible-deploy.txt
@@ -1007,13 +1007,18 @@ dev deployment #3
         Took 27 min 15 sec. Last updated by gaiauser at February 13 2021, 6:48:09 AM.
 
         Bad sources plot
+        Took 27 min 56 sec. Last updated by gaiauser at February 13 2021, 7:16:06 AM.
 
         good/bad count
+        No. of good sources:  22254
+        No. of bad sources:   26170
+        Took 27 min 16 sec. Last updated by gaiauser at February 13 2021, 7:43:22 AM.
 
         histogram
+        Took 19 min 43 sec. Last updated by gaiauser at February 13 2021, 8:03:05 AM.
 
         Nulls
-
+        Took 15 min 54 sec. Last updated by gaiauser at February 13 2021, 8:18:59 AM.
 
     TODO
 

From 7f8a58945cf7ccf9d9b783df87b3cabf13ce9da7 Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Sat, 13 Feb 2021 13:09:44 +0000
Subject: [PATCH 12/27] ....

---
 notes/zrq/20210211-03-ansible-deploy.txt | 793 +-------------------
 notes/zrq/20210212-01-speed-tests.txt    | 881 +++++++++++++++++++++++
 notes/zrq/20210213-01-speed-tests.txt    |  28 +
 3 files changed, 910 insertions(+), 792 deletions(-)
 create mode 100644 notes/zrq/20210212-01-speed-tests.txt
 create mode 100644 notes/zrq/20210213-01-speed-tests.txt

diff --git a/notes/zrq/20210211-03-ansible-deploy.txt b/notes/zrq/20210211-03-ansible-deploy.txt
index 84dce1a0..ec3764c9 100644
--- a/notes/zrq/20210211-03-ansible-deploy.txt
+++ b/notes/zrq/20210211-03-ansible-deploy.txt
@@ -54,7 +54,7 @@
 
         Hadoop and Spark work best with local data.
 
-        The gaia machines sitting in the racks at ROE are a much better fit for the typer of load.
+        The gaia machines sitting in the racks at ROE are a better fit for this type of load.
         Spread the data across the workers, don't centralise it in one place.
         Either HDFS or another form of local caching.
 
@@ -80,9 +80,6 @@
         Notebook works with 100% of eDR3 and 500 trees.
         Need to experiment with adding more trees.
 
-    TODO:
-
-
 
 # -----------------------------------------------------
 # Update the Openstack cloud name.
@@ -237,792 +234,4 @@ EOF
     firefox --new-window "http://zeppelin.metagrid.xyz:8080/" &
 
 
-# -----------------------------------------------------
-# -----------------------------------------------------
-
-    Import notebooks from GitHub, clear the output and run all the cells ...
-
-    Good astrometric solutions via ML Random Forrest classifier
-    https://raw.githubusercontent.com/wfau/aglais-notebooks/main/2FRPC4BFS/note.json
-
-
-# -----------------------------------------------------
-# -----------------------------------------------------
-
-Stelio's test #1a
-20210211-ML-Notebook-Benchmarking.txt
-
-    Cinder Volumes for temp storage for Spark & Hadoop
-    500 trees
-
-    main select statement
-	>  Took 43 mins
-
-    RandomForestClassifier - 10% data 500 trees
-	> Took 17 mins
-
-Stelio's test #1b
-20210211-ML-Notebook-Benchmarking.txt
-
-    Cinder Volumes for temp storage for Spark & Hadoop
-    5000 trees
-
-    main select statement
-	> ???
-
-    RandomForestClassifier - 10% data 5000 trees
-	> ???
-
-	notebook took 3 hrs 23 min 28 sec ()
-
-Stelio's test #2
-20210211-ML-Notebook-Benchmarking.txt
-
-    Revert changes to Ansible scripts so that it matches what is currently deployed on zeppelin.aglais.uk
-
-    main select statement
-    > 28 min 10 sec.
-
-    RandomForestClassifier - (assume 10% data 500 trees, not stated)
-    > 15 min 28 sec.
-
-# -----------------------------------------------------
-# -----------------------------------------------------
-
-Live deployment #1
-
-    quick_filter=' AND MOD(random_index, 10) = 0'
-    quick_plot_filter=' AND MOD(random_index, 25) = 0'
-
-    main select statement
-    Took 29 min 10 sec. Last updated by gaiauser at February 12 2021, 4:59:01 AM.
-
-    first plot
-    ....
-
-    good/bad select
-    ....
-
-    RandomForestClassifier - 10% data 500 trees
-    Took 12 min 4 sec. Last updated by gaiauser at February 12 2021, 5:11:23 AM.
-
-
-Live deployment #2
-
-    quick_filter=''
-    quick_plot_filter=' AND MOD(random_index, 25) = 0'
-
-    main select statement
-    1724028
-    Took 4 min 16 sec. Last updated by gaiauser at February 12 2021, 6:16:23 AM.
-
-    first plot
-    Took 14 min 46 sec. Last updated by gaiauser at February 12 2021, 6:31:09 AM.
-
-    good/bad select
-    Good training data size: 244740 rows
-    Bad  training data size: 244740 rows
-    Took 23 min 8 sec. Last updated by gaiauser at February 12 2021, 6:54:18 AM.
-
-    RandomForestClassifier - 100% data 500 trees
-    Started 3 hours ago .... 66%
-
-
-
-
-# -----------------------------------------------------
-# -----------------------------------------------------
-
-How-to: Tune Your Apache Spark Jobs (Part 2)
-https://blog.cloudera.com/how-to-tune-your-apache-spark-jobs-part-2/
-
-    Imagine a cluster with six (4) nodes running NodeManagers, each equipped with 16 (14) cores and 64GB (45) of memory.
-    The NodeManager capacities, yarn.nodemanager.resource.memory-mb and yarn.nodemanager.resource.cpu-vcores, should probably be set to 63 * 1024 = 64512 (megabytes) and 15 respectively.
-
-    example
-        yarn.nodemanager.resource.memory-mb     63 * 1024 = 64512
-        yarn.nodemanager.resource.cpu-vcores    16 - 1 = 15
-
-    The NodeManager capacities, yarn.nodemanager.resource.memory-mb and yarn.nodemanager.resource.cpu-vcores, should probably be set to 44 * 1024 = 45056 (megabytes) and 13 respectively.
-
-    aglais
-        yarn.nodemanager.resource.memory-mb     44 * 1024 = 45056
-        yarn.nodemanager.resource.cpu-vcores    14 - 1 = 13
-
-    We avoid allocating 100% of the resources to YARN containers because the node needs some resources to run the OS and Hadoop daemons.
-    In this case, we leave a gigabyte and a core for these system processes. Cloudera Manager helps by accounting for these and configuring these YARN properties automatically.
-
-The likely first impulse would be to use --num-executors 6 --executor-cores 15 --executor-memory 63G. However, this is the wrong approach because:
-
-    63GB + the executor memory overhead won’t fit within the 63GB capacity of the NodeManagers.
-    The application master will take up a core on one of the nodes, meaning that there won’t be room for a 15-core executor on that node.
-    15 cores per executor can lead to bad HDFS I/O throughput.
-
-A better option would be to use --num-executors 17 --executor-cores 5 --executor-memory 19G. Why?
-
-    This config results in three executors on all nodes except for the one with the AM, which will have two executors.
-    --executor-memory was derived as (63/3 executors per node) = 21.  21 * 0.07 = 1.47.  21 – 1.47 ~ 19.
-
-
-    example
-        6 nodes
-        15 cores per node
-        63G per node
-        3 executors per node
-        executor-cores 15 / 3 = 5
-        num-executors (6*3)-1 = 17
-
-        executor-memory
-            63/3 = 21
-            21 * (1 - 0.07) = 19
-
-
-    alais
-        4 nodes
-        13 cores per node
-        44G per node
-        3 executors per node
-        executor-cores 13 / 3 = 4
-        num-executors (4*3)-1 = 11
-
-        executor-memory
-            44/3 = 14
-            14 * (1 - 0.07) = 13
-
-        ---- ----
-
-        yarn.nodemanager.resource.memory-mb     45056
-        yarn.nodemanager.resource.cpu-vcores    13
-
-        executor-cores   4
-        num-executors   11
-        executor-memory 13
-
-
-        spark-master
-        spark-defaults.conf
-
-            spark.driver.memory             13g
-            spark.yarn.am.memory            13g
-            spark.yarn.am.cores               4
-
-            spark.executor.memory           13g
-            spark.executor.cores              4
-            spark.executor.instances         11
-
-            spark.eventLog.enabled  true
-            spark.driver.maxResultSize	     8g
-
-
-        yarn-masters
-        yarn-site.xml
-            yarn.scheduler.maximum-allocation-mb 13312
-            yarn.scheduler.minimum-allocation-mb  2048
-                                                 14336
-
-
-        yarn-workers
-        yarn-site.xml
-
-            yarn.nodemanager.resource.memory-mb         ((45-1)*1024) = 45056
-            yarn.nodemanager.resource.cpu-vcores        13
-            yarn.scheduler.maximum-allocation-vcores    26
-            yarn.scheduler.minimum-allocation-vcores     1
-
-# -----------------------------------------------------
-# -----------------------------------------------------
-
-
-    https://github.com/hortonworks/hdp-configuration-utils
-
-    hdp-configuration-utils.py -c 14 -m 45 -d 1 -k False
-
-    >   Using cores=14 memory=45GB disks=1 hbase=False
-    >   Profile: cores=14 memory=45056MB reserved=1GB usableMem=44GB disks=1
-    >   Num Container=3
-    >   Container Ram=14336MB
-    >   Used Ram=42GB
-    >   Unused Ram=1GB
-    >   ***** mapred-site.xml *****
-    >   mapreduce.map.memory.mb=14336
-    >   mapreduce.map.java.opts=-Xmx11264m
-    >   mapreduce.reduce.memory.mb=14336
-    >   mapreduce.reduce.java.opts=-Xmx11264m
-    >   mapreduce.task.io.sort.mb=1792
-    >   ***** yarn-site.xml *****
-    >   yarn.scheduler.minimum-allocation-mb=14336
-    >   yarn.scheduler.maximum-allocation-mb=43008
-    >   yarn.nodemanager.resource.memory-mb=43008
-    >   yarn.app.mapreduce.am.resource.mb=14336
-    >   yarn.app.mapreduce.am.command-opts=-Xmx11264m
-    >   ***** tez-site.xml *****
-    >   tez.am.resource.memory.mb=14336
-    >   tez.am.java.opts=-Xmx11264m
-    >   ***** hive-site.xml *****
-    >   hive.tez.container.size=14336
-    >   hive.tez.java.opts=-Xmx11264m
-    >   hive.auto.convert.join.noconditionaltask.size=3758096000
-
-
-# -----------------------------------------------------
-# -----------------------------------------------------
-
-dev deployment #1
-
-    test #1.1
-
-        default settings, 10% data, 500 trees
-
-        main select statement
-        Took 28 min 32 sec. Last updated by gaiauser at February 12 2021, 4:03:51 AM.
-
-        RandomForestClassifier - 10% data 500 trees
-        Took 14 min 58 sec. Last updated by gaiauser at February 12 2021, 4:19:07 AM.
-
-    #
-    # Tweaked the Hadoop/Yarn settings ..
-    #
-
-dev deployment #2
-
-    test #2.1
-
-        default settings, 10% data, 500 trees
-
-        java.lang.IllegalArgumentException:
-            Required executor memory (13312), overhead (1331 MB), and PySpark memory (0 MB) is above the max threshold (13312 MB) of this cluster!
-            Please check the values of 'yarn.scheduler.maximum-allocation-mb' and/or 'yarn.nodemanager.resource.memory-mb'.
-
-    #
-    # Fixed the Hadoop/Yarn settings ..
-    #
-
-        yarn-masters
-        yarn-site.xml
-            yarn.scheduler.maximum-allocation-mb ((45-1)*1024) = 45056
-            yarn.scheduler.minimum-allocation-mb  2048
-
-
-# -----------------------------------------------------
-# -----------------------------------------------------
-
-dev deployment #3
-
-    test #3.1
-
-        default settings, 10% data, 500 trees
-
-        main select statement
-        1724028
-        Took 25 min 45 sec. Last updated by gaiauser at February 12 2021, 11:02:10 AM.
-
-        first plot
-        Took 6 sec. Last updated by gaiauser at February 12 2021, 11:02:16 AM.
-
-        good/bad select - 10% data
-        Good training data size: 24225 rows
-        Bad  training data size: 24225 rows
-        Took 10 sec. Last updated by gaiauser at February 12 2021, 11:02:26 AM.
-
-        RandomForestClassifier - 10% data 500 trees
-        Took 14 min 56 sec. Last updated by gaiauser at February 12 2021, 11:17:23 AM.
-
-            Slack chat with Paul Browne, asked him if there were any issues.
-            Suddenly running much faster - worker has 4 java processes at 96% cpu.
-            Might be a coincidence, might be something he tweaked ...
-            I think it was coincidence, I don't think he is online at the moment.
-
-        Good sources plot
-        Took 35 sec. Last updated by gaiauser at February 12 2021, 11:19:52 AM.
-
-        Bad sources plot
-        Took 36 sec. Last updated by gaiauser at February 12 2021, 11:20:28 AM.
-
-        Results
-        No. of good sources:  11180
-        No. of bad sources:   13102
-        Took 38 sec. Last updated by gaiauser at February 12 2021, 11:21:06 AM.
-
-
-# -----------------------------------------------------
-# -----------------------------------------------------
-# Checking logs on worker04.
-#[fedora@gaia-dev-20210212-worker04]
-
-    # worker01,02 and 04 all have a lot of activity.
-
-    ls -al /var/hadoop/logs/
-
-    >   drwxrwsr-x. 1 fedora fedora   582 Feb 12 10:19 .
-    >   drwxrwsr-x. 1 root   root      16 Feb 12 10:08 ..
-    >   -rw-rw-r--. 1 fedora fedora 38792 Feb 12 11:19 hadoop-fedora-datanode-gaia-dev-20210212-worker04.novalocal.log
-    >   -rw-rw-r--. 1 fedora fedora   702 Feb 12 10:19 hadoop-fedora-datanode-gaia-dev-20210212-worker04.novalocal.out
-    >   -rw-rw-r--. 1 fedora fedora 37728 Feb 12 11:09 hadoop-fedora-nodemanager-gaia-dev-20210212-worker04.novalocal.log
-    >   -rw-rw-r--. 1 fedora fedora  2218 Feb 12 10:19 hadoop-fedora-nodemanager-gaia-dev-20210212-worker04.novalocal.out
-    >   -rw-rw-r--. 1 fedora fedora     0 Feb 12 10:19 SecurityAuth-fedora.audit
-    >   drwxr-xr-x. 1 fedora fedora    60 Feb 12 11:17 userlogs
-
-    tail -f /var/hadoop/logs/hadoop-fedora-datanode-gaia-dev-20210212-worker04.novalocal.log
-
-    >   ....
-    >   ....
-    >   2021-02-12 11:04:41,833 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:313ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741833
-    >   2021-02-12 11:05:23,204 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:475ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741833
-    >   2021-02-12 11:12:07,737 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:712ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741833
-    >   ....
-    >   ....
-
-
-    ls -al /var/hdfs/data
-
-    >   lrwxrwxrwx. 1 root root 25 Feb 12 10:11 /var/hdfs/data -> /mnt/cinder/vdc/hdfs/data
-
-
-    df -h /var/hdfs/data
-
-    >   Filesystem      Size  Used Avail Use% Mounted on
-    >   /dev/vdc        512G  663M  510G   1% /mnt/cinder/vdc
-
-    #
-    # Writing to the Cinder volume is slower than Hadoop is expecting.
-    # cost:712ms (threshold=300ms)
-    #
-
-# -----------------------------------------------------
-# Checking logs on worker03.
-#[fedora@gaia-dev-20210212-worker04]
-
-    # worker03 has much less activity.
-
-    ls -al /var/hadoop/logs/
-
-    >   drwxrwsr-x. 1 fedora fedora   582 Feb 12 10:19 .
-    >   drwxrwsr-x. 1 root   root      16 Feb 12 10:08 ..
-    >   -rw-rw-r--. 1 fedora fedora 35013 Feb 12 11:19 hadoop-fedora-datanode-gaia-dev-20210212-worker03.novalocal.log
-    >   -rw-rw-r--. 1 fedora fedora   702 Feb 12 10:19 hadoop-fedora-datanode-gaia-dev-20210212-worker03.novalocal.out
-    >   -rw-rw-r--. 1 fedora fedora 37730 Feb 12 11:09 hadoop-fedora-nodemanager-gaia-dev-20210212-worker03.novalocal.log
-    >   -rw-rw-r--. 1 fedora fedora  2218 Feb 12 10:19 hadoop-fedora-nodemanager-gaia-dev-20210212-worker03.novalocal.out
-    >   -rw-rw-r--. 1 fedora fedora     0 Feb 12 10:19 SecurityAuth-fedora.audit
-    >   drwxr-xr-x. 1 fedora fedora    60 Feb 12 11:17 userlogs
-
-    tail -f /var/hadoop/logs/hadoop-fedora-datanode-gaia-dev-20210212-worker03.novalocal.log
-
-    >   ....
-    >   ....
-    >   2021-02-12 11:18:38,134 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:493ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741834
-    >   2021-02-12 11:19:04,401 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:618ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741834
-    >   2021-02-12 11:19:29,870 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:486ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741834
-    >   2021-02-12 11:19:44,349 INFO org.apache.hadoop.hdfs.server.datanode.DataNode.clienttrace: src: /10.10.3.76:39778, dest: /10.10.0.137:9866, bytes: 134217728, op: HDFS_WRITE, cliID: DFSClient_NONMAPREDUCE_-1806520083_23, offset: 0, srvID: e9c64f4b-966f-468c-af20-b6ae51d502de, blockid: BP-346622070-10.10.3.194-1613125180505:blk_1073741834_1010, duration(ns): 134185201769
-    >   2021-02-12 11:19:44,349 INFO org.apache.hadoop.hdfs.server.datanode.DataNode: PacketResponder: BP-346622070-10.10.3.194-1613125180505:blk_1073741834_1010, type=HAS_DOWNSTREAM_IN_PIPELINE, downstreams=1:[10.10.1.46:9866] terminating
-    >   2021-02-12 11:19:44,364 INFO org.apache.hadoop.hdfs.server.datanode.DataNode: Receiving BP-346622070-10.10.3.194-1613125180505:blk_1073741835_1011 src: /10.10.3.104:47116 dest: /10.10.0.137:9866
-    >   2021-02-12 11:19:44,365 INFO org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
-    >   2021-02-12 11:20:16,593 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:643ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741835
-    >   ....
-    >   ....
-
-
-# -----------------------------------------------------
-# -----------------------------------------------------
-# Test run #3.2 tailing Zeppelin log
-#[fedora@gaia-dev-20210212-zeppelin]
-
-    ls -al zeppelin-0.8.2-bin-all/logs
-
-    >   drwxrwxr-x.  2 fedora fedora      4096 Feb 12 10:34 .
-    >   drwxr-xr-x. 12 fedora fedora      4096 Feb 12 10:20 ..
-    >   -rw-rw-r--.  1 fedora fedora     55109 Feb 12 11:29 zeppelin-fedora-gaia-dev-20210212-zeppelin.novalocal.log
-    >   -rw-rw-r--.  1 fedora fedora      6194 Feb 12 10:34 zeppelin-fedora-gaia-dev-20210212-zeppelin.novalocal.out
-    >   -rw-rw-r--.  1 fedora fedora      2885 Feb 12 11:28 zeppelin-interpreter-md-fedora-gaia-dev-20210212-zeppelin.novalocal.log
-    >   -rw-rw-r--.  1 fedora fedora 122946934 Feb 12 11:31 zeppelin-interpreter-spark-fedora-gaia-dev-20210212-zeppelin.novalocal.log
-
-
-    tail -f zeppelin-0.8.2-bin-all/logs/zeppelin-interpreter-spark-fedora-gaia-dev-20210212-zeppelin.novalocal.log
-
-    >   ....
-    >   ....
-    >    INFO [2021-02-12 11:28:59,148] ({pool-2-thread-13} SchedulerFactory.java[jobStarted]:114) - Job 20201013-131649_1734629667 started by scheduler interpreter_2016348950
-    >   ....
-    >   ....
-    >    INFO [2021-02-12 11:37:38,123] ({dispatcher-event-loop-12} Logging.scala[logInfo]:54) - Starting task 2997.0 in stage 92.0 (TID 320495, worker02, executor 2, partition 2997, PROCESS_LOCAL, 8450 bytes)
-    >    INFO [2021-02-12 11:37:38,123] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 2996.0 in stage 92.0 (TID 320494) in 242 ms on worker02 (executor 2) (2986/5720)
-    >    INFO [2021-02-12 11:37:38,312] ({dispatcher-event-loop-4} Logging.scala[logInfo]:54) - Starting task 2998.0 in stage 92.0 (TID 320496, worker02, executor 2, partition 2998, PROCESS_LOCAL, 8450 bytes)
-    >    INFO [2021-02-12 11:37:38,312] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 2997.0 in stage 92.0 (TID 320495) in 189 ms on worker02 (executor 2) (2987/5720)
-    >    INFO [2021-02-12 11:37:38,546] ({dispatcher-event-loop-2} Logging.scala[logInfo]:54) - Starting task 2999.0 in stage 92.0 (TID 320497, worker02, executor 2, partition 2999, PROCESS_LOCAL, 8450 bytes)
-    >    INFO [2021-02-12 11:37:38,547] ({task-result-getter-3} Logging.scala[logInfo]:54) - Finished task 2998.0 in stage 92.0 (TID 320496) in 235 ms on worker02 (executor 2) (2988/5720)
-    >    INFO [2021-02-12 11:37:39,376] ({dispatcher-event-loop-11} Logging.scala[logInfo]:54) - Starting task 3000.0 in stage 92.0 (TID 320498, worker01, executor 1, partition 3000, PROCESS_LOCAL, 8450 bytes)
-    >    INFO [2021-02-12 11:37:39,376] ({task-result-getter-0} Logging.scala[logInfo]:54) - Finished task 2986.0 in stage 92.0 (TID 320484) in 4466 ms on worker01 (executor 1) (2989/5720)
-    >    INFO [2021-02-12 11:37:39,974] ({dispatcher-event-loop-1} Logging.scala[logInfo]:54) - Starting task 3001.0 in stage 92.0 (TID 320499, worker04, executor 3, partition 3001, PROCESS_LOCAL, 8450 bytes)
-    >    INFO [2021-02-12 11:37:39,974] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 2976.0 in stage 92.0 (TID 320474) in 7743 ms on worker04 (executor 3) (2990/5720)
-    >    INFO [2021-02-12 11:37:40,235] ({dispatcher-event-loop-2} Logging.scala[logInfo]:54) - Starting task 3002.0 in stage 92.0 (TID 320500, worker04, executor 3, partition 3002, PROCESS_LOCAL, 8450 bytes)
-    >    INFO [2021-02-12 11:37:40,235] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished
-    >   ....
-    >   ....
-
-    #
-    # Not sending much to worker03 for some reason ?
-    #
-
-# -----------------------------------------------------
-# Test run #3.2 disc use on worker02
-#[fedora@gaia-dev-20210212-worker01]
-
-
-    ls -al /var/hadoop/
-
-    >   total 8
-    >   drwxrwsr-x.  2 root root 4096 Feb 12 10:09 .
-    >   drwxr-xr-x. 20 root root 4096 Feb 12 10:11 ..
-    >   lrwxrwxrwx.  1 root root   27 Feb 12 10:08 data -> /mnt/cinder/vdc/hadoop/data
-    >   lrwxrwxrwx.  1 root root   27 Feb 12 10:09 logs -> /mnt/cinder/vdc/hadoop/logs
-    >   lrwxrwxrwx.  1 root root   26 Feb 12 10:09 temp -> /mnt/local/vdb/hadoop/temp
-
-
-    du -h -d 2 -L /var/hadoop/
-
-    >   91M	/var/hadoop/logs/userlogs
-    >   91M	/var/hadoop/logs
-    >   0	/var/hadoop/data
-    >   293M	/var/hadoop/temp/nm-local-dir
-    >   293M	/var/hadoop/temp
-    >   384M	/var/hadoop/
-
-
-
-    ls -al /var/hadoop/logs/
-
-    >   drwxrwsr-x. 1 fedora fedora   582 Feb 12 10:19 .
-    >   drwxrwsr-x. 1 root   root      16 Feb 12 10:08 ..
-    >   -rw-rw-r--. 1 fedora fedora 39272 Feb 12 11:42 hadoop-fedora-datanode-gaia-dev-20210212-worker02.novalocal.log
-    >   -rw-rw-r--. 1 fedora fedora   702 Feb 12 10:19 hadoop-fedora-datanode-gaia-dev-20210212-worker02.novalocal.out
-    >   -rw-rw-r--. 1 fedora fedora 38788 Feb 12 11:49 hadoop-fedora-nodemanager-gaia-dev-20210212-worker02.novalocal.log
-    >   -rw-rw-r--. 1 fedora fedora  2218 Feb 12 10:19 hadoop-fedora-nodemanager-gaia-dev-20210212-worker02.novalocal.out
-    >   -rw-rw-r--. 1 fedora fedora     0 Feb 12 10:19 SecurityAuth-fedora.audit
-    >   drwxr-xr-x. 1 fedora fedora    60 Feb 12 11:51 userlogs
-
-
-    du -h -d 2 /var/hadoop/logs/
-
-    >   88M	/var/hadoop/logs/userlogs/application_1613125194823_0001
-    >   88M	/var/hadoop/logs/userlogs
-    >   89M	/var/hadoop/logs/
-
-
-    ls -al /var/hadoop/data/
-
-    >   total 0
-    >   drwxrwsr-x. 1 fedora fedora  0 Feb 12 10:08 .
-    >   drwxrwsr-x. 1 root   root   16 Feb 12 10:08 ..
-
-
-    du -h -d 2 /var/hadoop/data/
-
-    >   0	/var/hadoop/data/
-
-
-    ls -al /var/hadoop/temp/
-
-    >   drwxrwsr-x. 3 fedora fedora 4096 Feb 12 10:19 .
-    >   drwxrwsr-x. 3 root   root   4096 Feb 12 10:09 ..
-    >   drwxr-xr-x. 5 fedora fedora 4096 Feb 12 11:55 nm-local-dir
-
-
-    du -h -d 2 /var/hadoop/temp/
-
-    >   292M	/var/hadoop/temp/nm-local-dir/usercache
-    >   4.0K	/var/hadoop/temp/nm-local-dir/filecache
-    >   36K	/var/hadoop/temp/nm-local-dir/nmPrivate
-    >   292M	/var/hadoop/temp/nm-local-dir
-    >   292M	/var/hadoop/temp/
-
-
-    ls -al /var/hdfs/
-
-    >   total 8
-    >   drwxrwsr-x.  2 root root 4096 Feb 12 10:11 .
-    >   drwxr-xr-x. 20 root root 4096 Feb 12 10:11 ..
-    >   lrwxrwxrwx.  1 root root   25 Feb 12 10:11 data -> /mnt/cinder/vdc/hdfs/data
-    >   lrwxrwxrwx.  1 root root   25 Feb 12 10:11 logs -> /mnt/cinder/vdc/hdfs/logs
-
-
-    ls -al /var/hdfs/data/
-
-    >   total 4
-    >   drwx------. 1 fedora fedora 36 Feb 12 10:19 .
-    >   drwxrwsr-x. 1 root   root   16 Feb 12 10:11 ..
-    >   drwxrwxr-x. 1 fedora fedora 90 Feb 12 10:19 current
-    >   -rw-rw-r--. 1 fedora fedora 14 Feb 12 10:19 in_use.lock
-
-
-    du -h -d 2 /var/hdfs/data/
-
-    >   928M	/var/hdfs/data/current/BP-346622070-10.10.3.194-1613125180505
-    >   928M	/var/hdfs/data/current
-    >   928M	/var/hdfs/data/
-
-
-    ls -al /var/hdfs/logs/
-
-    >   drwxrwsr-x. 1 fedora fedora  0 Feb 12 10:11 .
-    >   drwxrwsr-x. 1 root   root   16 Feb 12 10:11 ..
-
-
-    du -h -d 2 /var/hdfs/logs/
-
-    >   0	/var/hdfs/logs/
-
-
-# -----------------------------------------------------
-# Test run #3.2 disc use on zeppelin
-#[fedora@gaia-dev-20210212-zeppelin]
-
-    ls -al /var/spark/
-
-    >   total 8
-    >   drwxrwsr-x.  2 root root 4096 Feb 12 10:13 .
-    >   drwxr-xr-x. 20 root root 4096 Feb 12 10:13 ..
-    >   lrwxrwxrwx.  1 root root   25 Feb 12 10:13 temp -> /mnt/local/vdb/spark/temp
-
-
-    ls -al /var/spark/temp/
-
-    >   drwxrwsr-x.  4 fedora fedora 4096 Feb 12 10:35 .
-    >   drwxrwsr-x.  3 root   root   4096 Feb 12 10:13 ..
-    >   drwxrwsr-x. 51 fedora fedora 4096 Feb 12 11:29 blockmgr-2c154649-34c5-4eb0-b588-b08b5bceccda
-    >   drwx--S---.  4 fedora fedora 4096 Feb 12 10:35 spark-0f42b2cc-7042-48bd-992f-99d49cabd8d5
-
-
-    du -h -d 2 /var/spark/temp/
-
-    >   220K	/var/spark/temp/spark-0f42b2cc-7042-48bd-992f-99d49cabd8d5
-    >   200K	/var/spark/temp/blockmgr-2c154649-34c5-4eb0-b588-b08b5bceccda
-    >   424K	/var/spark/temp/
-
-
-# -----------------------------------------------------
-# -----------------------------------------------------
-
-dev deployment #3
-
-    test #3.2
-
-        changed settings, 100% data, 500 trees
-        edit the notebook to remove quick_filter
-        clear output and run again
-
-        main select statement
-        1724028
-        Took 12 min 51 sec. Last updated by gaiauser at February 12 2021, 11:42:40 AM.
-
-        first plot
-        Took 16 min 21 sec. Last updated by gaiauser at February 12 2021, 11:59:01 AM.
-
-        good/bad select - 100% data
-        Good training data size: 244740 rows
-        Bad  training data size: 244740 rows
-        Took 19 min 16 sec. Last updated by gaiauser at February 12 2021, 12:18:17 PM.
-
-        RandomForestClassifier - 100% data 500 trees
-
-            Back to slow progress.
-            Several Java 30%cpu. lots of cephfuse at 2%cpu.
-
-        Reached 66% and then started to go backwards.
-        Reached 62% and decided to stop it.
-        Started 2 hours ago.
-
-        Clicked the [Cancel] button - no effect.
-        Log shows new tasks being issued.
-
-        Keyboard cancel, Ctrl-Atl-C - no effect.
-        Log shows new tasks being issued.
-
-        Restarted the intepreter - result.
-        Log shows tasks being cancelled.
-
-    >    INFO [2021-02-12 14:37:41,547] ({dispatcher-event-loop-10} Logging.scala[logInfo]:54) - Added rdd_419_2815 in memory on worker02:39835 (size: 16.0 B, free: 6.6 GB)
-    >    INFO [2021-02-12 14:37:41,582] ({dispatcher-event-loop-8} Logging.scala[logInfo]:54) - Starting task 2817.0 in stage 113.0 (TID 383260, worker02, executor 2, partition 2817, PROCESS_LOCAL, 8559 bytes)
-    >    INFO [2021-02-12 14:37:41,582] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 2815.0 in stage 113.0 (TID 383258) in 1246 ms on worker02 (executor 2) (2806/5721)
-    >    INFO [2021-02-12 14:37:52,456] ({dispatcher-event-loop-0} Logging.scala[logInfo]:54) - Added rdd_419_2817 in memory on worker02:39835 (size: 16.0 B, free: 6.6 GB)
-    >    INFO [2021-02-12 14:37:52,487] ({dispatcher-event-loop-2} Logging.scala[logInfo]:54) - Starting task 2818.0 in stage 113.0 (TID 383261, worker02, executor 2, partition 2818, PROCESS_LOCAL, 8559 bytes)
-    >    INFO [2021-02-12 14:37:52,487] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 2817.0 in stage 113.0 (TID 383260) in 10905 ms on worker02 (executor 2) (2807/5721)
-    >    INFO [2021-02-12 14:37:57,681] ({pool-1-thread-3} RemoteInterpreterServer.java[cancel]:681) - cancel org.apache.zeppelin.spark.PySparkInterpreter 20201013-152110_1282917873
-    >    INFO [2021-02-12 14:37:57,702] ({pool-1-thread-3} Logging.scala[logInfo]:54) - Asked to cancel job group zeppelin-gaiauser-2FX82FMTH-20201013-152110_1282917873
-    >    INFO [2021-02-12 14:37:57,706] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - Cancelling stage 113
-    >    INFO [2021-02-12 14:37:57,707] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - Killing all running tasks in stage 113: Stage cancelled
-    >    INFO [2021-02-12 14:37:57,711] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - Stage 113 was cancelled
-    >    INFO [2021-02-12 14:37:57,712] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - ShuffleMapStage 113 (mapPartitions at RandomForest.scala:538) failed in 1388.544 s due to Job 61 cancelled part of cancelled job group zeppelin-gaiauser-2FX82FMTH-20201013-152110_1282917873
-    >    INFO [2021-02-12 14:37:57,713] ({Thread-39} Logging.scala[logInfo]:54) - Job 61 failed: collectAsMap at RandomForest.scala:567, took 1388.585122 s
-    >   ERROR [2021-02-12 14:37:57,720] ({Thread-39} Logging.scala[logError]:70) - org.apache.spark.SparkException: Job 61 cancelled part of cancelled job group zeppelin-gaiauser-2FX82FMTH-20201013-152110_1282917873
-    >   	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
-    >   	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:1860)
-    >   	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply$mcVI$sp(DAGScheduler.scala:928)
-    >   	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply(DAGScheduler.scala:928)
-    >   	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply(DAGScheduler.scala:928)
-    >   	at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
-    >   	at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:928)
-    >   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2115)
-    >   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
-    >   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
-
-
-
-# -----------------------------------------------------
-# -----------------------------------------------------
-
-dev deployment #3
-
-    test #3.3
-
-        changed settings, 100% data, 500 trees, no cache
-
-        Caching may cause problems for datasets in Parquet files.
-        https://towardsdatascience.com/best-practices-for-caching-in-spark-sql-b22fb0f02d34
-
-        edit the notebook to remove quick_filter and caching
-
-        -   quick_filter = ' AND MOD(random_index, 10) = 0'
-        +   quick_filter = '' # AND MOD(random_index, 10) = 0'
-
-        -   raw_sources_df.cache()
-        +  #raw_sources_df.cache()
-
-        clear output and run again
-
-        main select statement
-        1724028
-        Took 10 min 39 sec. Last updated by gaiauser at February 12 2021, 2:57:19 PM.
-
-        first plot
-        Took 19 min 22 sec. Last updated by gaiauser at February 12 2021, 3:16:42 PM.
-
-        good/bad select - 100% data
-        Good training data size: 244740 rows
-        Bad  training data size: 244740 rows
-        Took 28 min 3 sec. Last updated by gaiauser at February 12 2021, 3:44:45 PM.
-
-        RandomForestClassifier - 100% data 500 trees
-        Killed at 80% to allow John to heal the Ceph system.
-
-
-# -----------------------------------------------------
-# -----------------------------------------------------
-
-dev deployment #3
-
-    test #3.4
-
-        100% data, 500 trees, no cache
-
-        ml intro
-        Took 0 sec. Last updated by gaiauser at February 12 2021, 8:18:31 PM.
-
-        temp view
-        Took 1 min 40 sec. Last updated by gaiauser at February 12 2021, 8:20:11 PM.
-
-        main select statement
-        1724028
-        Took 1 min 38 sec. Last updated by gaiauser at February 12 2021, 8:21:49 PM.
-
-            ceph-fuse at 80-90%
-            java at 20-50%
-
-        Hertzsprung-Russell
-        Took 4 min 39 sec. Last updated by gaiauser at February 12 2021, 8:26:28 PM.
-
-        good/bad select - 100% data
-        Good training data size: 244740 rows
-        Bad  training data size: 244740 rows
-        Took 7 min 13 sec. Last updated by gaiauser at February 12 2021, 8:33:41 PM.
-
-        RandomForestClassifier - 100% data 500 trees
-        Took 1 hrs 16 min 6 sec. Last updated by gaiauser at February 12 2021, 9:49:48 PM.
-
-        Misclassifications for the test set: 0.35 %
-        Took 18 min 35 sec. Last updated by gaiauser at February 12 2021, 10:08:23 PM.
-
-        Hertzsprung-Russell
-        Took 54 min 22 sec. Last updated by gaiauser at February 12 2021, 11:02:46 PM.
-
-        histogram
-        Took 14 min 58 sec. Last updated by gaiauser at February 12 2021, 11:17:44 PM.
-
-        Good sources plot
-        Took 27 min 12 sec. Last updated by gaiauser at February 12 2021, 11:44:56 PM.
-
-        Bad sources plot
-        Took 27 min 13 sec. Last updated by gaiauser at February 13 2021, 12:12:09 AM.
-
-        No. of good sources:  22254
-        No. of bad sources:   26170
-        Took 27 min 42 sec. Last updated by gaiauser at February 13 2021, 12:39:51 AM.
-
-        histogram
-        Took 19 min 10 sec. Last updated by gaiauser at February 13 2021, 12:59:01 AM.
-
-        Nulls
-        Took 15 min 48 sec. Last updated by gaiauser at February 13 2021, 1:14:49 AM.
-
-        ----
-
-dev deployment #3
-
-    test #3.5
-
-        repeat of the same
-        100% data, 500 trees, no cache
-
-        clear cells and run all
-
-        ml intro
-        Took 0 sec. Last updated by gaiauser at February 13 2021, 3:19:07 AM.
-
-        temp view
-        Took 50 sec. Last updated by gaiauser at February 13 2021, 3:19:57 AM.
-
-        main select statement
-        1724028
-        Took 38 sec. Last updated by gaiauser at February 13 2021, 3:20:35 AM.
-
-        Hertzsprung-Russell
-        Took 4 min 1 sec. Last updated by gaiauser at February 13 2021, 3:24:36 AM.
-
-        good/bad select - 100% data
-        Good training data size: 244740 rows
-        Bad  training data size: 244740 rows
-        Took 7 min 4 sec. Last updated by gaiauser at February 13 2021, 3:31:40 AM.
-
-        RandomForestClassifier - 100% data 500 trees
-        Took 1 hrs 19 min 39 sec. Last updated by gaiauser at February 13 2021, 4:51:20 AM.
-
-        Misclassifications for the test set: 0.35 %
-        Took 20 min 13 sec. Last updated by gaiauser at February 13 2021, 5:11:34 AM.
-
-        Hertzsprung-Russell
-        Took 55 min 7 sec. Last updated by gaiauser at February 13 2021, 6:06:42 AM.
-
-        histogram
-        Took 14 min 12 sec. Last updated by gaiauser at February 13 2021, 6:20:54 AM.
-
-        Good sources plot
-        Took 27 min 15 sec. Last updated by gaiauser at February 13 2021, 6:48:09 AM.
-
-        Bad sources plot
-        Took 27 min 56 sec. Last updated by gaiauser at February 13 2021, 7:16:06 AM.
-
-        good/bad count
-        No. of good sources:  22254
-        No. of bad sources:   26170
-        Took 27 min 16 sec. Last updated by gaiauser at February 13 2021, 7:43:22 AM.
-
-        histogram
-        Took 19 min 43 sec. Last updated by gaiauser at February 13 2021, 8:03:05 AM.
-
-        Nulls
-        Took 15 min 54 sec. Last updated by gaiauser at February 13 2021, 8:18:59 AM.
-
-    TODO
-
-        retry with caching enabled
-        retry with 1000 and 5000 trees
-
 
diff --git a/notes/zrq/20210212-01-speed-tests.txt b/notes/zrq/20210212-01-speed-tests.txt
new file mode 100644
index 00000000..ae01345c
--- /dev/null
+++ b/notes/zrq/20210212-01-speed-tests.txt
@@ -0,0 +1,881 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+    Target:
+
+        Get Spark to work with the new configuration.
+
+        Changes based on information from Stelio's notes.
+            notes/stv/20210210-Benchmarking-ML-Notebook-01.txt
+            notes/stv/20210211-ML-Notebook-Benchmarking.txt
+
+            Added hadoop.tmp.dir to the core config.
+            Added /var/hadoop/temp to the volume mounts.
+
+        Changed to 4 medium workers
+
+        Test config:
+            1 small master
+            1 medium zeppelin
+            4 medium workers
+
+        Variable results caused by problems with the Ceph stprage system.
+        The whole notebook is IO limited, all of the calculations are starved of input data.
+        Even on a good run, the cpu use is around 1%.
+
+        Multiple disc failures were causing problems with the Ceph system.
+        John removed broken discs from the array and stayed late to finish rebuilding the array.
+        After that results were much better, but still starved of data.
+
+        Hadoop and Spark work best with local data.
+
+        The gaia machines sitting in the racks at ROE are a better fit for this type of load.
+        Spread the data across the workers, don't centralise it in one place.
+        Either HDFS or another form of local caching.
+
+    Links about file system optimisation
+
+        Best practices for caching in Spark SQL
+        https://towardsdatascience.com/best-practices-for-caching-in-spark-sql-b22fb0f02d34
+
+        RADOS (Reliable Autonomic Distributed Object Store)
+        https://searchstorage.techtarget.com/definition/RADOS-Reliable-Autonomic-Distributed-Object-Store
+
+        CephFS: a new generation storage platform for Australian High Energy Physics
+        https://indico.cern.ch/event/505613/contributions/2230911/attachments/1345227/2039428/Oral-v5-162.pdf
+
+        CephFS file layouts
+        https://docs.ceph.com/en/mimic/cephfs/file-layouts/
+
+        Detecting CPU steal time in guest virtual machines
+        https://opensource.com/article/20/1/cpu-steal-time
+
+    Results:
+
+        Notebook works with 100% of eDR3 and 500 trees.
+        Need to experiment with adding more trees.
+
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Login to Zeppelin ...
+#[user@desktop]
+
+    firefox --new-window "http://zeppelin.metagrid.xyz:8080/" &
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+    Import notebooks from GitHub, clear the output and run all the cells ...
+
+    Good astrometric solutions via ML Random Forrest classifier
+    https://raw.githubusercontent.com/wfau/aglais-notebooks/main/2FRPC4BFS/note.json
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+Stelio's test #1a
+20210211-ML-Notebook-Benchmarking.txt
+
+    Cinder Volumes for temp storage for Spark & Hadoop
+    500 trees
+
+    main select statement
+	>  Took 43 mins
+
+    RandomForestClassifier - 10% data 500 trees
+	> Took 17 mins
+
+Stelio's test #1b
+20210211-ML-Notebook-Benchmarking.txt
+
+    Cinder Volumes for temp storage for Spark & Hadoop
+    5000 trees
+
+    main select statement
+	> ???
+
+    RandomForestClassifier - 10% data 5000 trees
+	> ???
+
+	notebook took 3 hrs 23 min 28 sec ()
+
+Stelio's test #2
+20210211-ML-Notebook-Benchmarking.txt
+
+    Revert changes to Ansible scripts so that it matches what is currently deployed on zeppelin.aglais.uk
+
+    main select statement
+    > 28 min 10 sec.
+
+    RandomForestClassifier - (assume 10% data 500 trees, not stated)
+    > 15 min 28 sec.
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+Live deployment #1
+
+    quick_filter=' AND MOD(random_index, 10) = 0'
+    quick_plot_filter=' AND MOD(random_index, 25) = 0'
+
+    main select statement
+    Took 29 min 10 sec. Last updated by gaiauser at February 12 2021, 4:59:01 AM.
+
+    first plot
+    ....
+
+    good/bad select
+    ....
+
+    RandomForestClassifier - 10% data 500 trees
+    Took 12 min 4 sec. Last updated by gaiauser at February 12 2021, 5:11:23 AM.
+
+
+Live deployment #2
+
+    quick_filter=''
+    quick_plot_filter=' AND MOD(random_index, 25) = 0'
+
+    main select statement
+    1724028
+    Took 4 min 16 sec. Last updated by gaiauser at February 12 2021, 6:16:23 AM.
+
+    first plot
+    Took 14 min 46 sec. Last updated by gaiauser at February 12 2021, 6:31:09 AM.
+
+    good/bad select
+    Good training data size: 244740 rows
+    Bad  training data size: 244740 rows
+    Took 23 min 8 sec. Last updated by gaiauser at February 12 2021, 6:54:18 AM.
+
+    RandomForestClassifier - 100% data 500 trees
+    Started 3 hours ago .... 66%
+
+
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+How-to: Tune Your Apache Spark Jobs (Part 2)
+https://blog.cloudera.com/how-to-tune-your-apache-spark-jobs-part-2/
+
+    Imagine a cluster with six (4) nodes running NodeManagers, each equipped with 16 (14) cores and 64GB (45) of memory.
+    The NodeManager capacities, yarn.nodemanager.resource.memory-mb and yarn.nodemanager.resource.cpu-vcores, should probably be set to 63 * 1024 = 64512 (megabytes) and 15 respectively.
+
+    example
+        yarn.nodemanager.resource.memory-mb     63 * 1024 = 64512
+        yarn.nodemanager.resource.cpu-vcores    16 - 1 = 15
+
+    The NodeManager capacities, yarn.nodemanager.resource.memory-mb and yarn.nodemanager.resource.cpu-vcores, should probably be set to 44 * 1024 = 45056 (megabytes) and 13 respectively.
+
+    aglais
+        yarn.nodemanager.resource.memory-mb     44 * 1024 = 45056
+        yarn.nodemanager.resource.cpu-vcores    14 - 1 = 13
+
+    We avoid allocating 100% of the resources to YARN containers because the node needs some resources to run the OS and Hadoop daemons.
+    In this case, we leave a gigabyte and a core for these system processes. Cloudera Manager helps by accounting for these and configuring these YARN properties automatically.
+
+The likely first impulse would be to use --num-executors 6 --executor-cores 15 --executor-memory 63G. However, this is the wrong approach because:
+
+    63GB + the executor memory overhead won’t fit within the 63GB capacity of the NodeManagers.
+    The application master will take up a core on one of the nodes, meaning that there won’t be room for a 15-core executor on that node.
+    15 cores per executor can lead to bad HDFS I/O throughput.
+
+A better option would be to use --num-executors 17 --executor-cores 5 --executor-memory 19G. Why?
+
+    This config results in three executors on all nodes except for the one with the AM, which will have two executors.
+    --executor-memory was derived as (63/3 executors per node) = 21.  21 * 0.07 = 1.47.  21 – 1.47 ~ 19.
+
+
+    example
+        6 nodes
+        15 cores per node
+        63G per node
+        3 executors per node
+        executor-cores 15 / 3 = 5
+        num-executors (6*3)-1 = 17
+
+        executor-memory
+            63/3 = 21
+            21 * (1 - 0.07) = 19
+
+
+    alais
+        4 nodes
+        13 cores per node
+        44G per node
+        3 executors per node
+        executor-cores 13 / 3 = 4
+        num-executors (4*3)-1 = 11
+
+        executor-memory
+            44/3 = 14
+            14 * (1 - 0.07) = 13
+
+        ---- ----
+
+        yarn.nodemanager.resource.memory-mb     45056
+        yarn.nodemanager.resource.cpu-vcores    13
+
+        executor-cores   4
+        num-executors   11
+        executor-memory 13
+
+
+        spark-master
+        spark-defaults.conf
+
+            spark.driver.memory             13g
+            spark.yarn.am.memory            13g
+            spark.yarn.am.cores               4
+
+            spark.executor.memory           13g
+            spark.executor.cores              4
+            spark.executor.instances         11
+
+            spark.eventLog.enabled  true
+            spark.driver.maxResultSize	     8g
+
+
+        yarn-masters
+        yarn-site.xml
+            yarn.scheduler.maximum-allocation-mb 13312
+            yarn.scheduler.minimum-allocation-mb  2048
+                                                 14336
+
+
+        yarn-workers
+        yarn-site.xml
+
+            yarn.nodemanager.resource.memory-mb         ((45-1)*1024) = 45056
+            yarn.nodemanager.resource.cpu-vcores        13
+            yarn.scheduler.maximum-allocation-vcores    26
+            yarn.scheduler.minimum-allocation-vcores     1
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+
+    https://github.com/hortonworks/hdp-configuration-utils
+
+    hdp-configuration-utils.py -c 14 -m 45 -d 1 -k False
+
+    >   Using cores=14 memory=45GB disks=1 hbase=False
+    >   Profile: cores=14 memory=45056MB reserved=1GB usableMem=44GB disks=1
+    >   Num Container=3
+    >   Container Ram=14336MB
+    >   Used Ram=42GB
+    >   Unused Ram=1GB
+    >   ***** mapred-site.xml *****
+    >   mapreduce.map.memory.mb=14336
+    >   mapreduce.map.java.opts=-Xmx11264m
+    >   mapreduce.reduce.memory.mb=14336
+    >   mapreduce.reduce.java.opts=-Xmx11264m
+    >   mapreduce.task.io.sort.mb=1792
+    >   ***** yarn-site.xml *****
+    >   yarn.scheduler.minimum-allocation-mb=14336
+    >   yarn.scheduler.maximum-allocation-mb=43008
+    >   yarn.nodemanager.resource.memory-mb=43008
+    >   yarn.app.mapreduce.am.resource.mb=14336
+    >   yarn.app.mapreduce.am.command-opts=-Xmx11264m
+    >   ***** tez-site.xml *****
+    >   tez.am.resource.memory.mb=14336
+    >   tez.am.java.opts=-Xmx11264m
+    >   ***** hive-site.xml *****
+    >   hive.tez.container.size=14336
+    >   hive.tez.java.opts=-Xmx11264m
+    >   hive.auto.convert.join.noconditionaltask.size=3758096000
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+dev deployment #1
+
+    test #1.1
+
+        default settings, 10% data, 500 trees
+
+        main select statement
+        Took 28 min 32 sec. Last updated by gaiauser at February 12 2021, 4:03:51 AM.
+
+        RandomForestClassifier - 10% data 500 trees
+        Took 14 min 58 sec. Last updated by gaiauser at February 12 2021, 4:19:07 AM.
+
+    #
+    # Tweaked the Hadoop/Yarn settings ..
+    #
+
+dev deployment #2
+
+    test #2.1
+
+        default settings, 10% data, 500 trees
+
+        java.lang.IllegalArgumentException:
+            Required executor memory (13312), overhead (1331 MB), and PySpark memory (0 MB) is above the max threshold (13312 MB) of this cluster!
+            Please check the values of 'yarn.scheduler.maximum-allocation-mb' and/or 'yarn.nodemanager.resource.memory-mb'.
+
+    #
+    # Fixed the Hadoop/Yarn settings ..
+    #
+
+        yarn-masters
+        yarn-site.xml
+            yarn.scheduler.maximum-allocation-mb ((45-1)*1024) = 45056
+            yarn.scheduler.minimum-allocation-mb  2048
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+dev deployment #3
+
+    test #3.1
+
+        default settings, 10% data, 500 trees
+
+        main select statement
+        1724028
+        Took 25 min 45 sec. Last updated by gaiauser at February 12 2021, 11:02:10 AM.
+
+        first plot
+        Took 6 sec. Last updated by gaiauser at February 12 2021, 11:02:16 AM.
+
+        good/bad select - 10% data
+        Good training data size: 24225 rows
+        Bad  training data size: 24225 rows
+        Took 10 sec. Last updated by gaiauser at February 12 2021, 11:02:26 AM.
+
+        RandomForestClassifier - 10% data 500 trees
+        Took 14 min 56 sec. Last updated by gaiauser at February 12 2021, 11:17:23 AM.
+
+            Slack chat with Paul Browne, asked him if there were any issues.
+            Suddenly running much faster - worker has 4 java processes at 96% cpu.
+            Might be a coincidence, might be something he tweaked ...
+            I think it was coincidence, I don't think he is online at the moment.
+
+        Good sources plot
+        Took 35 sec. Last updated by gaiauser at February 12 2021, 11:19:52 AM.
+
+        Bad sources plot
+        Took 36 sec. Last updated by gaiauser at February 12 2021, 11:20:28 AM.
+
+        Results
+        No. of good sources:  11180
+        No. of bad sources:   13102
+        Took 38 sec. Last updated by gaiauser at February 12 2021, 11:21:06 AM.
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Checking logs on worker04.
+#[fedora@gaia-dev-20210212-worker04]
+
+    # worker01,02 and 04 all have a lot of activity.
+
+    ls -al /var/hadoop/logs/
+
+    >   drwxrwsr-x. 1 fedora fedora   582 Feb 12 10:19 .
+    >   drwxrwsr-x. 1 root   root      16 Feb 12 10:08 ..
+    >   -rw-rw-r--. 1 fedora fedora 38792 Feb 12 11:19 hadoop-fedora-datanode-gaia-dev-20210212-worker04.novalocal.log
+    >   -rw-rw-r--. 1 fedora fedora   702 Feb 12 10:19 hadoop-fedora-datanode-gaia-dev-20210212-worker04.novalocal.out
+    >   -rw-rw-r--. 1 fedora fedora 37728 Feb 12 11:09 hadoop-fedora-nodemanager-gaia-dev-20210212-worker04.novalocal.log
+    >   -rw-rw-r--. 1 fedora fedora  2218 Feb 12 10:19 hadoop-fedora-nodemanager-gaia-dev-20210212-worker04.novalocal.out
+    >   -rw-rw-r--. 1 fedora fedora     0 Feb 12 10:19 SecurityAuth-fedora.audit
+    >   drwxr-xr-x. 1 fedora fedora    60 Feb 12 11:17 userlogs
+
+    tail -f /var/hadoop/logs/hadoop-fedora-datanode-gaia-dev-20210212-worker04.novalocal.log
+
+    >   ....
+    >   ....
+    >   2021-02-12 11:04:41,833 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:313ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741833
+    >   2021-02-12 11:05:23,204 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:475ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741833
+    >   2021-02-12 11:12:07,737 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:712ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741833
+    >   ....
+    >   ....
+
+
+    ls -al /var/hdfs/data
+
+    >   lrwxrwxrwx. 1 root root 25 Feb 12 10:11 /var/hdfs/data -> /mnt/cinder/vdc/hdfs/data
+
+
+    df -h /var/hdfs/data
+
+    >   Filesystem      Size  Used Avail Use% Mounted on
+    >   /dev/vdc        512G  663M  510G   1% /mnt/cinder/vdc
+
+    #
+    # Writing to the Cinder volume is slower than Hadoop is expecting.
+    # cost:712ms (threshold=300ms)
+    #
+
+# -----------------------------------------------------
+# Checking logs on worker03.
+#[fedora@gaia-dev-20210212-worker04]
+
+    # worker03 has much less activity.
+
+    ls -al /var/hadoop/logs/
+
+    >   drwxrwsr-x. 1 fedora fedora   582 Feb 12 10:19 .
+    >   drwxrwsr-x. 1 root   root      16 Feb 12 10:08 ..
+    >   -rw-rw-r--. 1 fedora fedora 35013 Feb 12 11:19 hadoop-fedora-datanode-gaia-dev-20210212-worker03.novalocal.log
+    >   -rw-rw-r--. 1 fedora fedora   702 Feb 12 10:19 hadoop-fedora-datanode-gaia-dev-20210212-worker03.novalocal.out
+    >   -rw-rw-r--. 1 fedora fedora 37730 Feb 12 11:09 hadoop-fedora-nodemanager-gaia-dev-20210212-worker03.novalocal.log
+    >   -rw-rw-r--. 1 fedora fedora  2218 Feb 12 10:19 hadoop-fedora-nodemanager-gaia-dev-20210212-worker03.novalocal.out
+    >   -rw-rw-r--. 1 fedora fedora     0 Feb 12 10:19 SecurityAuth-fedora.audit
+    >   drwxr-xr-x. 1 fedora fedora    60 Feb 12 11:17 userlogs
+
+    tail -f /var/hadoop/logs/hadoop-fedora-datanode-gaia-dev-20210212-worker03.novalocal.log
+
+    >   ....
+    >   ....
+    >   2021-02-12 11:18:38,134 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:493ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741834
+    >   2021-02-12 11:19:04,401 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:618ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741834
+    >   2021-02-12 11:19:29,870 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:486ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741834
+    >   2021-02-12 11:19:44,349 INFO org.apache.hadoop.hdfs.server.datanode.DataNode.clienttrace: src: /10.10.3.76:39778, dest: /10.10.0.137:9866, bytes: 134217728, op: HDFS_WRITE, cliID: DFSClient_NONMAPREDUCE_-1806520083_23, offset: 0, srvID: e9c64f4b-966f-468c-af20-b6ae51d502de, blockid: BP-346622070-10.10.3.194-1613125180505:blk_1073741834_1010, duration(ns): 134185201769
+    >   2021-02-12 11:19:44,349 INFO org.apache.hadoop.hdfs.server.datanode.DataNode: PacketResponder: BP-346622070-10.10.3.194-1613125180505:blk_1073741834_1010, type=HAS_DOWNSTREAM_IN_PIPELINE, downstreams=1:[10.10.1.46:9866] terminating
+    >   2021-02-12 11:19:44,364 INFO org.apache.hadoop.hdfs.server.datanode.DataNode: Receiving BP-346622070-10.10.3.194-1613125180505:blk_1073741835_1011 src: /10.10.3.104:47116 dest: /10.10.0.137:9866
+    >   2021-02-12 11:19:44,365 INFO org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
+    >   2021-02-12 11:20:16,593 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:643ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741835
+    >   ....
+    >   ....
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Test run #3.2 tailing Zeppelin log
+#[fedora@gaia-dev-20210212-zeppelin]
+
+    ls -al zeppelin-0.8.2-bin-all/logs
+
+    >   drwxrwxr-x.  2 fedora fedora      4096 Feb 12 10:34 .
+    >   drwxr-xr-x. 12 fedora fedora      4096 Feb 12 10:20 ..
+    >   -rw-rw-r--.  1 fedora fedora     55109 Feb 12 11:29 zeppelin-fedora-gaia-dev-20210212-zeppelin.novalocal.log
+    >   -rw-rw-r--.  1 fedora fedora      6194 Feb 12 10:34 zeppelin-fedora-gaia-dev-20210212-zeppelin.novalocal.out
+    >   -rw-rw-r--.  1 fedora fedora      2885 Feb 12 11:28 zeppelin-interpreter-md-fedora-gaia-dev-20210212-zeppelin.novalocal.log
+    >   -rw-rw-r--.  1 fedora fedora 122946934 Feb 12 11:31 zeppelin-interpreter-spark-fedora-gaia-dev-20210212-zeppelin.novalocal.log
+
+
+    tail -f zeppelin-0.8.2-bin-all/logs/zeppelin-interpreter-spark-fedora-gaia-dev-20210212-zeppelin.novalocal.log
+
+    >   ....
+    >   ....
+    >    INFO [2021-02-12 11:28:59,148] ({pool-2-thread-13} SchedulerFactory.java[jobStarted]:114) - Job 20201013-131649_1734629667 started by scheduler interpreter_2016348950
+    >   ....
+    >   ....
+    >    INFO [2021-02-12 11:37:38,123] ({dispatcher-event-loop-12} Logging.scala[logInfo]:54) - Starting task 2997.0 in stage 92.0 (TID 320495, worker02, executor 2, partition 2997, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-12 11:37:38,123] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 2996.0 in stage 92.0 (TID 320494) in 242 ms on worker02 (executor 2) (2986/5720)
+    >    INFO [2021-02-12 11:37:38,312] ({dispatcher-event-loop-4} Logging.scala[logInfo]:54) - Starting task 2998.0 in stage 92.0 (TID 320496, worker02, executor 2, partition 2998, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-12 11:37:38,312] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 2997.0 in stage 92.0 (TID 320495) in 189 ms on worker02 (executor 2) (2987/5720)
+    >    INFO [2021-02-12 11:37:38,546] ({dispatcher-event-loop-2} Logging.scala[logInfo]:54) - Starting task 2999.0 in stage 92.0 (TID 320497, worker02, executor 2, partition 2999, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-12 11:37:38,547] ({task-result-getter-3} Logging.scala[logInfo]:54) - Finished task 2998.0 in stage 92.0 (TID 320496) in 235 ms on worker02 (executor 2) (2988/5720)
+    >    INFO [2021-02-12 11:37:39,376] ({dispatcher-event-loop-11} Logging.scala[logInfo]:54) - Starting task 3000.0 in stage 92.0 (TID 320498, worker01, executor 1, partition 3000, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-12 11:37:39,376] ({task-result-getter-0} Logging.scala[logInfo]:54) - Finished task 2986.0 in stage 92.0 (TID 320484) in 4466 ms on worker01 (executor 1) (2989/5720)
+    >    INFO [2021-02-12 11:37:39,974] ({dispatcher-event-loop-1} Logging.scala[logInfo]:54) - Starting task 3001.0 in stage 92.0 (TID 320499, worker04, executor 3, partition 3001, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-12 11:37:39,974] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 2976.0 in stage 92.0 (TID 320474) in 7743 ms on worker04 (executor 3) (2990/5720)
+    >    INFO [2021-02-12 11:37:40,235] ({dispatcher-event-loop-2} Logging.scala[logInfo]:54) - Starting task 3002.0 in stage 92.0 (TID 320500, worker04, executor 3, partition 3002, PROCESS_LOCAL, 8450 bytes)
+    >    INFO [2021-02-12 11:37:40,235] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished
+    >   ....
+    >   ....
+
+    #
+    # Not sending much to worker03 for some reason ?
+    #
+
+# -----------------------------------------------------
+# Test run #3.2 disc use on worker02
+#[fedora@gaia-dev-20210212-worker01]
+
+
+    ls -al /var/hadoop/
+
+    >   total 8
+    >   drwxrwsr-x.  2 root root 4096 Feb 12 10:09 .
+    >   drwxr-xr-x. 20 root root 4096 Feb 12 10:11 ..
+    >   lrwxrwxrwx.  1 root root   27 Feb 12 10:08 data -> /mnt/cinder/vdc/hadoop/data
+    >   lrwxrwxrwx.  1 root root   27 Feb 12 10:09 logs -> /mnt/cinder/vdc/hadoop/logs
+    >   lrwxrwxrwx.  1 root root   26 Feb 12 10:09 temp -> /mnt/local/vdb/hadoop/temp
+
+
+    du -h -d 2 -L /var/hadoop/
+
+    >   91M	/var/hadoop/logs/userlogs
+    >   91M	/var/hadoop/logs
+    >   0	/var/hadoop/data
+    >   293M	/var/hadoop/temp/nm-local-dir
+    >   293M	/var/hadoop/temp
+    >   384M	/var/hadoop/
+
+
+
+    ls -al /var/hadoop/logs/
+
+    >   drwxrwsr-x. 1 fedora fedora   582 Feb 12 10:19 .
+    >   drwxrwsr-x. 1 root   root      16 Feb 12 10:08 ..
+    >   -rw-rw-r--. 1 fedora fedora 39272 Feb 12 11:42 hadoop-fedora-datanode-gaia-dev-20210212-worker02.novalocal.log
+    >   -rw-rw-r--. 1 fedora fedora   702 Feb 12 10:19 hadoop-fedora-datanode-gaia-dev-20210212-worker02.novalocal.out
+    >   -rw-rw-r--. 1 fedora fedora 38788 Feb 12 11:49 hadoop-fedora-nodemanager-gaia-dev-20210212-worker02.novalocal.log
+    >   -rw-rw-r--. 1 fedora fedora  2218 Feb 12 10:19 hadoop-fedora-nodemanager-gaia-dev-20210212-worker02.novalocal.out
+    >   -rw-rw-r--. 1 fedora fedora     0 Feb 12 10:19 SecurityAuth-fedora.audit
+    >   drwxr-xr-x. 1 fedora fedora    60 Feb 12 11:51 userlogs
+
+
+    du -h -d 2 /var/hadoop/logs/
+
+    >   88M	/var/hadoop/logs/userlogs/application_1613125194823_0001
+    >   88M	/var/hadoop/logs/userlogs
+    >   89M	/var/hadoop/logs/
+
+
+    ls -al /var/hadoop/data/
+
+    >   total 0
+    >   drwxrwsr-x. 1 fedora fedora  0 Feb 12 10:08 .
+    >   drwxrwsr-x. 1 root   root   16 Feb 12 10:08 ..
+
+
+    du -h -d 2 /var/hadoop/data/
+
+    >   0	/var/hadoop/data/
+
+
+    ls -al /var/hadoop/temp/
+
+    >   drwxrwsr-x. 3 fedora fedora 4096 Feb 12 10:19 .
+    >   drwxrwsr-x. 3 root   root   4096 Feb 12 10:09 ..
+    >   drwxr-xr-x. 5 fedora fedora 4096 Feb 12 11:55 nm-local-dir
+
+
+    du -h -d 2 /var/hadoop/temp/
+
+    >   292M	/var/hadoop/temp/nm-local-dir/usercache
+    >   4.0K	/var/hadoop/temp/nm-local-dir/filecache
+    >   36K	/var/hadoop/temp/nm-local-dir/nmPrivate
+    >   292M	/var/hadoop/temp/nm-local-dir
+    >   292M	/var/hadoop/temp/
+
+
+    ls -al /var/hdfs/
+
+    >   total 8
+    >   drwxrwsr-x.  2 root root 4096 Feb 12 10:11 .
+    >   drwxr-xr-x. 20 root root 4096 Feb 12 10:11 ..
+    >   lrwxrwxrwx.  1 root root   25 Feb 12 10:11 data -> /mnt/cinder/vdc/hdfs/data
+    >   lrwxrwxrwx.  1 root root   25 Feb 12 10:11 logs -> /mnt/cinder/vdc/hdfs/logs
+
+
+    ls -al /var/hdfs/data/
+
+    >   total 4
+    >   drwx------. 1 fedora fedora 36 Feb 12 10:19 .
+    >   drwxrwsr-x. 1 root   root   16 Feb 12 10:11 ..
+    >   drwxrwxr-x. 1 fedora fedora 90 Feb 12 10:19 current
+    >   -rw-rw-r--. 1 fedora fedora 14 Feb 12 10:19 in_use.lock
+
+
+    du -h -d 2 /var/hdfs/data/
+
+    >   928M	/var/hdfs/data/current/BP-346622070-10.10.3.194-1613125180505
+    >   928M	/var/hdfs/data/current
+    >   928M	/var/hdfs/data/
+
+
+    ls -al /var/hdfs/logs/
+
+    >   drwxrwsr-x. 1 fedora fedora  0 Feb 12 10:11 .
+    >   drwxrwsr-x. 1 root   root   16 Feb 12 10:11 ..
+
+
+    du -h -d 2 /var/hdfs/logs/
+
+    >   0	/var/hdfs/logs/
+
+
+# -----------------------------------------------------
+# Test run #3.2 disc use on zeppelin
+#[fedora@gaia-dev-20210212-zeppelin]
+
+    ls -al /var/spark/
+
+    >   total 8
+    >   drwxrwsr-x.  2 root root 4096 Feb 12 10:13 .
+    >   drwxr-xr-x. 20 root root 4096 Feb 12 10:13 ..
+    >   lrwxrwxrwx.  1 root root   25 Feb 12 10:13 temp -> /mnt/local/vdb/spark/temp
+
+
+    ls -al /var/spark/temp/
+
+    >   drwxrwsr-x.  4 fedora fedora 4096 Feb 12 10:35 .
+    >   drwxrwsr-x.  3 root   root   4096 Feb 12 10:13 ..
+    >   drwxrwsr-x. 51 fedora fedora 4096 Feb 12 11:29 blockmgr-2c154649-34c5-4eb0-b588-b08b5bceccda
+    >   drwx--S---.  4 fedora fedora 4096 Feb 12 10:35 spark-0f42b2cc-7042-48bd-992f-99d49cabd8d5
+
+
+    du -h -d 2 /var/spark/temp/
+
+    >   220K	/var/spark/temp/spark-0f42b2cc-7042-48bd-992f-99d49cabd8d5
+    >   200K	/var/spark/temp/blockmgr-2c154649-34c5-4eb0-b588-b08b5bceccda
+    >   424K	/var/spark/temp/
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+dev deployment #3
+
+    test #3.2
+
+        changed settings, 100% data, 500 trees
+        edit the notebook to remove quick_filter
+        clear output and run again
+
+        main select statement
+        1724028
+        Took 12 min 51 sec. Last updated by gaiauser at February 12 2021, 11:42:40 AM.
+
+        first plot
+        Took 16 min 21 sec. Last updated by gaiauser at February 12 2021, 11:59:01 AM.
+
+        good/bad select - 100% data
+        Good training data size: 244740 rows
+        Bad  training data size: 244740 rows
+        Took 19 min 16 sec. Last updated by gaiauser at February 12 2021, 12:18:17 PM.
+
+        RandomForestClassifier - 100% data 500 trees
+
+            Back to slow progress.
+            Several Java 30%cpu. lots of cephfuse at 2%cpu.
+
+        Reached 66% and then started to go backwards.
+        Reached 62% and decided to stop it.
+        Started 2 hours ago.
+
+        Clicked the [Cancel] button - no effect.
+        Log shows new tasks being issued.
+
+        Keyboard cancel, Ctrl-Atl-C - no effect.
+        Log shows new tasks being issued.
+
+        Restarted the intepreter - result.
+        Log shows tasks being cancelled.
+
+    >    INFO [2021-02-12 14:37:41,547] ({dispatcher-event-loop-10} Logging.scala[logInfo]:54) - Added rdd_419_2815 in memory on worker02:39835 (size: 16.0 B, free: 6.6 GB)
+    >    INFO [2021-02-12 14:37:41,582] ({dispatcher-event-loop-8} Logging.scala[logInfo]:54) - Starting task 2817.0 in stage 113.0 (TID 383260, worker02, executor 2, partition 2817, PROCESS_LOCAL, 8559 bytes)
+    >    INFO [2021-02-12 14:37:41,582] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 2815.0 in stage 113.0 (TID 383258) in 1246 ms on worker02 (executor 2) (2806/5721)
+    >    INFO [2021-02-12 14:37:52,456] ({dispatcher-event-loop-0} Logging.scala[logInfo]:54) - Added rdd_419_2817 in memory on worker02:39835 (size: 16.0 B, free: 6.6 GB)
+    >    INFO [2021-02-12 14:37:52,487] ({dispatcher-event-loop-2} Logging.scala[logInfo]:54) - Starting task 2818.0 in stage 113.0 (TID 383261, worker02, executor 2, partition 2818, PROCESS_LOCAL, 8559 bytes)
+    >    INFO [2021-02-12 14:37:52,487] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 2817.0 in stage 113.0 (TID 383260) in 10905 ms on worker02 (executor 2) (2807/5721)
+    >    INFO [2021-02-12 14:37:57,681] ({pool-1-thread-3} RemoteInterpreterServer.java[cancel]:681) - cancel org.apache.zeppelin.spark.PySparkInterpreter 20201013-152110_1282917873
+    >    INFO [2021-02-12 14:37:57,702] ({pool-1-thread-3} Logging.scala[logInfo]:54) - Asked to cancel job group zeppelin-gaiauser-2FX82FMTH-20201013-152110_1282917873
+    >    INFO [2021-02-12 14:37:57,706] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - Cancelling stage 113
+    >    INFO [2021-02-12 14:37:57,707] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - Killing all running tasks in stage 113: Stage cancelled
+    >    INFO [2021-02-12 14:37:57,711] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - Stage 113 was cancelled
+    >    INFO [2021-02-12 14:37:57,712] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - ShuffleMapStage 113 (mapPartitions at RandomForest.scala:538) failed in 1388.544 s due to Job 61 cancelled part of cancelled job group zeppelin-gaiauser-2FX82FMTH-20201013-152110_1282917873
+    >    INFO [2021-02-12 14:37:57,713] ({Thread-39} Logging.scala[logInfo]:54) - Job 61 failed: collectAsMap at RandomForest.scala:567, took 1388.585122 s
+    >   ERROR [2021-02-12 14:37:57,720] ({Thread-39} Logging.scala[logError]:70) - org.apache.spark.SparkException: Job 61 cancelled part of cancelled job group zeppelin-gaiauser-2FX82FMTH-20201013-152110_1282917873
+    >   	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
+    >   	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:1860)
+    >   	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply$mcVI$sp(DAGScheduler.scala:928)
+    >   	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply(DAGScheduler.scala:928)
+    >   	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply(DAGScheduler.scala:928)
+    >   	at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
+    >   	at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:928)
+    >   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2115)
+    >   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
+    >   	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
+
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+dev deployment #3
+
+    test #3.3
+
+        changed settings, 100% data, 500 trees, no cache
+
+        Caching may cause problems for datasets in Parquet files.
+        https://towardsdatascience.com/best-practices-for-caching-in-spark-sql-b22fb0f02d34
+
+        edit the notebook to remove quick_filter and caching
+
+        -   quick_filter = ' AND MOD(random_index, 10) = 0'
+        +   quick_filter = '' # AND MOD(random_index, 10) = 0'
+
+        -   raw_sources_df.cache()
+        +  #raw_sources_df.cache()
+
+        clear output and run again
+
+        main select statement
+        1724028
+        Took 10 min 39 sec. Last updated by gaiauser at February 12 2021, 2:57:19 PM.
+
+        first plot
+        Took 19 min 22 sec. Last updated by gaiauser at February 12 2021, 3:16:42 PM.
+
+        good/bad select - 100% data
+        Good training data size: 244740 rows
+        Bad  training data size: 244740 rows
+        Took 28 min 3 sec. Last updated by gaiauser at February 12 2021, 3:44:45 PM.
+
+        RandomForestClassifier - 100% data 500 trees
+        Killed at 80% to allow John to heal the Ceph system.
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+dev deployment #3
+
+    test #3.4
+
+        100% data, 500 trees, no cache
+
+        ml intro
+        Took 0 sec. Last updated by gaiauser at February 12 2021, 8:18:31 PM.
+
+        temp view
+        Took 1 min 40 sec. Last updated by gaiauser at February 12 2021, 8:20:11 PM.
+
+        main select statement
+        1724028
+        Took 1 min 38 sec. Last updated by gaiauser at February 12 2021, 8:21:49 PM.
+
+            ceph-fuse at 80-90%
+            java at 20-50%
+
+        Hertzsprung-Russell
+        Took 4 min 39 sec. Last updated by gaiauser at February 12 2021, 8:26:28 PM.
+
+        good/bad select - 100% data
+        Good training data size: 244740 rows
+        Bad  training data size: 244740 rows
+        Took 7 min 13 sec. Last updated by gaiauser at February 12 2021, 8:33:41 PM.
+
+        RandomForestClassifier - 100% data 500 trees
+        Took 1 hrs 16 min 6 sec. Last updated by gaiauser at February 12 2021, 9:49:48 PM.
+
+        Misclassifications for the test set: 0.35 %
+        Took 18 min 35 sec. Last updated by gaiauser at February 12 2021, 10:08:23 PM.
+
+        Hertzsprung-Russell
+        Took 54 min 22 sec. Last updated by gaiauser at February 12 2021, 11:02:46 PM.
+
+        histogram
+        Took 14 min 58 sec. Last updated by gaiauser at February 12 2021, 11:17:44 PM.
+
+        Good sources plot
+        Took 27 min 12 sec. Last updated by gaiauser at February 12 2021, 11:44:56 PM.
+
+        Bad sources plot
+        Took 27 min 13 sec. Last updated by gaiauser at February 13 2021, 12:12:09 AM.
+
+        No. of good sources:  22254
+        No. of bad sources:   26170
+        Took 27 min 42 sec. Last updated by gaiauser at February 13 2021, 12:39:51 AM.
+
+        histogram
+        Took 19 min 10 sec. Last updated by gaiauser at February 13 2021, 12:59:01 AM.
+
+        Nulls
+        Took 15 min 48 sec. Last updated by gaiauser at February 13 2021, 1:14:49 AM.
+
+        ----
+
+dev deployment #3
+
+    test #3.5
+
+        repeat of the same
+        100% data, 500 trees, no cache
+
+        clear cells and run all
+
+        ml intro
+        Took 0 sec. Last updated by gaiauser at February 13 2021, 3:19:07 AM.
+
+        temp view
+        Took 50 sec. Last updated by gaiauser at February 13 2021, 3:19:57 AM.
+
+        main select statement
+        1724028
+        Took 38 sec. Last updated by gaiauser at February 13 2021, 3:20:35 AM.
+
+        Hertzsprung-Russell
+        Took 4 min 1 sec. Last updated by gaiauser at February 13 2021, 3:24:36 AM.
+
+        good/bad select - 100% data
+        Good training data size: 244740 rows
+        Bad  training data size: 244740 rows
+        Took 7 min 4 sec. Last updated by gaiauser at February 13 2021, 3:31:40 AM.
+
+        RandomForestClassifier - 100% data 500 trees
+        Took 1 hrs 19 min 39 sec. Last updated by gaiauser at February 13 2021, 4:51:20 AM.
+
+        Misclassifications for the test set: 0.35 %
+        Took 20 min 13 sec. Last updated by gaiauser at February 13 2021, 5:11:34 AM.
+
+        Hertzsprung-Russell
+        Took 55 min 7 sec. Last updated by gaiauser at February 13 2021, 6:06:42 AM.
+
+        histogram
+        Took 14 min 12 sec. Last updated by gaiauser at February 13 2021, 6:20:54 AM.
+
+        Good sources plot
+        Took 27 min 15 sec. Last updated by gaiauser at February 13 2021, 6:48:09 AM.
+
+        Bad sources plot
+        Took 27 min 56 sec. Last updated by gaiauser at February 13 2021, 7:16:06 AM.
+
+        good/bad count
+        No. of good sources:  22254
+        No. of bad sources:   26170
+        Took 27 min 16 sec. Last updated by gaiauser at February 13 2021, 7:43:22 AM.
+
+        histogram
+        Took 19 min 43 sec. Last updated by gaiauser at February 13 2021, 8:03:05 AM.
+
+        Nulls
+        Took 15 min 54 sec. Last updated by gaiauser at February 13 2021, 8:18:59 AM.
+
+    TODO
+
+        retry with caching enabled
+        retry with 1000 and 5000 trees
+
+
diff --git a/notes/zrq/20210213-01-speed-tests.txt b/notes/zrq/20210213-01-speed-tests.txt
new file mode 100644
index 00000000..5fbf090e
--- /dev/null
+++ b/notes/zrq/20210213-01-speed-tests.txt
@@ -0,0 +1,28 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+

From bc2da47bff30726d95212092b7a655cb9b9d3221 Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Mon, 15 Feb 2021 07:01:39 +0000
Subject: [PATCH 13/27] ....

---
 notes/zrq/20210213-01-speed-tests.txt | 90 +++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/notes/zrq/20210213-01-speed-tests.txt b/notes/zrq/20210213-01-speed-tests.txt
index 5fbf090e..27e97f33 100644
--- a/notes/zrq/20210213-01-speed-tests.txt
+++ b/notes/zrq/20210213-01-speed-tests.txt
@@ -26,3 +26,93 @@
 #zrq-notes-zeppelin
 #
 
+    Target:
+
+        Follow on from previous tests.
+        Enable caching or raw_sources.
+        Increase the number of trees.
+
+    Result:
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Login to Zeppelin ...
+#[user@desktop]
+
+    firefox --new-window "http://zeppelin.metagrid.xyz:8080/" &
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+    Import notebooks from GitHub, clear the output and run all the cells ...
+
+    Good astrometric solutions via ML Random Forrest classifier
+    https://raw.githubusercontent.com/wfau/aglais-notebooks/main/2FRPC4BFS/note.json
+
+
+
+dev deployment #3
+
+    test #3.5
+
+        100% data, 500 trees, cache
+
+        edit the notebook to enable caching
+
+        -   #raw_sources_df.cache()
+        +   raw_sources_df.cache()
+
+        clear cells and run all
+
+        ML intro
+        Took 0 sec. Last updated by gaiauser at February 13 2021, 1:33:46 PM.
+
+        Temp view
+        Took 50 sec. Last updated by gaiauser at February 13 2021, 1:34:36 PM.
+
+        Main select statement
+        1724028
+        Took 40 sec. Last updated by gaiauser at February 13 2021, 1:35:16 PM.
+
+            # This MUST be using cached data.
+
+        Hertzsprung-Russell
+        Took 3 min 42 sec. Last updated by gaiauser at February 13 2021, 1:38:58 PM.
+
+        Good/bad select - 100% data
+        Good training data size: 244740 rows
+        Bad  training data size: 244740 rows
+        Took 7 min 8 sec. Last updated by gaiauser at February 13 2021, 1:46:06 PM.
+
+        RandomForestClassifier - 100% data 500 trees
+        Took 1 hrs 19 min 29 sec. Last updated by gaiauser at February 13 2021, 3:05:36 PM.
+
+        Misclassification fraction
+        Misclassifications for the test set: 0.35 %
+        Took 18 min 24 sec. Last updated by gaiauser at February 13 2021, 3:24:00 PM.
+
+        Hertzsprung-Russell
+        Took 55 min 28 sec. Last updated by gaiauser at February 13 2021, 4:19:28 PM.
+
+        Histogram
+        Took 14 min 5 sec. Last updated by gaiauser at February 13 2021, 4:33:33 PM.
+
+        Good sources plot
+        Took 27 min 43 sec. Last updated by gaiauser at February 13 2021, 5:01:16 PM.
+
+        Bad sources plot
+        Took 27 min 19 sec. Last updated by gaiauser at February 13 2021, 5:28:35 PM.
+
+        Good/bad count
+        No. of good sources:  22254
+        No. of bad sources:   26170
+        Took 27 min 25 sec. Last updated by gaiauser at February 13 2021, 5:56:00 PM.
+
+        Histogram
+        Took 18 min 46 sec. Last updated by gaiauser at February 13 2021, 6:14:46 PM.
+
+        Nulls
+        Took 14 min 39 sec. Last updated by gaiauser at February 13 2021, 6:29:25 PM.
+

From d2ee384b484e820338fdb16a4d701baaa7f13fde Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Mon, 15 Feb 2021 08:02:24 +0000
Subject: [PATCH 14/27] Added jq|sed parser for notebook timings

---
 notes/zrq/20210214-01-speed-tester.txt | 660 +++++++++++++++++++++++++
 1 file changed, 660 insertions(+)
 create mode 100644 notes/zrq/20210214-01-speed-tester.txt

diff --git a/notes/zrq/20210214-01-speed-tester.txt b/notes/zrq/20210214-01-speed-tester.txt
new file mode 100644
index 00000000..482e038d
--- /dev/null
+++ b/notes/zrq/20210214-01-speed-tester.txt
@@ -0,0 +1,660 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+    Target:
+
+        Issue #371
+        https://github.com/wfau/aglais/issues/371
+
+        Create a jq parser to extract the timing information from a Zeppelin notebook.
+
+    Result:
+
+        Work in progress ...
+
+
+    #
+    # REST API documentation.
+    # https://zeppelin.apache.org/docs/0.7.0/rest-api/rest-notebook.html#export-a-note
+
+    #
+    # REST API template
+    # http://[zeppelin-server]:[zeppelin-port]/api/notebook/export/[noteId]
+
+    #
+    # HTML notebook URL
+    # http://128.232.227.222:8080/#/notebook/2FX82FMTH
+
+
+# -----------------------------------------------------
+# Use the REST API to get a copy of the notebook.
+#[user@desktop]
+
+    curl -v 'http://128.232.227.222:8080/api/notebook/export/2FX82FMTH'
+
+    >   *   Trying 128.232.227.222:8080...
+    >   * Connected to 128.232.227.222 (128.232.227.222) port 8080 (#0)
+    >   > GET /api/notebook/export/2FX82FMTH HTTP/1.1
+    >   > Host: 128.232.227.222:8080
+    >   > User-Agent: curl/7.69.1
+    >   > Accept: */*
+    >   >
+    >   * Mark bundle as not supporting multiuse
+    >   < HTTP/1.1 302 Found
+    >   < Date: Sunday, February 14, 2021 6:21:46 PM UTC
+    >   < Access-Control-Allow-Credentials: true
+    >   < Access-Control-Allow-Headers: authorization,Content-Type
+    >   < Access-Control-Allow-Methods: POST, GET, OPTIONS, PUT, HEAD, DELETE
+    >   < X-FRAME-OPTIONS: SAMEORIGIN
+    >   < X-XSS-Protection: 1
+    >   < Set-Cookie: JSESSIONID=c3b28d9a-67d3-403b-8d10-6931e35b7211; Path=/; HttpOnly
+    >   < Location: http://128.232.227.222:8080/api/login;JSESSIONID=c3b28d9a-67d3-403b-8d10-6931e35b7211
+    >   < Content-Length: 0
+    >   < Server: Jetty(9.4.14.v20181114)
+    >   <
+
+
+# -----------------------------------------------------
+# Use the REST API to login.
+# https://community.cloudera.com/t5/Support-Questions/Authentication-with-the-Zeppelin-REST-API/td-p/115170
+#[user@desktop]
+
+    gaiauser=$(secret aglais.zeppelin.gaiauser)
+    gaiapass=$(secret aglais.zeppelin.gaiapass)
+
+    curl \
+        --include \
+        --request 'POST' \
+        --data "userName=${gaiauser:?}" \
+        --data "password=${gaiapass:?}" \
+        'http://128.232.227.222:8080/api/login'
+
+    >   HTTP/1.1 200 OK
+    >   Date: Sunday, February 14, 2021 6:29:54 PM UTC
+    >   Access-Control-Allow-Credentials: true
+    >   Access-Control-Allow-Headers: authorization,Content-Type
+    >   Access-Control-Allow-Methods: POST, GET, OPTIONS, PUT, HEAD, DELETE
+    >   X-FRAME-OPTIONS: SAMEORIGIN
+    >   X-XSS-Protection: 1
+    >   Set-Cookie: rememberMe=deleteMe; Path=/; Max-Age=0; Expires=Sat, 13-Feb-2021 18:29:54 GMT
+    >   Set-Cookie: JSESSIONID=28567db4-9c2c-4b24-afbb-4517b3dd9dbf; Path=/; HttpOnly
+    >   Set-Cookie: JSESSIONID=deleteMe; Path=/; Max-Age=0; Expires=Sat, 13-Feb-2021 18:29:54 GMT
+    >   Set-Cookie: rememberMe=deleteMe; Path=/; Max-Age=0; Expires=Sat, 13-Feb-2021 18:29:54 GMT
+    >   Set-Cookie: JSESSIONID=278c187f-7add-4ee3-a0e8-492a132cadb4; Path=/; HttpOnly
+    >   Set-Cookie: rememberMe=deleteMe; Path=/; Max-Age=0; Expires=Sat, 13-Feb-2021 18:29:54 GMT
+    >   Content-Type: application/json
+    >   Content-Length: 130
+    >   Server: Jetty(9.4.14.v20181114)
+
+    >   {
+    >   "status":"OK",
+    >   "message":"",
+    >   "body": {
+    >       "principal":"gaiauser",
+    >       "ticket":"4342bdd9-0a7b-4f27-a216-bae695a69b22",
+    >       "roles":"[\"role1\"]"
+    >       }
+    >   }
+
+
+# -----------------------------------------------------
+# Use the REST API to login, and save the cookie in a cookie-jar.
+#[user@desktop]
+
+    curl \
+        --request 'POST' \
+        --cookie-jar '/tmp/cookies' \
+        --data "userName=${gaiauser:?}" \
+        --data "password=${gaiapass:?}" \
+        'http://128.232.227.222:8080/api/login'
+
+    >   {
+    >   "status":"OK",
+    >   "message":"",
+    >   "body": {
+    >       "principal":"gaiauser",
+    >       "ticket":"4342bdd9-0a7b-4f27-a216-bae695a69b22",
+    >       "roles":"[\"role1\"]"
+    >       }
+    >   }
+
+
+    cat '/tmp/cookies'
+
+    >   # Netscape HTTP Cookie File
+    >   # https://curl.haxx.se/docs/http-cookies.html
+    >   # This file was generated by libcurl! Edit at your own risk.
+    >   
+    >   #HttpOnly_128.232.227.222	FALSE	/	FALSE	0	JSESSIONID	405b514d-c195-488b-98a8-86c9f06d65e2
+
+
+# -----------------------------------------------------
+# Use the cookie in our cookie-jar to authenticate the GET.
+#[user@desktop]
+
+    curl \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/export/2FX82FMTH' \
+    | jq '.'
+
+
+    >   >     | jq ''
+    >     % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
+    >                                    Dload  Upload   Total   Spent    Left  Speed
+    >   100  761k    0  761k    0     0  1603k      0 --:--:-- --:--:-- --:--:-- 1600k
+    >   {
+    >     "status": "OK",
+    >     "message": "",
+    >     "body": "{\n  \"paragraphs\": [....
+    >     ....
+    >     ....
+    >     .... \"info\": {}\n}"
+    >   }
+
+
+# -----------------------------------------------------
+# GET the note status rather than the content.
+#[user@desktop]
+
+    curl \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/job/2FX82FMTH' \
+    | jq '.'
+
+    >   {
+    >     "status": "OK",
+    >     "body": [
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 13:33:46 UTC 2021",
+    >         "finished": "Sat Feb 13 13:33:46 UTC 2021",
+    >         "id": "20201013-131059_546082898",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 13:33:46 UTC 2021",
+    >         "finished": "Sat Feb 13 13:34:36 UTC 2021",
+    >         "id": "20201013-131649_1734629667",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 13:34:36 UTC 2021",
+    >         "finished": "Sat Feb 13 13:35:16 UTC 2021",
+    >         "id": "20201013-132418_278702125",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 13:35:16 UTC 2021",
+    >         "finished": "Sat Feb 13 13:38:58 UTC 2021",
+    >         "id": "20201120-094650_221463065",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 13:38:58 UTC 2021",
+    >         "finished": "Sat Feb 13 13:38:58 UTC 2021",
+    >         "id": "20201120-110502_1704727157",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 13:38:58 UTC 2021",
+    >         "finished": "Sat Feb 13 13:46:06 UTC 2021",
+    >         "id": "20201123-105445_95907042",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 13:46:06 UTC 2021",
+    >         "finished": "Sat Feb 13 13:46:06 UTC 2021",
+    >         "id": "20201015-161110_18118893",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 13:46:07 UTC 2021",
+    >         "finished": "Sat Feb 13 15:05:36 UTC 2021",
+    >         "id": "20201013-152110_1282917873",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 15:05:36 UTC 2021",
+    >         "finished": "Sat Feb 13 15:05:36 UTC 2021",
+    >         "id": "20201015-131823_1744793710",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 15:05:36 UTC 2021",
+    >         "finished": "Sat Feb 13 15:24:00 UTC 2021",
+    >         "id": "20201016-154755_24366630",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 15:24:00 UTC 2021",
+    >         "finished": "Sat Feb 13 15:24:00 UTC 2021",
+    >         "id": "20201123-163421_1811049882",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 15:24:00 UTC 2021",
+    >         "finished": "Sat Feb 13 16:19:28 UTC 2021",
+    >         "id": "20201123-162249_1468741293",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 16:19:28 UTC 2021",
+    >         "finished": "Sat Feb 13 16:33:33 UTC 2021",
+    >         "id": "20201124-100512_110153564",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 16:33:33 UTC 2021",
+    >         "finished": "Sat Feb 13 17:01:16 UTC 2021",
+    >         "id": "20201125-103046_1353183691",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 17:01:16 UTC 2021",
+    >         "finished": "Sat Feb 13 17:28:35 UTC 2021",
+    >         "id": "20201125-163312_728555601",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 17:28:35 UTC 2021",
+    >         "finished": "Sat Feb 13 17:56:00 UTC 2021",
+    >         "id": "20201125-155131_269531128",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 17:56:00 UTC 2021",
+    >         "finished": "Sat Feb 13 18:14:46 UTC 2021",
+    >         "id": "20201124-161145_1933006801",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 18:14:46 UTC 2021",
+    >         "finished": "Sat Feb 13 18:29:25 UTC 2021",
+    >         "id": "20201124-171324_1960205489",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "id": "20210108-142101_850914828",
+    >         "status": "FINISHED"
+    >       }
+    >     ]
+    >   }
+
+
+# -----------------------------------------------------
+# Is there an easy way of doing the date subraction to get the execution time ?
+#[user@desktop]
+
+    # GoogleFoo to the rescue
+    # http://www.fresse.org/dateutils/
+
+    dnf install dateutils
+
+    started="Sat Feb 13 17:56:00 UTC 2021"
+    finished="Sat Feb 13 18:29:25 UTC 2021"
+
+    datediff "${started:?}" "${finished:?}"
+
+    >   ddiff: Error: reference DATE must be specified
+
+    datediff \
+        --input-format '%a %b %-d %H:%M:%S %Z %Y' \
+        "${started:?}" "${finished:?}"
+
+    >   ddiff: Error: reference DATE must be specified
+
+
+    datediff \
+        '13 Feb 2021 17:56:00' \
+        '13 Feb 2021 18:29:25'
+
+    >   ddiff: Error: reference DATE must be specified
+
+
+    datediff \
+        '2021-02-13 17:56:00' \
+        '2021-02-13 18:29:25'
+
+    >   2005s
+
+
+    datediff \
+        --format '%H:%M:%S' \
+        '2021-02-13 17:56:00' \
+        '2021-02-13 18:29:25'
+
+    >   0:33:25
+
+
+    datediff \
+        --format '%H:%M:%S' \
+        --input-format '%Y %b %d %H:%M:%S' \
+        '2021 Feb 13 17:56:00' \
+        '2021 Feb 13 18:29:25'
+
+    >   0:33:25
+
+    #
+    # Need to parse the crappy date format into something useable.
+    #
+
+    strptime '%a %b $-d %H:%M:%S %Z %Y' "${started:?}"
+
+    >   strptime: cannot make sense of `Sat Feb 13 17:56:00 UTC 2021' using the given input formats
+
+    #
+    # Need to parse the crappy date format into something useable.
+    #
+
+    echo "${started:?}" | sed '
+        s/\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)/\6 \2 \3 \4/
+        '
+
+    >   2021 Feb 13 17:56:00
+
+
+    dateform()
+        {
+        local input=${1:?}
+        echo "${input:?}" | sed '
+            s/\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)/\6 \2 \3 \4/
+            '
+        }
+
+    dateform "${started}"
+
+    >   2021 Feb 13 17:56:00
+
+    datediff \
+        --format '%H:%M:%S' \
+        --input-format '%Y %b %d %H:%M:%S' \
+        "$(dateform "${started:?}")" \
+        "$(dateform "${finished:?}")"
+
+    >   0:33:25
+
+    #
+    # Code to generate the date/time is here.
+    # https://github.com/apache/zeppelin/blob/f3bdd4a1fa0cf19bc1015955d8ade4bc79a8e16f/zeppelin-server/src/main/java/org/apache/zeppelin/rest/message/ParagraphJobStatus.java#L35
+    # Looks like a standard java.util.Date.toString() call.
+    #
+    # TODO : PR to implement an extra param that formats the dates ?
+    #
+
+
+# -----------------------------------------------------
+# GET the note status rather than the content.
+#[user@desktop]
+
+    curl \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/job/2FX82FMTH' \
+    | jq '.'
+
+
+    >   {
+    >     "status": "OK",
+    >     "body": [
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 13:33:46 UTC 2021",
+    >         "finished": "Sat Feb 13 13:33:46 UTC 2021",
+    >         "id": "20201013-131059_546082898",
+    >         "status": "FINISHED"
+    >       },
+    >   ....
+    >   ....
+    >       {
+    >         "progress": "100",
+    >         "started": "Sat Feb 13 18:14:46 UTC 2021",
+    >         "finished": "Sat Feb 13 18:29:25 UTC 2021",
+    >         "id": "20201124-171324_1960205489",
+    >         "status": "FINISHED"
+    >       },
+    >       {
+    >         "progress": "100",
+    >         "id": "20210108-142101_850914828",
+    >         "status": "FINISHED"
+    >       }
+    >     ]
+    >   }
+
+
+    #
+    # For each element in the list ..
+    # Calculate the elapsed time.
+    #
+
+
+    curl \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/job/2FX82FMTH' \
+    | jq '.body[]' \
+    > status.txt
+
+    #
+    # First format the start and end times and collect them together on a third line.
+    #
+
+    sed '
+        s/\("started":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/
+        s/\("finished":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/
+        /"started":/ {
+            h
+            s/\([[:space:]]*\)"started":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"finished":/ {
+            H
+            x
+            s/[[:space:]]*"finished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1"elapsed": \2/
+            x
+            G
+            }
+        ' status.txt
+
+
+1  "started": "2021 Feb 13 13:38:58",
+2  "finished": "2021 Feb 13 13:46:06",
+
+--
+
+P  "started": "2021 Feb 13 13:38:58",
+
+H  "started": "2021 Feb 13 13:38:58",
+P  "started": "2021 Feb 13 13:38:58",
+
+H  "started": "2021 Feb 13 13:38:58",
+P  "2021 Feb 13 13:38:58"
+
+H  "2021 Feb 13 13:38:58"
+P  "started": "2021 Feb 13 13:38:58",
+
+--
+
+H  "2021 Feb 13 13:38:58"
+P  "finished": "2021 Feb 13 13:46:06",
+
+H  "2021 Feb 13 13:38:58"\n"finished": "2021 Feb 13 13:46:06",
+P  "finished": "2021 Feb 13 13:46:06",
+
+H  "finished": "2021 Feb 13 13:46:06",
+P  "2021 Feb 13 13:38:58"\n"finished": "2021 Feb 13 13:46:06",
+
+H  "finished": "2021 Feb 13 13:46:06",
+P  "2021 Feb 13 13:38:58" "2021 Feb 13 13:46:06"
+
+H  "finished": "2021 Feb 13 13:46:06",
+P  "elapsed": "2021 Feb 13 13:38:58" "2021 Feb 13 13:46:06"
+
+H  "elapsed": "2021 Feb 13 13:38:58" "2021 Feb 13 13:46:06"
+P  "finished": "2021 Feb 13 13:46:06",
+
+P  "finished": "2021 Feb 13 13:46:06",
+   "elapsed": "2021 Feb 13 13:38:58" "2021 Feb 13 13:46:06"
+
+--
+
+    >   {
+    >     "progress": "100",
+    >     "started": "2021 Feb 13 13:33:46",
+    >     "finished": "2021 Feb 13 13:33:46",
+    >     "elapsed": "2021 Feb 13 13:33:46" "2021 Feb 13 13:33:46"
+    >     "id": "20201013-131059_546082898",
+    >     "status": "FINISHED"
+    >   }
+    >   {
+    >     "progress": "100",
+    >     "started": "2021 Feb 13 13:33:46",
+    >     "finished": "2021 Feb 13 13:34:36",
+    >     "elapsed": "2021 Feb 13 13:33:46" "2021 Feb 13 13:34:36"
+    >     "id": "20201013-131649_1734629667",
+    >     "status": "FINISHED"
+    >   }
+    >   ....
+    >   ....
+
+    #
+    # Add a call to datediff to generate the elapsed time.
+    #
+
+    sed '
+        s/\("started":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/
+        s/\("finished":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/
+        /"started":/ {
+            h
+            s/\([[:space:]]*\)"started":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"finished":/ {
+            H
+            x
+            s/[[:space:]]*"finished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsed\\": \\"$(datediff --format "%H:%M:%S" --input-format "%Y %b %d %H:%M:%S" \2)\\","/e
+            x
+            G
+            }
+        ' status.txt
+
+    >   {
+    >     "progress": "100",
+    >     "started": "2021 Feb 13 13:33:46",
+    >     "finished": "2021 Feb 13 13:33:46",
+    >     "elapsed": "0:0:0",
+    >     "id": "20201013-131059_546082898",
+    >     "status": "FINISHED"
+    >   }
+    >   {
+    >     "progress": "100",
+    >     "started": "2021 Feb 13 13:33:46",
+    >     "finished": "2021 Feb 13 13:34:36",
+    >     "elapsed": "0:0:50",
+    >     "id": "20201013-131649_1734629667",
+    >     "status": "FINISHED"
+    >   }
+    >   ....
+    >   ....
+
+    #
+    # Putting it all together.
+    # All because Zeppelin uses the default Java data format in a JSON response.
+    #
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/job/2FX82FMTH' \
+    | jq '.body' \
+    | sed '
+        s/\("started":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/
+        s/\("finished":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/
+        /"started":/ {
+            h
+            s/\([[:space:]]*\)"started":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"finished":/ {
+            H
+            x
+            s/[[:space:]]*"finished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsed\\": \\"$(datediff --format "%H:%M:%S" --input-format "%Y %b %d %H:%M:%S" \2)\\","/e
+            x
+            G
+            }
+        '
+
+    #
+    # Just get the elapsed time.
+    #
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/job/2FX82FMTH' \
+    | jq '.body' \
+    | sed '
+        s/\("started":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/
+        s/\("finished":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/
+        /"started":/ {
+            h
+            s/\([[:space:]]*\)"started":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"finished":/ {
+            H
+            x
+            s/[[:space:]]*"finished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsed\\": \\"$(datediff --format "%H:%M:%S" --input-format "%Y %b %d %H:%M:%S" \2)\\","/e
+            x
+            G
+            }
+        ' \
+    | jq -r '.[] | select(.elapsed != null) | .elapsed'
+
+
+
+

From a8a25c953018d5eb7ac83a99322577606b80b11f Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Mon, 15 Feb 2021 17:03:05 +0000
Subject: [PATCH 15/27] Performance testing

---
 notes/zrq/20210213-01-speed-tests.txt  |    2 +-
 notes/zrq/20210214-01-speed-tester.txt |    5 +-
 notes/zrq/20210215-01-speed-tests.txt  |  207 +++++
 notes/zrq/20210215-02-speed-tester.txt | 1046 ++++++++++++++++++++++++
 4 files changed, 1257 insertions(+), 3 deletions(-)
 create mode 100644 notes/zrq/20210215-01-speed-tests.txt
 create mode 100644 notes/zrq/20210215-02-speed-tester.txt

diff --git a/notes/zrq/20210213-01-speed-tests.txt b/notes/zrq/20210213-01-speed-tests.txt
index 27e97f33..72f3a977 100644
--- a/notes/zrq/20210213-01-speed-tests.txt
+++ b/notes/zrq/20210213-01-speed-tests.txt
@@ -30,10 +30,10 @@
 
         Follow on from previous tests.
         Enable caching or raw_sources.
-        Increase the number of trees.
 
     Result:
 
+        Work in progress ..
 
 # -----------------------------------------------------
 # -----------------------------------------------------
diff --git a/notes/zrq/20210214-01-speed-tester.txt b/notes/zrq/20210214-01-speed-tester.txt
index 482e038d..98a5883c 100644
--- a/notes/zrq/20210214-01-speed-tester.txt
+++ b/notes/zrq/20210214-01-speed-tester.txt
@@ -35,7 +35,8 @@
 
     Result:
 
-        Work in progress ...
+        Initial version works.
+        Extracts the paragraph id, star and end times, and calculates the elapsed time.
 
 
     #
@@ -149,7 +150,7 @@
     >   # Netscape HTTP Cookie File
     >   # https://curl.haxx.se/docs/http-cookies.html
     >   # This file was generated by libcurl! Edit at your own risk.
-    >   
+    >
     >   #HttpOnly_128.232.227.222	FALSE	/	FALSE	0	JSESSIONID	405b514d-c195-488b-98a8-86c9f06d65e2
 
 
diff --git a/notes/zrq/20210215-01-speed-tests.txt b/notes/zrq/20210215-01-speed-tests.txt
new file mode 100644
index 00000000..34b77625
--- /dev/null
+++ b/notes/zrq/20210215-01-speed-tests.txt
@@ -0,0 +1,207 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+    Target:
+
+        Follow on from previous tests.
+        Compare 500,1000 ... trees
+
+    Result:
+
+        Work in progress ..
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Login to Zeppelin ...
+#[user@desktop]
+
+    firefox --new-window "http://zeppelin.metagrid.xyz:8080/" &
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+    Import notebooks from GitHub, clear the output and run all the cells ...
+
+    Good astrometric solutions via ML Random Forrest classifier
+    https://raw.githubusercontent.com/wfau/aglais-notebooks/main/2FRPC4BFS/note.json
+
+# -----------------------------------------------------
+
+    dev deployment #3
+
+        test #3.6
+
+            100% data, 500 trees, cache
+
+            8:15 - run all
+           13:30 - done
+
+    #
+    # Get the elapsed time.
+    #
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/job/2FX82FMTH' \
+    | jq '.body' \
+    | sed '
+        s/\("started":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/
+        s/\("finished":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/
+        /"started":/ {
+            h
+            s/\([[:space:]]*\)"started":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"finished":/ {
+            H
+            x
+            s/[[:space:]]*"finished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsed\\": \\"$(datediff --format "%H:%M:%S" --input-format "%Y %b %d %H:%M:%S" \2)\\","/e
+            x
+            G
+            }
+        ' \
+    | jq -r '.[] | select(.elapsed != null) | .elapsed'
+
+
+    >   0:0:0
+    >   0:0:55
+    >   0:0:39
+    >   0:3:55
+    >   0:0:0
+    >   0:8:2
+    >   0:0:0
+    >   1:18:44
+    >   0:0:1
+    >   0:20:20
+    >   0:0:0
+    >   1:4:36
+    >   0:16:50
+    >   0:33:29
+    >   0:27:59
+    >   0:27:39
+    >   0:18:53
+    >   0:16:43
+
+    #
+    # Manual annotation..
+    #
+
+    >   0:0:0
+    >   0:0:55 - Astrometric features
+    >   0:0:39 - Select sources
+    >   0:3:55 - Hertzsprung-Russell diagram
+    >   0:0:0
+    >   0:8:2 - Selecting training data
+    >   0:0:0
+    >   1:18:44 - Random forest training
+    >   0:0:1
+    >   0:20:20 - Misclassification fraction
+    >   0:0:0
+    >   1:4:36  - Hertzsprung-Russell diagram
+    >   0:16:50 - Classification probabilities
+    >   0:33:29 - Good plot
+    >   0:27:59 - Bad plot
+    >   0:27:39 - Good/bad count
+    >   0:18:53 - Error distribution
+    >   0:16:43 - Null values
+
+
+    dev deployment #3
+        test #3.7
+
+            modify the code to use cached data
+
+
+                # cache it for speedy access below (all subsequent samples are derived from this):
+            -   raw_sources_df.cache()
+            +   cached_sources = raw_sources_df.cache()
+
+                # register as SQL-queryable
+            -   raw_sources_df.createOrReplaceTempView('raw_sources')
+            +   cached_sources.createOrReplaceTempView('cached_sources')
+
+
+            100% data, 500 trees, cache
+
+    #
+    # Get the elapsed time.
+    #
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/job/2FX82FMTH' \
+    | jq '.body' \
+    | sed '
+        s/\("started":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/
+        s/\("finished":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/
+        /"started":/ {
+            h
+            s/\([[:space:]]*\)"started":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"finished":/ {
+            H
+            x
+            s/[[:space:]]*"finished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsed\\": \\"$(datediff --format "%H:%M:%S" --input-format "%Y %b %d %H:%M:%S" \2)\\","/e
+            x
+            G
+            }
+        ' \
+    | jq -r '.[] | select(.elapsed != null) | .elapsed'
+
+    >   0:0:0
+    >   0:0:6   - Astrometric features
+    >   0:11:20 - Select sources
+    >   0:0:5   - Hertzsprung-Russell diagram
+    >   0:0:0
+    >   0:0:9   - Selecting training data
+    >   0:0:0
+    >   0:18:42 - Random forest training
+    >   0:0:0
+    >   0:0:29 - Misclassification fraction
+    >   0:0:0
+    >   0:1:49 - Hertzsprung-Russell diagram
+    >   0:0:58 - Classification probabilities
+    >   0:0:55 - Good plot
+    >   0:0:54 - Bad plot
+    >   0:0:56 - Good/bad count
+    >   0:0:10 - Error distribution
+    >   0:0:47 - Null values
+
+
+    dev deployment #3
+        test #3.8
+
+            100% data, 1000 trees, cache
+
diff --git a/notes/zrq/20210215-02-speed-tester.txt b/notes/zrq/20210215-02-speed-tester.txt
new file mode 100644
index 00000000..d08a5bdd
--- /dev/null
+++ b/notes/zrq/20210215-02-speed-tester.txt
@@ -0,0 +1,1046 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+    Target:
+
+        Issue #371
+        https://github.com/wfau/aglais/issues/371
+
+        Create a jq parser to extract the timing information from a Zeppelin notebook.
+
+    Result:
+
+        Work in progress ...
+
+
+    #
+    # REST API documentation.
+    # https://zeppelin.apache.org/docs/0.7.0/rest-api/rest-notebook.html#export-a-note
+
+    #
+    # REST API template
+    # http://[zeppelin-server]:[zeppelin-port]/api/notebook/export/[noteId]
+
+    #
+    # HTML notebook URL
+    # http://128.232.227.222:8080/#/notebook/2FX82FMTH
+
+
+# -----------------------------------------------------
+# Use the REST API to get the notebook contents.
+#[user@desktop]
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \
+    | jq '.'
+
+    >   ....
+    >   ....
+
+    #
+    # Select just the text outputs.
+    #
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \
+    | jq '
+        .body.paragraphs[] | .results | select(.code == "SUCCESS") | .msg[] | select(.type == "TEXT")
+        '
+
+    >   {
+    >     "type": "TEXT",
+    >     "data": "1724028"
+    >   }
+    >   {
+    >     "type": "TEXT",
+    >     "data": "<Figure size 648x648 with 1 Axes>\n"
+    >   }
+    >   {
+    >     "type": "TEXT",
+    >     "data": "Good training data size: 244740 rows\nBad  training data size: 244740 rows\n"
+    >   }
+    >   {
+    >     "type": "TEXT",
+    >     "data": "   |      1      2\n------------------------------\n 1 |  80320    553\n 2 |     10  80422\n\nMisclassifications for the test set: 0.35 %\n"
+    >   }
+    >   {
+    >     "type": "TEXT",
+    >     "data": "Relative importance of astrometric features:\n\n           parallax_error  :  0.238293\n      parallax_over_error  :  0.090757\n astrometric_sigma_5d_max  :  0.216903\n               pmra_error  :  0.152419\n              pmdec_error  :  0.135658\n astrometric_excess_noise  :  0.078488\nipd_gof_harmonic_amplitude  :  0.036072\n                     ruwe  :  0.016268\n  visibility_periods_used  :  0.007118\n                    pmdec  :  0.007725\n                     pmra  :  0.004116\n         ipd_frac_odd_win  :  0.001107\n      ipd_frac_multi_peak  :  0.009796\n       astrometric_gof_al  :  0.003791\n      parallax_pmdec_corr  :  0.000393\nastrometric_excess_noise_sig  :  0.001098\n"
+    >   }
+    >   {
+    >     "type": "TEXT",
+    >     "data": "<Figure size 432x698.4 with 1 Axes>\n"
+    >   }
+
+
+    #
+    # Exclude image outputs.
+    #
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \
+    | jq '
+        .body.paragraphs[] | del(.results | select(.code == "SUCCESS") | .msg[] | select(.type == "IMG"))
+        '
+
+    >   {
+    >     "text": "%md\n\n# Using ML to define an astrometrically clean sample of stars\n\n   Follows Gaia EDR3 performance verification paper DPACP-81 (Smart et al.) in classifying astrometric solutions as good or bad\n   via supervised ML. Employs a Random Forrest classifier plus appropriately defined training sets - see\n\n   https://arxiv.org/abs/2012.02061\n  \n   for further details. The work flow implemented here follows closely that described in Section 2, \"GCNS Generation\"\n   (GCNS = Gaia Catalogue of Nearby Stars) and is designed to clean up a 100pc (= nearby) sample.\n\n   <i>Version employing newer, richer dataframe API in pyspark ML</i>\n   \n   <b>IMPORTANT NOTE: </b> current deployment has Spark 2.4.7 installed. That specific version's API is documented here:\n   \n   https://spark.apache.org/docs/2.4.7/ml-classification-regression.html#random-forest-classifier\n   \n   Beware of following on-line message board and other fora posts for help and examples as they more often than not describe and link to different versions, and the API is evolving <i>all the time</i>.\n   \n   ",
+    >     "user": "gaiauser",
+    >     "dateUpdated": "Feb 15, 2021 8:15:17 AM",
+    >     "config": {
+    >       "tableHide": false,
+    >       "editorSetting": {
+    >         "language": "markdown",
+    >         "editOnDblClick": true,
+    >         "completionKey": "TAB",
+    >         "completionSupport": false
+    >       },
+    >       "colWidth": 12,
+    >       "editorMode": "ace/mode/markdown",
+    >       "fontSize": 9,
+    >       "editorHide": false,
+    >       "results": {},
+    >       "enabled": true
+    >     },
+    >     "settings": {
+    >       "params": {},
+    >       "forms": {}
+    >     },
+    >     "results": {
+    >       "code": "SUCCESS",
+    >       "msg": [
+    >         {
+    >           "type": "HTML",
+    >           "data": "<div class=\"markdown-body\">\n<h1>Using ML to define an astrometrically clean sample of stars</h1>\n<p>Follows Gaia EDR3 performance verification paper DPACP-81 (Smart et al.) in classifying astrometric solutions as good or bad<br/> via supervised ML. Employs a Random Forrest classifier plus appropriately defined training sets - see</p>\n<p><a href=\"https://arxiv.org/abs/2012.02061\">https://arxiv.org/abs/2012.02061</a></p>\n<p>for further details. The work flow implemented here follows closely that described in Section 2, &ldquo;GCNS Generation&rdquo;<br/> (GCNS = Gaia Catalogue of Nearby Stars) and is designed to clean up a 100pc (= nearby) sample.</p>\n<p><i>Version employing newer, richer dataframe API in pyspark ML</i></p>\n<p><b>IMPORTANT NOTE: </b> current deployment has Spark 2.4.7 installed. That specific version&rsquo;s API is documented here:</p>\n<p><a href=\"https://spark.apache.org/docs/2.4.7/ml-classification-regression.html#random-forest-classifier\">https://spark.apache.org/docs/2.4.7/ml-classification-regression.html#random-forest-classifier</a></p>\n<p>Beware of following on-line message board and other fora posts for help and examples as they more often than not describe and link to different versions, and the API is evolving <i>all the time</i>.</p>\n</div>"
+    >         }
+    >       ]
+    >     },
+    >     "apps": [],
+    >     "jobName": "paragraph_1613126076679_1211627861",
+    >     "id": "20201013-131059_546082898",
+    >     "dateCreated": "Feb 12, 2021 10:34:36 AM",
+    >     "dateStarted": "Feb 15, 2021 8:15:17 AM",
+    >     "dateFinished": "Feb 15, 2021 8:15:17 AM",
+    >     "status": "FINISHED",
+    >     "progressUpdateIntervalMs": 500
+    >   }
+    >   ....
+    >   ....
+    >   {
+    >     "text": "%spark.pyspark\n\n# where are the NULLs in raw_sources features selection?\nfor feature in astrometric_features: print (spark.sql('SELECT COUNT(*) AS ' + feature + '_nulls FROM raw_sources WHERE ' + feature + ' IS NULL').show())\n# scan_direction_strength_k2 is the culprit!\n    \n# alternatively could try:\n#Dict_Null = {col:df.filter(df[col].isNull()).count() for col in df.columns}\n#Dict_Null\n    \n",
+    >     "user": "gaiauser",
+    >     "dateUpdated": "Feb 13, 2021 6:14:46 PM",
+    >     "config": {
+    >       "editorSetting": {
+    >         "language": "python",
+    >         "editOnDblClick": false,
+    >         "completionKey": "TAB",
+    >         "completionSupport": true
+    >       },
+    >       "colWidth": 12,
+    >       "editorMode": "ace/mode/python",
+    >       "fontSize": 9,
+    >       "results": {},
+    >       "enabled": true
+    >     },
+    >     "settings": {
+    >       "params": {},
+    >       "forms": {}
+    >     },
+    >     "apps": [],
+    >     "jobName": "paragraph_1613126076687_1356332997",
+    >     "id": "20201124-171324_1960205489",
+    >     "dateCreated": "Feb 12, 2021 10:34:36 AM",
+    >     "dateStarted": "Feb 13, 2021 6:14:46 PM",
+    >     "dateFinished": "Feb 13, 2021 6:29:25 PM",
+    >     "status": "FINISHED",
+    >     "errorMessage": "",
+    >     "progressUpdateIntervalMs": 500
+    >   }
+
+    #
+    # Select specific fields.
+    #
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FYRDDR17' \
+    | jq '
+        .body.paragraphs[] | {
+            title,
+            status,
+            dateStarted,
+            dateFinished,
+            status: .results.code
+            }
+        '
+
+    >   {
+    >     "title": "Paragraph 001",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 11:51:15 AM",
+    >     "dateFinished": "Feb 15, 2021 11:51:17 AM"
+    >   }
+    >   ....
+    >   ....
+
+    #
+    # Select text message lines that begin with '-'.
+    #
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FYRDDR17' \
+    | jq '
+        .body.paragraphs[] | {
+            title,
+            status,
+            dateStarted,
+            dateFinished,
+            status: .results.code,
+            output: (.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-"))))
+            }
+        '
+
+    >   {
+    >     "title": "Paragraph 001",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 11:51:15 AM",
+    >     "dateFinished": "Feb 15, 2021 11:51:17 AM",
+    >     "output": [
+    >       "-rw-------.  1 fedora fedora  503 Feb 13 03:21 .bash_history",
+    >       "-rw-r--r--.  1 fedora fedora   18 Feb 16  2019 .bash_logout",
+    >       "-rw-r--r--.  1 fedora fedora  141 Feb 16  2019 .bash_profile",
+    >       "-rw-r--r--.  1 fedora fedora  376 Feb 16  2019 .bashrc",
+    >       "-rw-------.  1 fedora fedora    0 Feb 12 10:34 .scala_history"
+    >     ]
+    >   }
+
+    #
+    # Add the elapsed time calculation.
+    #
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FYRDDR17' \
+    | jq '.' \
+    | sed '
+        s/\("dateStarted":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/
+        s/\("dateFinished":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/
+        /"started":/ {
+            h
+            s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"finished":/ {
+            H
+            x
+            s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%Y %b %d %H:%M:%S" \2)\\","/e
+            x
+            G
+            }
+        ' \
+
+        #
+        # !!!!! the dates are in a different format !
+        #
+
+        #
+        # Back to square one with the date format .. although they are not as bad as the previous ones.
+        # Might be able to do it without the sed processing step.
+        #
+
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FYRDDR17' \
+    | jq '.' \
+    | sed '
+        /"dateStarted":/ {
+            h
+            s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"dateFinished":/ {
+            H
+            x
+            s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e
+            x
+            G
+            }
+        ' \
+
+    >   {
+    >     "status": "OK",
+    >     "message": "",
+    >     "body": {
+    >       "paragraphs": [
+    >         {
+    >           "title": "Paragraph 001",
+    >           "text": "%sh\nls -al\n",
+    >           "user": "gaiauser",
+    >           "dateUpdated": "Feb 15, 2021 11:51:15 AM",
+    >           ....
+    >           ....
+    >           "id": "20210215-115033_1362835282",
+    >           "dateCreated": "Feb 15, 2021 11:50:33 AM",
+    >           "dateStarted": "Feb 15, 2021 11:51:15 AM",
+    >           "dateFinished": "Feb 15, 2021 11:51:17 AM",
+    >           "elapsedTime": "0:0:2",
+    >           "status": "FINISHED",
+    >           "progressUpdateIntervalMs": 500
+    >         },
+    >       ....
+    >       ....
+    >       "info": {}
+    >     }
+    >   }
+
+    #
+    # Add the field selection and output parser.
+    #
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FYRDDR17' \
+    | jq '
+        .body.paragraphs[] | {
+            title,
+            status,
+            dateStarted,
+            dateFinished,
+            status: .results.code,
+            output: (.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-"))))
+            }
+        ' \
+    | sed '
+        /"dateStarted":/ {
+            h
+            s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"dateFinished":/ {
+            H
+            x
+            s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e
+            x
+            G
+            }
+        '
+
+    >   {
+    >     "title": "Paragraph 001",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 11:51:15 AM",
+    >     "dateFinished": "Feb 15, 2021 11:51:17 AM",
+    >     "elapsedTime": "0:0:2",
+    >     "output": [
+    >       "-rw-------.  1 fedora fedora  503 Feb 13 03:21 .bash_history",
+    >       "-rw-r--r--.  1 fedora fedora   18 Feb 16  2019 .bash_logout",
+    >       "-rw-r--r--.  1 fedora fedora  141 Feb 16  2019 .bash_profile",
+    >       "-rw-r--r--.  1 fedora fedora  376 Feb 16  2019 .bashrc",
+    >       "-rw-------.  1 fedora fedora    0 Feb 12 10:34 .scala_history"
+    >     ]
+    >   }
+
+
+# -----------------------------------------------------
+# Try it on the real notebook.
+#[user@desktop]
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \
+    | jq '
+        .body.paragraphs[] | {
+            title,
+            status,
+            dateStarted,
+            dateFinished,
+            status: .results.code,
+            output: (.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-"))))
+            }
+        ' \
+    | sed '
+        /"dateStarted":/ {
+            h
+            s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"dateFinished":/ {
+            H
+            x
+            s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e
+            x
+            G
+            }
+        '
+
+    >   {
+    >     "title": "Select sources",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:25:36 PM",
+    >     "dateFinished": "Feb 15, 2021 2:27:03 PM",
+    >     "elapsedTime": "0:1:27",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": "Hertzsprung-Russell diagram",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:27:03 PM",
+    >     "dateFinished": "Feb 15, 2021 2:30:44 PM",
+    >     "elapsedTime": "0:3:41",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": "Selecting training data",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:30:44 PM",
+    >     "dateFinished": "Feb 15, 2021 2:37:18 PM",
+    >     "elapsedTime": "0:6:34",
+    >     "output": []
+    >   }
+
+    #
+    # Missing some elements ..
+    #
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \
+    | jq '
+        .body.paragraphs[] | {
+            title,
+            status,
+            dateStarted,
+            dateFinished,
+            status: .results.code
+            }
+        ' \
+    | sed '
+        /"dateStarted":/ {
+            h
+            s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"dateFinished":/ {
+            H
+            x
+            s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e
+            x
+            G
+            }
+        '
+
+    # Skips rows if output is null ?
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \
+    | jq '
+        .body.paragraphs[] | {
+            title,
+            status,
+            dateStarted,
+            dateFinished,
+            status: .results.code,
+            output: (.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT"))
+            }
+        '
+
+    >   {
+    >     "title": "Select sources",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:25:36 PM",
+    >     "dateFinished": "Feb 15, 2021 2:27:03 PM",
+    >     "output": {
+    >       "type": "TEXT",
+    >       "data": "1724028"
+    >     }
+    >   }
+    >   {
+    >     "title": "Hertzsprung-Russell diagram",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:27:03 PM",
+    >     "dateFinished": "Feb 15, 2021 2:30:44 PM",
+    >     "output": {
+    >       "type": "TEXT",
+    >       "data": "<Figure size 648x648 with 1 Axes>\n"
+    >     }
+    >   }
+    >   {
+    >     "title": "Selecting training data",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:30:44 PM",
+    >     "dateFinished": "Feb 15, 2021 2:37:18 PM",
+    >     "output": {
+    >       "type": "TEXT",
+    >       "data": "Good training data size: 244740 rows\nBad  training data size: 244740 rows\n"
+    >     }
+    >   }
+
+    #
+    # Add a default value to the output.
+    #
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \
+    | jq '
+        .body.paragraphs[] | {
+            title,
+            status,
+            dateStarted,
+            dateFinished,
+            status: .results.code,
+            output: ((.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") ) // "-")
+            }
+        '
+
+    >   {
+    >     "title": "Introduction",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:25:31 PM",
+    >     "dateFinished": "Feb 15, 2021 2:25:31 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Astrometric features ",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:25:31 PM",
+    >     "dateFinished": "Feb 15, 2021 2:25:36 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Select sources",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:25:36 PM",
+    >     "dateFinished": "Feb 15, 2021 2:27:03 PM",
+    >     "output": {
+    >       "type": "TEXT",
+    >       "data": "1724028"
+    >     }
+    >   }
+    >   {
+    >     "title": "Hertzsprung-Russell diagram",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:27:03 PM",
+    >     "dateFinished": "Feb 15, 2021 2:30:44 PM",
+    >     "output": {
+    >       "type": "TEXT",
+    >       "data": "<Figure size 648x648 with 1 Axes>\n"
+    >     }
+    >   }
+    >   {
+    >     "title": null,
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:30:44 PM",
+    >     "dateFinished": "Feb 15, 2021 2:30:44 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Selecting training data",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:30:44 PM",
+    >     "dateFinished": "Feb 15, 2021 2:37:18 PM",
+    >     "output": {
+    >       "type": "TEXT",
+    >       "data": "Good training data size: 244740 rows\nBad  training data size: 244740 rows\n"
+    >     }
+    >   }
+    >   {
+    >     "title": null,
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:37:19 PM",
+    >     "dateFinished": "Feb 15, 2021 2:37:19 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Random forest training",
+    >     "status": null,
+    >     "dateStarted": "Feb 15, 2021 2:37:19 PM",
+    >     "dateFinished": "Feb 15, 2021 2:16:12 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Random forest testing",
+    >     "status": null,
+    >     "dateStarted": "Feb 15, 2021 2:16:12 PM",
+    >     "dateFinished": "Feb 15, 2021 2:16:12 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Misclassification fraction",
+    >     "status": null,
+    >     "dateStarted": "Feb 15, 2021 2:16:12 PM",
+    >     "dateFinished": "Feb 15, 2021 2:16:41 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Feature importance",
+    >     "status": null,
+    >     "dateStarted": "Feb 15, 2021 2:16:42 PM",
+    >     "dateFinished": "Feb 15, 2021 2:16:42 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Hertzsprung-Russell diagram",
+    >     "status": null,
+    >     "dateStarted": "Feb 15, 2021 2:16:42 PM",
+    >     "dateFinished": "Feb 15, 2021 2:18:31 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Classification probabilities",
+    >     "status": null,
+    >     "dateStarted": "Feb 15, 2021 2:18:31 PM",
+    >     "dateFinished": "Feb 15, 2021 2:19:29 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Good sources plot",
+    >     "status": null,
+    >     "dateStarted": "Feb 15, 2021 2:19:29 PM",
+    >     "dateFinished": "Feb 15, 2021 2:20:24 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Bad sources plot",
+    >     "status": null,
+    >     "dateStarted": "Feb 15, 2021 2:20:25 PM",
+    >     "dateFinished": "Feb 15, 2021 2:21:19 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Good/bad count",
+    >     "status": null,
+    >     "dateStarted": "Feb 15, 2021 2:21:20 PM",
+    >     "dateFinished": "Feb 15, 2021 2:22:16 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Parallax over error distribution",
+    >     "status": null,
+    >     "dateStarted": "Feb 15, 2021 2:22:16 PM",
+    >     "dateFinished": "Feb 15, 2021 2:22:26 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Null values check",
+    >     "status": null,
+    >     "dateStarted": "Feb 15, 2021 2:22:26 PM",
+    >     "dateFinished": "Feb 15, 2021 2:23:13 PM",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": null,
+    >     "status": null,
+    >     "dateStarted": null,
+    >     "dateFinished": null,
+    >     "output": "-"
+    >   }
+
+# -----------------------------------------------------
+# Try it on the real notebook.
+#[user@desktop]
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \
+    | jq '
+        .body.paragraphs[] | {
+            title,
+            status,
+            dateStarted,
+            dateFinished,
+            status: .results.code,
+            output: ((.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) // "-")
+            }
+        ' \
+    | sed '
+        /"dateStarted":/ {
+            h
+            s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"dateFinished":/ {
+            H
+            x
+            s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e
+            x
+            G
+            }
+        '
+
+    >   {
+    >     "title": "Introduction",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:25:31 PM",
+    >     "dateFinished": "Feb 15, 2021 2:25:31 PM",
+    >     "elapsedTime": "0:0:0",
+    >     "output": "-"
+    >   }
+    >   ....
+    >   ....
+    >   {
+    >     "title": "Null values check",
+    >     "status": null,
+    >     "dateStarted": "Feb 15, 2021 2:22:26 PM",
+    >     "dateFinished": "Feb 15, 2021 2:23:13 PM",
+    >     "elapsedTime": "0:0:47",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": null,
+    >     "status": null,
+    >     "dateStarted": null,
+    >   ddiff: Error: reference DATE must be specified
+    >
+    >   sh: line 2: dateFinished:: command not found
+    >     "dateFinished": null,
+    >     "elapsedTime": "Usage: datediff [OPTION]... DATE/TIME [DATE/TIME]...
+
+
+# -----------------------------------------------------
+# Add a null value check for the dates.
+#[user@desktop]
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \
+    | jq '
+        .body.paragraphs[] | {
+            title,
+            status,
+            dateStarted,
+            dateFinished,
+            status: .results.code,
+            output: ((.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) // "-")
+            }
+        ' \
+    | sed '
+        /"dateStarted": null,/d
+        /"dateStarted":/ {
+            h
+            s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"dateFinished": null,/ d
+        /"dateFinished":/ {
+            H
+            x
+            s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e
+            x
+            G
+            }
+        '
+
+    >   {
+    >     "title": "Introduction",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:25:31 PM",
+    >     "dateFinished": "Feb 15, 2021 2:25:31 PM",
+    >     "elapsedTime": "0:0:0",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Astrometric features ",
+    >     "status": "SUCCESS",
+    >     "dateStarted": "Feb 15, 2021 2:25:31 PM",
+    >     "dateFinished": "Feb 15, 2021 2:25:36 PM",
+    >     "elapsedTime": "0:0:5",
+    >     "output": "-"
+    >   }
+    >   ....
+    >   ....
+    >   {
+    >     "title": "Null values check",
+    >     "status": null,
+    >     "dateStarted": "Feb 15, 2021 2:22:26 PM",
+    >     "dateFinished": "Feb 15, 2021 2:23:13 PM",
+    >     "elapsedTime": "0:0:47",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": null,
+    >     "status": null,
+    >     "output": "-"
+    >   }
+
+
+
+# -----------------------------------------------------
+# Skip cells with no result code.
+#[user@desktop]
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \
+    | jq '
+        .body.paragraphs[] | select(.results.code != null) | {
+            title,
+            status,
+            dateStarted,
+            dateFinished,
+            result: .results.code,
+            output: ((.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) // "-")
+            }
+        ' \
+    | sed '
+        /"dateStarted": null,/d
+        /"dateStarted":/ {
+            h
+            s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"dateFinished": null,/ d
+        /"dateFinished":/ {
+            H
+            x
+            s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e
+            x
+            G
+            }
+        '
+
+    >   {
+    >     "title": "Introduction",
+    >     "status": "FINISHED",
+    >     "dateStarted": "Feb 15, 2021 2:25:31 PM",
+    >     "dateFinished": "Feb 15, 2021 2:25:31 PM",
+    >     "elapsedTime": "0:0:0",
+    >     "result": "SUCCESS",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Astrometric features ",
+    >     "status": "FINISHED",
+    >     "dateStarted": "Feb 15, 2021 2:25:31 PM",
+    >     "dateFinished": "Feb 15, 2021 2:25:36 PM",
+    >     "elapsedTime": "0:0:5",
+    >     "result": "SUCCESS",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Select sources",
+    >     "status": "FINISHED",
+    >     "dateStarted": "Feb 15, 2021 2:25:36 PM",
+    >     "dateFinished": "Feb 15, 2021 2:27:03 PM",
+    >     "elapsedTime": "0:1:27",
+    >     "result": "SUCCESS",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": "Hertzsprung-Russell diagram",
+    >     "status": "FINISHED",
+    >     "dateStarted": "Feb 15, 2021 2:27:03 PM",
+    >     "dateFinished": "Feb 15, 2021 2:30:44 PM",
+    >     "elapsedTime": "0:3:41",
+    >     "result": "SUCCESS",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": null,
+    >     "status": "FINISHED",
+    >     "dateStarted": "Feb 15, 2021 2:30:44 PM",
+    >     "dateFinished": "Feb 15, 2021 2:30:44 PM",
+    >     "elapsedTime": "0:0:0",
+    >     "result": "SUCCESS",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Selecting training data",
+    >     "status": "FINISHED",
+    >     "dateStarted": "Feb 15, 2021 2:30:44 PM",
+    >     "dateFinished": "Feb 15, 2021 2:37:18 PM",
+    >     "elapsedTime": "0:6:34",
+    >     "result": "SUCCESS",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": null,
+    >     "status": "FINISHED",
+    >     "dateStarted": "Feb 15, 2021 2:37:19 PM",
+    >     "dateFinished": "Feb 15, 2021 2:37:19 PM",
+    >     "elapsedTime": "0:0:0",
+    >     "result": "SUCCESS",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Random forest training",
+    >     "status": "FINISHED",
+    >     "dateStarted": "Feb 15, 2021 2:37:19 PM",
+    >     "dateFinished": "Feb 15, 2021 4:11:30 PM",
+    >     "elapsedTime": "1:34:11",
+    >     "result": "SUCCESS",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Random forest testing",
+    >     "status": "FINISHED",
+    >     "dateStarted": "Feb 15, 2021 4:11:30 PM",
+    >     "dateFinished": "Feb 15, 2021 4:11:30 PM",
+    >     "elapsedTime": "0:0:0",
+    >     "result": "SUCCESS",
+    >     "output": "-"
+    >   }
+
+
+
+
+# -----------------------------------------------------
+# Swap sed and jq, don't include the start and end dates in the output.
+#[user@desktop]
+
+    curl \
+        --silent \
+        --cookie '/tmp/cookies' \
+        'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \
+    | jq '.' \
+    | sed '
+        /"dateStarted": null,/d
+        /"dateStarted":/ {
+            h
+            s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/
+            x
+            }
+        /"dateFinished": null,/ d
+        /"dateFinished":/ {
+            H
+            x
+            s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/
+            s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e
+            x
+            G
+            }
+        ' \
+    | jq '
+        .body.paragraphs[] | select(.results.code != null) | {
+            title,
+            result: .results.code,
+            time:   .elapsedTime,
+            output: ((.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) // "-")
+            }
+        '
+
+
+    >   {
+    >     "title": "Introduction",
+    >     "result": "SUCCESS",
+    >     "time": "0:0:0",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Astrometric features ",
+    >     "result": "SUCCESS",
+    >     "time": "0:0:5",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Select sources",
+    >     "result": "SUCCESS",
+    >     "time": "0:1:27",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": "Hertzsprung-Russell diagram",
+    >     "result": "SUCCESS",
+    >     "time": "0:3:41",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": null,
+    >     "result": "SUCCESS",
+    >     "time": "0:0:0",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Selecting training data",
+    >     "result": "SUCCESS",
+    >     "time": "0:6:34",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": null,
+    >     "result": "SUCCESS",
+    >     "time": "0:0:0",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Random forest training",
+    >     "result": "SUCCESS",
+    >     "time": "1:34:11",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Random forest testing",
+    >     "result": "SUCCESS",
+    >     "time": "0:0:0",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Misclassification fraction",
+    >     "result": "SUCCESS",
+    >     "time": "0:17:40",
+    >     "output": [
+    >       "------------------------------"
+    >     ]
+    >   }
+    >   {
+    >     "title": "Feature importance",
+    >     "result": "SUCCESS",
+    >     "time": "0:0:0",
+    >     "output": []
+    >   }
+

From 63271a2f903b0958c9581a8ed9accce2d57c8285 Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Mon, 15 Feb 2021 18:53:13 +0000
Subject: [PATCH 16/27] ....

---
 notes/zrq/20210215-01-speed-tests.txt | 164 +++++++++++++++++++++++++-
 1 file changed, 160 insertions(+), 4 deletions(-)

diff --git a/notes/zrq/20210215-01-speed-tests.txt b/notes/zrq/20210215-01-speed-tests.txt
index 34b77625..37276bc3 100644
--- a/notes/zrq/20210215-01-speed-tests.txt
+++ b/notes/zrq/20210215-01-speed-tests.txt
@@ -57,7 +57,6 @@
     dev deployment #3
 
         test #3.6
-
             100% data, 500 trees, cache
 
             8:15 - run all
@@ -137,10 +136,8 @@
 
     dev deployment #3
         test #3.7
-
             modify the code to use cached data
 
-
                 # cache it for speedy access below (all subsequent samples are derived from this):
             -   raw_sources_df.cache()
             +   cached_sources = raw_sources_df.cache()
@@ -149,7 +146,6 @@
             -   raw_sources_df.createOrReplaceTempView('raw_sources')
             +   cached_sources.createOrReplaceTempView('cached_sources')
 
-
             100% data, 500 trees, cache
 
     #
@@ -202,6 +198,166 @@
 
     dev deployment #3
         test #3.8
+            100% data, 1000 trees, cache
 
+    checkstatus()
+        {
+        curl \
+            --silent \
+            --cookie '/tmp/cookies' \
+            'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \
+        | jq '.' \
+        | sed '
+            /"dateStarted": null,/d
+            /"dateStarted":/ {
+                h
+                s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/
+                x
+                }
+            /"dateFinished": null,/ d
+            /"dateFinished":/ {
+                H
+                x
+                s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/
+                s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e
+                x
+                G
+                }
+            ' \
+        | jq '
+            .body.paragraphs[] | select(.results.code != null) | {
+                title,
+                result: .results.code,
+                time:   .elapsedTime,
+                output: ((.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) // "-")
+                }
+            '
+        }
+
+    checkstatus
+
+    >   {
+    >     "title": "Introduction",
+    >     "result": "SUCCESS",
+    >     "time": "0:0:0",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Astrometric features ",
+    >     "result": "SUCCESS",
+    >     "time": "0:0:5",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Select sources",
+    >     "result": "SUCCESS",
+    >     "time": "0:1:27",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": "Hertzsprung-Russell diagram",
+    >     "result": "SUCCESS",
+    >     "time": "0:3:41",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": null,
+    >     "result": "SUCCESS",
+    >     "time": "0:0:0",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Selecting training data",
+    >     "result": "SUCCESS",
+    >     "time": "0:6:34",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": null,
+    >     "result": "SUCCESS",
+    >     "time": "0:0:0",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Random forest training",
+    >     "result": "SUCCESS",
+    >     "time": "1:34:11",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Random forest testing",
+    >     "result": "SUCCESS",
+    >     "time": "0:0:0",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Misclassification fraction",
+    >     "result": "SUCCESS",
+    >     "time": "0:17:40",
+    >     "output": [
+    >       "------------------------------"
+    >     ]
+    >   }
+    >   {
+    >     "title": "Feature importance",
+    >     "result": "SUCCESS",
+    >     "time": "0:0:0",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": "Hertzsprung-Russell diagram",
+    >     "result": "SUCCESS",
+    >     "time": "0:54:46",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": "Classification probabilities",
+    >     "result": "SUCCESS",
+    >     "time": "0:14:22",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": "Good sources plot",
+    >     "result": "SUCCESS",
+    >     "time": "0:27:28",
+    >     "output": []
+    >   }
+
+
+    dev deployment #3
+        test #3.9
+            repeat the same test
             100% data, 1000 trees, cache
+            minor edits to print statements
+
+    start 18:47
+
+    checkstatus
+
+    >   {
+    >     "title": "Introduction",
+    >     "result": "SUCCESS",
+    >     "time": "0:0:0",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Astrometric features ",
+    >     "result": "SUCCESS",
+    >     "time": "0:0:5",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Select sources",
+    >     "result": "SUCCESS",
+    >     "time": "0:1:22",
+    >     "output": [
+    >       "- Cached rows : 1724028 rows"
+    >     ]
+    >   }
+
+
+
+
+
+
 

From 7ddae3317587c25e8960c9b7325d8556ab40d4fe Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Tue, 16 Feb 2021 12:03:58 +0000
Subject: [PATCH 17/27] ....

---
 notes/zrq/20210215-01-speed-tests.txt   | 125 +++++++++++++++++++++++-
 notes/zrq/20210216-01-timing-logger.txt |  89 +++++++++++++++++
 2 files changed, 211 insertions(+), 3 deletions(-)
 create mode 100644 notes/zrq/20210216-01-timing-logger.txt

diff --git a/notes/zrq/20210215-01-speed-tests.txt b/notes/zrq/20210215-01-speed-tests.txt
index 37276bc3..53856c05 100644
--- a/notes/zrq/20210215-01-speed-tests.txt
+++ b/notes/zrq/20210215-01-speed-tests.txt
@@ -357,7 +357,126 @@
 
 
 
-
-
-
+    >   {
+    >     "title": "Introduction",
+    >     "result": "SUCCESS",
+    >     "time": "0:0:0",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Astrometric features ",
+    >     "result": "SUCCESS",
+    >     "time": "0:0:5",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Select sources",
+    >     "result": "SUCCESS",
+    >     "time": "0:1:22",
+    >     "output": [
+    >       "- Cached rows : 1724028 rows"
+    >     ]
+    >   }
+    >   {
+    >     "title": "Hertzsprung-Russell diagram",
+    >     "result": "SUCCESS",
+    >     "time": "0:4:1",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": null,
+    >     "result": "SUCCESS",
+    >     "time": "0:0:1",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Selecting training data",
+    >     "result": "SUCCESS",
+    >     "time": "0:6:39",
+    >     "output": [
+    >       "- Good training data size: 244740 rows",
+    >       "- Bad  training data size: 244740 rows"
+    >     ]
+    >   }
+    >   {
+    >     "title": null,
+    >     "result": "SUCCESS",
+    >     "time": "0:18:14",
+    >     "output": [
+    >       "- Combined training data : 328175 rows"
+    >     ]
+    >   }
+    >   {
+    >     "title": "Random forest training",
+    >     "result": "SUCCESS",
+    >     "time": "1:26:12",
+    >     "output": [
+    >       "- Classifier : 1000 trees"
+    >     ]
+    >   }
+    >   {
+    >     "title": "Random forest testing",
+    >     "result": "SUCCESS",
+    >     "time": "0:0:0",
+    >     "output": "-"
+    >   }
+    >   {
+    >     "title": "Misclassification fraction",
+    >     "result": "SUCCESS",
+    >     "time": "0:18:7",
+    >     "output": [
+    >       "- Misclassifications for the test set: 0.35 %"
+    >     ]
+    >   }
+    >   {
+    >     "title": "Feature importance",
+    >     "result": "SUCCESS",
+    >     "time": "0:0:0",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": "Hertzsprung-Russell diagram",
+    >     "result": "SUCCESS",
+    >     "time": "0:56:23",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": "Classification probabilities",
+    >     "result": "SUCCESS",
+    >     "time": "0:14:2",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": "Good sources plot",
+    >     "result": "SUCCESS",
+    >     "time": "0:26:49",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": "Bad sources plot",
+    >     "result": "SUCCESS",
+    >     "time": "0:26:34",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": "Good/bad count",
+    >     "result": "SUCCESS",
+    >     "time": "0:27:9",
+    >     "output": [
+    >       "- Found 22263 good sources",
+    >       "- Found 26161 bad sources"
+    >     ]
+    >   }
+    >   {
+    >     "title": "Parallax over error distribution",
+    >     "result": "SUCCESS",
+    >     "time": "0:16:43",
+    >     "output": []
+    >   }
+    >   {
+    >     "title": "Null values check",
+    >     "result": "SUCCESS",
+    >     "time": "0:9:39",
+    >     "output": []
+    >   }
 
diff --git a/notes/zrq/20210216-01-timing-logger.txt b/notes/zrq/20210216-01-timing-logger.txt
new file mode 100644
index 00000000..ca6b0aba
--- /dev/null
+++ b/notes/zrq/20210216-01-timing-logger.txt
@@ -0,0 +1,89 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+    Target:
+
+        Issue #371
+        https://github.com/wfau/aglais/issues/371
+
+        Create a jq parser to extract the timing information from a Zeppelin notebook.
+
+    Result:
+
+        Work in progress ...
+
+
+    #
+    # REST API documentation.
+    # https://zeppelin.apache.org/docs/0.7.0/rest-api/rest-notebook.html#export-a-note
+
+    #
+    # REST API template
+    # http://[zeppelin-server]:[zeppelin-port]/api/notebook/export/[noteId]
+
+
+
+    zeppelinbase=http://128.232.227.222:8080
+    notebookident=2FX82FMTH
+
+    checkstatus()
+        {
+        local zeppelinbase=${1:?}
+        local zeppelinbase=${1:?}
+        curl \
+            --silent \
+            --cookie '/tmp/cookies' \
+            'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \
+        | jq '.' \
+        | sed '
+            /"dateStarted": null,/d
+            /"dateStarted":/ {
+                h
+                s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/
+                x
+                }
+            /"dateFinished": null,/ d
+            /"dateFinished":/ {
+                H
+                x
+                s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/
+                s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e
+                x
+                G
+                }
+            ' \
+        | jq '
+            .body.paragraphs[] | select(.results.code != null) | {
+                title,
+                result: .results.code,
+                time:   .elapsedTime,
+                output: ((.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) // "-")
+                }
+            '
+        }
+

From 58c0a303847636a6b7c7288d3234a0ec2e5b2262 Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Tue, 16 Feb 2021 12:13:45 +0000
Subject: [PATCH 18/27] Renaming notes

---
 ...20210214-01-speed-tester.txt => 20210214-01-timing-logger.txt} | 0
 ...20210215-02-speed-tester.txt => 20210215-02-timing-logger.txt} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename notes/zrq/{20210214-01-speed-tester.txt => 20210214-01-timing-logger.txt} (100%)
 rename notes/zrq/{20210215-02-speed-tester.txt => 20210215-02-timing-logger.txt} (100%)

diff --git a/notes/zrq/20210214-01-speed-tester.txt b/notes/zrq/20210214-01-timing-logger.txt
similarity index 100%
rename from notes/zrq/20210214-01-speed-tester.txt
rename to notes/zrq/20210214-01-timing-logger.txt
diff --git a/notes/zrq/20210215-02-speed-tester.txt b/notes/zrq/20210215-02-timing-logger.txt
similarity index 100%
rename from notes/zrq/20210215-02-speed-tester.txt
rename to notes/zrq/20210215-02-timing-logger.txt

From 6447eb3bd6ee89ee858272ad66f8a9e593c712b5 Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Thu, 18 Feb 2021 04:11:23 +0000
Subject: [PATCH 19/27] Notes on ML meeting and OS resources

---
 notes/zrq/20210216-01-timing-logger.txt   |  36 +++++--
 notes/zrq/20210216-02-resources.txt       | 121 ++++++++++++++++++++++
 notes/zrq/20210216-03-IRIS-ML-meeting.txt |  72 +++++++++++++
 3 files changed, 223 insertions(+), 6 deletions(-)
 create mode 100644 notes/zrq/20210216-02-resources.txt
 create mode 100644 notes/zrq/20210216-03-IRIS-ML-meeting.txt

diff --git a/notes/zrq/20210216-01-timing-logger.txt b/notes/zrq/20210216-01-timing-logger.txt
index ca6b0aba..c8b3ddda 100644
--- a/notes/zrq/20210216-01-timing-logger.txt
+++ b/notes/zrq/20210216-01-timing-logger.txt
@@ -46,20 +46,44 @@
     # REST API template
     # http://[zeppelin-server]:[zeppelin-port]/api/notebook/export/[noteId]
 
+    #
+    # JQ download/install
+    # https://stedolan.github.io/jq/download/
+
 
 
-    zeppelinbase=http://128.232.227.222:8080
-    notebookident=2FX82FMTH
+    zeppelinurl=http://128.232.227.222:8080
+    notebookid=2FX82FMTH
 
     checkstatus()
         {
-        local zeppelinbase=${1:?}
-        local zeppelinbase=${1:?}
+        local zeppelinurl=${1:?}
+        local notebookid=${2:?}
+        local timingdir=/tmp/aglais/timing
+        local timingfile=\${timingdir:?}/aglais-notebookid-$(date '+%Y%m%dT%H%M%S').json
+
+        rm -f "${timingfile:?}"
+
+
         curl \
             --silent \
-            --cookie '/tmp/cookies' \
-            'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \
+            --cookie "${tempdir:?}/cookies" \
+            "${zeppelinurl:?}/api/notebook/${notebookid}" \
         | jq '.' \
+        > "${timingfile:?}"
+
+
+
+
+
+        ## If the file is empty
+        ## Try login
+        ## Repeat get
+
+        ## If file is not empty
+
+
+
         | sed '
             /"dateStarted": null,/d
             /"dateStarted":/ {
diff --git a/notes/zrq/20210216-02-resources.txt b/notes/zrq/20210216-02-resources.txt
new file mode 100644
index 00000000..d388335b
--- /dev/null
+++ b/notes/zrq/20210216-02-resources.txt
@@ -0,0 +1,121 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+
+    #
+    # Openstack flavors
+    #
+
+
+    openstack \
+        --os-cloud "${cloudname:?}" \
+        flavor list
+
+    >   +--------------------------------------+-------------------+--------+------+-----------+-------+-----------+
+    >   | ID                                   | Name              |    RAM | Disk | Ephemeral | VCPUs | Is Public |
+    >   +--------------------------------------+-------------------+--------+------+-----------+-------+-----------+
+    >   | 20061eba-9e88-494c-95a3-41ed77721244 | general.v1.small  |  22528 |   20 |         0 |     6 | True      |
+    >   | 406a17e0-afd0-47d3-a6ad-8b19198bdd97 | general.v1.tiny   |   6144 |   12 |         0 |     2 | True      |
+    >   | 8a821ef8-20b8-4bbb-990b-91198745e7a7 | general.v1.xlarge | 184320 |   20 |       340 |    28 | True      |
+    >   | 996c1c8c-c934-411c-9631-b74eb2829631 | general.v1.medium |  46080 |   20 |        60 |    14 | True      |
+    >   | c4c07f5a-260a-4f22-9530-a09a19aa490a | general.v1.large  |  92160 |   20 |       160 |    28 | True      |
+    >   +--------------------------------------+-------------------+--------+------+-----------+-------+-----------+
+
+    #
+    # Physical machines
+    #
+
+    Our VMs are pinned to four physical hosts.
+
+    From Paul B. via Slack
+
+        cpu-p-633:
+        local_gb:     880 (G)
+        memory_mb: 191855 (188G)
+        vcpus: 110
+
+    From screen shot from John G.
+
+        cpu ??
+        RAM 186G per machine
+
+
+    #
+    # Fitting medium VMs
+    #
+
+    Local disc
+
+        60+20 = 80G per VM
+
+        If local disc is 880G, we should be able to fit 11 VMs per host.
+        10 VMs per host, over 4 hosts = 40 VMs.
+        Divide by 3 clouds = 13 medium VMs per cloud.
+
+    Memory
+
+        45G per VM
+
+        Local memory is 186G, we should be able to fit 4.13 VMs per host.
+        4 VMs per host, over 4 hosts = 16 VMs.
+        Divide by 3 clouds = 5 medium VMs per cloud.
+
+        Local memory is 186G, times four hosts = 744G
+        Divide by 3 clouds = 248G per cloud.
+
+        Openstack overview says 768GB (per cloud?)
+        Is the Horizon UI showing a total of 768GB for all 3 clouds.
+        Or do we have 4 hosts per cloud ?
+
+    CPU
+
+        14 cores per VM
+        Don't know how many cores per host ...
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/notes/zrq/20210216-03-IRIS-ML-meeting.txt b/notes/zrq/20210216-03-IRIS-ML-meeting.txt
new file mode 100644
index 00000000..c95fb46a
--- /dev/null
+++ b/notes/zrq/20210216-03-IRIS-ML-meeting.txt
@@ -0,0 +1,72 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+
+    ML meeting
+
+        "AI for science"
+
+            https://mlcommons.org/en/
+            https://github.com/mlcommons
+
+            ML for Science course is free
+            Weekly ML seminars
+
+                Jeyan Thiyagalingam
+                t.jeyan@stfc.ac.uk
+
+        Convolutional Neural Networks (CNN)
+
+            lots of params
+            needs big data to train
+
+            deep neural networks tend to overfit to training datasets
+            what works for one survey doesn't work on another survey - even on the same telescope
+
+            networks are sensitive to orientation
+            rotate the image and the network doesn't recognise it
+
+            Group-equivariant convolutional neural networks help to solve this
+
+        Deep Learning Research for the Cherenkov Telescope Array (CTA)
+
+            ....
+
+        Machine Learning in Particle Physics
+
+            ....
+
+            Dave Morris
+                Will we start to change the design of instruments to match the needs of ML?
+            Pete Clarke
+                Instruments will become smarter, with computing built in
+                line between instrument and analysis will become blurred
+
+
+
+

From b0fdaeed105e90b019675622cd9bc1a27461fa4d Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Fri, 19 Feb 2021 16:26:29 +0000
Subject: [PATCH 20/27] Made a start on Infra-Ops work

Signed-off-by: zrq-github@metagrid.co.uk <zrq-github@metagrid.co.uk>
---
 .../infra-ops/ansible/01-ssh-config.yml       | 64 +++++++++++++++++
 .../infra-ops/ansible/02-ping-test.yml        | 35 +++++++++
 experiments/infra-ops/ansible/ansible.cfg     | 30 ++++++++
 experiments/infra-ops/ansible/hosts.yml       | 64 +++++++++++++++++
 .../infra-ops/ansible/ssh-hostkeys.yml        | 43 +++++++++++
 .../ansible/templates/ssh-local-config.j2     | 40 +++++++++++
 notes/zrq/20210218-01-infra-ops.txt           | 72 +++++++++++++++++++
 7 files changed, 348 insertions(+)
 create mode 100644 experiments/infra-ops/ansible/01-ssh-config.yml
 create mode 100644 experiments/infra-ops/ansible/02-ping-test.yml
 create mode 100644 experiments/infra-ops/ansible/ansible.cfg
 create mode 100644 experiments/infra-ops/ansible/hosts.yml
 create mode 100644 experiments/infra-ops/ansible/ssh-hostkeys.yml
 create mode 100644 experiments/infra-ops/ansible/templates/ssh-local-config.j2
 create mode 100644 notes/zrq/20210218-01-infra-ops.txt

diff --git a/experiments/infra-ops/ansible/01-ssh-config.yml b/experiments/infra-ops/ansible/01-ssh-config.yml
new file mode 100644
index 00000000..dd0d3570
--- /dev/null
+++ b/experiments/infra-ops/ansible/01-ssh-config.yml
@@ -0,0 +1,64 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2019, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+---
+- hosts: localhost
+  gather_facts: false
+
+  vars:
+    sshdir: "{{ lookup('env','HOME') }}/.ssh"
+
+  tasks:
+
+    - name: "Check our local SSH directory [{{sshdir}}]"
+      file:
+        path: "{{sshdir}}"
+        mode: 'u=rwx,g=rx,o=rx'
+        state: directory
+
+    - name: "Create our Ansible config [{{sshdir}}/ansible-config]"
+      template:
+        src:  'templates/ssh-local-config.j2'
+        dest: "{{sshdir}}/ansible-config"
+
+    - name: "Check for an existing config [{{sshdir}}/config]"
+      stat:
+        path: "{{sshdir}}/config"
+        get_mime: no
+        get_checksum: no
+        get_attributes: no
+      register: filestat
+
+    - name: "Link our Ansible config [{{sshdir}}/config]"
+      file:
+        src:  "{{sshdir}}/ansible-config"
+        dest: "{{sshdir}}/config"
+        state: link
+      when: filestat.stat.exists == false
+
+    - name: "Create our [{{sshdir}}/known_hosts] file"
+      shell: "ssh-keyscan {{ hostvars[item].publicip4 | ipaddr('address') }} >> {{sshdir}}/known_hosts"
+      loop: "{{ groups['public'] }}"
+
+
+
+
diff --git a/experiments/infra-ops/ansible/02-ping-test.yml b/experiments/infra-ops/ansible/02-ping-test.yml
new file mode 100644
index 00000000..af17b0ec
--- /dev/null
+++ b/experiments/infra-ops/ansible/02-ping-test.yml
@@ -0,0 +1,35 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2020, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+---
+- name: "Ping tests"
+  hosts: public
+  gather_facts: false
+  tasks:
+
+    - name: "Check we can connect"
+      ping:
+
+    - name: "Check we can become [root]"
+      become: true
+      ping:
+
diff --git a/experiments/infra-ops/ansible/ansible.cfg b/experiments/infra-ops/ansible/ansible.cfg
new file mode 100644
index 00000000..535ce431
--- /dev/null
+++ b/experiments/infra-ops/ansible/ansible.cfg
@@ -0,0 +1,30 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2019, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+
+[defaults]
+
+# https://docs.ansible.com/ansible/2.8/reference_appendices/interpreter_discovery.html
+interpreter_python = auto
+
+# https://docs.ansible.com/ansible/latest/user_guide/intro_getting_started.html#host-key-checking
+host_key_checking = False
+
+
diff --git a/experiments/infra-ops/ansible/hosts.yml b/experiments/infra-ops/ansible/hosts.yml
new file mode 100644
index 00000000..6174d417
--- /dev/null
+++ b/experiments/infra-ops/ansible/hosts.yml
@@ -0,0 +1,64 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+all:
+    vars:
+
+        ansible_ssh_common_args: "-F {{ lookup('env','HOME') }}/.ssh/ansible-config"
+        ansible_python_interpreter: 'auto_silent'
+
+        users:
+            zrq:
+                name: 'Zarquan'
+                sshkey: 'dmr.roe.ac.uk.rsa.pub'
+                groups:
+                    - 'users'
+                    - 'admin'
+                    - 'sudo'
+                    - 'root'
+
+            tig:
+                name: 'Tigger'
+                sshkey: 'dmr.roe.ac.uk.rsa.pub'
+                groups:
+                    - 'users'
+                    - 'admin'
+                    - 'sudo'
+
+public:
+
+    hosts:
+
+        Hizzoria:
+
+            cloudname: 'digital-ocean'
+            publicip4: '46.101.32.198'
+            publicip6: '2a03:b0c0:1:d0::b53:6001'
+            adminuser: 'root'
+
+
+
+
+
+
+
+
diff --git a/experiments/infra-ops/ansible/ssh-hostkeys.yml b/experiments/infra-ops/ansible/ssh-hostkeys.yml
new file mode 100644
index 00000000..2af41023
--- /dev/null
+++ b/experiments/infra-ops/ansible/ssh-hostkeys.yml
@@ -0,0 +1,43 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2019, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+---
+- hosts: localhost
+  gather_facts: false
+  tasks:
+
+    - name: "Check our ssh directory"
+      file:
+        path: "{{ lookup('env','HOME') }}/.ssh"
+        mode: 'u=rwx,g=rx,o=rx'
+        state: directory
+
+    - name: "Accept the gateway host key"
+      shell: "ssh-keyscan {{ hostvars['lsstukhead'].inetip4 | ipaddr('address') }} >> {{ lookup('env','HOME') }}/.ssh/known_hosts"
+
+# TODO
+# Use separate tasks:
+# command, register the output, pass to known_hosts module to write.
+
+
+
+
diff --git a/experiments/infra-ops/ansible/templates/ssh-local-config.j2 b/experiments/infra-ops/ansible/templates/ssh-local-config.j2
new file mode 100644
index 00000000..5ec1d3e3
--- /dev/null
+++ b/experiments/infra-ops/ansible/templates/ssh-local-config.j2
@@ -0,0 +1,40 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+Protocol 2
+Compression no
+ForwardAgent yes
+PasswordAuthentication no
+
+{% for hostname in groups['public'] %}
+Host {{ hostname }}
+    HostName {{ hostvars[hostname].publicip4 | ipaddr('address') }}
+    User {{ hostvars[hostname].adminuser }}
+    ServerAliveInterval 60
+    ServerAliveCountMax 5
+    ControlPath ~/.ssh/%r@%h:%p
+    ControlMaster auto
+    ControlPersist 5m
+
+{% endfor %}
+
+
diff --git a/notes/zrq/20210218-01-infra-ops.txt b/notes/zrq/20210218-01-infra-ops.txt
new file mode 100644
index 00000000..84441ab9
--- /dev/null
+++ b/notes/zrq/20210218-01-infra-ops.txt
@@ -0,0 +1,72 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+# -----------------------------------------------------
+# Create a container to work with.
+#[user@desktop]
+
+    source "${HOME:?}/aglais.env"
+
+    podman run \
+        --rm \
+        --tty \
+        --interactive \
+        --name ansibler \
+        --hostname ansibler \
+        --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \
+        --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/infra-ops:/infra-ops:ro,z" \
+        atolmis/ansible-client:2020.12.02 \
+        bash
+
+
+# -----------------------------------------------------
+# Test our Ansible scripts ...
+#[root@ansibler]
+
+    pushd "/infra-ops/ansible"
+
+        ansible-playbook \
+            --inventory "hosts.yml" \
+            "01-ssh-config.yml"
+
+    >   ....
+    >   ....
+
+
+        ansible-playbook \
+            --inventory "hosts.yml" \
+            "02-ping-test.yml"
+
+    >   ....
+    >   ....
+
+    popd
+
+
+

From a18c7da30076d1af7dbcf73a97f6745cc3c7175c Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Sun, 21 Feb 2021 06:53:57 +0000
Subject: [PATCH 21/27] Added Ansible role for DNSmasq service

---
 .../infra-ops/ansible/03-apply-roles.yml      |  28 +
 .../roles/dns-server/defaults/main.yml        |  32 ++
 .../ansible/roles/dns-server/meta/main.yml    |  38 ++
 .../ansible/roles/dns-server/tasks/main.yml   |  52 ++
 .../roles/dns-server/templates/main.yml       |  23 +
 .../ansible/roles/fedora-base/meta/main.yml   |  42 ++
 .../ansible/roles/fedora-base/tasks/main.yml  |  36 ++
 .../ansible/roles/podman-host/meta/main.yml   |  46 ++
 .../ansible/roles/podman-host/tasks/main.yml  |  32 ++
 notes/zrq/20210218-01-infra-ops.txt           |  75 +++
 notes/zrq/20210221-01-infra-ops.txt           | 506 ++++++++++++++++++
 11 files changed, 910 insertions(+)
 create mode 100644 experiments/infra-ops/ansible/03-apply-roles.yml
 create mode 100644 experiments/infra-ops/ansible/roles/dns-server/defaults/main.yml
 create mode 100644 experiments/infra-ops/ansible/roles/dns-server/meta/main.yml
 create mode 100644 experiments/infra-ops/ansible/roles/dns-server/tasks/main.yml
 create mode 100644 experiments/infra-ops/ansible/roles/dns-server/templates/main.yml
 create mode 100644 experiments/infra-ops/ansible/roles/fedora-base/meta/main.yml
 create mode 100644 experiments/infra-ops/ansible/roles/fedora-base/tasks/main.yml
 create mode 100644 experiments/infra-ops/ansible/roles/podman-host/meta/main.yml
 create mode 100644 experiments/infra-ops/ansible/roles/podman-host/tasks/main.yml
 create mode 100644 notes/zrq/20210221-01-infra-ops.txt

diff --git a/experiments/infra-ops/ansible/03-apply-roles.yml b/experiments/infra-ops/ansible/03-apply-roles.yml
new file mode 100644
index 00000000..35b4c317
--- /dev/null
+++ b/experiments/infra-ops/ansible/03-apply-roles.yml
@@ -0,0 +1,28 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2020, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+---
+- name: "Apply roles"
+  hosts: Hizzoria
+  roles:
+    - dns-server
+
diff --git a/experiments/infra-ops/ansible/roles/dns-server/defaults/main.yml b/experiments/infra-ops/ansible/roles/dns-server/defaults/main.yml
new file mode 100644
index 00000000..64b1a84a
--- /dev/null
+++ b/experiments/infra-ops/ansible/roles/dns-server/defaults/main.yml
@@ -0,0 +1,32 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+---
+
+dnsmasq:
+    # Host path for DNSmasq config files
+    config_path: '/var/aglais/dnsmasq'
+
+podman:
+    # Number of restart attempts
+    # https://docs.ansible.com/ansible/latest/collections/containers/podman/podman_container_module.html#parameter-restart_policy
+    restart_limit: 10
diff --git a/experiments/infra-ops/ansible/roles/dns-server/meta/main.yml b/experiments/infra-ops/ansible/roles/dns-server/meta/main.yml
new file mode 100644
index 00000000..8cb04de9
--- /dev/null
+++ b/experiments/infra-ops/ansible/roles/dns-server/meta/main.yml
@@ -0,0 +1,38 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+---
+
+# Role metadata.
+# https://galaxy.ansible.com/docs/contributing/creating_role.html#role-metadata
+galaxy_info:
+  role_name: dns-service
+  author: 'Zarquan (https://github.com/Zarquan)'
+  description: 'InfraOps DNSmasq service.'
+  company: 'University of Edinburgh'
+  license: 'GPLv3'
+
+# Role dependencies
+# https://docs.ansible.com/ansible/latest/user_guide/playbooks_reuse_roles.html#using-role-dependencies
+dependencies:
+  - role: podman-host
+
diff --git a/experiments/infra-ops/ansible/roles/dns-server/tasks/main.yml b/experiments/infra-ops/ansible/roles/dns-server/tasks/main.yml
new file mode 100644
index 00000000..4a862b2c
--- /dev/null
+++ b/experiments/infra-ops/ansible/roles/dns-server/tasks/main.yml
@@ -0,0 +1,52 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+---
+
+# Create the DNSmasq config directory.
+- name: "Create DNSmasq config directory"
+  become: true
+  ansible.builtin.file:
+    path: "{{dnsmasq.config_path}}"
+    state: directory
+
+# Deploy a DNSmasq service in a Podman container.
+# http://www.thekelleys.org.uk/dnsmasq/doc.html
+# https://github.com/Storytel/dnsmasq
+# https://hub.docker.com/r/storytel/dnsmasq/dockerfile
+# https://docs.ansible.com/ansible/latest/collections/containers/podman/podman_container_module.html
+- name: "Deploy DNSmasq container"
+  become: true
+  containers.podman.podman_container:
+    name:  'dnsmasq'
+    image: 'storytel/dnsmasq'
+    state: 'started'
+    restart: 'yes'
+    detach: true
+    privileged: true
+    network: 'host'
+    restart_policy: "on-failure:{{podman.restart_limit}}"
+    volumes:
+        - "{{dnsmasq.config_path}}:/etc/dnsmasq:ro"
+    publish:
+        - "53:53/tcp"
+
diff --git a/experiments/infra-ops/ansible/roles/dns-server/templates/main.yml b/experiments/infra-ops/ansible/roles/dns-server/templates/main.yml
new file mode 100644
index 00000000..5b80f3b0
--- /dev/null
+++ b/experiments/infra-ops/ansible/roles/dns-server/templates/main.yml
@@ -0,0 +1,23 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+
diff --git a/experiments/infra-ops/ansible/roles/fedora-base/meta/main.yml b/experiments/infra-ops/ansible/roles/fedora-base/meta/main.yml
new file mode 100644
index 00000000..2634d2d9
--- /dev/null
+++ b/experiments/infra-ops/ansible/roles/fedora-base/meta/main.yml
@@ -0,0 +1,42 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+---
+
+# Role metadata.
+# https://galaxy.ansible.com/docs/contributing/creating_role.html#role-metadata
+galaxy_info:
+  role_name: fedora-base
+  author: 'Zarquan (https://github.com/Zarquan)'
+  description: 'Base role for Fedora hosts.'
+  company: 'University of Edinburgh'
+  license: 'GPLv3'
+
+  platforms:
+  - name: Fedora
+    versions:
+    - 30
+    - 31
+    - 32
+    - 33
+
+
diff --git a/experiments/infra-ops/ansible/roles/fedora-base/tasks/main.yml b/experiments/infra-ops/ansible/roles/fedora-base/tasks/main.yml
new file mode 100644
index 00000000..23713538
--- /dev/null
+++ b/experiments/infra-ops/ansible/roles/fedora-base/tasks/main.yml
@@ -0,0 +1,36 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+---
+
+# Update the DNF cache.
+# Performs a no-op install to force a cache-refresh.
+# Fixes issue #378 https://github.com/wfau/aglais/issues/378
+# https://docs.ansible.com/ansible/latest/user_guide/playbooks_error_handling.html#ignoring-failed-commands
+- name: "Update the DNF cache"
+  become: true
+  ignore_errors: yes
+  dnf:
+    name:  'kernel'
+    state: present
+    update_cache: yes
+
diff --git a/experiments/infra-ops/ansible/roles/podman-host/meta/main.yml b/experiments/infra-ops/ansible/roles/podman-host/meta/main.yml
new file mode 100644
index 00000000..d8805ce4
--- /dev/null
+++ b/experiments/infra-ops/ansible/roles/podman-host/meta/main.yml
@@ -0,0 +1,46 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+---
+
+# Role metadata.
+# https://galaxy.ansible.com/docs/contributing/creating_role.html#role-metadata
+galaxy_info:
+  role_name: dns-service
+  author: 'Zarquan (https://github.com/Zarquan)'
+  description: 'base role for a Podman host.'
+  company: 'University of Edinburgh'
+  license: 'GPLv3'
+
+  platforms:
+  - name: Fedora
+    versions:
+    - 30
+    - 31
+    - 32
+    - 33
+
+# Role dependencies
+# https://docs.ansible.com/ansible/latest/user_guide/playbooks_reuse_roles.html#using-role-dependencies
+dependencies:
+  - role: fedora-base
+
diff --git a/experiments/infra-ops/ansible/roles/podman-host/tasks/main.yml b/experiments/infra-ops/ansible/roles/podman-host/tasks/main.yml
new file mode 100644
index 00000000..e8a32fcd
--- /dev/null
+++ b/experiments/infra-ops/ansible/roles/podman-host/tasks/main.yml
@@ -0,0 +1,32 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#
+
+---
+
+# Install Podman.
+# https://podman.io/
+- name: "Install Podman"
+  become: true
+  dnf:
+    name: 'podman'
+    state: present
+
diff --git a/notes/zrq/20210218-01-infra-ops.txt b/notes/zrq/20210218-01-infra-ops.txt
index 84441ab9..cff12949 100644
--- a/notes/zrq/20210218-01-infra-ops.txt
+++ b/notes/zrq/20210218-01-infra-ops.txt
@@ -69,4 +69,79 @@
     popd
 
 
+# -----------------------------------------------------
+# Install the Podman container plugin.
+# TODO - Add this to our ansible-client image.
+#[root@ansibler]
+
+    ansible-galaxy collection install containers.podman
+
+    >   Process install dependency map
+    >   Starting collection install process
+    >   Installing 'containers.podman:1.4.3' to '/root/.ansible/collections/ansible_collections/containers/podman'
+
+
+# -----------------------------------------------------
+# Install the Podman container plugin.
+# TODO - Add this to our ansible-client image.
+#[root@ansibler]
+
+    pushd "/infra-ops/ansible"
+
+        ansible-playbook \
+            --inventory "hosts.yml" \
+            "03-apply-roles.yml"
+
+    popd
+
+    >   PLAY [Apply roles] *****************************************************************
+    >   
+    >   TASK [Gathering Facts] *************************************************************
+    >   ok: [Hizzoria]
+    >   
+    >   TASK [fedora-base : Update the DNF cache] ******************************************
+    >   changed: [Hizzoria]
+    >   
+    >   TASK [podman-host : Install Podman] ************************************************
+    >   changed: [Hizzoria]
+    >   
+    >   TASK [dns-server : Create DNSmasq config directory] ********************************
+    >   changed: [Hizzoria]
+    >   
+    >   TASK [dns-server : Deploy DNSmasq container] ***************************************
+    >   changed: [Hizzoria]
+    >   
+    >   PLAY RECAP *************************************************************************
+    >   Hizzoria : ok=5  changed=4  unreachable=0  failed=0  skipped=0  rescued=0  ignored=0
+
+
+# -----------------------------------------------------
+# Check the result
+#[root@ansibler]
+
+    ssh Hizzoria
+
+    >   Last login: Sun Feb 21 05:21:20 2021 from 81.187.247.196
+
+
+        podman ps -a
+
+    >   CONTAINER ID  IMAGE                              COMMAND  CREATED         STATUS                     PORTS   NAMES
+    >   1c28928f2d80  docker.io/storytel/dnsmasq:latest  dnsmasq  12 minutes ago  Exited (2) 12 minutes ago          dnsmasq
+
+
+    podman logs dnsmasq
+
+    >   dnsmasq: failed to create listening socket for port 53: Address in use
+    >   
+    >   dnsmasq: failed to create listening socket for port 53: Address in use
+    >   
+    >   dnsmasq: failed to create listening socket for port 53: Address in use
+    >   ....
+    >   ....
+
+    #
+    # OK - needs tweaking - but the roles thing worked :-)
+    #
+
 
diff --git a/notes/zrq/20210221-01-infra-ops.txt b/notes/zrq/20210221-01-infra-ops.txt
new file mode 100644
index 00000000..30f5de9d
--- /dev/null
+++ b/notes/zrq/20210221-01-infra-ops.txt
@@ -0,0 +1,506 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+# -----------------------------------------------------
+# Create a container to work with.
+#[user@desktop]
+
+    source "${HOME:?}/aglais.env"
+
+    podman run \
+        --rm \
+        --tty \
+        --interactive \
+        --name ansibler \
+        --hostname ansibler \
+        --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \
+        --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/infra-ops:/infra-ops:ro,z" \
+        atolmis/ansible-client:2020.12.02 \
+        bash
+
+
+# -----------------------------------------------------
+# Test our Ansible scripts ...
+#[root@ansibler]
+
+    pushd "/infra-ops/ansible"
+
+        ansible-playbook \
+            --inventory "hosts.yml" \
+            "01-ssh-config.yml"
+
+    >   ....
+    >   ....
+
+
+        ansible-playbook \
+            --inventory "hosts.yml" \
+            "02-ping-test.yml"
+
+    >   ....
+    >   ....
+
+    popd
+
+
+# -----------------------------------------------------
+# Install the Podman container plugin.
+# TODO - Add this to our ansible-client image.
+#[root@ansibler]
+
+    ansible-galaxy collection install containers.podman
+
+    >   Process install dependency map
+    >   Starting collection install process
+    >   Installing 'containers.podman:1.4.3' to '/root/.ansible/collections/ansible_collections/containers/podman'
+
+
+# -----------------------------------------------------
+# Install the Podman container plugin.
+# TODO - Add this to our ansible-client image.
+#[root@ansibler]
+
+    pushd "/infra-ops/ansible"
+
+        ansible-playbook \
+            --inventory "hosts.yml" \
+            "03-apply-roles.yml"
+
+    popd
+
+    >   PLAY [Apply roles] *****************************************************************
+    >   
+    >   TASK [Gathering Facts] *************************************************************
+    >   ok: [Hizzoria]
+    >   
+    >   TASK [fedora-base : Update the DNF cache] ******************************************
+    >   changed: [Hizzoria]
+    >   
+    >   TASK [podman-host : Install Podman] ************************************************
+    >   changed: [Hizzoria]
+    >   
+    >   TASK [dns-server : Create DNSmasq config directory] ********************************
+    >   changed: [Hizzoria]
+    >   
+    >   TASK [dns-server : Deploy DNSmasq container] ***************************************
+    >   changed: [Hizzoria]
+    >   
+    >   PLAY RECAP *************************************************************************
+    >   Hizzoria : ok=5  changed=4  unreachable=0  failed=0  skipped=0  rescued=0  ignored=0
+
+
+# -----------------------------------------------------
+# Check the result
+#[root@ansibler]
+
+    ssh Hizzoria
+
+    >   Last login: Sun Feb 21 05:21:20 2021 from 81.187.247.196
+
+
+        podman ps -a
+
+    >   CONTAINER ID  IMAGE                              COMMAND  CREATED         STATUS                     PORTS   NAMES
+    >   1c28928f2d80  docker.io/storytel/dnsmasq:latest  dnsmasq  12 minutes ago  Exited (2) 12 minutes ago          dnsmasq
+
+
+    podman logs dnsmasq
+
+    >   dnsmasq: failed to create listening socket for port 53: Address in use
+    >   
+    >   dnsmasq: failed to create listening socket for port 53: Address in use
+    >   
+    >   dnsmasq: failed to create listening socket for port 53: Address in use
+    >   ....
+    >   ....
+
+    #
+    # OK - needs tweaking - but the roles thing worked :-)
+    #
+
+
+# -----------------------------------------------------
+# Check if anyone is listening on port 53.
+# https://www.cyberciti.biz/faq/unix-linux-check-if-port-is-in-use-command/
+#[root@Hizzoria]
+
+    dnf info lsof
+
+    >   ....
+    >   ....
+
+
+    lsof -i -P -n | grep LISTEN
+
+    >   systemd-r   444 systemd-resolve   11u  IPv4    17957      0t0  TCP *:5355 (LISTEN)
+    >   systemd-r   444 systemd-resolve   13u  IPv6    17960      0t0  TCP *:5355 (LISTEN)
+    >   systemd-r   444 systemd-resolve   16u  IPv4    17964      0t0  TCP 127.0.0.53:53 (LISTEN)
+    >   sshd        746            root    4u  IPv4    23626      0t0  TCP *:22 (LISTEN)
+    >   sshd        746            root    5u  IPv6    23635      0t0  TCP *:22 (LISTEN)
+
+    # systemd-resolve is listening on the internal localhost address.
+    # - which I think is OK
+
+
+    # We want dnsmasq to listen on the external (public) IP address.
+
+    # DigitalOcean website lists the IP address as 46.101.32.198
+
+
+# -----------------------------------------------------
+# List the IP interfaces.
+# https://access.redhat.com/sites/default/files/attachments/rh_ip_command_cheatsheet_1214_jcs_print.pdf
+#[root@Hizzoria]
+
+    ip addr
+
+    >   1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
+    >       link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
+    >       inet 127.0.0.1/8 scope host lo
+    >          valid_lft forever preferred_lft forever
+    >       inet6 ::1/128 scope host
+    >          valid_lft forever preferred_lft forever
+    >   2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000
+    >       link/ether 86:ab:2d:00:cb:2a brd ff:ff:ff:ff:ff:ff
+    >       altname enp0s3
+    >       altname ens3
+    >       inet 46.101.32.198/18 brd 46.101.63.255 scope global noprefixroute eth0
+    >          valid_lft forever preferred_lft forever
+    >       inet 10.16.0.5/16 brd 10.16.255.255 scope global noprefixroute eth0
+    >          valid_lft forever preferred_lft forever
+    >       inet6 2a03:b0c0:1:d0::b53:6001/64 scope global noprefixroute
+    >          valid_lft forever preferred_lft forever
+    >       inet6 fe80::84ab:2dff:fe00:cb2a/64 scope link noprefixroute
+    >          valid_lft forever preferred_lft forever
+    >   3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000
+    >       link/ether be:ff:fc:81:f0:74 brd ff:ff:ff:ff:ff:ff
+    >       altname enp0s4
+    >       altname ens4
+    >       inet 10.106.0.2/20 brd 10.106.15.255 scope global noprefixroute eth1
+    >          valid_lft forever preferred_lft forever
+    >       inet6 fe80::bcff:fcff:fe81:f074/64 scope link
+    >          valid_lft forever preferred_lft forever
+
+    # Interesting, but hard to tell which is the external interface.
+
+
+# -----------------------------------------------------
+# List the IP routes.
+# https://access.redhat.com/sites/default/files/attachments/rh_ip_command_cheatsheet_1214_jcs_print.pdf
+#[root@Hizzoria]
+
+    ip route
+
+    >   default via 46.101.0.1 dev eth0 proto static metric 100
+    >   10.16.0.0/16 dev eth0 proto kernel scope link src 10.16.0.5 metric 100
+    >   10.106.0.0/20 dev eth1 proto kernel scope link src 10.106.0.2 metric 101
+    >   46.101.0.0/18 dev eth0 proto kernel scope link src 46.101.32.198 metric 100
+
+    # That looks more useful, the default route is linked to the external interface.
+    # Is that always a valid assumption to make ?
+
+
+    ip route list match default
+
+    >   default via 46.101.0.1 dev eth0 proto static metric 100
+
+
+# -----------------------------------------------------
+# Try extract the interface name for the default route.
+#[root@Hizzoria]
+
+    ip route
+
+    >   default via 46.101.0.1 dev eth0 proto static metric 100
+    >   10.16.0.0/16 dev eth0 proto kernel scope link src 10.16.0.5 metric 100
+    >   10.106.0.0/20 dev eth1 proto kernel scope link src 10.106.0.2 metric 101
+    >   46.101.0.0/18 dev eth0 proto kernel scope link src 46.101.32.198 metric 100
+
+
+    ip route | cut -d ' ' -f 1
+
+    >   default
+    >   10.16.0.0/16
+    >   10.106.0.0/20
+    >   46.101.0.0/18
+
+
+    ip route | cut -d ' ' -f 2
+
+    >   via
+    >   dev
+    >   dev
+    >   dev
+
+
+    ip route | cut -d ' ' -f 3
+
+    >   46.101.0.1
+    >   eth0
+    >   eth1
+    >   eth0
+
+
+    ip route | cut -d ' ' -f 4
+
+    >   dev
+    >   proto
+    >   proto
+    >   proto
+
+
+    # The default line is different because the destination is several words.
+    # The first term is 'default via 46.101.0.1'
+    # Equivalent to :
+
+    >   default-via-46.101.0.1 dev eth0 proto ....
+    >   10.16.0.0/16           dev eth0 proto ....
+    >   10.106.0.0/20          dev eth1 proto ....
+    >   46.101.0.0/18          dev eth0 proto ....
+
+
+    # We could use `ip` to select just the default route, and use `sed` to match the term we want.
+
+    ip route list match default
+
+    >   default via 46.101.0.1 dev eth0 proto static metric 100
+
+
+    ip route list match default \
+    | sed '
+        s/^.*dev[[:space:]]*\([[:alnum:]]*\)[[:space:]]*proto.*$/\1/
+        '
+
+    >   eth0
+
+    ifname=$(
+        ip route list match default \
+        | sed '
+            s/^.*dev[[:space:]]*\([[:alnum:]]*\)[[:space:]]*proto.*$/\1/
+            '
+        )
+
+
+# -----------------------------------------------------
+# Add a basic DNSmasq config file with the interface name.
+# (initial settings copied fro Esperia)
+# https://github.com/wfau/esperia/blob/master/src/ansible/dnsmasq/dnsmasq-esperia.conf
+#[root@Hizzoria]
+
+    configdir=/var/aglais/dnsmasq
+
+    cat > "${configdir:?}/aglais.conf" << EOF
+
+no-hosts
+no-resolv
+no-daemon
+bogus-priv
+domain-needed
+keep-in-foreground
+
+log-facility=-
+
+interface=${ifname:?}
+bind-interfaces
+
+EOF
+
+
+# -----------------------------------------------------
+# Restart the DNSmasq container see if that helps.
+#[root@Hizzoria]
+
+    podman restart dnsmasq
+
+    >   04b9f37c57872d54e9ad46f26c1d832cfb2c71f78222463725d69de950375e8e
+
+
+    podman ps -a
+
+    >   CONTAINER ID  IMAGE                              COMMAND  CREATED         STATUS            PORTS   NAMES
+    >   04b9f37c5787  docker.io/storytel/dnsmasq:latest  dnsmasq  44 minutes ago  Up 3 seconds ago          dnsmasq
+
+
+    podman logs dnsmasq
+
+    >   dnsmasq[1]: started, version 2.78 cachesize 150
+    >   dnsmasq[1]: compile time options: IPv6 GNU-getopt no-DBus no-i18n no-IDN DHCP DHCPv6 no-Lua TFTP no-conntrack ipset auth no-DNSSEC loop-detect inotify
+    >   dnsmasq[1]: LOUD WARNING: listening on 46.101.32.198 may accept requests via interfaces other than eth0
+    >   dnsmasq[1]: LOUD WARNING: use --bind-dynamic rather than --bind-interfaces to avoid DNS amplification attacks via these interface(s)
+    >   dnsmasq[1]: warning: no upstream servers configured
+    >   dnsmasq[1]: cleared cache
+
+    # Checks the DNSmasql manual ..
+    # http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html
+    # We could just use except-interface to preevent it listening on localhost ..
+
+# -----------------------------------------------------
+# Update the DNSmasq config file and try again.
+# http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html
+#[root@Hizzoria]
+
+    cat > "${configdir:?}/aglais.conf" << EOF
+
+no-hosts
+no-resolv
+no-daemon
+bogus-priv
+domain-needed
+keep-in-foreground
+
+log-facility=-
+
+except-interface=localhost
+
+EOF
+
+
+    podman restart dnsmasq
+
+    >   04b9f37c57872d54e9ad46f26c1d832cfb2c71f78222463725d69de950375e8e
+
+
+    podman logs --follow dnsmasq
+
+    >   dnsmasq: failed to create listening socket for port 53: Address in use
+    >   
+    >   dnsmasq: failed to create listening socket for port 53: Address in use
+    >   ....
+    >   ....
+
+
+# -----------------------------------------------------
+# Update the DNSmasq config file and try again.
+# http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html
+#[root@Hizzoria]
+
+    cat > "${configdir:?}/aglais.conf" << EOF
+
+no-hosts
+no-resolv
+no-daemon
+bogus-priv
+domain-needed
+keep-in-foreground
+
+log-facility=-
+
+except-interface=localhost
+bind-interface
+
+EOF
+
+
+    podman restart dnsmasq
+
+    >   04b9f37c57872d54e9ad46f26c1d832cfb2c71f78222463725d69de950375e8e
+
+
+    podman logs --follow dnsmasq
+
+    >   dnsmasq: bad option at line 12 of /etc/dnsmasq/aglais.conf
+
+
+# -----------------------------------------------------
+# Update the DNSmasq config file and try again.
+# http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html
+#[root@Hizzoria]
+
+    cat > "${configdir:?}/aglais.conf" << EOF
+
+no-hosts
+no-resolv
+no-daemon
+bogus-priv
+domain-needed
+keep-in-foreground
+
+log-facility=-
+
+except-interface=localhost
+bind-interfaces
+
+EOF
+
+
+    podman restart dnsmasq
+
+    >   04b9f37c57872d54e9ad46f26c1d832cfb2c71f78222463725d69de950375e8e
+
+
+    podman logs --follow dnsmasq
+
+    >   dnsmasq[1]: started, version 2.78 cachesize 150
+    >   dnsmasq[1]: compile time options: IPv6 GNU-getopt no-DBus no-i18n no-IDN DHCP DHCPv6 no-Lua TFTP no-conntrack ipset auth no-DNSSEC loop-detect inotify
+    >   dnsmasq[1]: LOUD WARNING: listening on 46.101.32.198 may accept requests via interfaces other than eth0
+    >   dnsmasq[1]: LOUD WARNING: use --bind-dynamic rather than --bind-interfaces to avoid DNS amplification attacks via these interface(s)
+    >   dnsmasq[1]: warning: no upstream servers configured
+    >   dnsmasq[1]: cleared cache
+
+
+# -----------------------------------------------------
+# Update the DNSmasq config file and try again.
+# http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html
+#[root@Hizzoria]
+
+    cat > "${configdir:?}/aglais.conf" << EOF
+
+no-hosts
+no-resolv
+no-daemon
+bogus-priv
+domain-needed
+keep-in-foreground
+
+log-facility=-
+
+except-interface=localhost
+bind-dynamic
+
+EOF
+
+
+    podman restart dnsmasq
+
+    >   04b9f37c57872d54e9ad46f26c1d832cfb2c71f78222463725d69de950375e8e
+
+
+    podman logs --follow dnsmasq
+
+    >   dnsmasq[1]: started, version 2.78 cachesize 150
+    >   dnsmasq[1]: compile time options: IPv6 GNU-getopt no-DBus no-i18n no-IDN DHCP DHCPv6 no-Lua TFTP no-conntrack ipset auth no-DNSSEC loop-detect inotify
+    >   dnsmasq[1]: warning: no upstream servers configured
+    >   dnsmasq[1]: cleared cache
+
+
+
+
+
+
+

From cbefdfcb44737b5e3141ac8ff9bbe913e53a396a Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Mon, 22 Feb 2021 00:22:55 +0000
Subject: [PATCH 22/27] First pass at configuring DNSmasq server

---
 .../{03-apply-roles.yml => 03-dns-server.yml} |  10 +-
 .../roles/dns-server/defaults/main.yml        |   2 +-
 notes/zrq/20210221-01-infra-ops.txt           | 711 +++++++++++++++++-
 3 files changed, 712 insertions(+), 11 deletions(-)
 rename experiments/infra-ops/ansible/{03-apply-roles.yml => 03-dns-server.yml} (86%)

diff --git a/experiments/infra-ops/ansible/03-apply-roles.yml b/experiments/infra-ops/ansible/03-dns-server.yml
similarity index 86%
rename from experiments/infra-ops/ansible/03-apply-roles.yml
rename to experiments/infra-ops/ansible/03-dns-server.yml
index 35b4c317..6caae3c7 100644
--- a/experiments/infra-ops/ansible/03-apply-roles.yml
+++ b/experiments/infra-ops/ansible/03-dns-server.yml
@@ -21,8 +21,16 @@
 #
 
 ---
-- name: "Apply roles"
+- name: "DNS server"
   hosts: Hizzoria
+  vars:
+    dnsmasq:
+      # Host path for DNSmasq config files
+      config_path: '/var/aglais/dnsmasq'
+
   roles:
     - dns-server
 
+
+
+
diff --git a/experiments/infra-ops/ansible/roles/dns-server/defaults/main.yml b/experiments/infra-ops/ansible/roles/dns-server/defaults/main.yml
index 64b1a84a..bff7bb08 100644
--- a/experiments/infra-ops/ansible/roles/dns-server/defaults/main.yml
+++ b/experiments/infra-ops/ansible/roles/dns-server/defaults/main.yml
@@ -24,7 +24,7 @@
 
 dnsmasq:
     # Host path for DNSmasq config files
-    config_path: '/var/aglais/dnsmasq'
+    config_path: '/etc/dnsmasq'
 
 podman:
     # Number of restart attempts
diff --git a/notes/zrq/20210221-01-infra-ops.txt b/notes/zrq/20210221-01-infra-ops.txt
index 30f5de9d..afe7151b 100644
--- a/notes/zrq/20210221-01-infra-ops.txt
+++ b/notes/zrq/20210221-01-infra-ops.txt
@@ -95,22 +95,22 @@
     popd
 
     >   PLAY [Apply roles] *****************************************************************
-    >   
+    >
     >   TASK [Gathering Facts] *************************************************************
     >   ok: [Hizzoria]
-    >   
+    >
     >   TASK [fedora-base : Update the DNF cache] ******************************************
     >   changed: [Hizzoria]
-    >   
+    >
     >   TASK [podman-host : Install Podman] ************************************************
     >   changed: [Hizzoria]
-    >   
+    >
     >   TASK [dns-server : Create DNSmasq config directory] ********************************
     >   changed: [Hizzoria]
-    >   
+    >
     >   TASK [dns-server : Deploy DNSmasq container] ***************************************
     >   changed: [Hizzoria]
-    >   
+    >
     >   PLAY RECAP *************************************************************************
     >   Hizzoria : ok=5  changed=4  unreachable=0  failed=0  skipped=0  rescued=0  ignored=0
 
@@ -133,9 +133,9 @@
     podman logs dnsmasq
 
     >   dnsmasq: failed to create listening socket for port 53: Address in use
-    >   
+    >
     >   dnsmasq: failed to create listening socket for port 53: Address in use
-    >   
+    >
     >   dnsmasq: failed to create listening socket for port 53: Address in use
     >   ....
     >   ....
@@ -389,7 +389,7 @@ EOF
     podman logs --follow dnsmasq
 
     >   dnsmasq: failed to create listening socket for port 53: Address in use
-    >   
+    >
     >   dnsmasq: failed to create listening socket for port 53: Address in use
     >   ....
     >   ....
@@ -499,7 +499,700 @@ EOF
     >   dnsmasq[1]: cleared cache
 
 
+# -----------------------------------------------------
+# To perevent DNSmasq from trying to resolve other addresses.
+# http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html
+# https://www.mail-archive.com/dnsmasq-discuss@lists.thekelleys.org.uk/msg14016.html
+#[root@Hizzoria]
+
+    # Not conclusive ..
+    # I think we have what we need.
+
+    # Q - why wrap DNSmasq in a container ?
+    # A - because that's the way Fedora is going in the future ?
+
+    # It comes installed in Fedora by default.
+
+    dnf info dnsmasq
+
+    >   Installed Packages
+    >   Name         : dnsmasq
+    >   Version      : 2.83
+    >   Release      : 1.fc33
+    >   Architecture : x86_64
+    >   Size         : 693 k
+    >   Source       : dnsmasq-2.83-1.fc33.src.rpm
+    >   Repository   : @System
+    >   From repo    : updates
+    >   Summary      : A lightweight DHCP/caching DNS server
+    >   URL          : http://www.thekelleys.org.uk/dnsmasq/
+    >   License      : GPLv2 or GPLv3
+    >   Description  : Dnsmasq is lightweight, easy to configure DNS forwarder and DHCP server.
+    >                : It is designed to provide DNS and, optionally, DHCP, to a small network.
+    >                : It can serve the names of local machines which are not in the global
+    >                : DNS. The DHCP server integrates with the DNS server and allows machines
+    >                : with DHCP-allocated addresses to appear in the DNS with names configured
+    >                : either in each host or in a central configuration file. Dnsmasq supports
+    >                : static and dynamic DHCP leases and BOOTP for network booting of diskless
+    >                : machines.
+
+
+    # All we would need to do is start it ...
+
+    systemctl status dnsmasq
+
+    >   ● dnsmasq.service - DNS caching server.
+    >        Loaded: loaded (/usr/lib/systemd/system/dnsmasq.service; disabled; vendor preset: disabled)
+    >        Active: inactive (dead)
+
+    # Because we want to have multiple services on the machine
+    # and we want each of them to be in separate containers
+
+
+# -----------------------------------------------------
+# Setup a second shell to tail the DNSmasq logs.
+#[user@desktop]
+
+    podman exec -it ansibler bash
+
+        ssh Hizzoria
+
+            podman logs --follow dnsmasq
+
+    >   dnsmasq[1]: started, version 2.78 cachesize 150
+    >   dnsmasq[1]: compile time options: IPv6 GNU-getopt no-DBus no-i18n no-IDN DHCP DHCPv6 no-Lua TFTP no-conntrack ipset auth no-DNSSEC loop-detect inotify
+    >   dnsmasq[1]: warning: no upstream servers configured
+    >   dnsmasq[1]: cleared cache
+    >   ....
+    >   ....
+
+
+# -----------------------------------------------------
+# Add a directory for host files.
+#[root@Hizzoria]
+
+    mkdir "${configdir:?}/hosts"
+
+    cat > "${configdir:?}/aglais.conf" << EOF
+
+no-hosts
+no-resolv
+no-daemon
+bogus-priv
+domain-needed
+keep-in-foreground
+
+log-facility=-
+
+except-interface=localhost
+bind-dynamic
+
+hostsdir=${configdir:?}/hosts
+
+EOF
+
+
+# -----------------------------------------------------
+# Send DNSmasq a SIGHUP signal to reload the config.
+# https://serverfault.com/questions/723292/dnsmasq-doesnt-automatically-reload-when-entry-is-added-to-etc-hosts
+# https://serverfault.com/a/934681
+# http://docs.podman.io/en/latest/markdown/podman-kill.1.html
+#[root@Hizzoria]
+
+    podman kill --signal SIGHUP dnsmasq
+
+    >   04b9f37c57872d54e9ad46f26c1d832cfb2c71f78222463725d69de950375e8e
+
+
+# -----------------------------------------------------
+# Add a hosts file for gaia-dev.
+#[root@Hizzoria]
+
+    cat > "${configdir:?}/gaia-dev.hosts" << EOF
+
+zeppelin.gaia-dev.aglais.uk,128.232.227.197
+
+EOF
+
+    #
+    # Useful to know - DNSmasq doesn't handle CNAMES to other domains.
+    # http://lists.thekelleys.org.uk/pipermail/dnsmasq-discuss/2006q1/000583.html
+    #
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Send a test query.
+#[user@desktop]
+
+    server=46.101.32.198
+
+    host 'zeppelin.gaia-dev.aglais.uk' "${server:?}"
+
+
+    >   Using domain server:
+    >   Name: 46.101.32.198
+    >   Address: 46.101.32.198#53
+    >   Aliases:
+    >   
+    >   Host zeppelin.gaia-dev.aglais.uk not found: 5(REFUSED)
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Add query logging.
+#[root@Hizzoria]
+
+    vi "${configdir:?}/aglais.conf"
+
+    +   log-queries
+
+
+    podman kill --signal SIGHUP dnsmasq
+
+    >   ....
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Send a test query.
+#[user@desktop]
+
+    host 'zeppelin.gaia-dev.aglais.uk' "${server:?}"
+
+    >   Using domain server:
+    >   Name: 46.101.32.198
+    >   Address: 46.101.32.198#53
+    >   Aliases:
+    >   
+    >   Host zeppelin.gaia-dev.aglais.uk not found: 5(REFUSED)
+
+
+    # Is that request refused, or connection refused ?
+
+
+# -----------------------------------------------------
+# Try dig instead.
+#[user@desktop]
+
+    dig "@${server:?}" 'zeppelin.gaia-dev.aglais.uk' 'A'
+
+    >   ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> @46.101.32.198 zeppelin.gaia-dev.aglais.uk A
+    >   ; (1 server found)
+    >   ;; global options: +cmd
+    >   ;; Got answer:
+    >   ;; ->>HEADER<<- opcode: QUERY, status: REFUSED, id: 65372
+    >   ;; flags: qr rd ra ad; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 0
+    >   
+    >   ;; QUESTION SECTION:
+    >   ;zeppelin.gaia-dev.aglais.uk.	IN	A
+    >   
+    >   ;; Query time: 15 msec
+    >   ;; SERVER: 46.101.32.198#53(46.101.32.198)
+    >   ;; WHEN: Sun Feb 21 13:06:16 GMT 2021
+    >   ;; MSG SIZE  rcvd: 45
+
+
+    # OK - looks like we got an answer ... that the query was refused.
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Setup the NS address in our external DNS provider.
+# https://admin.lcn.com/
+
+    ....
+    ....
+
+
+# -----------------------------------------------------
+# Check the result.
+#[user@desktop]
+
+    host -a 'infra-ops.aglais.uk' 'ns1.lcn.com'
+
+    >   Trying "infra-ops.aglais.uk"
+    >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 33136
+    >   ;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 3, ADDITIONAL: 3
+    >   
+    >   ;; QUESTION SECTION:
+    >   ;infra-ops.aglais.uk.		IN	ANY
+    >   
+    >   ;; ANSWER SECTION:
+    >   infra-ops.aglais.uk.	600	IN	A	46.101.32.198
+    >   
+    >   ;; AUTHORITY SECTION:
+    >   aglais.uk.		172800	IN	NS	ns0.lcn.com.
+    >   aglais.uk.		172800	IN	NS	ns1.lcn.com.
+    >   aglais.uk.		172800	IN	NS	ns2.lcn.com.
+    >   
+    >   ;; ADDITIONAL SECTION:
+    >   ns1.lcn.com.		12497	IN	A	85.233.160.69
+    >   ns0.lcn.com.		12497	IN	A	195.110.124.234
+    >   ns2.lcn.com.		12497	IN	A	91.186.2.8
+    >   
+    >   Received 162 bytes from 195.194.120.1#53 in 85 ms
+
+
+    host -a 'gaia-dev.aglais.uk' 'ns1.lcn.com'
+
+    >   Trying "gaia-dev.aglais.uk"
+    >   ;; Truncated, retrying in TCP mode.
+    >   Trying "gaia-dev.aglais.uk"
+    >   Using domain server:
+    >   Name: ns1.lcn.com
+    >   Address: 85.233.160.69#53
+    >   Aliases:
+    >   
+    >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 22311
+    >   ;; flags: qr rd; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 1
+    >   
+    >   ;; QUESTION SECTION:
+    >   ;gaia-dev.aglais.uk.		IN	ANY
+    >   
+    >   ;; AUTHORITY SECTION:
+    >   gaia-dev.aglais.uk.	600	IN	NS	infra-ops.aglais.uk.
+    >   
+    >   ;; ADDITIONAL SECTION:
+    >   infra-ops.aglais.uk.	600	IN	A	46.101.32.198
+    >   
+    >   Received 76 bytes from 85.233.160.69#53 in 15 ms
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Make DNSmasq an authoritative server.
+#[root@Hizzoria]
+
+    vi "${configdir:?}/aglais.conf"
+
+    +   auth-server=ns0.lcn.com
+    +   auth-zone=gaia-dev.aglais.uk
+
+    podman kill --signal SIGHUP dnsmasq
+
+    >   ....
+
+# -----------------------------------------------------
+# Check the result.
+#[user@desktop]
+
+    host -a 'zeppelin.gaia-dev.aglais.uk' 'infra-ops.aglais.uk'
+
+    >   Trying "zeppelin.gaia-dev.aglais.uk"
+    >   Using domain server:
+    >   Name: infra-ops.aglais.uk
+    >   Address: 46.101.32.198#53
+    >   Aliases:
+    >   
+    >   Host zeppelin.gaia-dev.aglais.uk not found: 5(REFUSED)
+    >   Received 45 bytes from 46.101.32.198#53 in 13 ms
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Explicity add the host address to our main config.
+#[root@Hizzoria]
+
+    vi "${configdir:?}/aglais.conf"
+
+    +   host-record=zeppelin,zeppelin.gaia-dev.aglais.uk,128.232.227.197
+
+
+    podman kill --signal SIGHUP dnsmasq
+
+    >   ....
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Check the result.
+#[user@desktop]
+
+    host -a 'zeppelin.gaia-dev.aglais.uk' 'infra-ops.aglais.uk'
+
+    >   ....
+    >   ....
+    >   Host zeppelin.gaia-dev.aglais.uk not found: 5(REFUSED)
+    >   Received 45 bytes from 46.101.32.198#53 in 15 ms
+
+    # Still getting REFUSED response.
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Stop the service ...
+#[root@Hizzoria]
+
+    podman stop dnsmasq
+
+    >   dnsmasq[1]: exiting on receipt of SIGTERM
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Check the result.
+#[user@desktop]
+
+    host -a 'zeppelin.gaia-dev.aglais.uk' 'infra-ops.aglais.uk'
+
+    >   host -a 'zeppelin.gaia-dev.aglais.uk' 'infra-ops.aglais.uk'
+    >   Trying "zeppelin.gaia-dev.aglais.uk"
+    >   ;; connection timed out; no servers could be reached
+
+    # So the REFUSED is coming from DNSmasq itself.
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Start the service ...
+#[root@Hizzoria]
+
+    podman start dnsmasq
+
+    >   dnsmasq: error at line 17 of /etc/dnsmasq/aglais.conf
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+    # Mistake #1
+    # Created a hosts directory and added it to the config.
+
+        mkdir "${configdir:?}/hosts"
+
+    # .. but put the hosts file in the top level
+
+        cat > "${configdir:?}/gaia-dev.hosts" << EOF
+
+    # Explicitly adding the host addres to the main config worked
+
+        host-record=zeppelin,zeppelin.gaia-dev.aglais.uk,128.232.227.197
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+    # Mistake #2
+    # Created a hosts directory on the host VM
+
+        mkdir "${configdir:?}/hosts"
+
+    # and used the same path in the config file
+
+        hostsdir=/var/aglais/dnsmasq/hosts
+
+    # but the host VM path is mounted as something else inside the Pod
+
+        volumes:
+            - "{{dnsmasq.config_path}}:/etc/dnsmasq:ro"
+
+    # So the path _insied_ the Pod should be
+
+        /etc/dnsmasq/hosts
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+    # Mistake #3
+    # The wrong syntax in the hosts file
+    # using the addresshost-record syntax from DNSmasq config
+
+        zeppelin.gaia-dev.aglais.uk,128.232.227.197
+
+    # Should be using the hosts file syntax
+
+        128.232.227.197 zeppelin.gaia-dev.aglais.uk
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+    # Mistake #4
+    # Sending the SIGHUP signal to the Pod didn't reload the config file.
+    # Need to explicitly stop and start the Pod.
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Putting it all together.
+
+    cat "${configdir:?}/aglais.conf"
+
+    >   no-hosts
+    >   no-resolv
+    >   no-daemon
+    >   bogus-priv
+    >   domain-needed
+    >   keep-in-foreground
+    >   
+    >   log-queries
+    >   log-facility=-
+    >   
+    >   except-interface=localhost
+    >   bind-dynamic
+    >   
+    >   hostsdir=/etc/dnsmasq/hosts
+    >   
+    >   #auth-server=ns0.lcn.com
+    >   #auth-server=infra-ops.aglais.uk
+    >   auth-zone=gaia-dev.aglais.uk
+
+
+    cat "${configdir:?}/hosts/gaia-dev.hosts"
+
+    >   # Host addresses for the gaia-dev cloud.
+    >   128.232.227.197 zeppelin.gaia-dev.aglais.uk
+    >   
+
+# -----------------------------------------------------
+# Stop and start the Pod ...
+#[root@Hizzoria]
+
+    podman stop dnsmasq
+
+    podman start dnsmasq
+
+
+# -----------------------------------------------------
+# Test from an external machine
+#[user@trop01]
+
+    host 'zeppelin.gaia-dev.aglais.uk'
+
+    >   zeppelin.gaia-dev.aglais.uk has address 128.232.227.197
+
+
+    host -a 'zeppelin.gaia-dev.aglais.uk'
+
+    >   Trying "zeppelin.gaia-dev.aglais.uk"
+    >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 63506
+    >   ;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 1, ADDITIONAL: 1
+    >   
+    >   ;; QUESTION SECTION:
+    >   ;zeppelin.gaia-dev.aglais.uk.	IN	ANY
+    >   
+    >   ;; ANSWER SECTION:
+    >   zeppelin.gaia-dev.aglais.uk. 558 IN	A	128.232.227.197
+    >   
+    >   ;; AUTHORITY SECTION:
+    >   gaia-dev.aglais.uk.	431	IN	NS	infra-ops.aglais.uk.
+    >   
+    >   ;; ADDITIONAL SECTION:
+    >   infra-ops.aglais.uk.	431	IN	A	46.101.32.198
+    >   
+    >   Received 101 bytes from 195.194.120.2#53 in 13 ms
+
+
+    # The NS record for the sub-domain is wrong.
+
+    host -a 'gaia-dev.aglais.uk'
+
+    >   Trying "gaia-dev.aglais.uk"
+    >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 50199
+    >   ;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 0, ADDITIONAL: 0
+    >   
+    >   ;; QUESTION SECTION:
+    >   ;gaia-dev.aglais.uk.		IN	ANY
+    >   
+    >   ;; ANSWER SECTION:
+    >   gaia-dev.aglais.uk.	243	IN	NS	.
+    >   
+    >   Received 49 bytes from 195.194.120.1#53 in 13 ms
+
+
+    host -t NS 'gaia-dev.aglais.uk'
+
+    >   gaia-dev.aglais.uk name server .
+
+
+    # Explicitly point at our server
+
+    host -a 'gaia-dev.aglais.uk' 'infra-ops.aglais.uk'
+
+    >   Trying "gaia-dev.aglais.uk"
+    >   Using domain server:
+    >   Name: infra-ops.aglais.uk
+    >   Address: 46.101.32.198#53
+    >   Aliases:
+    >   
+    >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 12966
+    >   ;; flags: qr rd ra; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 0
+    >   
+    >   ;; QUESTION SECTION:
+    >   ;gaia-dev.aglais.uk.		IN	ANY
+    >   
+    >   Received 36 bytes from 46.101.32.198#53 in 15 ms
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+
+    # Lots of caching is getting in the way.
+    # I think we have the right configuration.
+
+    # We needed to specify the interface for the auth-server entry.
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# List the configuration.
+#[root@Hizzoria]
+
+    cat "${configdir:?}/aglais.conf"
+
+    >   no-hosts
+    >   no-resolv
+    >   no-daemon
+    >   bogus-priv
+    >   domain-needed
+    >   keep-in-foreground
+    >   
+    >   local-ttl=60
+    >   
+    >   log-queries
+    >   log-facility=-
+    >   
+    >   except-interface=localhost
+    >   bind-dynamic
+    >   
+    >   hostsdir=/etc/dnsmasq/hosts
+    >   
+    >   auth-server=infra-ops.aglais.uk,eth0
+    >   auth-zone=gaia-dev.aglais.uk
+
+
+    cat "${configdir:?}/hosts/gaia-dev.hosts"
+
+    >   # Host addresses for the gaia-dev cloud.
+    >   128.232.227.197 zeppelin.gaia-dev.aglais.uk
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Check the results.
+#[user@desktop]
+
+    # Ask LCN's nameserver
+    dig '@ns0.lcn.com' 'zeppelin.gaia-dev.aglais.uk'
+
+    >   ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> @ns0.lcn.com zeppelin.gaia-dev.aglais.uk
+    >   ; (1 server found)
+    >   ;; global options: +cmd
+    >   ;; Got answer:
+    >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 27658
+    >   ;; flags: qr rd; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 2
+    >   ;; WARNING: recursion requested but not available
+    >   
+    >   ;; OPT PSEUDOSECTION:
+    >   ; EDNS: version: 0, flags:; udp: 1232
+    >   ;; QUESTION SECTION:
+    >   ;zeppelin.gaia-dev.aglais.uk.	IN	A
+    >   
+    >   ;; AUTHORITY SECTION:
+    >   gaia-dev.aglais.uk.	600	IN	NS	infra-ops.aglais.uk.
+    >   
+    >   ;; ADDITIONAL SECTION:
+    >   infra-ops.aglais.uk.	600	IN	A	46.101.32.198
+    >   
+    >   ;; Query time: 51 msec
+    >   ;; SERVER: 195.110.124.234#53(195.110.124.234)
+    >   ;; WHEN: Sun Feb 21 16:17:11 GMT 2021
+    >   ;; MSG SIZE  rcvd: 96
+
+
+    # Ask our nameserver
+    dig '@infra-ops.aglais.uk' 'zeppelin.gaia-dev.aglais.uk'
+
+    >   ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> @infra-ops.aglais.uk zeppelin.gaia-dev.aglais.uk
+    >   ; (1 server found)
+    >   ;; global options: +cmd
+    >   ;; Got answer:
+    >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 31016
+    >   ;; flags: qr aa rd ad; QUERY: 1, ANSWER: 1, AUTHORITY: 1, ADDITIONAL: 1
+    >   ;; WARNING: recursion requested but not available
+    >   
+    >   ;; OPT PSEUDOSECTION:
+    >   ; EDNS: version: 0, flags:; udp: 4096
+    >   ;; QUESTION SECTION:
+    >   ;zeppelin.gaia-dev.aglais.uk.	IN	A
+    >   
+    >   ;; ANSWER SECTION:
+    >   zeppelin.gaia-dev.aglais.uk. 600 IN	A	128.232.227.197
+    >   
+    >   ;; AUTHORITY SECTION:
+    >   gaia-dev.aglais.uk.	600	IN	NS	infra-ops.aglais.uk.
+    >   
+    >   ;; Query time: 16 msec
+    >   ;; SERVER: 46.101.32.198#53(46.101.32.198)
+    >   ;; WHEN: Sun Feb 21 16:17:51 GMT 2021
+    >   ;; MSG SIZE  rcvd: 123
+
+
+    # That all looks OK, apart from the 600s TTL.
+    # ... but we can live with that for now.
+
+    # Having to set the interface name in the auth-server entry is a hassle,
+    # but we work around it.
+
+    # We can use the interface name from the default route entry.
+
+# -----------------------------------------------------
+# Get the interface name from the default route entry.
+#[root@Hizzoria]
+
+    ifname=$(
+        ip route list match default \
+        | sed '
+            s/^.*dev[[:space:]]*\([[:alnum:]]*\)[[:space:]]*proto.*$/\1/
+            '
+        )
+
+    echo "Interface [${ifname:?}]"
+
+    >   Interface [eth0]
+
+
+# -----------------------------------------------------
+# Get the config files.
+#[root@Hizzoria]
+
+    ls -1 "${configdir}"
+
+    >   aglais.conf
+    >   hosts
+
+
+    "cat ${configdir}/aglais.conf"
+
+    >   no-hosts
+    >   no-resolv
+    >   no-daemon
+    >   bogus-priv
+    >   domain-needed
+    >   keep-in-foreground
+    >   
+    >   local-ttl=60
+    >   
+    >   log-queries
+    >   log-facility=-
+    >   
+    >   except-interface=localhost
+    >   bind-dynamic
+    >   
+    >   hostsdir=/etc/dnsmasq/hosts
+    >   
+    >   auth-server=infra-ops.aglais.uk,eth0
+    >   auth-zone=gaia-dev.aglais.uk
+
+
+    ls -1 "${configdir}/hosts"
+
+    >   gaia-dev.hosts
+
+
+    cat "${configdir}/hosts/gaia-dev.hosts"
 
+    >   # Host addresses for the gaia-dev cloud.
+    >   128.232.227.197 zeppelin.gaia-dev.aglais.uk
 
 
 

From e2e6368fddff1429d17774b8133bdca11d66b035 Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Tue, 23 Feb 2021 00:31:17 +0000
Subject: [PATCH 23/27] Generating DNSmasq config from hosts.yml

Signed-off-by: zrq-github@metagrid.co.uk <zrq-github@metagrid.co.uk>
---
 .../infra-ops/ansible/01-ssh-config.yml       |  29 ++--
 .../infra-ops/ansible/03-dns-server.yml       |  10 +-
 .../{ssh-hostkeys.yml => 04-dns-hosts.yml}    |  30 ++--
 experiments/infra-ops/ansible/hosts.yml       |  47 ++++--
 .../roles/dns-server/defaults/main.yml        |  22 ++-
 .../ansible/roles/dns-server/tasks/main.yml   |  16 +-
 .../ansible/templates/dns/dns-aglais.conf.j2  |  49 ++++++
 .../dns/dns-cloud-hosts.j2}                   |   6 +
 .../templates/{ => ssh}/ssh-local-config.j2   |   3 +-
 notes/zrq/20210218-01-infra-ops.txt           |  28 +++-
 notes/zrq/20210221-01-infra-ops.txt           | 147 +++++++++---------
 notes/zrq/20210222-01-infra-ops.txt           | 108 +++++++++++++
 notes/zrq/20210222-03-infra-ops.txt           | 147 ++++++++++++++++++
 13 files changed, 505 insertions(+), 137 deletions(-)
 rename experiments/infra-ops/ansible/{ssh-hostkeys.yml => 04-dns-hosts.yml} (64%)
 create mode 100644 experiments/infra-ops/ansible/templates/dns/dns-aglais.conf.j2
 rename experiments/infra-ops/ansible/{roles/dns-server/templates/main.yml => templates/dns/dns-cloud-hosts.j2} (75%)
 rename experiments/infra-ops/ansible/templates/{ => ssh}/ssh-local-config.j2 (95%)
 create mode 100644 notes/zrq/20210222-01-infra-ops.txt
 create mode 100644 notes/zrq/20210222-03-infra-ops.txt

diff --git a/experiments/infra-ops/ansible/01-ssh-config.yml b/experiments/infra-ops/ansible/01-ssh-config.yml
index dd0d3570..37347e02 100644
--- a/experiments/infra-ops/ansible/01-ssh-config.yml
+++ b/experiments/infra-ops/ansible/01-ssh-config.yml
@@ -25,38 +25,41 @@
   gather_facts: false
 
   vars:
-    sshdir: "{{ lookup('env','HOME') }}/.ssh"
+    ssh_config_path: "{{ lookup('env','HOME') }}/.ssh"
+    ssh_config_file: "{{ ssh_config_path }}/ansible-config"
+    ssh_config_link: "{{ ssh_config_path }}/config"
+    ssh_known_hosts: "{{ ssh_config_path }}/known_hosts"
 
   tasks:
 
-    - name: "Check our local SSH directory [{{sshdir}}]"
+    - name: "Check our local SSH directory [{{ssh_config_path}}]"
       file:
-        path: "{{sshdir}}"
+        path: "{{ssh_config_path}}"
         mode: 'u=rwx,g=rx,o=rx'
         state: directory
 
-    - name: "Create our Ansible config [{{sshdir}}/ansible-config]"
+    - name: "Create our Ansible config file [{{ssh_config_file}}]"
       template:
-        src:  'templates/ssh-local-config.j2'
-        dest: "{{sshdir}}/ansible-config"
+        src:  'templates/ssh/ssh-local-config.j2'
+        dest: "{{ssh_config_file}}"
 
-    - name: "Check for an existing config [{{sshdir}}/config]"
+    - name: "Check for an existing config link [{{ssh_config_link}}]"
       stat:
-        path: "{{sshdir}}/config"
+        path: "{{ssh_config_link}}"
         get_mime: no
         get_checksum: no
         get_attributes: no
       register: filestat
 
-    - name: "Link our Ansible config [{{sshdir}}/config]"
+    - name: "Create our Ansible config link [{{ssh_config_link}}]"
       file:
-        src:  "{{sshdir}}/ansible-config"
-        dest: "{{sshdir}}/config"
+        src:  "{{ssh_config_file}}"
+        dest: "{{ssh_config_link}}"
         state: link
       when: filestat.stat.exists == false
 
-    - name: "Create our [{{sshdir}}/known_hosts] file"
-      shell: "ssh-keyscan {{ hostvars[item].publicip4 | ipaddr('address') }} >> {{sshdir}}/known_hosts"
+    - name: "Create our known_hosts file [{{ssh_known_hosts}}]"
+      shell: "ssh-keyscan {{ hostvars[item].publicip4 | ipaddr('address') }} >> {{ssh_known_hosts}}"
       loop: "{{ groups['public'] }}"
 
 
diff --git a/experiments/infra-ops/ansible/03-dns-server.yml b/experiments/infra-ops/ansible/03-dns-server.yml
index 6caae3c7..f68a58b7 100644
--- a/experiments/infra-ops/ansible/03-dns-server.yml
+++ b/experiments/infra-ops/ansible/03-dns-server.yml
@@ -23,14 +23,14 @@
 ---
 - name: "DNS server"
   hosts: Hizzoria
-  vars:
-    dnsmasq:
-      # Host path for DNSmasq config files
-      config_path: '/var/aglais/dnsmasq'
 
   roles:
     - dns-server
 
-
+  tasks:
+    - name: "Create DNSmasq config [{{dnsmasq_config_path}}/aglais.conf]"
+      template:
+        src:  'templates/dns/dns-aglais.conf.j2'
+        dest: "{{dnsmasq_config_path}}/aglais.conf"
 
 
diff --git a/experiments/infra-ops/ansible/ssh-hostkeys.yml b/experiments/infra-ops/ansible/04-dns-hosts.yml
similarity index 64%
rename from experiments/infra-ops/ansible/ssh-hostkeys.yml
rename to experiments/infra-ops/ansible/04-dns-hosts.yml
index 2af41023..a7023c07 100644
--- a/experiments/infra-ops/ansible/ssh-hostkeys.yml
+++ b/experiments/infra-ops/ansible/04-dns-hosts.yml
@@ -1,7 +1,7 @@
 #
 # <meta:header>
 #   <meta:licence>
-#     Copyright (c) 2019, ROE (http://www.roe.ac.uk/)
+#     Copyright (c) 2020, ROE (http://www.roe.ac.uk/)
 #
 #     This information is free software: you can redistribute it and/or modify
 #     it under the terms of the GNU General Public License as published by
@@ -21,23 +21,23 @@
 #
 
 ---
-- hosts: localhost
+- name: "Configure DNS hosts"
+  hosts: Hizzoria
   gather_facts: false
-  tasks:
 
-    - name: "Check our ssh directory"
-      file:
-        path: "{{ lookup('env','HOME') }}/.ssh"
-        mode: 'u=rwx,g=rx,o=rx'
+  tasks:
+    - name: "Create DNSmasq hosts directory [{{dnsmasq_hosts_path}}]"
+      become: true
+      ansible.builtin.file:
+        path: "{{dnsmasq_hosts_path}}"
         state: directory
 
-    - name: "Accept the gateway host key"
-      shell: "ssh-keyscan {{ hostvars['lsstukhead'].inetip4 | ipaddr('address') }} >> {{ lookup('env','HOME') }}/.ssh/known_hosts"
-
-# TODO
-# Use separate tasks:
-# command, register the output, pass to known_hosts module to write.
-
-
+    - name: "Update DNSmasq hosts"
+      become: true
+      template:
+        src:  'templates/dns/dns-cloud-hosts.j2'
+        dest: "{{dnsmasq_hosts_path}}/{{item.key}}.hosts"
+      loop:
+        "{{ query('dict', clouds) }}"
 
 
diff --git a/experiments/infra-ops/ansible/hosts.yml b/experiments/infra-ops/ansible/hosts.yml
index 6174d417..7833d393 100644
--- a/experiments/infra-ops/ansible/hosts.yml
+++ b/experiments/infra-ops/ansible/hosts.yml
@@ -26,6 +26,14 @@ all:
         ansible_ssh_common_args: "-F {{ lookup('env','HOME') }}/.ssh/ansible-config"
         ansible_python_interpreter: 'auto_silent'
 
+        # Host path for DNSmasq config files
+        dnsmasq_config_path: "/var/aglais/dnsmasq"
+        dnsmasq_hosts_path:  "{{dnsmasq_config_path}}/hosts"
+        # DNSmasq network interface
+        dnsmasq_ifname: 'eth0'
+        dnsmasq_host:   'infra-ops'
+        dnsmasq_domain: 'aglais.uk'
+
         users:
             zrq:
                 name: 'Zarquan'
@@ -43,17 +51,38 @@ all:
                     - 'users'
                     - 'admin'
                     - 'sudo'
+        clouds:
+            # These are NOT Ansible deplyment hosts.
+            # These target machines are created and configured by the Aglais deployment.
+            # This is just a list of their names and their IP addresses.
+            gaia-dev:
+                zeppelin:
+                    publicip4: '128.232.227.197'
+                dashboard:
+                    publicip4: '128.232.227.197'
+
+            gaia-test:
+                zeppelin:
+                    publicip4: '128.232.227.197'
+
+            gaia-prod:
+                zeppelin:
+                    publicip4: '128.232.227.197'
+
+    children:
+
+        ocean:
+            hosts:
+                Hizzoria:
+                    publicip4: '46.101.32.198'
+                    publicip6: '2a03:b0c0:1:d0::b53:6001'
+                    adminuser: 'root'
+
+        public:
+            hosts:
+                Hizzoria:
 
-public:
-
-    hosts:
-
-        Hizzoria:
 
-            cloudname: 'digital-ocean'
-            publicip4: '46.101.32.198'
-            publicip6: '2a03:b0c0:1:d0::b53:6001'
-            adminuser: 'root'
 
 
 
diff --git a/experiments/infra-ops/ansible/roles/dns-server/defaults/main.yml b/experiments/infra-ops/ansible/roles/dns-server/defaults/main.yml
index bff7bb08..a1c4a3f7 100644
--- a/experiments/infra-ops/ansible/roles/dns-server/defaults/main.yml
+++ b/experiments/infra-ops/ansible/roles/dns-server/defaults/main.yml
@@ -20,13 +20,21 @@
 #
 #
 
+#
+# Had to flatten the variables from a tree into single names.
+# Ansible doesn't merge variables it replaces the whole branch with another.
+# https://docs.ansible.com/ansible/latest/reference_appendices/config.html#default-hash-behaviour
+# https://stackoverflow.com/questions/35554415/in-ansible-how-to-combine-variables-from-separate-files-into-one-array
+# https://serverfault.com/questions/1044921/merging-variables-in-ansible-with-roles
+
 ---
+# DNSmasq configuration
+dnsmasq_config_path:     "/etc/dnsmasq"
+dnsmasq_hosts_path:      "{{dnsmasq_config_path}}/hosts"
+dnsmasq_container_name:  "dnsmasq"
+dnsmasq_container_image: "storytel/dnsmasq"
 
-dnsmasq:
-    # Host path for DNSmasq config files
-    config_path: '/etc/dnsmasq'
+# Number of restart attempts
+# https://docs.ansible.com/ansible/latest/collections/containers/podman/podman_container_module.html#parameter-restart_policy
+dnsmasq_restart_limit: 10
 
-podman:
-    # Number of restart attempts
-    # https://docs.ansible.com/ansible/latest/collections/containers/podman/podman_container_module.html#parameter-restart_policy
-    restart_limit: 10
diff --git a/experiments/infra-ops/ansible/roles/dns-server/tasks/main.yml b/experiments/infra-ops/ansible/roles/dns-server/tasks/main.yml
index 4a862b2c..8aa999ab 100644
--- a/experiments/infra-ops/ansible/roles/dns-server/tasks/main.yml
+++ b/experiments/infra-ops/ansible/roles/dns-server/tasks/main.yml
@@ -23,10 +23,10 @@
 ---
 
 # Create the DNSmasq config directory.
-- name: "Create DNSmasq config directory"
+- name: "Create DNSmasq config directory [{{dnsmasq_config_path}}]"
   become: true
   ansible.builtin.file:
-    path: "{{dnsmasq.config_path}}"
+    path: "{{dnsmasq_config_path}}"
     state: directory
 
 # Deploy a DNSmasq service in a Podman container.
@@ -34,19 +34,19 @@
 # https://github.com/Storytel/dnsmasq
 # https://hub.docker.com/r/storytel/dnsmasq/dockerfile
 # https://docs.ansible.com/ansible/latest/collections/containers/podman/podman_container_module.html
-- name: "Deploy DNSmasq container"
+- name: "Deploy DNSmasq container [{{dnsmasq_container_image}}]"
   become: true
   containers.podman.podman_container:
-    name:  'dnsmasq'
-    image: 'storytel/dnsmasq'
+    name:  "{{dnsmasq_container_name}}"
+    image: "{{dnsmasq_container_image}}"
     state: 'started'
     restart: 'yes'
     detach: true
     privileged: true
     network: 'host'
-    restart_policy: "on-failure:{{podman.restart_limit}}"
+    restart_policy: "on-failure:{{dnsmasq_restart_limit}}"
     volumes:
-        - "{{dnsmasq.config_path}}:/etc/dnsmasq:ro"
+      - "{{dnsmasq_config_path}}:/etc/dnsmasq:ro"
     publish:
-        - "53:53/tcp"
+      - "53:53/tcp"
 
diff --git a/experiments/infra-ops/ansible/templates/dns/dns-aglais.conf.j2 b/experiments/infra-ops/ansible/templates/dns/dns-aglais.conf.j2
new file mode 100644
index 00000000..d458126e
--- /dev/null
+++ b/experiments/infra-ops/ansible/templates/dns/dns-aglais.conf.j2
@@ -0,0 +1,49 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+# DNSmasq config file
+# http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html
+#
+
+no-hosts
+no-resolv
+no-daemon
+bogus-priv
+domain-needed
+keep-in-foreground
+
+auth-ttl=300
+local-ttl=300
+
+log-queries
+log-facility=-
+
+bind-dynamic
+except-interface=localhost
+
+hostsdir=/etc/dnsmasq/hosts
+
+auth-server={{dnsmasq_host}}.{{dnsmasq_domain}},{{dnsmasq_ifname}}
+
+{% for cloud in query('dict', clouds) -%}
+auth-zone={{cloud.key}}.aglais.uk
+{% endfor %}
+
+
diff --git a/experiments/infra-ops/ansible/roles/dns-server/templates/main.yml b/experiments/infra-ops/ansible/templates/dns/dns-cloud-hosts.j2
similarity index 75%
rename from experiments/infra-ops/ansible/roles/dns-server/templates/main.yml
rename to experiments/infra-ops/ansible/templates/dns/dns-cloud-hosts.j2
index 5b80f3b0..d7dce8b0 100644
--- a/experiments/infra-ops/ansible/roles/dns-server/templates/main.yml
+++ b/experiments/infra-ops/ansible/templates/dns/dns-cloud-hosts.j2
@@ -18,6 +18,12 @@
 #   </meta:licence>
 # </meta:header>
 #
+# DNSmasq hosts file for the {{item.key}} cloud.
+# http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html
 #
 
+{% for target in query('dict', item.value) -%}
+{{ "%-15s"|format(target.value.publicip4) }}  {{ target.key }}.{{item.key}}.{{dnsmasq_domain}}
+{% endfor %}
+
 
diff --git a/experiments/infra-ops/ansible/templates/ssh-local-config.j2 b/experiments/infra-ops/ansible/templates/ssh/ssh-local-config.j2
similarity index 95%
rename from experiments/infra-ops/ansible/templates/ssh-local-config.j2
rename to experiments/infra-ops/ansible/templates/ssh/ssh-local-config.j2
index 5ec1d3e3..c9eec641 100644
--- a/experiments/infra-ops/ansible/templates/ssh-local-config.j2
+++ b/experiments/infra-ops/ansible/templates/ssh/ssh-local-config.j2
@@ -25,7 +25,8 @@ Compression no
 ForwardAgent yes
 PasswordAuthentication no
 
-{% for hostname in groups['public'] %}
+# Digital Ocean hosts
+{% for hostname in groups['ocean'] %}
 Host {{ hostname }}
     HostName {{ hostvars[hostname].publicip4 | ipaddr('address') }}
     User {{ hostvars[hostname].adminuser }}
diff --git a/notes/zrq/20210218-01-infra-ops.txt b/notes/zrq/20210218-01-infra-ops.txt
index cff12949..9055c012 100644
--- a/notes/zrq/20210218-01-infra-ops.txt
+++ b/notes/zrq/20210218-01-infra-ops.txt
@@ -26,6 +26,18 @@
 #zrq-notes-zeppelin
 #
 
+    Target:
+
+        Deploy DNSmasq to provide a DNS service for our deployments.
+
+        Solves issue #379
+        https://github.com/wfau/aglais/issues/379
+
+    Result:
+
+        Work in progress ...
+
+
 # -----------------------------------------------------
 # Create a container to work with.
 #[user@desktop]
@@ -95,22 +107,22 @@
     popd
 
     >   PLAY [Apply roles] *****************************************************************
-    >   
+    >
     >   TASK [Gathering Facts] *************************************************************
     >   ok: [Hizzoria]
-    >   
+    >
     >   TASK [fedora-base : Update the DNF cache] ******************************************
     >   changed: [Hizzoria]
-    >   
+    >
     >   TASK [podman-host : Install Podman] ************************************************
     >   changed: [Hizzoria]
-    >   
+    >
     >   TASK [dns-server : Create DNSmasq config directory] ********************************
     >   changed: [Hizzoria]
-    >   
+    >
     >   TASK [dns-server : Deploy DNSmasq container] ***************************************
     >   changed: [Hizzoria]
-    >   
+    >
     >   PLAY RECAP *************************************************************************
     >   Hizzoria : ok=5  changed=4  unreachable=0  failed=0  skipped=0  rescued=0  ignored=0
 
@@ -133,9 +145,9 @@
     podman logs dnsmasq
 
     >   dnsmasq: failed to create listening socket for port 53: Address in use
-    >   
+    >
     >   dnsmasq: failed to create listening socket for port 53: Address in use
-    >   
+    >
     >   dnsmasq: failed to create listening socket for port 53: Address in use
     >   ....
     >   ....
diff --git a/notes/zrq/20210221-01-infra-ops.txt b/notes/zrq/20210221-01-infra-ops.txt
index afe7151b..23460257 100644
--- a/notes/zrq/20210221-01-infra-ops.txt
+++ b/notes/zrq/20210221-01-infra-ops.txt
@@ -26,6 +26,21 @@
 #zrq-notes-zeppelin
 #
 
+    Target:
+
+        Deploy DNSmasq to provide a DNS service for our deployments.
+
+        Follow on from previous notes:
+            notes/zrq/20210218-01-infra-ops.txt
+
+        Solves issue #379
+        https://github.com/wfau/aglais/issues/379
+
+    Result:
+
+        Work in progress ...
+
+
 # -----------------------------------------------------
 # Create a container to work with.
 #[user@desktop]
@@ -44,9 +59,20 @@
         atolmis/ansible-client:2020.12.02 \
         bash
 
+# -----------------------------------------------------
+# Install the Podman container plugin.
+# TODO - Add this to our ansible-client image.
+#[root@ansibler]
+
+    ansible-galaxy collection install containers.podman
+
+    >   Process install dependency map
+    >   Starting collection install process
+    >   Installing 'containers.podman:1.4.3' to '/root/.ansible/collections/ansible_collections/containers/podman'
+
 
 # -----------------------------------------------------
-# Test our Ansible scripts ...
+# Run our Ansible scripts ...
 #[root@ansibler]
 
     pushd "/infra-ops/ansible"
@@ -66,27 +92,6 @@
     >   ....
     >   ....
 
-    popd
-
-
-# -----------------------------------------------------
-# Install the Podman container plugin.
-# TODO - Add this to our ansible-client image.
-#[root@ansibler]
-
-    ansible-galaxy collection install containers.podman
-
-    >   Process install dependency map
-    >   Starting collection install process
-    >   Installing 'containers.podman:1.4.3' to '/root/.ansible/collections/ansible_collections/containers/podman'
-
-
-# -----------------------------------------------------
-# Install the Podman container plugin.
-# TODO - Add this to our ansible-client image.
-#[root@ansibler]
-
-    pushd "/infra-ops/ansible"
 
         ansible-playbook \
             --inventory "hosts.yml" \
@@ -634,7 +639,7 @@ EOF
     >   Name: 46.101.32.198
     >   Address: 46.101.32.198#53
     >   Aliases:
-    >   
+    >
     >   Host zeppelin.gaia-dev.aglais.uk not found: 5(REFUSED)
 
 
@@ -664,7 +669,7 @@ EOF
     >   Name: 46.101.32.198
     >   Address: 46.101.32.198#53
     >   Aliases:
-    >   
+    >
     >   Host zeppelin.gaia-dev.aglais.uk not found: 5(REFUSED)
 
 
@@ -683,10 +688,10 @@ EOF
     >   ;; Got answer:
     >   ;; ->>HEADER<<- opcode: QUERY, status: REFUSED, id: 65372
     >   ;; flags: qr rd ra ad; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 0
-    >   
+    >
     >   ;; QUESTION SECTION:
     >   ;zeppelin.gaia-dev.aglais.uk.	IN	A
-    >   
+    >
     >   ;; Query time: 15 msec
     >   ;; SERVER: 46.101.32.198#53(46.101.32.198)
     >   ;; WHEN: Sun Feb 21 13:06:16 GMT 2021
@@ -714,23 +719,23 @@ EOF
     >   Trying "infra-ops.aglais.uk"
     >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 33136
     >   ;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 3, ADDITIONAL: 3
-    >   
+    >
     >   ;; QUESTION SECTION:
     >   ;infra-ops.aglais.uk.		IN	ANY
-    >   
+    >
     >   ;; ANSWER SECTION:
     >   infra-ops.aglais.uk.	600	IN	A	46.101.32.198
-    >   
+    >
     >   ;; AUTHORITY SECTION:
     >   aglais.uk.		172800	IN	NS	ns0.lcn.com.
     >   aglais.uk.		172800	IN	NS	ns1.lcn.com.
     >   aglais.uk.		172800	IN	NS	ns2.lcn.com.
-    >   
+    >
     >   ;; ADDITIONAL SECTION:
     >   ns1.lcn.com.		12497	IN	A	85.233.160.69
     >   ns0.lcn.com.		12497	IN	A	195.110.124.234
     >   ns2.lcn.com.		12497	IN	A	91.186.2.8
-    >   
+    >
     >   Received 162 bytes from 195.194.120.1#53 in 85 ms
 
 
@@ -743,19 +748,19 @@ EOF
     >   Name: ns1.lcn.com
     >   Address: 85.233.160.69#53
     >   Aliases:
-    >   
+    >
     >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 22311
     >   ;; flags: qr rd; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 1
-    >   
+    >
     >   ;; QUESTION SECTION:
     >   ;gaia-dev.aglais.uk.		IN	ANY
-    >   
+    >
     >   ;; AUTHORITY SECTION:
     >   gaia-dev.aglais.uk.	600	IN	NS	infra-ops.aglais.uk.
-    >   
+    >
     >   ;; ADDITIONAL SECTION:
     >   infra-ops.aglais.uk.	600	IN	A	46.101.32.198
-    >   
+    >
     >   Received 76 bytes from 85.233.160.69#53 in 15 ms
 
 
@@ -784,7 +789,7 @@ EOF
     >   Name: infra-ops.aglais.uk
     >   Address: 46.101.32.198#53
     >   Aliases:
-    >   
+    >
     >   Host zeppelin.gaia-dev.aglais.uk not found: 5(REFUSED)
     >   Received 45 bytes from 46.101.32.198#53 in 13 ms
 
@@ -924,15 +929,15 @@ EOF
     >   bogus-priv
     >   domain-needed
     >   keep-in-foreground
-    >   
+    >
     >   log-queries
     >   log-facility=-
-    >   
+    >
     >   except-interface=localhost
     >   bind-dynamic
-    >   
+    >
     >   hostsdir=/etc/dnsmasq/hosts
-    >   
+    >
     >   #auth-server=ns0.lcn.com
     >   #auth-server=infra-ops.aglais.uk
     >   auth-zone=gaia-dev.aglais.uk
@@ -942,7 +947,7 @@ EOF
 
     >   # Host addresses for the gaia-dev cloud.
     >   128.232.227.197 zeppelin.gaia-dev.aglais.uk
-    >   
+    >
 
 # -----------------------------------------------------
 # Stop and start the Pod ...
@@ -967,19 +972,19 @@ EOF
     >   Trying "zeppelin.gaia-dev.aglais.uk"
     >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 63506
     >   ;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 1, ADDITIONAL: 1
-    >   
+    >
     >   ;; QUESTION SECTION:
     >   ;zeppelin.gaia-dev.aglais.uk.	IN	ANY
-    >   
+    >
     >   ;; ANSWER SECTION:
     >   zeppelin.gaia-dev.aglais.uk. 558 IN	A	128.232.227.197
-    >   
+    >
     >   ;; AUTHORITY SECTION:
     >   gaia-dev.aglais.uk.	431	IN	NS	infra-ops.aglais.uk.
-    >   
+    >
     >   ;; ADDITIONAL SECTION:
     >   infra-ops.aglais.uk.	431	IN	A	46.101.32.198
-    >   
+    >
     >   Received 101 bytes from 195.194.120.2#53 in 13 ms
 
 
@@ -990,13 +995,13 @@ EOF
     >   Trying "gaia-dev.aglais.uk"
     >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 50199
     >   ;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 0, ADDITIONAL: 0
-    >   
+    >
     >   ;; QUESTION SECTION:
     >   ;gaia-dev.aglais.uk.		IN	ANY
-    >   
+    >
     >   ;; ANSWER SECTION:
     >   gaia-dev.aglais.uk.	243	IN	NS	.
-    >   
+    >
     >   Received 49 bytes from 195.194.120.1#53 in 13 ms
 
 
@@ -1014,13 +1019,13 @@ EOF
     >   Name: infra-ops.aglais.uk
     >   Address: 46.101.32.198#53
     >   Aliases:
-    >   
+    >
     >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 12966
     >   ;; flags: qr rd ra; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 0
-    >   
+    >
     >   ;; QUESTION SECTION:
     >   ;gaia-dev.aglais.uk.		IN	ANY
-    >   
+    >
     >   Received 36 bytes from 46.101.32.198#53 in 15 ms
 
 # -----------------------------------------------------
@@ -1045,17 +1050,17 @@ EOF
     >   bogus-priv
     >   domain-needed
     >   keep-in-foreground
-    >   
+    >
     >   local-ttl=60
-    >   
+    >
     >   log-queries
     >   log-facility=-
-    >   
+    >
     >   except-interface=localhost
     >   bind-dynamic
-    >   
+    >
     >   hostsdir=/etc/dnsmasq/hosts
-    >   
+    >
     >   auth-server=infra-ops.aglais.uk,eth0
     >   auth-zone=gaia-dev.aglais.uk
 
@@ -1081,18 +1086,18 @@ EOF
     >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 27658
     >   ;; flags: qr rd; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 2
     >   ;; WARNING: recursion requested but not available
-    >   
+    >
     >   ;; OPT PSEUDOSECTION:
     >   ; EDNS: version: 0, flags:; udp: 1232
     >   ;; QUESTION SECTION:
     >   ;zeppelin.gaia-dev.aglais.uk.	IN	A
-    >   
+    >
     >   ;; AUTHORITY SECTION:
     >   gaia-dev.aglais.uk.	600	IN	NS	infra-ops.aglais.uk.
-    >   
+    >
     >   ;; ADDITIONAL SECTION:
     >   infra-ops.aglais.uk.	600	IN	A	46.101.32.198
-    >   
+    >
     >   ;; Query time: 51 msec
     >   ;; SERVER: 195.110.124.234#53(195.110.124.234)
     >   ;; WHEN: Sun Feb 21 16:17:11 GMT 2021
@@ -1109,18 +1114,18 @@ EOF
     >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 31016
     >   ;; flags: qr aa rd ad; QUERY: 1, ANSWER: 1, AUTHORITY: 1, ADDITIONAL: 1
     >   ;; WARNING: recursion requested but not available
-    >   
+    >
     >   ;; OPT PSEUDOSECTION:
     >   ; EDNS: version: 0, flags:; udp: 4096
     >   ;; QUESTION SECTION:
     >   ;zeppelin.gaia-dev.aglais.uk.	IN	A
-    >   
+    >
     >   ;; ANSWER SECTION:
     >   zeppelin.gaia-dev.aglais.uk. 600 IN	A	128.232.227.197
-    >   
+    >
     >   ;; AUTHORITY SECTION:
     >   gaia-dev.aglais.uk.	600	IN	NS	infra-ops.aglais.uk.
-    >   
+    >
     >   ;; Query time: 16 msec
     >   ;; SERVER: 46.101.32.198#53(46.101.32.198)
     >   ;; WHEN: Sun Feb 21 16:17:51 GMT 2021
@@ -1169,17 +1174,17 @@ EOF
     >   bogus-priv
     >   domain-needed
     >   keep-in-foreground
-    >   
+    >
     >   local-ttl=60
-    >   
+    >
     >   log-queries
     >   log-facility=-
-    >   
+    >
     >   except-interface=localhost
     >   bind-dynamic
-    >   
+    >
     >   hostsdir=/etc/dnsmasq/hosts
-    >   
+    >
     >   auth-server=infra-ops.aglais.uk,eth0
     >   auth-zone=gaia-dev.aglais.uk
 
diff --git a/notes/zrq/20210222-01-infra-ops.txt b/notes/zrq/20210222-01-infra-ops.txt
new file mode 100644
index 00000000..5d87634f
--- /dev/null
+++ b/notes/zrq/20210222-01-infra-ops.txt
@@ -0,0 +1,108 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+    Target:
+
+        Deploy DNSmasq to provide a DNS service for our deployments.
+
+        Solves issue #379
+        https://github.com/wfau/aglais/issues/379
+
+    Result:
+
+        Work in progress ...
+
+
+# -----------------------------------------------------
+# Rebuild the DigitalOcean droplet
+
+    DigitalOcean website
+    ....
+
+        publicip4: '46.101.32.198'
+        publicip6: '2a03:b0c0:1:d0::b53:6001'
+
+
+# -----------------------------------------------------
+# Create a container to work with.
+#[user@desktop]
+
+    source "${HOME:?}/aglais.env"
+
+    podman run \
+        --rm \
+        --tty \
+        --interactive \
+        --name ansibler \
+        --hostname ansibler \
+        --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \
+        --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/infra-ops:/infra-ops:ro,z" \
+        atolmis/ansible-client:2020.12.02 \
+        bash
+
+# -----------------------------------------------------
+# Install the Podman container plugin.
+# TODO - Add this to our ansible-client image.
+#[root@ansibler]
+
+    ansible-galaxy collection install containers.podman
+
+    >   Process install dependency map
+    >   Starting collection install process
+    >   Installing 'containers.podman:1.4.3' to '/root/.ansible/collections/ansible_collections/containers/podman'
+
+
+# -----------------------------------------------------
+# Run our Ansible scripts ...
+#[root@ansibler]
+
+    pushd "/infra-ops/ansible"
+
+        ansible-playbook \
+            --inventory "hosts.yml" \
+            "01-ssh-config.yml"
+
+
+        ansible-playbook \
+            --inventory "hosts.yml" \
+            "02-ping-test.yml"
+
+        ansible-playbook \
+            --inventory "hosts.yml" \
+            "03-dns-server.yml"
+
+    popd
+
+
+    >   ....
+    >   ....
+
+
+
+
diff --git a/notes/zrq/20210222-03-infra-ops.txt b/notes/zrq/20210222-03-infra-ops.txt
new file mode 100644
index 00000000..c1d592ec
--- /dev/null
+++ b/notes/zrq/20210222-03-infra-ops.txt
@@ -0,0 +1,147 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+    Target:
+
+        Notes on a suspicious (possible malicious) scan directed at our DNS service.
+        Probably just normal bad stuff you get it you have something listening on port 53.
+
+# -----------------------------------------------------
+
+    Seen in the DNSmasq logs on 2021-02-22
+
+        dnsmasq[1]: auth[TXT] version.bind from 80.82.77.139
+        dnsmasq[1]: auth[TXT] id.server from 80.82.77.139
+        dnsmasq[1]: auth[TXT] hostname.bind from 80.82.77.139
+        dnsmasq[1]: auth[A] direct.shodan.io from 80.82.77.139
+
+    The first three are attempts to try to identify what version of Bind we might be running.
+
+        https://serverfault.com/questions/215724/disable-bind-9-3-6-hostname-disclosure
+        https://www.osi.security/blog/determining-bind-dns-version-using-dig
+
+    The third is a request for the address of the IoT search engine.
+
+        https://www.shodan.io/
+
+    The IP address making the queries is from a hosting company registered in the Seychelles.
+
+        whois 80.82.77.139
+
+    >   ....
+    >   ....
+    >   inetnum:        80.82.77.0 - 80.82.77.255
+    >   netname:        NET-1-77
+    >   descr:          IPV NETBLOCK
+    >   country:        NL
+    >   geoloc:         52.370216 4.895168
+    >   org:            ORG-IVI1-RIPE
+    >   admin-c:        IVI24-RIPE
+    >   tech-c:         IVI24-RIPE
+    >   status:         ASSIGNED PA
+    >   mnt-by:         IPV
+    >   mnt-lower:      IPV
+    >   mnt-routes:     IPV
+    >   created:        2013-04-26T10:57:52Z
+    >   last-modified:  2019-02-01T18:30:06Z
+    >   source:         RIPE
+    >   
+    >   organisation:   ORG-IVI1-RIPE
+    >   org-name:       IP Volume inc
+    >   org-type:       OTHER
+    >   address:        Suite 9
+    >   address:        Victoria, Mahe
+    >   address:        Seychelles
+    >   abuse-c:        IVNO1-RIPE
+    >   mnt-ref:        IPV
+    >   mnt-by:         IPV
+    >   created:        2018-05-14T11:46:50Z
+    >   last-modified:  2019-01-31T14:39:36Z
+    >   source:         RIPE # Filtered
+    >   ....
+    >   ....
+
+    They do not have a good reputation.
+
+        https://badpackets.net/a-conversation-with-ripe-ncc-regarding-quasi-networks-ltd/
+        https://scamalytics.com/ip/isp/ip-volume-inc
+
+
+# -----------------------------------------------------
+# Try sending our DNSmasq server the same query.
+#[user@desktop]
+
+    dig '@infra-ops.aglais.uk' 'direct.shodan.io'
+
+    >   ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> @infra-ops.aglais.uk direct.shodan.io
+    >   ; (1 server found)
+    >   ;; global options: +cmd
+    >   ;; Got answer:
+    >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 11408
+    >   ;; flags: qr rd ad; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 1
+    >   ;; WARNING: recursion requested but not available
+    >   
+    >   ;; OPT PSEUDOSECTION:
+    >   ; EDNS: version: 0, flags:; udp: 4096
+    >   ;; QUESTION SECTION:
+    >   ;direct.shodan.io.		IN	A
+    >   
+    >   ;; Query time: 16 msec
+    >   ;; SERVER: 46.101.32.198#53(46.101.32.198)
+    >   ;; WHEN: Mon Feb 22 19:58:28 GMT 2021
+    >   ;; MSG SIZE  rcvd: 45
+
+
+    host 'direct.shodan.io' 'infra-ops.aglais.uk'
+
+    >   Trying "direct.shodan.io"
+    >   Using domain server:
+    >   Name: infra-ops.aglais.uk
+    >   Address: 46.101.32.198#53
+    >   Aliases:
+    >   
+    >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 56963
+    >   ;; flags: qr rd; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 0
+    >   
+    >   ;; QUESTION SECTION:
+    >   ;direct.shodan.io.		IN	ANY
+    >   
+    >   Received 34 bytes from 46.101.32.198#53 in 14 ms
+
+
+    Creates similar entries in the DNSmasq logs
+
+    >   dnsmasq[1]: auth[A] direct.shodan.io from 81.187.247.196
+    >   dnsmasq[1]: auth[A] direct.shodan.io from 81.187.247.196
+    >   dnsmasq[1]: auth[AAAA] direct.shodan.io from 81.187.247.196
+    >   dnsmasq[1]: auth[MX] direct.shodan.io from 81.187.247.196
+    >   dnsmasq[1]: auth[ANY] direct.shodan.io from 81.187.247.196
+
+
+
+

From 9a40bf997f9d8cd0d6ba65b185bcbcd51f5a1c3d Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Tue, 23 Feb 2021 01:16:37 +0000
Subject: [PATCH 24/27] Added monitoring tools

Signed-off-by: zrq-github@metagrid.co.uk <zrq-github@metagrid.co.uk>
---
 .../infra-ops/ansible/roles/fedora-base/tasks/main.yml    | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/experiments/infra-ops/ansible/roles/fedora-base/tasks/main.yml b/experiments/infra-ops/ansible/roles/fedora-base/tasks/main.yml
index 23713538..9fd4ffe5 100644
--- a/experiments/infra-ops/ansible/roles/fedora-base/tasks/main.yml
+++ b/experiments/infra-ops/ansible/roles/fedora-base/tasks/main.yml
@@ -34,3 +34,11 @@
     state: present
     update_cache: yes
 
+- name: "Install monitoring tools"
+  become: true
+  dnf:
+    name:
+      - 'atop'
+      - 'htop'
+    state: present
+

From 55329834f7ee62af038f4c58ae0a715077e7a9a4 Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Tue, 23 Feb 2021 02:04:32 +0000
Subject: [PATCH 25/27] Added check for empty list and updated records in LCN

Signed-off-by: zrq-github@metagrid.co.uk <zrq-github@metagrid.co.uk>
---
 .../infra-ops/ansible/04-dns-hosts.yml        |   2 +-
 experiments/infra-ops/ansible/hosts.yml       |   4 +-
 .../ansible/roles/dns-server/tasks/main.yml   |   7 +-
 .../ansible/templates/dns/dns-cloud-hosts.j2  |   2 +
 notes/zrq/20210222-01-infra-ops.txt           | 216 +++++++++++++++++-
 5 files changed, 225 insertions(+), 6 deletions(-)

diff --git a/experiments/infra-ops/ansible/04-dns-hosts.yml b/experiments/infra-ops/ansible/04-dns-hosts.yml
index a7023c07..3eca116b 100644
--- a/experiments/infra-ops/ansible/04-dns-hosts.yml
+++ b/experiments/infra-ops/ansible/04-dns-hosts.yml
@@ -32,7 +32,7 @@
         path: "{{dnsmasq_hosts_path}}"
         state: directory
 
-    - name: "Update DNSmasq hosts"
+    - name: "Update DNSmasq host files"
       become: true
       template:
         src:  'templates/dns/dns-cloud-hosts.j2'
diff --git a/experiments/infra-ops/ansible/hosts.yml b/experiments/infra-ops/ansible/hosts.yml
index 7833d393..6f739177 100644
--- a/experiments/infra-ops/ansible/hosts.yml
+++ b/experiments/infra-ops/ansible/hosts.yml
@@ -54,7 +54,7 @@ all:
         clouds:
             # These are NOT Ansible deplyment hosts.
             # These target machines are created and configured by the Aglais deployment.
-            # This is just a list of their names and their IP addresses.
+            # This is just a list of their names and IP addresses.
             gaia-dev:
                 zeppelin:
                     publicip4: '128.232.227.197'
@@ -62,8 +62,6 @@ all:
                     publicip4: '128.232.227.197'
 
             gaia-test:
-                zeppelin:
-                    publicip4: '128.232.227.197'
 
             gaia-prod:
                 zeppelin:
diff --git a/experiments/infra-ops/ansible/roles/dns-server/tasks/main.yml b/experiments/infra-ops/ansible/roles/dns-server/tasks/main.yml
index 8aa999ab..77f3f770 100644
--- a/experiments/infra-ops/ansible/roles/dns-server/tasks/main.yml
+++ b/experiments/infra-ops/ansible/roles/dns-server/tasks/main.yml
@@ -22,13 +22,18 @@
 
 ---
 
-# Create the DNSmasq config directory.
 - name: "Create DNSmasq config directory [{{dnsmasq_config_path}}]"
   become: true
   ansible.builtin.file:
     path: "{{dnsmasq_config_path}}"
     state: directory
 
+- name: "Create DNSmasq hosts directory [{{dnsmasq_hosts_path}}]"
+  become: true
+  ansible.builtin.file:
+    path: "{{dnsmasq_hosts_path}}"
+    state: directory
+
 # Deploy a DNSmasq service in a Podman container.
 # http://www.thekelleys.org.uk/dnsmasq/doc.html
 # https://github.com/Storytel/dnsmasq
diff --git a/experiments/infra-ops/ansible/templates/dns/dns-cloud-hosts.j2 b/experiments/infra-ops/ansible/templates/dns/dns-cloud-hosts.j2
index d7dce8b0..e38669c0 100644
--- a/experiments/infra-ops/ansible/templates/dns/dns-cloud-hosts.j2
+++ b/experiments/infra-ops/ansible/templates/dns/dns-cloud-hosts.j2
@@ -22,8 +22,10 @@
 # http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html
 #
 
+{% if item.value %}
 {% for target in query('dict', item.value) -%}
 {{ "%-15s"|format(target.value.publicip4) }}  {{ target.key }}.{{item.key}}.{{dnsmasq_domain}}
 {% endfor %}
+{% endif %}
 
 
diff --git a/notes/zrq/20210222-01-infra-ops.txt b/notes/zrq/20210222-01-infra-ops.txt
index 5d87634f..7490c6ca 100644
--- a/notes/zrq/20210222-01-infra-ops.txt
+++ b/notes/zrq/20210222-01-infra-ops.txt
@@ -88,7 +88,6 @@
             --inventory "hosts.yml" \
             "01-ssh-config.yml"
 
-
         ansible-playbook \
             --inventory "hosts.yml" \
             "02-ping-test.yml"
@@ -97,6 +96,10 @@
             --inventory "hosts.yml" \
             "03-dns-server.yml"
 
+        ansible-playbook \
+            --inventory "hosts.yml" \
+            "04-dns-hosts.yml"
+
     popd
 
 
@@ -104,5 +107,216 @@
     >   ....
 
 
+# -----------------------------------------------------
+# Login to the Droplet and tail the DNSmasq log.
+#[root@ansibler]
+
+    ssh Hizzoria \
+        '
+        podman logs --follow dnsmasq
+        '
+
+    >   dnsmasq: failed to create listening socket for port 53: Address in use
+    >   
+    >   dnsmasq: failed to create listening socket for port 53: Address in use
+    >   
+    >   dnsmasq: failed to create listening socket for port 53: Address in use
+    >   
+    >   dnsmasq: failed to create listening socket for port 53: Address in use
+    >   
+    >   dnsmasq: failed to create listening socket for port 53: Address in use
+    >   dnsmasq[1]: started, version 2.78 cachesize 150
+    >   dnsmasq[1]: compile time options: IPv6 GNU-getopt no-DBus no-i18n no-IDN DHCP DHCPv6 no-Lua TFTP no-conntrack ipset auth no-DNSSEC loop-detect inotify
+    >   dnsmasq[1]: warning: no upstream servers configured
+    >   dnsmasq[1]: cleared cache
+    >   dnsmasq[1]: inotify, new or changed file /etc/dnsmasq/hosts/gaia-dev.hosts
+    >   dnsmasq[1]: read /etc/dnsmasq/hosts/gaia-dev.hosts - 2 addresses
+    >   dnsmasq[1]: inotify, new or changed file /etc/dnsmasq/hosts/gaia-test.hosts
+    >   dnsmasq[1]: read /etc/dnsmasq/hosts/gaia-test.hosts - 0 addresses
+    >   dnsmasq[1]: inotify, new or changed file /etc/dnsmasq/hosts/gaia-prod.hosts
+    >   dnsmasq[1]: read /etc/dnsmasq/hosts/gaia-prod.hosts - 1 addresses
+    >   ....
+    >   ....
+
+
+# -----------------------------------------------------
+# Test queries direct to our DNS server.
+#[user@desktop]
+
+    dig '@infra-ops.aglais.uk' 'zeppelin.gaia-dev.aglais.uk'
+
+    >   ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> @infra-ops.aglais.uk zeppelin.gaia-dev.aglais.uk
+    >   ; (1 server found)
+    >   ;; global options: +cmd
+    >   ;; Got answer:
+    >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 39531
+    >   ;; flags: qr aa rd ad; QUERY: 1, ANSWER: 1, AUTHORITY: 1, ADDITIONAL: 1
+    >   ;; WARNING: recursion requested but not available
+    >   
+    >   ;; OPT PSEUDOSECTION:
+    >   ; EDNS: version: 0, flags:; udp: 4096
+    >   ;; QUESTION SECTION:
+    >   ;zeppelin.gaia-dev.aglais.uk.	IN	A
+    >   
+    >   ;; ANSWER SECTION:
+    >   zeppelin.gaia-dev.aglais.uk. 300 IN	A	128.232.227.197
+    >   
+    >   ;; AUTHORITY SECTION:
+    >   gaia-dev.aglais.uk.	300	IN	NS	infra-ops.aglais.uk.
+    >   
+    >   ;; Query time: 15 msec
+    >   ;; SERVER: 46.101.32.198#53(46.101.32.198)
+    >   ;; WHEN: Tue Feb 23 01:43:23 GMT 2021
+    >   ;; MSG SIZE  rcvd: 123
+
+
+    >   ....
+    >   dnsmasq[1]: auth[A] zeppelin.gaia-dev.aglais.uk from 81.187.247.196
+    >   dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.197 is zeppelin.gaia-dev.aglais.uk
+    >   ....
+
+
+    dig '@infra-ops.aglais.uk' 'zeppelin.gaia-prod.aglais.uk'
+
+    >   ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> @infra-ops.aglais.uk zeppelin.gaia-prod.aglais.uk
+    >   ; (1 server found)
+    >   ;; global options: +cmd
+    >   ;; Got answer:
+    >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 46737
+    >   ;; flags: qr aa rd ad; QUERY: 1, ANSWER: 1, AUTHORITY: 1, ADDITIONAL: 1
+    >   ;; WARNING: recursion requested but not available
+    >   
+    >   ;; OPT PSEUDOSECTION:
+    >   ; EDNS: version: 0, flags:; udp: 4096
+    >   ;; QUESTION SECTION:
+    >   ;zeppelin.gaia-prod.aglais.uk.	IN	A
+    >   
+    >   ;; ANSWER SECTION:
+    >   zeppelin.gaia-prod.aglais.uk. 300 IN	A	128.232.227.197
+    >   
+    >   ;; AUTHORITY SECTION:
+    >   gaia-prod.aglais.uk.	300	IN	NS	infra-ops.aglais.uk.
+    >   
+    >   ;; Query time: 15 msec
+    >   ;; SERVER: 46.101.32.198#53(46.101.32.198)
+    >   ;; WHEN: Tue Feb 23 01:43:51 GMT 2021
+    >   ;; MSG SIZE  rcvd: 125
+
+
+    >   ....
+    >   dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 81.187.247.196
+    >   dnsmasq[1]: /etc/dnsmasq/hosts zeppelin.gaia-prod.aglais.uk is 128.232.227.197
+    >   ....
+
+
+
+# -----------------------------------------------------
+# Update our LCN nameserver records.
+#[root@ansibler]
+
+    infra-ops   A       46.101.32.198
+    www         CNAME   zeppelin.gaia-prod.aglais.uk
+    zeppelin    CNAME   zeppelin.gaia-prod.aglais.uk
+    gaia-dev    NS      infra-ops.aglais.uk
+    gaia-test   NS      infra-ops.aglais.uk
+    gaia-prod   NS      infra-ops.aglais.uk
+
+
+# -----------------------------------------------------
+# Query the dev hostname via our local DNS.
+#[user@desktop]
+
+    dig 'zeppelin.gaia-dev.aglais.uk'
+
+    >   
+    >   ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> zeppelin.gaia-dev.aglais.uk
+    >   ;; global options: +cmd
+    >   ;; Got answer:
+    >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 63618
+    >   ;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 13, ADDITIONAL: 1
+    >   
+    >   ;; OPT PSEUDOSECTION:
+    >   ; EDNS: version: 0, flags:; udp: 4096
+    >   ; COOKIE: 786bc567224a9e6584e88f4c603460c0d5dc757d25f6d17a (good)
+    >   ;; QUESTION SECTION:
+    >   ;zeppelin.gaia-dev.aglais.uk.	IN	A
+    >   
+    >   ;; ANSWER SECTION:
+    >   zeppelin.gaia-dev.aglais.uk. 300 IN	A	128.232.227.197
+    >   
+    >   ;; AUTHORITY SECTION:
+    >   .			79428	IN	NS	d.root-servers.net.
+    >   .			79428	IN	NS	i.root-servers.net.
+    >   .			79428	IN	NS	k.root-servers.net.
+    >   .			79428	IN	NS	c.root-servers.net.
+    >   .			79428	IN	NS	l.root-servers.net.
+    >   .			79428	IN	NS	h.root-servers.net.
+    >   .			79428	IN	NS	a.root-servers.net.
+    >   .			79428	IN	NS	b.root-servers.net.
+    >   .			79428	IN	NS	f.root-servers.net.
+    >   .			79428	IN	NS	m.root-servers.net.
+    >   .			79428	IN	NS	g.root-servers.net.
+    >   .			79428	IN	NS	j.root-servers.net.
+    >   .			79428	IN	NS	e.root-servers.net.
+    >   
+    >   ;; Query time: 62 msec
+    >   ;; SERVER: 10.4.0.2#53(10.4.0.2)
+    >   ;; WHEN: Tue Feb 23 01:56:16 GMT 2021
+    >   ;; MSG SIZE  rcvd: 311
+
+    >   ....
+    >   dnsmasq[1]: auth[A] zeppelin.gaia-dev.aglais.uk from 90.155.53.34
+    >   dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.197 is zeppelin.gaia-dev.aglais.uk
+    >   ....
+
+
+    # Query originates from our ISP.
+
+    host 90.155.53.34
+
+    >   34.53.155.90.in-addr.arpa domain name pointer b-dns-thn.aa.net.uk.
+
+
+# -----------------------------------------------------
+# Query the public service name via our local DNS.
+#[user@desktop]
+
+    dig 'zeppelin.aglais.uk'
+
+    >   ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> zeppelin.aglais.uk
+    >   ;; global options: +cmd
+    >   ;; Got answer:
+    >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 4530
+    >   ;; flags: qr rd ra; QUERY: 1, ANSWER: 2, AUTHORITY: 13, ADDITIONAL: 1
+    >   
+    >   ;; OPT PSEUDOSECTION:
+    >   ; EDNS: version: 0, flags:; udp: 4096
+    >   ; COOKIE: 9fd63624d4944e80a40705ca60346180714b55bdf57e9829 (good)
+    >   ;; QUESTION SECTION:
+    >   ;zeppelin.aglais.uk.		IN	A
+    >   
+    >   ;; ANSWER SECTION:
+    >   zeppelin.aglais.uk.	600	IN	CNAME	zeppelin.gaia-prod.aglais.uk.
+    >   zeppelin.gaia-prod.aglais.uk. 300 IN	A	128.232.227.197
+    >   
+    >   ;; AUTHORITY SECTION:
+    >   .			79236	IN	NS	i.root-servers.net.
+    >   .			79236	IN	NS	c.root-servers.net.
+    >   .			79236	IN	NS	j.root-servers.net.
+    >   .			79236	IN	NS	a.root-servers.net.
+    >   .			79236	IN	NS	g.root-servers.net.
+    >   .			79236	IN	NS	l.root-servers.net.
+    >   .			79236	IN	NS	m.root-servers.net.
+    >   .			79236	IN	NS	b.root-servers.net.
+    >   .			79236	IN	NS	h.root-servers.net.
+    >   .			79236	IN	NS	d.root-servers.net.
+    >   .			79236	IN	NS	k.root-servers.net.
+    >   .			79236	IN	NS	e.root-servers.net.
+    >   .			79236	IN	NS	f.root-servers.net.
+    >   
+    >   ;; Query time: 67 msec
+    >   ;; SERVER: 10.4.0.2#53(10.4.0.2)
+    >   ;; WHEN: Tue Feb 23 01:59:28 GMT 2021
+    >   ;; MSG SIZE  rcvd: 335
 
 

From 9901dfe89dd07a41291f565a6d22f56bfbb5f616 Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Tue, 23 Feb 2021 07:12:50 +0000
Subject: [PATCH 26/27] Notes on updating public DNS records

Signed-off-by: zrq-github@metagrid.co.uk <zrq-github@metagrid.co.uk>
---
 experiments/infra-ops/ansible/hosts.yml |   2 +-
 notes/zrq/20210222-01-infra-ops.txt     |  46 ++---
 notes/zrq/20210223-01-infra-ops.txt     | 229 ++++++++++++++++++++++++
 3 files changed, 253 insertions(+), 24 deletions(-)
 create mode 100644 notes/zrq/20210223-01-infra-ops.txt

diff --git a/experiments/infra-ops/ansible/hosts.yml b/experiments/infra-ops/ansible/hosts.yml
index 6f739177..fd0fbb2d 100644
--- a/experiments/infra-ops/ansible/hosts.yml
+++ b/experiments/infra-ops/ansible/hosts.yml
@@ -65,7 +65,7 @@ all:
 
             gaia-prod:
                 zeppelin:
-                    publicip4: '128.232.227.197'
+                    publicip4: '128.232.227.212'
 
     children:
 
diff --git a/notes/zrq/20210222-01-infra-ops.txt b/notes/zrq/20210222-01-infra-ops.txt
index 7490c6ca..8fd40ca3 100644
--- a/notes/zrq/20210222-01-infra-ops.txt
+++ b/notes/zrq/20210222-01-infra-ops.txt
@@ -58,8 +58,8 @@
         --rm \
         --tty \
         --interactive \
-        --name ansibler \
-        --hostname ansibler \
+        --name infra-ops \
+        --hostname infra-ops \
         --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \
         --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \
         --volume "${AGLAIS_CODE:?}/experiments/infra-ops:/infra-ops:ro,z" \
@@ -117,13 +117,13 @@
         '
 
     >   dnsmasq: failed to create listening socket for port 53: Address in use
-    >   
+    >
     >   dnsmasq: failed to create listening socket for port 53: Address in use
-    >   
+    >
     >   dnsmasq: failed to create listening socket for port 53: Address in use
-    >   
+    >
     >   dnsmasq: failed to create listening socket for port 53: Address in use
-    >   
+    >
     >   dnsmasq: failed to create listening socket for port 53: Address in use
     >   dnsmasq[1]: started, version 2.78 cachesize 150
     >   dnsmasq[1]: compile time options: IPv6 GNU-getopt no-DBus no-i18n no-IDN DHCP DHCPv6 no-Lua TFTP no-conntrack ipset auth no-DNSSEC loop-detect inotify
@@ -152,18 +152,18 @@
     >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 39531
     >   ;; flags: qr aa rd ad; QUERY: 1, ANSWER: 1, AUTHORITY: 1, ADDITIONAL: 1
     >   ;; WARNING: recursion requested but not available
-    >   
+    >
     >   ;; OPT PSEUDOSECTION:
     >   ; EDNS: version: 0, flags:; udp: 4096
     >   ;; QUESTION SECTION:
     >   ;zeppelin.gaia-dev.aglais.uk.	IN	A
-    >   
+    >
     >   ;; ANSWER SECTION:
     >   zeppelin.gaia-dev.aglais.uk. 300 IN	A	128.232.227.197
-    >   
+    >
     >   ;; AUTHORITY SECTION:
     >   gaia-dev.aglais.uk.	300	IN	NS	infra-ops.aglais.uk.
-    >   
+    >
     >   ;; Query time: 15 msec
     >   ;; SERVER: 46.101.32.198#53(46.101.32.198)
     >   ;; WHEN: Tue Feb 23 01:43:23 GMT 2021
@@ -185,18 +185,18 @@
     >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 46737
     >   ;; flags: qr aa rd ad; QUERY: 1, ANSWER: 1, AUTHORITY: 1, ADDITIONAL: 1
     >   ;; WARNING: recursion requested but not available
-    >   
+    >
     >   ;; OPT PSEUDOSECTION:
     >   ; EDNS: version: 0, flags:; udp: 4096
     >   ;; QUESTION SECTION:
     >   ;zeppelin.gaia-prod.aglais.uk.	IN	A
-    >   
+    >
     >   ;; ANSWER SECTION:
     >   zeppelin.gaia-prod.aglais.uk. 300 IN	A	128.232.227.197
-    >   
+    >
     >   ;; AUTHORITY SECTION:
     >   gaia-prod.aglais.uk.	300	IN	NS	infra-ops.aglais.uk.
-    >   
+    >
     >   ;; Query time: 15 msec
     >   ;; SERVER: 46.101.32.198#53(46.101.32.198)
     >   ;; WHEN: Tue Feb 23 01:43:51 GMT 2021
@@ -228,22 +228,22 @@
 
     dig 'zeppelin.gaia-dev.aglais.uk'
 
-    >   
+    >
     >   ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> zeppelin.gaia-dev.aglais.uk
     >   ;; global options: +cmd
     >   ;; Got answer:
     >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 63618
     >   ;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 13, ADDITIONAL: 1
-    >   
+    >
     >   ;; OPT PSEUDOSECTION:
     >   ; EDNS: version: 0, flags:; udp: 4096
     >   ; COOKIE: 786bc567224a9e6584e88f4c603460c0d5dc757d25f6d17a (good)
     >   ;; QUESTION SECTION:
     >   ;zeppelin.gaia-dev.aglais.uk.	IN	A
-    >   
+    >
     >   ;; ANSWER SECTION:
     >   zeppelin.gaia-dev.aglais.uk. 300 IN	A	128.232.227.197
-    >   
+    >
     >   ;; AUTHORITY SECTION:
     >   .			79428	IN	NS	d.root-servers.net.
     >   .			79428	IN	NS	i.root-servers.net.
@@ -258,7 +258,7 @@
     >   .			79428	IN	NS	g.root-servers.net.
     >   .			79428	IN	NS	j.root-servers.net.
     >   .			79428	IN	NS	e.root-servers.net.
-    >   
+    >
     >   ;; Query time: 62 msec
     >   ;; SERVER: 10.4.0.2#53(10.4.0.2)
     >   ;; WHEN: Tue Feb 23 01:56:16 GMT 2021
@@ -288,17 +288,17 @@
     >   ;; Got answer:
     >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 4530
     >   ;; flags: qr rd ra; QUERY: 1, ANSWER: 2, AUTHORITY: 13, ADDITIONAL: 1
-    >   
+    >
     >   ;; OPT PSEUDOSECTION:
     >   ; EDNS: version: 0, flags:; udp: 4096
     >   ; COOKIE: 9fd63624d4944e80a40705ca60346180714b55bdf57e9829 (good)
     >   ;; QUESTION SECTION:
     >   ;zeppelin.aglais.uk.		IN	A
-    >   
+    >
     >   ;; ANSWER SECTION:
     >   zeppelin.aglais.uk.	600	IN	CNAME	zeppelin.gaia-prod.aglais.uk.
     >   zeppelin.gaia-prod.aglais.uk. 300 IN	A	128.232.227.197
-    >   
+    >
     >   ;; AUTHORITY SECTION:
     >   .			79236	IN	NS	i.root-servers.net.
     >   .			79236	IN	NS	c.root-servers.net.
@@ -313,7 +313,7 @@
     >   .			79236	IN	NS	k.root-servers.net.
     >   .			79236	IN	NS	e.root-servers.net.
     >   .			79236	IN	NS	f.root-servers.net.
-    >   
+    >
     >   ;; Query time: 67 msec
     >   ;; SERVER: 10.4.0.2#53(10.4.0.2)
     >   ;; WHEN: Tue Feb 23 01:59:28 GMT 2021
diff --git a/notes/zrq/20210223-01-infra-ops.txt b/notes/zrq/20210223-01-infra-ops.txt
new file mode 100644
index 00000000..2199334a
--- /dev/null
+++ b/notes/zrq/20210223-01-infra-ops.txt
@@ -0,0 +1,229 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+    Target:
+
+        Update DNSmasq hosts with new IP address.
+
+    Result:
+
+        Work in progress ...
+
+
+# -----------------------------------------------------
+# Update the hosts file
+#[user@desktop]
+
+    source "${HOME:?}/aglais.env"
+
+    pushd "${AGLAIS_CODE:?}"
+
+        pushd experiments/infra-ops/ansible
+
+            gedit hosts.yml
+
+                gaia-prod:
+                    zeppelin:
+            ~           publicip4: '128.232.227.212'
+
+
+        popd
+    popd
+
+
+# -----------------------------------------------------
+# Create a container to work with.
+#[user@desktop]
+
+    source "${HOME:?}/aglais.env"
+
+    podman run \
+        --rm \
+        --tty \
+        --interactive \
+        --name infra-ops \
+        --hostname infra-ops \
+        --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \
+        --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/infra-ops:/infra-ops:ro,z" \
+        atolmis/ansible-client:2020.12.02 \
+        bash
+
+# -----------------------------------------------------
+# Install the Podman container plugin.
+# TODO - Add this to our ansible-client image.
+#[root@ansibler]
+
+    ansible-galaxy collection install containers.podman
+
+    >   Process install dependency map
+    >   Starting collection install process
+    >   Installing 'containers.podman:1.4.3' to '/root/.ansible/collections/ansible_collections/containers/podman'
+
+
+# -----------------------------------------------------
+# Run our Ansible scripts ...
+#[root@ansibler]
+
+    pushd "/infra-ops/ansible"
+
+        ansible-playbook \
+            --inventory "hosts.yml" \
+            "01-ssh-config.yml"
+
+        ansible-playbook \
+            --inventory "hosts.yml" \
+            "02-ping-test.yml"
+
+        ansible-playbook \
+            --inventory "hosts.yml" \
+            "04-dns-hosts.yml"
+
+    popd
+
+
+    >   ....
+    >   TASK [Update DNSmasq host files] **************************************************************************
+    >   ok: [Hizzoria] => (item={'key': 'gaia-dev', 'value': {....}})
+    >   ok: [Hizzoria] => (item={'key': 'gaia-test', 'value': None})
+    >   changed: [Hizzoria] => (item={'key': 'gaia-prod', 'value': {'zeppelin': {'publicip4': '128.232.227.212'}}})
+    >   ....
+
+
+# -----------------------------------------------------
+# Login to the Droplet and tail the DNSmasq log.
+#[root@ansibler]
+
+    ssh Hizzoria \
+        '
+        podman logs --follow dnsmasq
+        '
+
+    >   ....
+    >   ....
+
+
+# -----------------------------------------------------
+# Test queries direct to our DNS server.
+#[user@desktop]
+
+    dig '@infra-ops.aglais.uk' 'zeppelin.gaia-prod.aglais.uk'
+
+
+    >   ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> @infra-ops.aglais.uk zeppelin.gaia-prod.aglais.uk
+    >   ; (1 server found)
+    >   ;; global options: +cmd
+    >   ;; Got answer:
+    >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 10057
+    >   ;; flags: qr aa rd ad; QUERY: 1, ANSWER: 2, AUTHORITY: 1, ADDITIONAL: 1
+    >   ;; WARNING: recursion requested but not available
+    >   
+    >   ;; OPT PSEUDOSECTION:
+    >   ; EDNS: version: 0, flags:; udp: 4096
+    >   ;; QUESTION SECTION:
+    >   ;zeppelin.gaia-prod.aglais.uk.	IN	A
+    >   
+    >   ;; ANSWER SECTION:
+    >   zeppelin.gaia-prod.aglais.uk. 300 IN	A	128.232.227.212
+    >   zeppelin.gaia-prod.aglais.uk. 300 IN	A	128.232.227.197
+    >   
+    >   ;; AUTHORITY SECTION:
+    >   gaia-prod.aglais.uk.	300	IN	NS	infra-ops.aglais.uk.
+    >   
+    >   ;; Query time: 15 msec
+    >   ;; SERVER: 46.101.32.198#53(46.101.32.198)
+    >   ;; WHEN: Tue Feb 23 06:54:35 GMT 2021
+    >   ;; MSG SIZE  rcvd: 141
+
+    #
+    # Two answers for the same host - one is from the cache.
+    # Need to send a SIGHUP signal to flush the cache.
+    #
+
+# -----------------------------------------------------
+# Try send a SIGHUP signal to flush the cache.
+#[root@ansibler]
+
+    ssh Hizzoria
+
+        podman kill --signal SIGHUP dnsmask
+
+
+# -----------------------------------------------------
+# Test queries direct to our DNS server.
+#[user@desktop]
+
+    dig '@infra-ops.aglais.uk' 'zeppelin.gaia-prod.aglais.uk'
+
+    >   ....
+    >   ;; ANSWER SECTION:
+    >   zeppelin.gaia-prod.aglais.uk. 300 IN	A	128.232.227.212
+    >   zeppelin.gaia-prod.aglais.uk. 300 IN	A	128.232.227.197
+    >   ....
+
+
+# -----------------------------------------------------
+# Try restart DNSmasq to flush the cache.
+#[root@ansibler]
+
+    ssh Hizzoria
+
+        podman stop dnsmasq
+
+        sleep 1
+
+        podman start dnsmasq
+
+
+# -----------------------------------------------------
+# Test queries direct to our DNS server.
+#[user@desktop]
+
+    dig '@infra-ops.aglais.uk' 'zeppelin.gaia-prod.aglais.uk'
+
+    >   ....
+    >   ;; ANSWER SECTION:
+    >   zeppelin.gaia-prod.aglais.uk. 300 IN	A	128.232.227.212
+    >   ....
+
+
+# -----------------------------------------------------
+# Query the public service name via our local DNS.
+#[user@desktop]
+
+    dig 'zeppelin.aglais.uk'
+
+    >   ....
+    >   ;; ANSWER SECTION:
+    >   zeppelin.aglais.uk.	600	IN	CNAME	zeppelin.gaia-prod.aglais.uk.
+    >   zeppelin.gaia-prod.aglais.uk. 300 IN	A	128.232.227.212
+    >   ....
+
+
+
+

From d3f45fdd707d54d74f76672e51b16ccaaf20fd0b Mon Sep 17 00:00:00 2001
From: "zrq-github@metagrid.co.uk" <zrq-github@metagrid.co.uk>
Date: Thu, 25 Feb 2021 17:46:18 +0000
Subject: [PATCH 27/27] Notes on DNS service failure

---
 notes/zrq/20210222-01-infra-ops.txt |    2 +-
 notes/zrq/20210223-01-infra-ops.txt |   10 +-
 notes/zrq/20210225-01-infra-ops.txt | 1128 +++++++++++++++++++++++++++
 3 files changed, 1134 insertions(+), 6 deletions(-)
 create mode 100644 notes/zrq/20210225-01-infra-ops.txt

diff --git a/notes/zrq/20210222-01-infra-ops.txt b/notes/zrq/20210222-01-infra-ops.txt
index 8fd40ca3..61fafea8 100644
--- a/notes/zrq/20210222-01-infra-ops.txt
+++ b/notes/zrq/20210222-01-infra-ops.txt
@@ -35,7 +35,7 @@
 
     Result:
 
-        Work in progress ...
+        Success - working DNS service hosted on DigitalOcean.
 
 
 # -----------------------------------------------------
diff --git a/notes/zrq/20210223-01-infra-ops.txt b/notes/zrq/20210223-01-infra-ops.txt
index 2199334a..d406666a 100644
--- a/notes/zrq/20210223-01-infra-ops.txt
+++ b/notes/zrq/20210223-01-infra-ops.txt
@@ -32,7 +32,7 @@
 
     Result:
 
-        Work in progress ...
+        Success - DNS record updated.
 
 
 # -----------------------------------------------------
@@ -142,19 +142,19 @@
     >   ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 10057
     >   ;; flags: qr aa rd ad; QUERY: 1, ANSWER: 2, AUTHORITY: 1, ADDITIONAL: 1
     >   ;; WARNING: recursion requested but not available
-    >   
+    >
     >   ;; OPT PSEUDOSECTION:
     >   ; EDNS: version: 0, flags:; udp: 4096
     >   ;; QUESTION SECTION:
     >   ;zeppelin.gaia-prod.aglais.uk.	IN	A
-    >   
+    >
     >   ;; ANSWER SECTION:
     >   zeppelin.gaia-prod.aglais.uk. 300 IN	A	128.232.227.212
     >   zeppelin.gaia-prod.aglais.uk. 300 IN	A	128.232.227.197
-    >   
+    >
     >   ;; AUTHORITY SECTION:
     >   gaia-prod.aglais.uk.	300	IN	NS	infra-ops.aglais.uk.
-    >   
+    >
     >   ;; Query time: 15 msec
     >   ;; SERVER: 46.101.32.198#53(46.101.32.198)
     >   ;; WHEN: Tue Feb 23 06:54:35 GMT 2021
diff --git a/notes/zrq/20210225-01-infra-ops.txt b/notes/zrq/20210225-01-infra-ops.txt
new file mode 100644
index 00000000..4e5cbf6a
--- /dev/null
+++ b/notes/zrq/20210225-01-infra-ops.txt
@@ -0,0 +1,1128 @@
+#
+# <meta:header>
+#   <meta:licence>
+#     Copyright (c) 2021, ROE (http://www.roe.ac.uk/)
+#
+#     This information is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+#
+#     This information is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+#
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#   </meta:licence>
+# </meta:header>
+#
+#zrq-notes-time
+#zrq-notes-indent
+#zrq-notes-crypto
+#zrq-notes-ansible
+#zrq-notes-osformat
+#zrq-notes-zeppelin
+#
+
+
+    Target:
+
+        Investigate errors with DNS resolution.
+        Email from Dennis
+
+            Just letting you know the IRIS system isn’t accessible at the minute.
+            It appears the login page doesn’t exist and/or the server can’t be found?
+            No pressure, I know there are issue at the minute, but wasn’t sure if you were aware of this.
+
+
+    Result:
+
+        Restarting the Pod got the service working again,
+        but no clue as to why it failed and if/when it will happen again.
+
+
+
+# -----------------------------------------------------
+# Check the name resolves.
+#[user@desktop]
+
+     host zeppelin.aglais.uk
+
+    >   ;; connection timed out; no servers could be reached
+
+
+     host -a zeppelin.aglais.uk
+
+    >   ....
+    >   ....
+    >   ;; ANSWER SECTION:
+    >   zeppelin.aglais.uk.	372	IN	CNAME	zeppelin.gaia-prod.aglais.uk.
+    >   
+    >   ;; AUTHORITY SECTION:
+    >   aglais.uk.		171966	IN	NS	ns2.lcn.com.
+    >   aglais.uk.		171966	IN	NS	ns1.lcn.com.
+    >   aglais.uk.		171966	IN	NS	ns0.lcn.com.
+
+
+    host zeppelin.gaia-prod.aglais.uk.
+
+
+    >   ;; connection timed out; no servers could be reached
+
+
+    #
+    # Suggests that our DNS service isn't working.
+    #
+
+# -----------------------------------------------------
+# Test query direct to our DNS server.
+#[user@desktop]
+
+    dig '@infra-ops.aglais.uk' 'zeppelin.gaia-prod.aglais.uk'
+
+    >   ; (1 server found)
+    >   ;; global options: +cmd
+    >   ;; connection timed out; no servers could be reached
+
+
+# -----------------------------------------------------
+# Test query direct to the LCN server.
+#[user@desktop]
+
+    dig '@ns2.lcn.com' 'gaia-prod.aglais.uk'
+
+    >   ....
+    >   ;; QUESTION SECTION:
+    >   ;gaia-prod.aglais.uk.		IN	A
+    >   
+    >   ;; AUTHORITY SECTION:
+    >   gaia-prod.aglais.uk.	600	IN	NS	infra-ops.aglais.uk.
+    >   
+    >   ;; ADDITIONAL SECTION:
+    >   infra-ops.aglais.uk.	600	IN	A	46.101.32.198
+    >   ....
+
+
+# -----------------------------------------------------
+# Create a container to work with.
+#[user@desktop]
+
+    source "${HOME:?}/aglais.env"
+
+    podman run \
+        --rm \
+        --tty \
+        --interactive \
+        --name infra-ops \
+        --hostname infra-ops \
+        --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \
+        --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \
+        --volume "${AGLAIS_CODE:?}/experiments/infra-ops:/infra-ops:ro,z" \
+        atolmis/ansible-client:2020.12.02 \
+        bash
+
+
+# -----------------------------------------------------
+# Run the Ansible script to generate our local SSH config.
+#[root@ansibler]
+
+    pushd "/infra-ops/ansible"
+
+        ansible-playbook \
+            --inventory "hosts.yml" \
+            "01-ssh-config.yml"
+
+    popd
+
+    >   ....
+    >   ....
+
+
+# -----------------------------------------------------
+# Login to the DNS server and see what we can find ..
+#[root@ansibler]
+
+    ssh Hizzoria
+
+
+# -----------------------------------------------------
+# Check the Pod is still running
+#[root@Hizzoria]
+
+    podman ps -a
+
+    >   podman ps -a
+    >   CONTAINER ID  IMAGE                              COMMAND  CREATED     STATUS         PORTS   NAMES
+    >   314312dfde70  docker.io/storytel/dnsmasq:latest  dnsmasq  2 days ago  Up 2 days ago          dnsmasq
+
+
+# -----------------------------------------------------
+# Check the logs.
+#[root@Hizzoria]
+
+    podman logs --follow dnsmasq
+
+    >   ....
+    >   ....
+    >   dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 90.155.53.33
+    >   dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zeppelin.gaia-prod.aglais.uk
+    >   dnsmasq[1]: auth[RRSIG] pizzaseo.com from 5.157.62.2
+    >   dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 129.215.17.202
+    >   dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zeppelin.gaia-prod.aglais.uk
+    >   dnsmasq[1]: auth[TXT] VERSION.BIND from 185.173.35.25
+    >   dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 129.215.16.12
+    >   dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zeppelin.gaia-prod.aglais.uk
+    >   dnsmasq[1]: auth[RRSIG] pizzaseo.com from 188.166.8.202
+    >   dnsmasq[1]: auth[RRSIG] pizzaseo.com from 188.166.8.202
+    >   dnsmasq[1]: auth[ANY] mz.gov.pl from 178.79.138.36
+    >   dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 81.97.95.154
+    >   dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zeppelin.gaia-prod.aglais.uk
+    >   dnsmasq[1]: auth[A] 2e6520c6.asert-dns-research.com from 146.88.240.4
+    >   dnsmasq[1]: auth[A] ya.ru from 209.188.7.170
+    >   dnsmasq[1]: auth[A] www.wikipedia.org from 146.88.240.12
+    >   dnsmasq[1]: auth[AAAA] 67b.org from 80.82.65.90
+    >   dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 199.30.231.12
+    >   dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zeppelin.gaia-prod.aglais.uk
+    >   dnsmasq[1]: auth[TXT] hp.com from 123.58.180.77
+    >   dnsmasq[1]: auth[TXT] hp.com from 123.58.180.77
+    >   dnsmasq[1]: auth[TXT] hp.com from 123.58.180.77
+    >   dnsmasq[1]: auth[TXT] version.bind from 104.140.188.22
+    >   dnsmasq[1]: auth[A] zePpeLIN.gaiA-prOD.aglAIs.Uk from 35.172.165.0
+    >   dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zePpeLIN.gaiA-prOD.aglAIs.Uk
+    >   dnsmasq[1]: auth[AAAA] ZEPpEliN.GaiA-PRod.aGlAIS.UK from 35.172.165.0
+    >   dnsmasq[1]: auth[TXT] dns-test.research.a10protects.com from 45.79.54.171
+    >   dnsmasq[1]: auth[TXT] ebay.com from 123.58.180.77
+    >   dnsmasq[1]: auth[ANY] com from 185.94.111.1
+    >   dnsmasq[1]: auth[A] researchscan541.eecs.umich.edu from 141.212.123.34
+    >   dnsmasq[1]: auth[A] dnsscan.shadowserver.org from 74.82.47.6
+    >   dnsmasq[1]: auth[PTR] 213.1.168.192.in-addr.arpa from 167.248.133.27
+    >   dnsmasq[1]: auth[A] invalid.parrotdns.com from 167.248.133.53
+    >   dnsmasq[1]: auth[A] ip.parrotdns.com from 167.248.133.53
+    >   dnsmasq[1]: auth[TXT] version.bind from 167.248.133.53
+    >   dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 207.102.138.19
+    >   dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zeppelin.gaia-prod.aglais.uk
+    >   dnsmasq[1]: auth[AAAA] zeppelin.gaia-prod.aglais.uk from 207.102.138.19
+    >   dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 66.249.66.203
+    >   dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zeppelin.gaia-prod.aglais.uk
+    >   dnsmasq[1]: auth[A] ZEPPELin.GAiA-prOd.aGLAis.uK from 3.233.239.118
+    >   dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is ZEPPELin.GAiA-prOd.aGLAis.uK
+    >   dnsmasq[1]: auth[AAAA] ZEppelIN.GAiA-PRod.aGLaIS.uk from 3.233.239.118
+    >   dnsmasq[1]: auth[A] researchscan541.eecs.umich.edu from 141.212.123.25
+    >   dnsmasq[1]: auth[TXT] tmz.com from 123.58.180.77
+    >   dnsmasq[1]: auth[TXT] tmz.com from 123.58.180.77
+
+    # TODO See if we can get a data stamp on these entries ?
+
+    # Looks like a mixture of genuine requests and malicious poking.
+
+    # This is an interesting one ...
+
+    >   dnsmasq[1]: auth[A] ZEPPELin.GAiA-prOd.aGLAis.uK from 3.233.239.118
+    >   dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is ZEPPELin.GAiA-prOd.aGLAis.uK
+
+
+# -----------------------------------------------------
+# Check the disc space on the VM.
+#[root@Hizzoria]
+
+    df -h
+
+    >   Filesystem      Size  Used Avail Use% Mounted on
+    >   devtmpfs        470M     0  470M   0% /dev
+    >   tmpfs           487M   84K  487M   1% /dev/shm
+    >   tmpfs           195M  616K  195M   1% /run
+    >   /dev/vda1        25G  1.7G   22G   8% /
+    >   tmpfs           487M     0  487M   0% /tmp
+    >   shm              63M     0   63M   0% /var/lib/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/shm
+    >   overlay          25G  1.7G   22G   8% /var/lib/containers/storage/overlay/9661016980dfe5613577610f8ede05969f5cc360f08b12548c5f35a7df160184/merged
+    >   tmpfs            98M     0   98M   0% /run/user/0
+
+
+# -----------------------------------------------------
+# Check the disc space inside the Pod.
+#[root@Hizzoria]
+
+    podman exec -it dnsmasq df -h
+
+    >   Filesystem                Size      Used Available Use% Mounted on
+    >   overlay                  24.5G      1.7G     21.8G   7% /
+    >   tmpfs                    64.0M         0     64.0M   0% /dev
+    >   shm                      62.5M         0     62.5M   0% /dev/shm
+    >   /dev/vda1                24.5G      1.7G     21.8G   7% /etc/dnsmasq
+    >   tmpfs                   194.7M    620.0K    194.1M   0% /etc/hostname
+    >   tmpfs                   194.7M    620.0K    194.1M   0% /etc/resolv.conf
+    >   tmpfs                   194.7M    620.0K    194.1M   0% /etc/hosts
+    >   tmpfs                   194.7M    620.0K    194.1M   0% /run/.containerenv
+    >   tmpfs                   194.7M    620.0K    194.1M   0% /run/secrets
+
+
+# -----------------------------------------------------
+# Inspect the Pod.
+#[root@Hizzoria]
+
+    podman inspect dnsmasq
+
+    >   [
+    >       {
+    >           "Id": "314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb",
+    >           "Created": "2021-02-23T01:38:43.1790103Z",
+    >           "Path": "dnsmasq",
+    >           "Args": [
+    >               "dnsmasq"
+    >           ],
+    >           "State": {
+    >               "OciVersion": "1.0.2-dev",
+    >               "Status": "running",
+    >               "Running": true,
+    >               "Paused": false,
+    >               "Restarting": false,
+    >               "OOMKilled": false,
+    >               "Dead": false,
+    >               "Pid": 20076,
+    >               "ConmonPid": 20073,
+    >               "ExitCode": 0,
+    >               "Error": "",
+    >               "StartedAt": "2021-02-23T07:06:43.426937776Z",
+    >               "FinishedAt": "2021-02-23T07:06:41.970034999Z",
+    >               "Healthcheck": {
+    >                   "Status": "",
+    >                   "FailingStreak": 0,
+    >                   "Log": null
+    >               }
+    >           },
+    >           "Image": "a12355af408b83950f803716de133c0440c54c5808125044d6b71898bfb5bdf3",
+    >           "ImageName": "docker.io/storytel/dnsmasq:latest",
+    >           "Rootfs": "",
+    >           "Pod": "",
+    >           "ResolvConfPath": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/resolv.conf",
+    >           "HostnamePath": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/hostname",
+    >           "HostsPath": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/hosts",
+    >           "StaticDir": "/var/lib/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata",
+    >           "OCIConfigPath": "/var/lib/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/config.json",
+    >           "OCIRuntime": "crun",
+    >           "LogPath": "/var/lib/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/ctr.log",
+    >           "LogTag": "",
+    >           "ConmonPidFile": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/conmon.pid",
+    >           "Name": "dnsmasq",
+    >           "RestartCount": 0,
+    >           "Driver": "overlay",
+    >           "MountLabel": "system_u:object_r:container_file_t:s0:c385,c403",
+    >           "ProcessLabel": "",
+    >           "AppArmorProfile": "",
+    >           "EffectiveCaps": [
+    >               "CAP_CHOWN",
+    >               "CAP_DAC_OVERRIDE",
+    >               "CAP_DAC_READ_SEARCH",
+    >               "CAP_FOWNER",
+    >               "CAP_FSETID",
+    >               "CAP_KILL",
+    >               "CAP_SETGID",
+    >               "CAP_SETUID",
+    >               "CAP_SETPCAP",
+    >               "CAP_LINUX_IMMUTABLE",
+    >               "CAP_NET_BIND_SERVICE",
+    >               "CAP_NET_BROADCAST",
+    >               "CAP_NET_ADMIN",
+    >               "CAP_NET_RAW",
+    >               "CAP_IPC_LOCK",
+    >               "CAP_IPC_OWNER",
+    >               "CAP_SYS_MODULE",
+    >               "CAP_SYS_RAWIO",
+    >               "CAP_SYS_CHROOT",
+    >               "CAP_SYS_PTRACE",
+    >               "CAP_SYS_PACCT",
+    >               "CAP_SYS_ADMIN",
+    >               "CAP_SYS_BOOT",
+    >               "CAP_SYS_NICE",
+    >               "CAP_SYS_RESOURCE",
+    >               "CAP_SYS_TIME",
+    >               "CAP_SYS_TTY_CONFIG",
+    >               "CAP_MKNOD",
+    >               "CAP_LEASE",
+    >               "CAP_AUDIT_WRITE",
+    >               "CAP_AUDIT_CONTROL",
+    >               "CAP_SETFCAP",
+    >               "CAP_MAC_OVERRIDE",
+    >               "CAP_MAC_ADMIN",
+    >               "CAP_SYSLOG",
+    >               "CAP_WAKE_ALARM",
+    >               "CAP_BLOCK_SUSPEND",
+    >               "CAP_AUDIT_READ"
+    >           ],
+    >           "BoundingCaps": [
+    >               "CAP_CHOWN",
+    >               "CAP_DAC_OVERRIDE",
+    >               "CAP_DAC_READ_SEARCH",
+    >               "CAP_FOWNER",
+    >               "CAP_FSETID",
+    >               "CAP_KILL",
+    >               "CAP_SETGID",
+    >               "CAP_SETUID",
+    >               "CAP_SETPCAP",
+    >               "CAP_LINUX_IMMUTABLE",
+    >               "CAP_NET_BIND_SERVICE",
+    >               "CAP_NET_BROADCAST",
+    >               "CAP_NET_ADMIN",
+    >               "CAP_NET_RAW",
+    >               "CAP_IPC_LOCK",
+    >               "CAP_IPC_OWNER",
+    >               "CAP_SYS_MODULE",
+    >               "CAP_SYS_RAWIO",
+    >               "CAP_SYS_CHROOT",
+    >               "CAP_SYS_PTRACE",
+    >               "CAP_SYS_PACCT",
+    >               "CAP_SYS_ADMIN",
+    >               "CAP_SYS_BOOT",
+    >               "CAP_SYS_NICE",
+    >               "CAP_SYS_RESOURCE",
+    >               "CAP_SYS_TIME",
+    >               "CAP_SYS_TTY_CONFIG",
+    >               "CAP_MKNOD",
+    >               "CAP_LEASE",
+    >               "CAP_AUDIT_WRITE",
+    >               "CAP_AUDIT_CONTROL",
+    >               "CAP_SETFCAP",
+    >               "CAP_MAC_OVERRIDE",
+    >               "CAP_MAC_ADMIN",
+    >               "CAP_SYSLOG",
+    >               "CAP_WAKE_ALARM",
+    >               "CAP_BLOCK_SUSPEND",
+    >               "CAP_AUDIT_READ"
+    >           ],
+    >           "ExecIDs": [],
+    >           "GraphDriver": {
+    >               "Name": "overlay",
+    >               "Data": {
+    >                   "LowerDir": "/var/lib/containers/storage/overlay/7bb8a4351055007d2f87cb9bb2902da18fd7c410f9da470b4ef56e78b94080a3/diff:/var/lib/containers/storage/overlay/cd7100a72410606589a54b932cabd804a17f9ae5b42a1882bd56d263e02b6215/diff",
+    >                   "MergedDir": "/var/lib/containers/storage/overlay/9661016980dfe5613577610f8ede05969f5cc360f08b12548c5f35a7df160184/merged",
+    >                   "UpperDir": "/var/lib/containers/storage/overlay/9661016980dfe5613577610f8ede05969f5cc360f08b12548c5f35a7df160184/diff",
+    >                   "WorkDir": "/var/lib/containers/storage/overlay/9661016980dfe5613577610f8ede05969f5cc360f08b12548c5f35a7df160184/work"
+    >               }
+    >           },
+    >           "Mounts": [
+    >               {
+    >                   "Type": "bind",
+    >                   "Name": "",
+    >                   "Source": "/var/aglais/dnsmasq",
+    >                   "Destination": "/etc/dnsmasq",
+    >                   "Driver": "",
+    >                   "Mode": "",
+    >                   "Options": [
+    >                       "rbind"
+    >                   ],
+    >                   "RW": false,
+    >                   "Propagation": "rprivate"
+    >               }
+    >           ],
+    >           "Dependencies": [],
+    >           "NetworkSettings": {
+    >               "EndpointID": "",
+    >               "Gateway": "",
+    >               "IPAddress": "",
+    >               "IPPrefixLen": 0,
+    >               "IPv6Gateway": "",
+    >               "GlobalIPv6Address": "",
+    >               "GlobalIPv6PrefixLen": 0,
+    >               "MacAddress": "",
+    >               "Bridge": "",
+    >               "SandboxID": "",
+    >               "HairpinMode": false,
+    >               "LinkLocalIPv6Address": "",
+    >               "LinkLocalIPv6PrefixLen": 0,
+    >               "Ports": {},
+    >               "SandboxKey": ""
+    >           },
+    >           "ExitCommand": [
+    >               "/usr/bin/podman",
+    >               "--root",
+    >               "/var/lib/containers/storage",
+    >               "--runroot",
+    >               "/var/run/containers/storage",
+    >               "--log-level",
+    >               "error",
+    >               "--cgroup-manager",
+    >               "systemd",
+    >               "--tmpdir",
+    >               "/var/run/libpod",
+    >               "--runtime",
+    >               "crun",
+    >               "--storage-driver",
+    >               "overlay",
+    >               "--storage-opt",
+    >               "overlay.mountopt=nodev,metacopy=on",
+    >               "--events-backend",
+    >               "journald",
+    >               "container",
+    >               "cleanup",
+    >               "314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb"
+    >           ],
+    >           "Namespace": "",
+    >           "IsInfra": false,
+    >           "Config": {
+    >               "Hostname": "Hizzoria",
+    >               "Domainname": "",
+    >               "User": "",
+    >               "AttachStdin": false,
+    >               "AttachStdout": false,
+    >               "AttachStderr": false,
+    >               "Tty": false,
+    >               "OpenStdin": false,
+    >               "StdinOnce": false,
+    >               "Env": [
+    >                   "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+    >                   "TERM=xterm",
+    >                   "container=podman",
+    >                   "HOSTNAME=Hizzoria",
+    >                   "HOME=/root"
+    >               ],
+    >               "Cmd": [
+    >                   "dnsmasq"
+    >               ],
+    >               "Image": "docker.io/storytel/dnsmasq:latest",
+    >               "Volumes": null,
+    >               "WorkingDir": "/",
+    >               "Entrypoint": "",
+    >               "OnBuild": null,
+    >               "Labels": null,
+    >               "Annotations": {
+    >                   "io.container.manager": "libpod",
+    >                   "io.kubernetes.cri-o.Created": "2021-02-23T01:38:43.1790103Z",
+    >                   "io.kubernetes.cri-o.TTY": "false",
+    >                   "io.podman.annotations.autoremove": "FALSE",
+    >                   "io.podman.annotations.init": "FALSE",
+    >                   "io.podman.annotations.privileged": "TRUE",
+    >                   "io.podman.annotations.publish-all": "FALSE",
+    >                   "org.opencontainers.image.stopSignal": "15"
+    >               },
+    >               "StopSignal": 15,
+    >               "CreateCommand": [
+    >                   "podman",
+    >                   "container",
+    >                   "run",
+    >                   "--name",
+    >                   "dnsmasq",
+    >                   "--detach=True",
+    >                   "--privileged=True",
+    >                   "--network",
+    >                   "host",
+    >                   "--restart=on-failure:10",
+    >                   "--publish",
+    >                   "53:53/tcp",
+    >                   "--volume",
+    >                   "/var/aglais/dnsmasq:/etc/dnsmasq:ro",
+    >                   "storytel/dnsmasq"
+    >               ],
+    >               "Umask": "0022"
+    >           },
+    >           "HostConfig": {
+    >               "Binds": [
+    >                   "/var/aglais/dnsmasq:/etc/dnsmasq:ro,rprivate,rbind"
+    >               ],
+    >               "CgroupManager": "systemd",
+    >               "CgroupMode": "private",
+    >               "ContainerIDFile": "",
+    >               "LogConfig": {
+    >                   "Type": "k8s-file",
+    >                   "Config": null
+    >               },
+    >               "NetworkMode": "host",
+    >               "PortBindings": {},
+    >               "RestartPolicy": {
+    >                   "Name": "on-failure",
+    >                   "MaximumRetryCount": 10
+    >               },
+    >               "AutoRemove": false,
+    >               "VolumeDriver": "",
+    >               "VolumesFrom": null,
+    >               "CapAdd": [],
+    >               "CapDrop": [],
+    >               "Dns": [],
+    >               "DnsOptions": [],
+    >               "DnsSearch": [],
+    >               "ExtraHosts": [],
+    >               "GroupAdd": [],
+    >               "IpcMode": "private",
+    >               "Cgroup": "",
+    >               "Cgroups": "default",
+    >               "Links": null,
+    >               "OomScoreAdj": 0,
+    >               "PidMode": "private",
+    >               "Privileged": true,
+    >               "PublishAllPorts": false,
+    >               "ReadonlyRootfs": false,
+    >               "SecurityOpt": [],
+    >               "Tmpfs": {},
+    >               "UTSMode": "private",
+    >               "UsernsMode": "",
+    >               "ShmSize": 65536000,
+    >               "Runtime": "oci",
+    >               "ConsoleSize": [
+    >                   0,
+    >                   0
+    >               ],
+    >               "Isolation": "",
+    >               "CpuShares": 0,
+    >               "Memory": 0,
+    >               "NanoCpus": 0,
+    >               "CgroupParent": "",
+    >               "BlkioWeight": 0,
+    >               "BlkioWeightDevice": null,
+    >               "BlkioDeviceReadBps": null,
+    >               "BlkioDeviceWriteBps": null,
+    >               "BlkioDeviceReadIOps": null,
+    >               "BlkioDeviceWriteIOps": null,
+    >               "CpuPeriod": 0,
+    >               "CpuQuota": 0,
+    >               "CpuRealtimePeriod": 0,
+    >               "CpuRealtimeRuntime": 0,
+    >               "CpusetCpus": "",
+    >               "CpusetMems": "",
+    >               "Devices": [],
+    >               "DiskQuota": 0,
+    >               "KernelMemory": 0,
+    >               "MemoryReservation": 0,
+    >               "MemorySwap": 0,
+    >               "MemorySwappiness": 0,
+    >               "OomKillDisable": false,
+    >               "PidsLimit": 2048,
+    >               "Ulimits": [
+    >                   {
+    >                       "Name": "RLIMIT_NOFILE",
+    >                       "Soft": 1048576,
+    >                       "Hard": 1048576
+    >                   },
+    >                   {
+    >                       "Name": "RLIMIT_NPROC",
+    >                       "Soft": 4194304,
+    >                       "Hard": 4194304
+    >                   }
+    >               ],
+    >               "CpuCount": 0,
+    >               "CpuPercent": 0,
+    >               "IOMaximumIOps": 0,
+    >               "IOMaximumBandwidth": 0,
+    >               "CgroupConf": null
+    >           }
+    >       }
+    >   ]
+
+    # I can't remember if the NetworkSettings block should be empty ?
+
+    >       ....
+    >       "NetworkSettings": {
+    >           "EndpointID": "",
+    >           "Gateway": "",
+    >           "IPAddress": "",
+    >           "IPPrefixLen": 0,
+    >           "IPv6Gateway": "",
+    >           "GlobalIPv6Address": "",
+    >           "GlobalIPv6PrefixLen": 0,
+    >           "MacAddress": "",
+    >           "Bridge": "",
+    >           "SandboxID": "",
+    >           "HairpinMode": false,
+    >           "LinkLocalIPv6Address": "",
+    >           "LinkLocalIPv6PrefixLen": 0,
+    >           "Ports": {},
+    >           "SandboxKey": ""
+    >       },
+    >       ....
+
+
+# -----------------------------------------------------
+# Login to the Pod and check the filesystem is OK.
+#[root@Hizzoria]
+
+    podman exec -it dnsmasq /bin/sh
+
+
+# -----------------------------------------------------
+# ....
+#[root@dnsmasq]
+
+    ls /etc/dnsmasq
+
+    >   aglais.conf
+    >   hosts
+
+
+    cat /etc/dnsmasq/aglais.conf
+
+    >   ....
+    >   ....
+    >   no-hosts
+    >   no-resolv
+    >   no-daemon
+    >   bogus-priv
+    >   domain-needed
+    >   keep-in-foreground
+    >   
+    >   auth-ttl=300
+    >   local-ttl=300
+    >   
+    >   log-queries
+    >   log-facility=-
+    >   
+    >   bind-dynamic
+    >   except-interface=localhost
+    >   
+    >   hostsdir=/etc/dnsmasq/hosts
+    >   
+    >   auth-server=infra-ops.aglais.uk,eth0
+    >   
+    >   auth-zone=gaia-dev.aglais.uk
+    >   auth-zone=gaia-test.aglais.uk
+    >   auth-zone=gaia-prod.aglais.uk
+
+
+    cat /etc/dnsmasq/hosts/gaia-prod.hosts
+
+    >   ....
+    >   ....
+    >   128.232.227.212  zeppelin.gaia-prod.aglais.uk
+
+
+
+# -----------------------------------------------------
+# ....
+#[root@dnsmasq]
+
+    ifconfig
+
+    >   ....
+    >   eth0      Link encap:Ethernet  HWaddr 86:AB:2D:00:CB:2A
+    >             inet addr:46.101.32.198  Bcast:46.101.63.255  Mask:255.255.192.0
+    >             inet6 addr: 2a03:b0c0:1:d0::b53:6001/64 Scope:Global
+    >             inet6 addr: fe80::84ab:2dff:fe00:cb2a/64 Scope:Link
+    >             UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
+    >             RX packets:300997 errors:0 dropped:0 overruns:0 frame:0
+    >             TX packets:78428 errors:0 dropped:0 overruns:0 carrier:0
+    >             collisions:0 txqueuelen:1000
+    >             RX bytes:373962484 (356.6 MiB)  TX bytes:7941941 (7.5 MiB)
+    >   
+    >   eth1      Link encap:Ethernet  HWaddr BE:FF:FC:81:F0:74
+    >             inet addr:10.106.0.2  Bcast:10.106.15.255  Mask:255.255.240.0
+    >             inet6 addr: fe80::bcff:fcff:fe81:f074/64 Scope:Link
+    >             UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
+    >             RX packets:75 errors:0 dropped:0 overruns:0 frame:0
+    >             TX packets:83 errors:0 dropped:0 overruns:0 carrier:0
+    >             collisions:0 txqueuelen:1000
+    >             RX bytes:5326 (5.2 KiB)  TX bytes:5850 (5.7 KiB)
+    >   ....
+
+
+
+# -----------------------------------------------------
+# Out of ideas .. try restarting the Pod ?
+#[root@Hizzoria]
+
+    podman stop dnsmasq
+    sleep 5
+    podman start dnsmasq
+
+    >   ....
+    >   ....
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Test query direct to our DNS server.
+#[user@desktop]
+
+    dig '@infra-ops.aglais.uk' 'zeppelin.gaia-prod.aglais.uk'
+
+    >   ....
+    >   ....
+    >   ;; QUESTION SECTION:
+    >   ;zeppelin.gaia-prod.aglais.uk.	IN	A
+    >   
+    >   ;; ANSWER SECTION:
+    >   zeppelin.gaia-prod.aglais.uk. 300 IN	A	128.232.227.212
+    >   
+    >   ;; AUTHORITY SECTION:
+    >   gaia-prod.aglais.uk.	300	IN	NS	infra-ops.aglais.uk.
+    >   
+    >   ;; Query time: 14 msec
+    >   ;; SERVER: 46.101.32.198#53(46.101.32.198)
+    >   ;; WHEN: Thu Feb 25 17:23:48 GMT 2021
+    >   ;; MSG SIZE  rcvd: 125
+
+
+# -----------------------------------------------------
+# -----------------------------------------------------
+# Inspect the Pod now it is working ....
+#[root@Hizzoria]
+
+    podman inspect dnsmasq
+
+    >   [
+    >       {
+    >           "Id": "314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb",
+    >           "Created": "2021-02-23T01:38:43.1790103Z",
+    >           "Path": "dnsmasq",
+    >           "Args": [
+    >               "dnsmasq"
+    >           ],
+    >           "State": {
+    >               "OciVersion": "1.0.2-dev",
+    >               "Status": "running",
+    >               "Running": true,
+    >               "Paused": false,
+    >               "Restarting": false,
+    >               "OOMKilled": false,
+    >               "Dead": false,
+    >               "Pid": 25313,
+    >               "ConmonPid": 25310,
+    >               "ExitCode": 0,
+    >               "Error": "",
+    >               "StartedAt": "2021-02-25T17:23:10.323794038Z",
+    >               "FinishedAt": "2021-02-25T17:23:04.795965881Z",
+    >               "Healthcheck": {
+    >                   "Status": "",
+    >                   "FailingStreak": 0,
+    >                   "Log": null
+    >               }
+    >           },
+    >           "Image": "a12355af408b83950f803716de133c0440c54c5808125044d6b71898bfb5bdf3",
+    >           "ImageName": "docker.io/storytel/dnsmasq:latest",
+    >           "Rootfs": "",
+    >           "Pod": "",
+    >           "ResolvConfPath": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/resolv.conf",
+    >           "HostnamePath": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/hostname",
+    >           "HostsPath": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/hosts",
+    >           "StaticDir": "/var/lib/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata",
+    >           "OCIConfigPath": "/var/lib/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/config.json",
+    >           "OCIRuntime": "crun",
+    >           "LogPath": "/var/lib/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/ctr.log",
+    >           "LogTag": "",
+    >           "ConmonPidFile": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/conmon.pid",
+    >           "Name": "dnsmasq",
+    >           "RestartCount": 0,
+    >           "Driver": "overlay",
+    >           "MountLabel": "system_u:object_r:container_file_t:s0:c385,c403",
+    >           "ProcessLabel": "",
+    >           "AppArmorProfile": "",
+    >           "EffectiveCaps": [
+    >               "CAP_CHOWN",
+    >               "CAP_DAC_OVERRIDE",
+    >               "CAP_DAC_READ_SEARCH",
+    >               "CAP_FOWNER",
+    >               "CAP_FSETID",
+    >               "CAP_KILL",
+    >               "CAP_SETGID",
+    >               "CAP_SETUID",
+    >               "CAP_SETPCAP",
+    >               "CAP_LINUX_IMMUTABLE",
+    >               "CAP_NET_BIND_SERVICE",
+    >               "CAP_NET_BROADCAST",
+    >               "CAP_NET_ADMIN",
+    >               "CAP_NET_RAW",
+    >               "CAP_IPC_LOCK",
+    >               "CAP_IPC_OWNER",
+    >               "CAP_SYS_MODULE",
+    >               "CAP_SYS_RAWIO",
+    >               "CAP_SYS_CHROOT",
+    >               "CAP_SYS_PTRACE",
+    >               "CAP_SYS_PACCT",
+    >               "CAP_SYS_ADMIN",
+    >               "CAP_SYS_BOOT",
+    >               "CAP_SYS_NICE",
+    >               "CAP_SYS_RESOURCE",
+    >               "CAP_SYS_TIME",
+    >               "CAP_SYS_TTY_CONFIG",
+    >               "CAP_MKNOD",
+    >               "CAP_LEASE",
+    >               "CAP_AUDIT_WRITE",
+    >               "CAP_AUDIT_CONTROL",
+    >               "CAP_SETFCAP",
+    >               "CAP_MAC_OVERRIDE",
+    >               "CAP_MAC_ADMIN",
+    >               "CAP_SYSLOG",
+    >               "CAP_WAKE_ALARM",
+    >               "CAP_BLOCK_SUSPEND",
+    >               "CAP_AUDIT_READ"
+    >           ],
+    >           "BoundingCaps": [
+    >               "CAP_CHOWN",
+    >               "CAP_DAC_OVERRIDE",
+    >               "CAP_DAC_READ_SEARCH",
+    >               "CAP_FOWNER",
+    >               "CAP_FSETID",
+    >               "CAP_KILL",
+    >               "CAP_SETGID",
+    >               "CAP_SETUID",
+    >               "CAP_SETPCAP",
+    >               "CAP_LINUX_IMMUTABLE",
+    >               "CAP_NET_BIND_SERVICE",
+    >               "CAP_NET_BROADCAST",
+    >               "CAP_NET_ADMIN",
+    >               "CAP_NET_RAW",
+    >               "CAP_IPC_LOCK",
+    >               "CAP_IPC_OWNER",
+    >               "CAP_SYS_MODULE",
+    >               "CAP_SYS_RAWIO",
+    >               "CAP_SYS_CHROOT",
+    >               "CAP_SYS_PTRACE",
+    >               "CAP_SYS_PACCT",
+    >               "CAP_SYS_ADMIN",
+    >               "CAP_SYS_BOOT",
+    >               "CAP_SYS_NICE",
+    >               "CAP_SYS_RESOURCE",
+    >               "CAP_SYS_TIME",
+    >               "CAP_SYS_TTY_CONFIG",
+    >               "CAP_MKNOD",
+    >               "CAP_LEASE",
+    >               "CAP_AUDIT_WRITE",
+    >               "CAP_AUDIT_CONTROL",
+    >               "CAP_SETFCAP",
+    >               "CAP_MAC_OVERRIDE",
+    >               "CAP_MAC_ADMIN",
+    >               "CAP_SYSLOG",
+    >               "CAP_WAKE_ALARM",
+    >               "CAP_BLOCK_SUSPEND",
+    >               "CAP_AUDIT_READ"
+    >           ],
+    >           "ExecIDs": [],
+    >           "GraphDriver": {
+    >               "Name": "overlay",
+    >               "Data": {
+    >                   "LowerDir": "/var/lib/containers/storage/overlay/7bb8a4351055007d2f87cb9bb2902da18fd7c410f9da470b4ef56e78b94080a3/diff:/var/lib/containers/storage/overlay/cd7100a72410606589a54b932cabd804a17f9ae5b42a1882bd56d263e02b6215/diff",
+    >                   "MergedDir": "/var/lib/containers/storage/overlay/9661016980dfe5613577610f8ede05969f5cc360f08b12548c5f35a7df160184/merged",
+    >                   "UpperDir": "/var/lib/containers/storage/overlay/9661016980dfe5613577610f8ede05969f5cc360f08b12548c5f35a7df160184/diff",
+    >                   "WorkDir": "/var/lib/containers/storage/overlay/9661016980dfe5613577610f8ede05969f5cc360f08b12548c5f35a7df160184/work"
+    >               }
+    >           },
+    >           "Mounts": [
+    >               {
+    >                   "Type": "bind",
+    >                   "Name": "",
+    >                   "Source": "/var/aglais/dnsmasq",
+    >                   "Destination": "/etc/dnsmasq",
+    >                   "Driver": "",
+    >                   "Mode": "",
+    >                   "Options": [
+    >                       "rbind"
+    >                   ],
+    >                   "RW": false,
+    >                   "Propagation": "rprivate"
+    >               }
+    >           ],
+    >           "Dependencies": [],
+    >           "NetworkSettings": {
+    >               "EndpointID": "",
+    >               "Gateway": "",
+    >               "IPAddress": "",
+    >               "IPPrefixLen": 0,
+    >               "IPv6Gateway": "",
+    >               "GlobalIPv6Address": "",
+    >               "GlobalIPv6PrefixLen": 0,
+    >               "MacAddress": "",
+    >               "Bridge": "",
+    >               "SandboxID": "",
+    >               "HairpinMode": false,
+    >               "LinkLocalIPv6Address": "",
+    >               "LinkLocalIPv6PrefixLen": 0,
+    >               "Ports": {},
+    >               "SandboxKey": ""
+    >           },
+    >           "ExitCommand": [
+    >               "/usr/bin/podman",
+    >               "--root",
+    >               "/var/lib/containers/storage",
+    >               "--runroot",
+    >               "/var/run/containers/storage",
+    >               "--log-level",
+    >               "error",
+    >               "--cgroup-manager",
+    >               "systemd",
+    >               "--tmpdir",
+    >               "/var/run/libpod",
+    >               "--runtime",
+    >               "crun",
+    >               "--storage-driver",
+    >               "overlay",
+    >               "--storage-opt",
+    >               "overlay.mountopt=nodev,metacopy=on",
+    >               "--events-backend",
+    >               "journald",
+    >               "container",
+    >               "cleanup",
+    >               "314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb"
+    >           ],
+    >           "Namespace": "",
+    >           "IsInfra": false,
+    >           "Config": {
+    >               "Hostname": "Hizzoria",
+    >               "Domainname": "",
+    >               "User": "",
+    >               "AttachStdin": false,
+    >               "AttachStdout": false,
+    >               "AttachStderr": false,
+    >               "Tty": false,
+    >               "OpenStdin": false,
+    >               "StdinOnce": false,
+    >               "Env": [
+    >                   "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+    >                   "TERM=xterm",
+    >                   "container=podman",
+    >                   "HOSTNAME=Hizzoria",
+    >                   "HOME=/root"
+    >               ],
+    >               "Cmd": [
+    >                   "dnsmasq"
+    >               ],
+    >               "Image": "docker.io/storytel/dnsmasq:latest",
+    >               "Volumes": null,
+    >               "WorkingDir": "/",
+    >               "Entrypoint": "",
+    >               "OnBuild": null,
+    >               "Labels": null,
+    >               "Annotations": {
+    >                   "io.container.manager": "libpod",
+    >                   "io.kubernetes.cri-o.Created": "2021-02-23T01:38:43.1790103Z",
+    >                   "io.kubernetes.cri-o.TTY": "false",
+    >                   "io.podman.annotations.autoremove": "FALSE",
+    >                   "io.podman.annotations.init": "FALSE",
+    >                   "io.podman.annotations.privileged": "TRUE",
+    >                   "io.podman.annotations.publish-all": "FALSE",
+    >                   "org.opencontainers.image.stopSignal": "15"
+    >               },
+    >               "StopSignal": 15,
+    >               "CreateCommand": [
+    >                   "podman",
+    >                   "container",
+    >                   "run",
+    >                   "--name",
+    >                   "dnsmasq",
+    >                   "--detach=True",
+    >                   "--privileged=True",
+    >                   "--network",
+    >                   "host",
+    >                   "--restart=on-failure:10",
+    >                   "--publish",
+    >                   "53:53/tcp",
+    >                   "--volume",
+    >                   "/var/aglais/dnsmasq:/etc/dnsmasq:ro",
+    >                   "storytel/dnsmasq"
+    >               ],
+    >               "Umask": "0022"
+    >           },
+    >           "HostConfig": {
+    >               "Binds": [
+    >                   "/var/aglais/dnsmasq:/etc/dnsmasq:ro,rprivate,rbind"
+    >               ],
+    >               "CgroupManager": "systemd",
+    >               "CgroupMode": "private",
+    >               "ContainerIDFile": "",
+    >               "LogConfig": {
+    >                   "Type": "k8s-file",
+    >                   "Config": null
+    >               },
+    >               "NetworkMode": "host",
+    >               "PortBindings": {},
+    >               "RestartPolicy": {
+    >                   "Name": "on-failure",
+    >                   "MaximumRetryCount": 10
+    >               },
+    >               "AutoRemove": false,
+    >               "VolumeDriver": "",
+    >               "VolumesFrom": null,
+    >               "CapAdd": [],
+    >               "CapDrop": [],
+    >               "Dns": [],
+    >               "DnsOptions": [],
+    >               "DnsSearch": [],
+    >               "ExtraHosts": [],
+    >               "GroupAdd": [],
+    >               "IpcMode": "private",
+    >               "Cgroup": "",
+    >               "Cgroups": "default",
+    >               "Links": null,
+    >               "OomScoreAdj": 0,
+    >               "PidMode": "private",
+    >               "Privileged": true,
+    >               "PublishAllPorts": false,
+    >               "ReadonlyRootfs": false,
+    >               "SecurityOpt": [],
+    >               "Tmpfs": {},
+    >               "UTSMode": "private",
+    >               "UsernsMode": "",
+    >               "ShmSize": 65536000,
+    >               "Runtime": "oci",
+    >               "ConsoleSize": [
+    >                   0,
+    >                   0
+    >               ],
+    >               "Isolation": "",
+    >               "CpuShares": 0,
+    >               "Memory": 0,
+    >               "NanoCpus": 0,
+    >               "CgroupParent": "",
+    >               "BlkioWeight": 0,
+    >               "BlkioWeightDevice": null,
+    >               "BlkioDeviceReadBps": null,
+    >               "BlkioDeviceWriteBps": null,
+    >               "BlkioDeviceReadIOps": null,
+    >               "BlkioDeviceWriteIOps": null,
+    >               "CpuPeriod": 0,
+    >               "CpuQuota": 0,
+    >               "CpuRealtimePeriod": 0,
+    >               "CpuRealtimeRuntime": 0,
+    >               "CpusetCpus": "",
+    >               "CpusetMems": "",
+    >               "Devices": [],
+    >               "DiskQuota": 0,
+    >               "KernelMemory": 0,
+    >               "MemoryReservation": 0,
+    >               "MemorySwap": 0,
+    >               "MemorySwappiness": 0,
+    >               "OomKillDisable": false,
+    >               "PidsLimit": 2048,
+    >               "Ulimits": [
+    >                   {
+    >                       "Name": "RLIMIT_NOFILE",
+    >                       "Soft": 1048576,
+    >                       "Hard": 1048576
+    >                   },
+    >                   {
+    >                       "Name": "RLIMIT_NPROC",
+    >                       "Soft": 4194304,
+    >                       "Hard": 4194304
+    >                   }
+    >               ],
+    >               "CpuCount": 0,
+    >               "CpuPercent": 0,
+    >               "IOMaximumIOps": 0,
+    >               "IOMaximumBandwidth": 0,
+    >               "CgroupConf": null
+    >           }
+    >       }
+    >   ]
+
+    # We can compare these later to see if there is any difference.
+
+    # Note the NetworkSettings block is indeed full of blanks.
+
+    >       ....
+    >       "NetworkSettings": {
+    >           "EndpointID": "",
+    >           "Gateway": "",
+    >           "IPAddress": "",
+    >           "IPPrefixLen": 0,
+    >           "IPv6Gateway": "",
+    >           "GlobalIPv6Address": "",
+    >           "GlobalIPv6PrefixLen": 0,
+    >           "MacAddress": "",
+    >           "Bridge": "",
+    >           "SandboxID": "",
+    >           "HairpinMode": false,
+    >           "LinkLocalIPv6Address": "",
+    >           "LinkLocalIPv6PrefixLen": 0,
+    >           "Ports": {},
+    >           "SandboxKey": ""
+    >       },
+    >       ....
+
+