diff --git a/experiments/hadoop-yarn/ansible/03-create-masters.yml b/experiments/hadoop-yarn/ansible/03-create-masters.yml index facb7fb0..8c621019 100644 --- a/experiments/hadoop-yarn/ansible/03-create-masters.yml +++ b/experiments/hadoop-yarn/ansible/03-create-masters.yml @@ -35,7 +35,7 @@ register: mastersec - - name: "Add a rule to allow SSH from our gateway" + - name: "Add a rule to allow SSH from zeppelin" os_security_group_rule: cloud: "{{ cloudname }}" state: present @@ -44,7 +44,7 @@ protocol: 'tcp' port_range_min: 22 port_range_max: 22 - remote_group: "{{ security['gateway'] }}" + remote_group: "{{ security['zeppelin'] }}" - name: "Create our masters" os_server: diff --git a/experiments/hadoop-yarn/ansible/04-create-workers.yml b/experiments/hadoop-yarn/ansible/04-create-workers.yml index eeb43bb6..e6dc0cda 100644 --- a/experiments/hadoop-yarn/ansible/04-create-workers.yml +++ b/experiments/hadoop-yarn/ansible/04-create-workers.yml @@ -35,7 +35,7 @@ register: secgroup - - name: "Add a rule to allow ssh from the gateway" + - name: "Add a rule to allow ssh from zeppelin" os_security_group_rule: cloud: "{{ cloudname }}" state: present @@ -44,7 +44,7 @@ protocol: 'tcp' port_range_min: 22 port_range_max: 22 - remote_group: "{{ security['gateway'] }}" + remote_group: "{{ security['zeppelin'] }}" - name: "Create our workers" os_server: diff --git a/experiments/hadoop-yarn/ansible/04-update-fedora.yml b/experiments/hadoop-yarn/ansible/04-update-fedora.yml new file mode 100644 index 00000000..d5085c0e --- /dev/null +++ b/experiments/hadoop-yarn/ansible/04-update-fedora.yml @@ -0,0 +1,50 @@ +# +# +# +# Copyright (c) 2020, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +# + +# ignore_errors +# https://docs.ansible.com/ansible/latest/user_guide/playbooks_error_handling.html#ignoring-failed-commands + +- name: "DNF update" + gather_facts: false + hosts: masters:workers:zeppelin + vars_files: + - /tmp/ansible-vars.yml + tasks: + + # This is a noop to force a cache-refresh. + - name: "Update the DNF cache" + become: true + ignore_errors: yes + dnf: + name: 'kernel' + state: present + update_cache: yes + + + - name: "Install monitoring tools" + become: true + dnf: + name: + - 'atop' + - 'htop' + state: present + diff --git a/experiments/hadoop-yarn/ansible/05-config-ssh.yml b/experiments/hadoop-yarn/ansible/05-config-ssh.yml index 4f60131e..72a42edc 100644 --- a/experiments/hadoop-yarn/ansible/05-config-ssh.yml +++ b/experiments/hadoop-yarn/ansible/05-config-ssh.yml @@ -35,12 +35,12 @@ mode: 'u=rwx,g=rx,o=rx' state: directory - - name: "Discover our gateway nodes" + - name: "Discover our zeppelin node" os_server_info: cloud: "{{ cloudname }}" - server: "{{ deployname }}-gateway" + server: "{{ deployname }}-zeppelin" register: - gatewaynodes + zeppelinnodes - name: "Generate Ansible SSH config" template: diff --git a/experiments/hadoop-yarn/ansible/06-config-dns.yml b/experiments/hadoop-yarn/ansible/06-config-dns.yml index 1e20d587..0ec41589 100644 --- a/experiments/hadoop-yarn/ansible/06-config-dns.yml +++ b/experiments/hadoop-yarn/ansible/06-config-dns.yml @@ -26,12 +26,12 @@ - /tmp/ansible-vars.yml tasks: - - name: "Discover our gateway nodes" + - name: "Discover our Zeppelin node" os_server_info: cloud: "{{ cloudname }}" - server: "{{ deployname }}-gateway*" + server: "{{ deployname }}-zeppelin" register: - gatewaynodes + zeppelinnode - name: "Discover our master nodes" os_server_info: @@ -47,34 +47,16 @@ register: workernodes - - name: "Discover our Zeppelin nodes" - os_server_info: - cloud: "{{ cloudname }}" - server: "{{ deployname }}-zeppelin" - register: - zeppelinnode - - name: "Generate our DNS hosts file" template: src: 'templates/dns-hosts.j2' dest: "/tmp/aglais-dns-hosts" -- hosts: gateway - gather_facts: false - tasks: - - name: "Deploy [/etc/hosts] to our gateway" - become: true - copy: - src: /tmp/aglais-dns-hosts - dest: /etc/hosts - owner: root - group: root - mode: u=rw,g=r,o=r - hosts: zeppelin gather_facts: false tasks: - - name: "Deploy [/etc/hosts] to our Zeppelin" + - name: "Deploy [/etc/hosts] to our Zeppelin node" become: true copy: src: /tmp/aglais-dns-hosts diff --git a/experiments/hadoop-yarn/ansible/07-host-keys.yml b/experiments/hadoop-yarn/ansible/07-host-keys.yml index ea87e946..fbfc803c 100644 --- a/experiments/hadoop-yarn/ansible/07-host-keys.yml +++ b/experiments/hadoop-yarn/ansible/07-host-keys.yml @@ -22,7 +22,7 @@ # https://everythingshouldbevirtual.com/automation/ansible-ssh-known-host-keys/ # -- hosts: gateway +- hosts: zeppelin gather_facts: false tasks: @@ -50,7 +50,7 @@ dest: "/tmp/aglais-ssh-hosts" -- hosts: gateway:masters:workers:zeppelin +- hosts: masters:workers:zeppelin gather_facts: false tasks: - name: "Deploy the known hosts file to [/etc/ssh/ssh_known_hosts]" diff --git a/experiments/hadoop-yarn/ansible/08-ping-test.yml b/experiments/hadoop-yarn/ansible/08-ping-test.yml index c07607ae..b8e8cd33 100644 --- a/experiments/hadoop-yarn/ansible/08-ping-test.yml +++ b/experiments/hadoop-yarn/ansible/08-ping-test.yml @@ -22,7 +22,7 @@ --- - name: "Ping tests" - hosts: gateway:masters:workers:zeppelin + hosts: zeppelin:masters:workers gather_facts: false tasks: diff --git a/experiments/hadoop-yarn/ansible/09-worker-volumes.yml b/experiments/hadoop-yarn/ansible/09-worker-volumes.yml index de158418..584f1a7a 100644 --- a/experiments/hadoop-yarn/ansible/09-worker-volumes.yml +++ b/experiments/hadoop-yarn/ansible/09-worker-volumes.yml @@ -45,12 +45,13 @@ become: true dnf: name: btrfs-progs - state: latest + state: present - - name: "Mount data volumes for {{ inventory_hostname }}" + - name: "Call the mount-volumes task" include_tasks: tasks/mount-volumes.yml loop: "{{ hostvars[ inventory_hostname ].discs }}" loop_control: loop_var: disc + when: ((disc.type == 'cinder') or (disc.type == 'local')) diff --git a/experiments/hadoop-yarn/ansible/11-install-hadoop.yml b/experiments/hadoop-yarn/ansible/11-install-hadoop.yml index 427bbaa0..2434d200 100644 --- a/experiments/hadoop-yarn/ansible/11-install-hadoop.yml +++ b/experiments/hadoop-yarn/ansible/11-install-hadoop.yml @@ -29,13 +29,6 @@ - name: "Install Hadoop" hosts: masters:workers:zeppelin gather_facts: false - vars: - hdname: "hadoop-3.1.3" - hdbase: "/opt" - hdhome: "/opt/hadoop" - hddata: "/var/local/hadoop" - hdhost: "{{groups['masters'][0]}}" - hduser: "{{hostvars[inventory_hostname].login}}" tasks: @@ -46,32 +39,26 @@ dest: "{{hdbase}}" remote_src: yes - - name: "Create a symbolic link" + - name: "Create a symlink for the Hadoop version" become: true file: src: "{{hdname}}" path: "{{hdhome}}" state: link - - name: "Create '{{hddata}}'" - become: true - file: - path: "{{hddata}}" - mode: 'u=rwx,g=rwxs,o=rx' - state: directory - recurse: yes - owner: "{{hduser}}" - group: "{{hduser}}" + - name: "Create Hadoop data directory" + include_tasks: "tasks/create-linked.yml" + vars: + linkdest: "{{hddatadest}}" + linkpath: "{{hddatalink}}" + linkuser: "{{hduser}}" - - name: "Create [{{hddata}}/logs]" - become: true - file: - path: "{{hddata}}/logs" - mode: 'u=rwx,g=rwxs,o=rx' - state: directory - recurse: yes - owner: "{{hduser}}" - group: "{{hduser}}" + - name: "Create Hadoop logs directory" + include_tasks: "tasks/create-linked.yml" + vars: + linkdest: "{{hdlogsdest}}" + linkpath: "{{hdlogslink}}" + linkuser: "{{hduser}}" # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/ClusterSetup.html#Configuring_Environment_of_Hadoop_Daemons - name: "Create [/etc/profile.d/hadoop.sh]" @@ -89,8 +76,8 @@ export PATH=${PATH}:{{hdhome}}/bin:{{hdhome}}/sbin #export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:{{hdhome}}/lib/native export HADOOP_HOME={{hdhome}} - export HADOOP_DATA={{hddata}} - export HADOOP_CONF_DIR={{hdhome}}/etc/hadoop - export HADOOP_LOG_DIR=${HADOOP_DATA}/logs + export HADOOP_DATA={{hddatalink}} + export HADOOP_CONF_DIR={{hdconf}} + export HADOOP_LOG_DIR={{hdlogslink}} diff --git a/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml b/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml index bbbe6e71..aa6e6693 100644 --- a/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml +++ b/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml @@ -23,16 +23,16 @@ - name: "Configure Hadoop [core-site.xml]" hosts: masters:workers:zeppelin gather_facts: false - vars: - hdname: "hadoop-3.1.3" - hdbase: "/opt" - hdhome: "/opt/hadoop" - hddata: "/var/local/hadoop" - hdhost: "{{groups['masters'][0]}}" - hduser: "{{hostvars[inventory_hostname].login}}" tasks: + - name: "Create Hadoop temp directory" + include_tasks: "tasks/create-linked.yml" + vars: + linkdest: "{{hdtempdest}}" + linkpath: "{{hdtemplink}}" + linkuser: "{{hduser}}" + # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/ClusterSetup.html#Configuring_the_Hadoop_Daemons # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/core-default.html - name: "Configure [{{hdhome}}/etc/hadoop/core-site.xml]" @@ -58,4 +58,10 @@ hdfs://{{hdhost}}:9000 + + hadoop.tmp.dir + {{hdtemplink}} + + + diff --git a/experiments/hadoop-yarn/ansible/12-config-ssh-access.yml b/experiments/hadoop-yarn/ansible/12-config-ssh-access.yml index 7fd83bca..20a372f0 100644 --- a/experiments/hadoop-yarn/ansible/12-config-ssh-access.yml +++ b/experiments/hadoop-yarn/ansible/12-config-ssh-access.yml @@ -102,13 +102,6 @@ - name: "Configure Hadoop [workers] on master nodes" hosts: masters:zeppelin gather_facts: false - vars: - hdname: "hadoop-3.1.3" - hdbase: "/opt" - hdhome: "/opt/hadoop" - hddata: "/var/local/hadoop" - hdhost: "{{groups['masters'][0]}}" - hduser: "{{hostvars[inventory_hostname].login}}" tasks: diff --git a/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml b/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml index caae6b5b..390d6656 100644 --- a/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml +++ b/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml @@ -21,27 +21,29 @@ # - name: "Configure HDFS namenode" - hosts: master01:zeppelin + hosts: master01 gather_facts: false vars: - hdname: "hadoop-3.1.3" - hdbase: "/opt" - hdhome: "/opt/hadoop" - hddata: "/var/local/hadoop" - hdhost: "{{groups['masters'][0]}}" - hduser: "{{hostvars[inventory_hostname].login}}" + hdfsimage: "{{hdfsmetalink}}/namenode/fsimage" tasks: - - name: "Create [{{hddata}}/namenode/fsimage]" + - name: "Create HDFS metadata directory" + include_tasks: "tasks/create-linked.yml" + vars: + linkdest: "{{hdfsmetadest}}" + linkpath: "{{hdfsmetalink}}" + linkuser: "{{hdfsuser}}" + + - name: "Create [{{hdfsimage}}]" become: true file: - path: "{{hddata}}/namenode/fsimage" + path: "{{hdfsimage}}" mode: 'u=rwx,g=rwxs,o=rx' state: directory recurse: yes - owner: "{{hduser}}" - group: "{{hduser}}" + owner: "{{hdfsuser}}" + group: "{{hdfsuser}}" # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/ClusterSetup.html#Configuring_the_Hadoop_Daemons # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml @@ -54,12 +56,12 @@ block: | dfs.namenode.name.dir - {{hddata}}/namenode/fsimage + {{hdfsimage}} @@ -115,7 +117,7 @@ dfs.client.use.datanode.hostname true - + dfs.datanode.use.datanode.hostname true diff --git a/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml b/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml index 6df8aafe..1842c580 100644 --- a/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml +++ b/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml @@ -21,27 +21,24 @@ # - name: "Configure Hadoop workers" - hosts: workers:zeppelin + hosts: workers gather_facts: false - vars: - hdname: "hadoop-3.1.3" - hdbase: "/opt" - hdhome: "/opt/hadoop" - hddata: "/var/local/hadoop" - hdhost: "{{groups['masters'][0]}}" - hduser: "{{hostvars[inventory_hostname].login}}" tasks: - # TODO Create from hosts.yml - - name: "Create [/data-01/hdfs/data]" - become: true - file: - path: "/data-01/hdfs/data" - mode: 'u=rwx,g=rwxs,o=rx' - state: directory - owner: "{{hduser}}" - group: "{{hduser}}" + - name: "Create HDFS data directory" + include_tasks: "tasks/create-linked.yml" + vars: + linkdest: "{{hdfsdatadest}}" + linkpath: "{{hdfsdatalink}}" + linkuser: "{{hdfsuser}}" + + - name: "Create HDFS logs directory" + include_tasks: "tasks/create-linked.yml" + vars: + linkdest: "{{hdfslogsdest}}" + linkpath: "{{hdfslogslink}}" + linkuser: "{{hdfsuser}}" # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/ClusterSetup.html#Configuring_the_Hadoop_Daemons - name: "Configure [{{hdhome}}/etc/hadoop/hdfs-site.xml]" @@ -56,11 +53,11 @@ | If this is a comma-delimited list of directories, then data will be stored in all named directories, typically on different devices. | The directories should be tagged with corresponding storage types ([SSD]/[DISK]/[ARCHIVE]/[RAM_DISK]) for HDFS storage policies. | The default storage type will be DISK if the directory does not have a storage type tagged explicitly. - | Directories that do not exist will be created if local filesystem permission allows. + | Directories that do not exist will be created if local filesystem permission allows. +--> dfs.datanode.data.dir - /data-01/hdfs/data + {{hdfsdatalink}} diff --git a/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml b/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml index 47a32d7a..f2d3c00f 100644 --- a/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml +++ b/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml @@ -21,15 +21,8 @@ # - name: "Configure YARN masters" - hosts: master01:zeppelin + hosts: masters:zeppelin gather_facts: false - vars: - hdname: "hadoop-3.1.3" - hdbase: "/opt" - hdhome: "/opt/hadoop" - hddata: "/var/local/hadoop" - hdhost: "master01" - hduser: "{{hostvars[inventory_hostname].login}}" tasks: @@ -62,7 +55,7 @@ yarn.resourcemanager.hostname @@ -71,7 +64,7 @@ yarn.scheduler.maximum-allocation-mb - 20000 + 43008 - - - - yarn.scheduler.maximum-allocation-mb - 20000 - - - - yarn.scheduler.minimum-allocation-mb - 2000 + 14336 - +--> # # CapacityScheduler config. @@ -168,7 +154,7 @@ diff --git a/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml b/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml index d39524d4..a6330d76 100644 --- a/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml +++ b/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml @@ -21,15 +21,8 @@ # - name: "Configure YARN workers" - hosts: workers:zeppelin + hosts: workers gather_facts: false - vars: - hdname: "hadoop-3.1.3" - hdbase: "/opt" - hdhome: "/opt/hadoop" - hddata: "/var/local/hadoop" - hdhost: "{{groups['masters'][0]}}" - hduser: "{{hostvars[inventory_hostname].login}}" tasks: @@ -61,30 +54,61 @@ yarn.resourcemanager.hostname {{hdhost}} - + yarn.scheduler.maximum-allocation-mb - 20000 + 43008 + + + + + yarn.scheduler.minimum-allocation-mb + 14336 yarn.nodemanager.resource.memory-mb - 20000 + 43008 + + + yarn.nodemanager.resource.cpu-vcores + 13 + + + yarn.scheduler.maximum-allocation-vcores 48 @@ -96,8 +120,8 @@ yarn.nodemanager.aux-services @@ -148,10 +172,10 @@ | HADOOP_CONF_DIR, | HADOOP_HDFS_HOME, | HADOOP_YARN_HOME, - | HADOOP_MAPRED_HOME, + | HADOOP_MAPRED_HOME, | HADOOP_COMMON_HOME, | CLASSPATH_PREPEND_DISTCACHE - | + | yarn.nodemanager.env-whitelist @@ -166,7 +190,7 @@ false +--> - + 328ESC[0;31m format(target_id, ".", name), value) + > ESC[0mESC[1;32m 329ESC[0m ESC[0;32melseESC[0mESC[0;34m:ESC[0mESC[0;34mESC[0mESC[0;34mESC[0mESC[0m + > ESC[1;32m 330ESC[0m raise Py4JError( + > + > ESC[0;31mPy4JJavaErrorESC[0m: An error occurred while calling o191.fit. + > : org.apache.spark.SparkException: Job aborted due to stage failure: Task 3226 in stage 35.0 failed 4 times, most recent failure: Lost task 3226.3 in stage 35.0 (TID 122005, worker05, executor 2): java.io.IOException: No space left on device + > at java.io.FileOutputStream.writeBytes(Native Method) + > at java.io.FileOutputStream.write(FileOutputStream.java:326) + > at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:58) + > at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82) + > at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140) + > at net.jpountz.lz4.LZ4BlockOutputStream.finish(LZ4BlockOutputStream.java:260) + > at net.jpountz.lz4.LZ4BlockOutputStream.close(LZ4BlockOutputStream.java:190) + > at java.io.ObjectOutputStream$BlockDataOutputStream.close(ObjectOutputStream.java:1828) + > at java.io.ObjectOutputStream.close(ObjectOutputStream.java:742) + > at org.apache.spark.serializer.JavaSerializationStream.close(JavaSerializer.scala:57) + > at org.apache.spark.storage.DiskBlockObjectWriter.commitAndGet(DiskBlockObjectWriter.scala:173) + > at org.apache.spark.util.collection.ExternalSorter.writePartitionedFile(ExternalSorter.scala:701) + > at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:71) + > at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) + > at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) + > at org.apache.spark.scheduler.Task.run(Task.scala:123) + > at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408) + > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) + > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + > at java.lang.Thread.run(Thread.java:748) + > + > Driver stacktrace: + > at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925) + > at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913) + > at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912) + > at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) + > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) + > at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912) + > at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948) + > at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948) + > at scala.Option.foreach(Option.scala:257) + > at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948) + > at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146) + > at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095) + > at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084) + > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) + > at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759) + > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061) + > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082) + > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101) + > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126) + > at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990) + > at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) + > at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) + > at org.apache.spark.rdd.RDD.withScope(RDD.scala:385) + > at org.apache.spark.rdd.RDD.collect(RDD.scala:989) + > at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:743) + > at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:742) + > at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) + > at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) + > at org.apache.spark.rdd.RDD.withScope(RDD.scala:385) + > at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:742) + > at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:567) + > at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:201) + > at org.apache.spark.ml.classification.RandomForestClassifier$$anonfun$train$1.apply(RandomForestClassifier.scala:142) + > at org.apache.spark.ml.classification.RandomForestClassifier$$anonfun$train$1.apply(RandomForestClassifier.scala:120) + > at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185) + > at scala.util.Try$.apply(Try.scala:192) + > at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185) + > at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:120) + > at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:46) + > at org.apache.spark.ml.Predictor.fit(Predictor.scala:118) + > at org.apache.spark.ml.Predictor.fit(Predictor.scala:82) + > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + > at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + > at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + > at java.lang.reflect.Method.invoke(Method.java:498) + > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) + > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) + > at py4j.Gateway.invoke(Gateway.java:282) + > at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) + > at py4j.commands.CallCommand.execute(CallCommand.java:79) + > at py4j.GatewayConnection.run(GatewayConnection.java:238) + > at java.lang.Thread.run(Thread.java:748) + > + > Caused by: java.io.IOException: No space left on device + > at java.io.FileOutputStream.writeBytes(Native Method) + > at java.io.FileOutputStream.write(FileOutputStream.java:326) + > at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:58) + > at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82) + > at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140) + > at net.jpountz.lz4.LZ4BlockOutputStream.finish(LZ4BlockOutputStream.java:260) + > at net.jpountz.lz4.LZ4BlockOutputStream.close(LZ4BlockOutputStream.java:190) + > at java.io.ObjectOutputStream$BlockDataOutputStream.close(ObjectOutputStream.java:1828) + > at java.io.ObjectOutputStream.close(ObjectOutputStream.java:742) + > at org.apache.spark.serializer.JavaSerializationStream.close(JavaSerializer.scala:57) + > at org.apache.spark.storage.DiskBlockObjectWriter.commitAndGet(DiskBlockObjectWriter.scala:173) + > at org.apache.spark.util.collection.ExternalSorter.writePartitionedFile(ExternalSorter.scala:701) + > at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:71) + > at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) + > at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) + > at org.apache.spark.scheduler.Task.run(Task.scala:123) + > at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408) + > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) + > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + > ... 1 more + > + > INFO [2021-02-11 11:06:34,565] ({pool-2-thread-3} VFSNotebookRepo.java[save]:196) - Saving note:2FYW1HNED + > INFO [2021-02-11 11:06:34,572] ({pool-2-thread-3} SchedulerFactory.java[jobFinished]:120) - Job 20201013-152110_1282917873 finished by scheduler org.apache.zeppelin.interpreter.remote.RemoteInterpreter-spark:shared_process-shared_session + > INFO [2021-02-11 11:30:45,031] ({SessionValidationThread-1} AbstractValidatingSessionManager.java[validateSessions]:275) - Validating all active sessions... + > INFO [2021-02-11 11:30:45,032] ({SessionValidationThread-1} AbstractValidatingSessionManager.java[validateSessions]:308) - Finished session validation. No sessions were stopped. + + + Lots of information in that .. + + - The exception was reported by (TID 122005, worker05, executor 2) + - I think the out of space was on worker05, not the Zeppelin node. + + - The stack trace suggests that RandomForestClassifier understands org.apache.spark.rdd.RDD + - Which means at least part of the RandomForestClassifier training is offloaded to the workers. + + > at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) + > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) + > at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912) + > .... + > at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948) + > at scala.Option.foreach(Option.scala:257) + > at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948) + > .... + > at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084) + > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) + > at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759) + > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061) + > .... + > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126) + > .... + > at org.apache.spark.rdd.RDD.withScope(RDD.scala:385) + > at org.apache.spark.rdd.RDD.collect(RDD.scala:989) + > .... + > at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:567) + > .... + > at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185) + > at scala.util.Try$.apply(Try.scala:192) + > at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185) + > at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:120) + > at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:46) + > at org.apache.spark.ml.Predictor.fit(Predictor.scala:118) + + +# ----------------------------------------------------- +# Check the disc space on zeppelin +#[user@zeppelin] + + ls -1 /var/spark/temp/ + + > blockmgr-6e40f938-4cea-4e51-aa7d-7b8e8d957fd9 + > spark-02a20dcf-c44b-46fd-aba2-84dcd4092b77 + + + du -h -d 1 /var/spark/temp/ + + > 220K /var/spark/temp/spark-02a20dcf-c44b-46fd-aba2-84dcd4092b77 + > 168K /var/spark/temp/blockmgr-6e40f938-4cea-4e51-aa7d-7b8e8d957fd9 + > 392K /var/spark/temp/ + + +# ----------------------------------------------------- +# Check the disc space on worker05 +#[user@zeppelin] + + + ssh worker05 \ + ' + hostname + date + echo + ls -1 /var/spark/temp/ + echo + du -h -d 1 /var/spark/temp/ + ' + + > gaia-dev-20210211-worker05.novalocal + > Thu 11 Feb 12:16:18 UTC 2021 + > + > ls: cannot access '/var/spark/temp/': No such file or directory + > + > du: cannot access '/var/spark/temp/': No such file or directory + + # + # When we changed this back down to a small node we didn't create the spark temp directory. + # + + + ssh worker05 \ + ' + hostname + date + echo + ls -1 /tmp/ + echo + du -h -d 1 /tmp/ + ' + + > gaia-dev-20210211-worker05.novalocal + > Thu 11 Feb 12:16:49 UTC 2021 + > + > hadoop-fedora + > hadoop-fedora-datanode.pid + > hadoop-fedora-nodemanager.pid + > hsperfdata_fedora + > hsperfdata_root + > jetty-0.0.0.0-8042-node-_-any-5267485435957391381.dir + > jetty-localhost-33243-datanode-_-any-3555236917512612600.dir + > systemd-private-e9fcce57f1be40acb5b15c979c850494-chronyd.service-0AFL7F + > systemd-private-e9fcce57f1be40acb5b15c979c850494-dbus-broker.service-ej71vh + > + > 4.0K /tmp/jetty-localhost-33243-datanode-_-any-3555236917512612600.dir + > 4.0K /tmp/.ICE-unix + > 4.0K /tmp/.X11-unix + > 4.0K /tmp/.Test-unix + > du: cannot read directory '/tmp/systemd-private-e9fcce57f1be40acb5b15c979c850494-chronyd.service-0AFL7F': Permission denied + > 4.0K /tmp/systemd-private-e9fcce57f1be40acb5b15c979c850494-chronyd.service-0AFL7F + > 8.0K /tmp/jetty-0.0.0.0-8042-node-_-any-5267485435957391381.dir + > 4.0K /tmp/.font-unix + > 100K /tmp/hsperfdata_fedora + > 4.0K /tmp/.XIM-unix + > 14G /tmp/hadoop-fedora + > 4.0K /tmp/systemd-private-e9fcce57f1be40acb5b15c979c850494-dbus-broker.service-ej71vh + > 36K /tmp/hsperfdata_root + > 14G /tmp/ + > du: cannot read directory '/tmp/systemd-private-e9fcce57f1be40acb5b15c979c850494-dbus-broker.service-ej71vh': Permission denied + + + ssh worker05 \ + ' + hostname + date + echo + ls -1 /tmp/hadoop-fedora + echo + du -h -d 1 /tmp/hadoop-fedora + ' + + > gaia-dev-20210211-worker05.novalocal + > Thu 11 Feb 12:18:19 UTC 2021 + > + > nm-local-dir + > + > 14G /tmp/hadoop-fedora/nm-local-dir + > 14G /tmp/hadoop-fedora + + + ssh worker05 \ + ' + hostname + date + echo + du -h /tmp/hadoop-fedora + ' + + > 4.0K /tmp/hadoop-fedora/nm-local-dir/nmPrivate/application_1613027823151_0001 + > 8.0K /tmp/hadoop-fedora/nm-local-dir/nmPrivate + > .... .... + > 284K /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/12 + > 592K /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/11 + > 231M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/13/__spark_libs__4343915086399681065.zip + > 231M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/13 + > .... .... + > 2.9M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/10/sparkr.zip + > 2.9M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/10 + > 52K /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/14 + > 235M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache + > .... .... + > 51M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc/22 + > 64M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc/1c + > 66M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc/24 + > 414M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc/30 + > 13G /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc + > 4.0K /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/filecache + > 13G /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001 + > 13G /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache + > 14G /tmp/hadoop-fedora/nm-local-dir/usercache/fedora + > 14G /tmp/hadoop-fedora/nm-local-dir/usercache + > 4.0K /tmp/hadoop-fedora/nm-local-dir/filecache + > 14G /tmp/hadoop-fedora/nm-local-dir + > 14G /tmp/hadoop-fedora + + Lots of information in that .. + + - By the time the job gets here it it is a Hadoop job NOT a Spark job. + - The temp files are owned by Hadoop node-manager and Hadoop block-manager. + - To move them to another location we should use the Hadoop temp settings, not the Spark temp settings. + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + + Things we have learned so far. + + Even if we don't create the separate disc mounts, we should still create the spark and hadoop temp directories. + + Move the mount paths from a global setting to a host specific setting. + Always create the same directories + + /var/spark/temp + /var/spark/data + + /var/hadoop/temp + /var/hadoop/data + + If this is a medium node, then change some of them into links. + If the host config has mount paths for them. + + The master node isn't doing much. + Possibly managing the HDFS namenode ? + Is it actually managing the Yarn scheduling ? + Could all this be done by a tiny VM ? + + The Zeppelin node is running the Spark interpreter. + The Spark interpreter is scheduling the Spark jobs. + The Spark interpreter aggregates the notebook results. + + The Spark interpreter uses 392K of space in /var/spark/temp. + This could still probably me a small node. + The main cpu use is the ipython server running one thread at 100%. + The rest of the cores are idle most of the time. + + + + + + + + + + diff --git a/notes/zrq/20210211-02-git-branches.txt b/notes/zrq/20210211-02-git-branches.txt new file mode 100644 index 00000000..022dc660 --- /dev/null +++ b/notes/zrq/20210211-02-git-branches.txt @@ -0,0 +1,457 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Transfer work on 20210206-zrq-working branch onto spaller task specific branches. + We spent a while adding a mixture of changes to the working branch. + Needed to step back and commit the changes as separate task specific PRs. + + Result: + + Work in progress ... + +# ----------------------------------------------------- +# Create a copy of the local working branch. +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-zrq + + git add . + git commit -m "Adding everything to the working branch" + git push + + popd + + cp -a github-zrq github-working + mv github-zrq github-backup + + + popd + +# ----------------------------------------------------- +# Update the working copy with merged PRs from upstream. +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + git checkout master + + > Switched to branch 'master' + > Your branch is up to date with 'origin/master'. + + git pull + + > Already up to date. + + git fetch upstream + + > remote: Enumerating objects: 7, done. + > remote: Counting objects: 100% (7/7), done. + > remote: Total 26 (delta 7), reused 7 (delta 7), pack-reused 19 + > Unpacking objects: 100% (26/26), 45.43 KiB | 186.00 KiB/s, done. + > From github.com:wfau/aglais + > 7f642cd..01c7c74 master -> upstream/master + + + git merge upstream/master + + > Updating 7f642cd..01c7c74 + > Fast-forward + > experiments/hadoop-yarn/ansible/01-create-keypair.yml | 2 +- + > experiments/hadoop-yarn/ansible/02-create-gateway.yml | 2 +- + > .... + > .... + > create mode 100644 notes/zrq/20210205-02-resources.txt + > create mode 100644 notes/zrq/20210206-01-git-cherry-pick.txt + + + git push + + > Total 0 (delta 0), reused 0 (delta 0), pack-reused 0 + > To github.com:Zarquan/aglais.git + > 01c7c74..f46bc2b master -> master + + + git status + + > On branch master + > Your branch is up to date with 'origin/master'. + + + popd + popd + + +# ----------------------------------------------------- +# Delete merged branches. +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + git branch + + > 20210113-zrq-source-build + > 20210125-zrq-format-notes + > 20210125-zrq-kubernetes-deploy + > 20210127-zrq-error-trap + > 20210127-zrq-oauth + > 20210127-zrq-working + > 20210205-zrq-deployname + > 20210205-zrq-error-trap + > 20210205-zrq-notes + > 20210205-zrq-testing + > 20210205-zrq-timeout + > 20210206-zrq-working + > * master + + + git branch -d 20210125-zrq-format-notes + + > Deleted branch 20210125-zrq-format-notes (was fd1e449). + + + git branch -d 20210125-zrq-kubernetes-deploy + + > Deleted branch 20210125-zrq-kubernetes-deploy (was 3ab3b55). + + + git branch -d 20210127-zrq-error-trap + + > Deleted branch 20210127-zrq-error-trap (was 1b80704). + + + git branch -d 20210127-zrq-oauth + + > Deleted branch 20210127-zrq-oauth (was d5af1da). + + + git branch -d 20210127-zrq-working + + > warning: deleting branch '20210127-zrq-working' that has been merged to + > 'refs/remotes/origin/20210127-zrq-working', but not yet merged to HEAD. + > + > Deleted branch 20210127-zrq-working (was e12e24c). + + + git branch -d 20210205-zrq-deployname + + > Deleted branch 20210205-zrq-deployname (was 64d0f2c). + + + git branch -d 20210205-zrq-error-trap + + > Deleted branch 20210205-zrq-error-trap (was 1b80704). + + + git branch -d 20210205-zrq-notes + + > Deleted branch 20210205-zrq-notes (was 9c73277). + + + git branch -d 20210205-zrq-testing + + > Deleted branch 20210205-zrq-testing (was c148e78). + + + git branch -d 20210205-zrq-timeout + + > Deleted branch 20210205-zrq-timeout (was 9c73277). + + + git branch + + > 20210113-zrq-source-build + > 20210206-zrq-working + > * master + + + popd + popd + + +# ----------------------------------------------------- +# Create a new branch for the gateway changes. +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + nextbranch=$(date '+%Y%m%d')-zrq-gateway + + git checkout master + + > Already on 'master' + > Your branch is up to date with 'origin/master'. + + + git checkout -b "${nextbranch:?}" + + > Switched to a new branch '20210211-zrq-gateway' + + + git push --set-upstream origin "${nextbranch:?}" + + > Total 0 (delta 0), reused 0 (delta 0), pack-reused 0 + > remote: + > remote: Create a pull request for '20210211-zrq-gateway' on GitHub by visiting: + > remote: https://github.com/Zarquan/aglais/pull/new/20210211-zrq-gateway + > remote: + > To github.com:Zarquan/aglais.git + > * [new branch] 20210211-zrq-gateway -> 20210211-zrq-gateway + > Branch '20210211-zrq-gateway' set up to track remote branch '20210211-zrq-gateway' from 'origin'. + + popd + popd + + +# ----------------------------------------------------- +# Transfer the changes to remove the gateway node. +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + meld github-backup github-working & + + pushd github-working + + meld . & + + git branch + + > 20210113-zrq-source-build + > 20210206-zrq-working + > * 20210211-zrq-gateway + > master + + git add . + + git commit -m "Removed gateway node" + + > [20210211-zrq-gateway ffe2137] Removed gateway node + > 14 files changed, 58 insertions(+), 61 deletions(-) + + git push + + > Enumerating objects: 39, done. + > Counting objects: 100% (39/39), done. + > Delta compression using up to 4 threads + > Compressing objects: 100% (20/20), done. + > Writing objects: 100% (20/20), 1.84 KiB | 470.00 KiB/s, done. + > Total 20 (delta 17), reused 0 (delta 0), pack-reused 0 + > remote: Resolving deltas: 100% (17/17), completed with 16 local objects. + > To github.com:Zarquan/aglais.git + > f46bc2b..ffe2137 20210211-zrq-gateway -> 20210211-zrq-gateway + + popd + popd + + +# ----------------------------------------------------- +# Create a new branch with a fix to delete-all. +# Note - this branch follows on from previous branch, carrying forward the changes +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + git status + + > On branch 20210211-zrq-gateway + > Your branch is up to date with 'origin/20210211-zrq-gateway'. + + nextbranch=$(date '+%Y%m%d')-zrq-delete-fix + + git checkout -b "${nextbranch:?}" + + git push --set-upstream origin "${nextbranch:?}" + + meld ../github-backup . & + + git status + + git add experiments/openstack/bin/delete-all.sh + git commit -m "Fix to catch all the keys created by create-all" + + git add notes/zrq/20210206-01-git-cherry-pick.txt + git commit -m "Finish notes on cherry picking" + + git push + + popd + popd + + +# ----------------------------------------------------- +# Create a new branch to move hadoop and spark vars into the hosts file. +# Note - this branch follows on from previous branch, carrying forward the changes +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + git status + + > On branch 20210211-zrq-delete-fix + > Your branch is up to date with 'origin/20210211-zrq-delete-fix'. + + nextbranch=$(date '+%Y%m%d')-zrq-move-vars + + git checkout -b "${nextbranch:?}" + + git push --set-upstream origin "${nextbranch:?}" + + meld ../github-backup . & + + git status + + git add . + + git commit -m "Moved Hadoop, Spark and Zeppelin vars into hosts.yml" + + > [20210211-zrq-move-vars 2432401] Moved Hadoop, Spark and Zeppelin vars into hosts.yml + > 11 files changed, 50 insertions(+), 71 deletions(-) + + git push + + popd + popd + + +# ----------------------------------------------------- +# Create a new branch to fix the issue with Fedora updates. +# Note - this branch follows on from previous branch, carrying forward the changes +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + git status + + > On branch 20210211-zrq-move-vars + > Your branch is up to date with 'origin/20210211-zrq-move-vars'. + + nextbranch=$(date '+%Y%m%d')-zrq-fedora-updates + + git checkout -b "${nextbranch:?}" + + git push --set-upstream origin "${nextbranch:?}" + + meld ../github-backup . & + + git status + + git add . + + git commit -m "Fix a problem with Fedora updates" + + git push + + popd + popd + + +# ----------------------------------------------------- +# Create a new branch to add misc notes. +# Note - this branch follows on from the master branch, nothing to carry forward. +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + git checkout master + + nextbranch=$(date '+%Y%m%d')-zrq-notes + + git checkout -b "${nextbranch:?}" + + meld ../github-backup . & + + git status + + git add . + + git commit -m "Added new notes" + + git push --set-upstream origin "${nextbranch:?}" + + popd + popd + + +# ----------------------------------------------------- +# Create a new branch to add misc notes. +# Note - this branch follows on from a previous branch, carrying forward the changes +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + git checkout 20210211-zrq-fedora-updates + + nextbranch=$(date '+%Y%m%d')-zrq-volume-mounts + + git checkout -b "${nextbranch:?}" + + meld ../github-backup . & + + git status + + meld . & + + git add . + + git commit -m "Volume mounts for temp space" + + git push --set-upstream origin "${nextbranch:?}" + + popd + popd + diff --git a/notes/zrq/20210211-03-ansible-deploy.txt b/notes/zrq/20210211-03-ansible-deploy.txt new file mode 100644 index 00000000..ec3764c9 --- /dev/null +++ b/notes/zrq/20210211-03-ansible-deploy.txt @@ -0,0 +1,237 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Get Spark to work with the new configuration. + + Changes based on information from Stelio's notes. + notes/stv/20210210-Benchmarking-ML-Notebook-01.txt + notes/stv/20210211-ML-Notebook-Benchmarking.txt + + Added hadoop.tmp.dir to the core config. + Added /var/hadoop/temp to the volume mounts. + + Changed to 4 medium workers + + Test config: + 1 small master + 1 medium zeppelin + 4 medium workers + + Variable results caused by problems withthe Ceph stprage system. + The whole notebook is IO limited, all of the calculations are starved of input data. + Even on a good run, the cpu use is around 1%. + + Multiple disc failures were causing problems with the Ceph system. + John removed broken discs from the array and stayed late to finish rebuilding the array. + After that results were much better, but still starved of data. + + Hadoop and Spark work best with local data. + + The gaia machines sitting in the racks at ROE are a better fit for this type of load. + Spread the data across the workers, don't centralise it in one place. + Either HDFS or another form of local caching. + + Links about file system optimisation + + Best practices for caching in Spark SQL + https://towardsdatascience.com/best-practices-for-caching-in-spark-sql-b22fb0f02d34 + + RADOS (Reliable Autonomic Distributed Object Store) + https://searchstorage.techtarget.com/definition/RADOS-Reliable-Autonomic-Distributed-Object-Store + + CephFS: a new generation storage platform for Australian High Energy Physics + https://indico.cern.ch/event/505613/contributions/2230911/attachments/1345227/2039428/Oral-v5-162.pdf + + CephFS file layouts + https://docs.ceph.com/en/mimic/cephfs/file-layouts/ + + Detecting CPU steal time in guest virtual machines + https://opensource.com/article/20/1/cpu-steal-time + + Results: + + Notebook works with 100% of eDR3 and 500 trees. + Need to experiment with adding more trees. + + +# ----------------------------------------------------- +# Update the Openstack cloud name. +#[user@desktop] + + cloudname=gaia-dev + + sed -i ' + s/^\(AGLAIS_CLOUD\)=.*$/\1='${cloudname:?}'/ + ' "${HOME}/aglais.env" + + +# ----------------------------------------------------- +# Create a container to work with. +# (*) extra volume mount for /common +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name ansibler \ + --hostname ansibler \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --env "cloudname=${AGLAIS_CLOUD:?}" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/common:/common:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/openstack:/openstack:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/hadoop-yarn:/hadoop-yarn:ro,z" \ + atolmis/ansible-client:2020.12.02 \ + bash + + +# ----------------------------------------------------- +# Create our Aglais configuration. +#[root@kubernator] + +cat > '/tmp/aglais-config.yml' << EOF +aglais: + version: 1.0 + spec: + openstack: + cloud: '${cloudname:?}' + +EOF + + +# ----------------------------------------------------- +# Create everything from scratch. +#[root@ansibler] + + time \ + /openstack/bin/delete-all.sh \ + "${cloudname:?}" + + rm -f ~/.ssh/* + + time \ + /hadoop-yarn/bin/create-all.sh + + + > real 33m6.197s + > user 8m17.797s + > sys 2m33.633s + + > real 31m27.362s + > user 7m41.976s + > sys 2m27.153s + + > + > real 32m40.876s + > user 8m1.610s + > sys 2m34.779s + + > real 31m42.765s + > user 7m59.668s + > sys 2m30.155s + + + +# ----------------------------------------------------- +# Check the deployment status. +#[root@ansibler] + + cat '/tmp/aglais-status.yml' + + > .... + > .... + + +# ----------------------------------------------------- +# Get the public IP address of our Zeppelin node. +#[root@ansibler] + + deployname=$( + yq read \ + '/tmp/aglais-status.yml' \ + 'aglais.status.deployment.name' + ) + + zeppelinid=$( + openstack \ + --os-cloud "${cloudname:?}" \ + server list \ + --format json \ + | jq -r '.[] | select(.Name == "'${deployname:?}'-zeppelin") | .ID' + ) + + zeppelinip=$( + openstack \ + --os-cloud "${cloudname:?}" \ + server show \ + --format json \ + "${zeppelinid:?}" \ + | jq -r '.addresses' \ + | sed ' + s/[[:space:]]// + s/.*=\(.*\)/\1/ + s/.*,\(.*\)/\1/ + ' + ) + +cat << EOF +Zeppelin ID [${zeppelinid:?}] +Zeppelin IP [${zeppelinip:?}] +EOF + + > Zeppelin ID [ecbdba16-f723-4f5f-a5e8-e943f83f95bd] + > Zeppelin IP [128.232.227.228] + + > Zeppelin ID [721e11ed-d1c5-4f7a-81fd-61dd87d4c13d] + > Zeppelin IP [128.232.227.202] + + > Zeppelin ID [31bd4e5e-3ea0-4dd2-a08c-863b61d923ea] + > Zeppelin IP [128.232.227.247] + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + Update our DNS + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Login to Zeppelin ... +#[user@desktop] + + firefox --new-window "http://zeppelin.metagrid.xyz:8080/" & + + + diff --git a/notes/zrq/20210212-01-speed-tests.txt b/notes/zrq/20210212-01-speed-tests.txt new file mode 100644 index 00000000..ae01345c --- /dev/null +++ b/notes/zrq/20210212-01-speed-tests.txt @@ -0,0 +1,881 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Get Spark to work with the new configuration. + + Changes based on information from Stelio's notes. + notes/stv/20210210-Benchmarking-ML-Notebook-01.txt + notes/stv/20210211-ML-Notebook-Benchmarking.txt + + Added hadoop.tmp.dir to the core config. + Added /var/hadoop/temp to the volume mounts. + + Changed to 4 medium workers + + Test config: + 1 small master + 1 medium zeppelin + 4 medium workers + + Variable results caused by problems with the Ceph stprage system. + The whole notebook is IO limited, all of the calculations are starved of input data. + Even on a good run, the cpu use is around 1%. + + Multiple disc failures were causing problems with the Ceph system. + John removed broken discs from the array and stayed late to finish rebuilding the array. + After that results were much better, but still starved of data. + + Hadoop and Spark work best with local data. + + The gaia machines sitting in the racks at ROE are a better fit for this type of load. + Spread the data across the workers, don't centralise it in one place. + Either HDFS or another form of local caching. + + Links about file system optimisation + + Best practices for caching in Spark SQL + https://towardsdatascience.com/best-practices-for-caching-in-spark-sql-b22fb0f02d34 + + RADOS (Reliable Autonomic Distributed Object Store) + https://searchstorage.techtarget.com/definition/RADOS-Reliable-Autonomic-Distributed-Object-Store + + CephFS: a new generation storage platform for Australian High Energy Physics + https://indico.cern.ch/event/505613/contributions/2230911/attachments/1345227/2039428/Oral-v5-162.pdf + + CephFS file layouts + https://docs.ceph.com/en/mimic/cephfs/file-layouts/ + + Detecting CPU steal time in guest virtual machines + https://opensource.com/article/20/1/cpu-steal-time + + Results: + + Notebook works with 100% of eDR3 and 500 trees. + Need to experiment with adding more trees. + + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Login to Zeppelin ... +#[user@desktop] + + firefox --new-window "http://zeppelin.metagrid.xyz:8080/" & + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + Import notebooks from GitHub, clear the output and run all the cells ... + + Good astrometric solutions via ML Random Forrest classifier + https://raw.githubusercontent.com/wfau/aglais-notebooks/main/2FRPC4BFS/note.json + + +# ----------------------------------------------------- +# ----------------------------------------------------- + +Stelio's test #1a +20210211-ML-Notebook-Benchmarking.txt + + Cinder Volumes for temp storage for Spark & Hadoop + 500 trees + + main select statement + > Took 43 mins + + RandomForestClassifier - 10% data 500 trees + > Took 17 mins + +Stelio's test #1b +20210211-ML-Notebook-Benchmarking.txt + + Cinder Volumes for temp storage for Spark & Hadoop + 5000 trees + + main select statement + > ??? + + RandomForestClassifier - 10% data 5000 trees + > ??? + + notebook took 3 hrs 23 min 28 sec () + +Stelio's test #2 +20210211-ML-Notebook-Benchmarking.txt + + Revert changes to Ansible scripts so that it matches what is currently deployed on zeppelin.aglais.uk + + main select statement + > 28 min 10 sec. + + RandomForestClassifier - (assume 10% data 500 trees, not stated) + > 15 min 28 sec. + +# ----------------------------------------------------- +# ----------------------------------------------------- + +Live deployment #1 + + quick_filter=' AND MOD(random_index, 10) = 0' + quick_plot_filter=' AND MOD(random_index, 25) = 0' + + main select statement + Took 29 min 10 sec. Last updated by gaiauser at February 12 2021, 4:59:01 AM. + + first plot + .... + + good/bad select + .... + + RandomForestClassifier - 10% data 500 trees + Took 12 min 4 sec. Last updated by gaiauser at February 12 2021, 5:11:23 AM. + + +Live deployment #2 + + quick_filter='' + quick_plot_filter=' AND MOD(random_index, 25) = 0' + + main select statement + 1724028 + Took 4 min 16 sec. Last updated by gaiauser at February 12 2021, 6:16:23 AM. + + first plot + Took 14 min 46 sec. Last updated by gaiauser at February 12 2021, 6:31:09 AM. + + good/bad select + Good training data size: 244740 rows + Bad training data size: 244740 rows + Took 23 min 8 sec. Last updated by gaiauser at February 12 2021, 6:54:18 AM. + + RandomForestClassifier - 100% data 500 trees + Started 3 hours ago .... 66% + + + + +# ----------------------------------------------------- +# ----------------------------------------------------- + +How-to: Tune Your Apache Spark Jobs (Part 2) +https://blog.cloudera.com/how-to-tune-your-apache-spark-jobs-part-2/ + + Imagine a cluster with six (4) nodes running NodeManagers, each equipped with 16 (14) cores and 64GB (45) of memory. + The NodeManager capacities, yarn.nodemanager.resource.memory-mb and yarn.nodemanager.resource.cpu-vcores, should probably be set to 63 * 1024 = 64512 (megabytes) and 15 respectively. + + example + yarn.nodemanager.resource.memory-mb 63 * 1024 = 64512 + yarn.nodemanager.resource.cpu-vcores 16 - 1 = 15 + + The NodeManager capacities, yarn.nodemanager.resource.memory-mb and yarn.nodemanager.resource.cpu-vcores, should probably be set to 44 * 1024 = 45056 (megabytes) and 13 respectively. + + aglais + yarn.nodemanager.resource.memory-mb 44 * 1024 = 45056 + yarn.nodemanager.resource.cpu-vcores 14 - 1 = 13 + + We avoid allocating 100% of the resources to YARN containers because the node needs some resources to run the OS and Hadoop daemons. + In this case, we leave a gigabyte and a core for these system processes. Cloudera Manager helps by accounting for these and configuring these YARN properties automatically. + +The likely first impulse would be to use --num-executors 6 --executor-cores 15 --executor-memory 63G. However, this is the wrong approach because: + + 63GB + the executor memory overhead won’t fit within the 63GB capacity of the NodeManagers. + The application master will take up a core on one of the nodes, meaning that there won’t be room for a 15-core executor on that node. + 15 cores per executor can lead to bad HDFS I/O throughput. + +A better option would be to use --num-executors 17 --executor-cores 5 --executor-memory 19G. Why? + + This config results in three executors on all nodes except for the one with the AM, which will have two executors. + --executor-memory was derived as (63/3 executors per node) = 21. 21 * 0.07 = 1.47. 21 – 1.47 ~ 19. + + + example + 6 nodes + 15 cores per node + 63G per node + 3 executors per node + executor-cores 15 / 3 = 5 + num-executors (6*3)-1 = 17 + + executor-memory + 63/3 = 21 + 21 * (1 - 0.07) = 19 + + + alais + 4 nodes + 13 cores per node + 44G per node + 3 executors per node + executor-cores 13 / 3 = 4 + num-executors (4*3)-1 = 11 + + executor-memory + 44/3 = 14 + 14 * (1 - 0.07) = 13 + + ---- ---- + + yarn.nodemanager.resource.memory-mb 45056 + yarn.nodemanager.resource.cpu-vcores 13 + + executor-cores 4 + num-executors 11 + executor-memory 13 + + + spark-master + spark-defaults.conf + + spark.driver.memory 13g + spark.yarn.am.memory 13g + spark.yarn.am.cores 4 + + spark.executor.memory 13g + spark.executor.cores 4 + spark.executor.instances 11 + + spark.eventLog.enabled true + spark.driver.maxResultSize 8g + + + yarn-masters + yarn-site.xml + yarn.scheduler.maximum-allocation-mb 13312 + yarn.scheduler.minimum-allocation-mb 2048 + 14336 + + + yarn-workers + yarn-site.xml + + yarn.nodemanager.resource.memory-mb ((45-1)*1024) = 45056 + yarn.nodemanager.resource.cpu-vcores 13 + yarn.scheduler.maximum-allocation-vcores 26 + yarn.scheduler.minimum-allocation-vcores 1 + +# ----------------------------------------------------- +# ----------------------------------------------------- + + + https://github.com/hortonworks/hdp-configuration-utils + + hdp-configuration-utils.py -c 14 -m 45 -d 1 -k False + + > Using cores=14 memory=45GB disks=1 hbase=False + > Profile: cores=14 memory=45056MB reserved=1GB usableMem=44GB disks=1 + > Num Container=3 + > Container Ram=14336MB + > Used Ram=42GB + > Unused Ram=1GB + > ***** mapred-site.xml ***** + > mapreduce.map.memory.mb=14336 + > mapreduce.map.java.opts=-Xmx11264m + > mapreduce.reduce.memory.mb=14336 + > mapreduce.reduce.java.opts=-Xmx11264m + > mapreduce.task.io.sort.mb=1792 + > ***** yarn-site.xml ***** + > yarn.scheduler.minimum-allocation-mb=14336 + > yarn.scheduler.maximum-allocation-mb=43008 + > yarn.nodemanager.resource.memory-mb=43008 + > yarn.app.mapreduce.am.resource.mb=14336 + > yarn.app.mapreduce.am.command-opts=-Xmx11264m + > ***** tez-site.xml ***** + > tez.am.resource.memory.mb=14336 + > tez.am.java.opts=-Xmx11264m + > ***** hive-site.xml ***** + > hive.tez.container.size=14336 + > hive.tez.java.opts=-Xmx11264m + > hive.auto.convert.join.noconditionaltask.size=3758096000 + + +# ----------------------------------------------------- +# ----------------------------------------------------- + +dev deployment #1 + + test #1.1 + + default settings, 10% data, 500 trees + + main select statement + Took 28 min 32 sec. Last updated by gaiauser at February 12 2021, 4:03:51 AM. + + RandomForestClassifier - 10% data 500 trees + Took 14 min 58 sec. Last updated by gaiauser at February 12 2021, 4:19:07 AM. + + # + # Tweaked the Hadoop/Yarn settings .. + # + +dev deployment #2 + + test #2.1 + + default settings, 10% data, 500 trees + + java.lang.IllegalArgumentException: + Required executor memory (13312), overhead (1331 MB), and PySpark memory (0 MB) is above the max threshold (13312 MB) of this cluster! + Please check the values of 'yarn.scheduler.maximum-allocation-mb' and/or 'yarn.nodemanager.resource.memory-mb'. + + # + # Fixed the Hadoop/Yarn settings .. + # + + yarn-masters + yarn-site.xml + yarn.scheduler.maximum-allocation-mb ((45-1)*1024) = 45056 + yarn.scheduler.minimum-allocation-mb 2048 + + +# ----------------------------------------------------- +# ----------------------------------------------------- + +dev deployment #3 + + test #3.1 + + default settings, 10% data, 500 trees + + main select statement + 1724028 + Took 25 min 45 sec. Last updated by gaiauser at February 12 2021, 11:02:10 AM. + + first plot + Took 6 sec. Last updated by gaiauser at February 12 2021, 11:02:16 AM. + + good/bad select - 10% data + Good training data size: 24225 rows + Bad training data size: 24225 rows + Took 10 sec. Last updated by gaiauser at February 12 2021, 11:02:26 AM. + + RandomForestClassifier - 10% data 500 trees + Took 14 min 56 sec. Last updated by gaiauser at February 12 2021, 11:17:23 AM. + + Slack chat with Paul Browne, asked him if there were any issues. + Suddenly running much faster - worker has 4 java processes at 96% cpu. + Might be a coincidence, might be something he tweaked ... + I think it was coincidence, I don't think he is online at the moment. + + Good sources plot + Took 35 sec. Last updated by gaiauser at February 12 2021, 11:19:52 AM. + + Bad sources plot + Took 36 sec. Last updated by gaiauser at February 12 2021, 11:20:28 AM. + + Results + No. of good sources: 11180 + No. of bad sources: 13102 + Took 38 sec. Last updated by gaiauser at February 12 2021, 11:21:06 AM. + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Checking logs on worker04. +#[fedora@gaia-dev-20210212-worker04] + + # worker01,02 and 04 all have a lot of activity. + + ls -al /var/hadoop/logs/ + + > drwxrwsr-x. 1 fedora fedora 582 Feb 12 10:19 . + > drwxrwsr-x. 1 root root 16 Feb 12 10:08 .. + > -rw-rw-r--. 1 fedora fedora 38792 Feb 12 11:19 hadoop-fedora-datanode-gaia-dev-20210212-worker04.novalocal.log + > -rw-rw-r--. 1 fedora fedora 702 Feb 12 10:19 hadoop-fedora-datanode-gaia-dev-20210212-worker04.novalocal.out + > -rw-rw-r--. 1 fedora fedora 37728 Feb 12 11:09 hadoop-fedora-nodemanager-gaia-dev-20210212-worker04.novalocal.log + > -rw-rw-r--. 1 fedora fedora 2218 Feb 12 10:19 hadoop-fedora-nodemanager-gaia-dev-20210212-worker04.novalocal.out + > -rw-rw-r--. 1 fedora fedora 0 Feb 12 10:19 SecurityAuth-fedora.audit + > drwxr-xr-x. 1 fedora fedora 60 Feb 12 11:17 userlogs + + tail -f /var/hadoop/logs/hadoop-fedora-datanode-gaia-dev-20210212-worker04.novalocal.log + + > .... + > .... + > 2021-02-12 11:04:41,833 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:313ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741833 + > 2021-02-12 11:05:23,204 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:475ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741833 + > 2021-02-12 11:12:07,737 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:712ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741833 + > .... + > .... + + + ls -al /var/hdfs/data + + > lrwxrwxrwx. 1 root root 25 Feb 12 10:11 /var/hdfs/data -> /mnt/cinder/vdc/hdfs/data + + + df -h /var/hdfs/data + + > Filesystem Size Used Avail Use% Mounted on + > /dev/vdc 512G 663M 510G 1% /mnt/cinder/vdc + + # + # Writing to the Cinder volume is slower than Hadoop is expecting. + # cost:712ms (threshold=300ms) + # + +# ----------------------------------------------------- +# Checking logs on worker03. +#[fedora@gaia-dev-20210212-worker04] + + # worker03 has much less activity. + + ls -al /var/hadoop/logs/ + + > drwxrwsr-x. 1 fedora fedora 582 Feb 12 10:19 . + > drwxrwsr-x. 1 root root 16 Feb 12 10:08 .. + > -rw-rw-r--. 1 fedora fedora 35013 Feb 12 11:19 hadoop-fedora-datanode-gaia-dev-20210212-worker03.novalocal.log + > -rw-rw-r--. 1 fedora fedora 702 Feb 12 10:19 hadoop-fedora-datanode-gaia-dev-20210212-worker03.novalocal.out + > -rw-rw-r--. 1 fedora fedora 37730 Feb 12 11:09 hadoop-fedora-nodemanager-gaia-dev-20210212-worker03.novalocal.log + > -rw-rw-r--. 1 fedora fedora 2218 Feb 12 10:19 hadoop-fedora-nodemanager-gaia-dev-20210212-worker03.novalocal.out + > -rw-rw-r--. 1 fedora fedora 0 Feb 12 10:19 SecurityAuth-fedora.audit + > drwxr-xr-x. 1 fedora fedora 60 Feb 12 11:17 userlogs + + tail -f /var/hadoop/logs/hadoop-fedora-datanode-gaia-dev-20210212-worker03.novalocal.log + + > .... + > .... + > 2021-02-12 11:18:38,134 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:493ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741834 + > 2021-02-12 11:19:04,401 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:618ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741834 + > 2021-02-12 11:19:29,870 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:486ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741834 + > 2021-02-12 11:19:44,349 INFO org.apache.hadoop.hdfs.server.datanode.DataNode.clienttrace: src: /10.10.3.76:39778, dest: /10.10.0.137:9866, bytes: 134217728, op: HDFS_WRITE, cliID: DFSClient_NONMAPREDUCE_-1806520083_23, offset: 0, srvID: e9c64f4b-966f-468c-af20-b6ae51d502de, blockid: BP-346622070-10.10.3.194-1613125180505:blk_1073741834_1010, duration(ns): 134185201769 + > 2021-02-12 11:19:44,349 INFO org.apache.hadoop.hdfs.server.datanode.DataNode: PacketResponder: BP-346622070-10.10.3.194-1613125180505:blk_1073741834_1010, type=HAS_DOWNSTREAM_IN_PIPELINE, downstreams=1:[10.10.1.46:9866] terminating + > 2021-02-12 11:19:44,364 INFO org.apache.hadoop.hdfs.server.datanode.DataNode: Receiving BP-346622070-10.10.3.194-1613125180505:blk_1073741835_1011 src: /10.10.3.104:47116 dest: /10.10.0.137:9866 + > 2021-02-12 11:19:44,365 INFO org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false + > 2021-02-12 11:20:16,593 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write data to disk cost:643ms (threshold=300ms), volume=file:/var/hdfs/data/, blockId=1073741835 + > .... + > .... + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Test run #3.2 tailing Zeppelin log +#[fedora@gaia-dev-20210212-zeppelin] + + ls -al zeppelin-0.8.2-bin-all/logs + + > drwxrwxr-x. 2 fedora fedora 4096 Feb 12 10:34 . + > drwxr-xr-x. 12 fedora fedora 4096 Feb 12 10:20 .. + > -rw-rw-r--. 1 fedora fedora 55109 Feb 12 11:29 zeppelin-fedora-gaia-dev-20210212-zeppelin.novalocal.log + > -rw-rw-r--. 1 fedora fedora 6194 Feb 12 10:34 zeppelin-fedora-gaia-dev-20210212-zeppelin.novalocal.out + > -rw-rw-r--. 1 fedora fedora 2885 Feb 12 11:28 zeppelin-interpreter-md-fedora-gaia-dev-20210212-zeppelin.novalocal.log + > -rw-rw-r--. 1 fedora fedora 122946934 Feb 12 11:31 zeppelin-interpreter-spark-fedora-gaia-dev-20210212-zeppelin.novalocal.log + + + tail -f zeppelin-0.8.2-bin-all/logs/zeppelin-interpreter-spark-fedora-gaia-dev-20210212-zeppelin.novalocal.log + + > .... + > .... + > INFO [2021-02-12 11:28:59,148] ({pool-2-thread-13} SchedulerFactory.java[jobStarted]:114) - Job 20201013-131649_1734629667 started by scheduler interpreter_2016348950 + > .... + > .... + > INFO [2021-02-12 11:37:38,123] ({dispatcher-event-loop-12} Logging.scala[logInfo]:54) - Starting task 2997.0 in stage 92.0 (TID 320495, worker02, executor 2, partition 2997, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-12 11:37:38,123] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 2996.0 in stage 92.0 (TID 320494) in 242 ms on worker02 (executor 2) (2986/5720) + > INFO [2021-02-12 11:37:38,312] ({dispatcher-event-loop-4} Logging.scala[logInfo]:54) - Starting task 2998.0 in stage 92.0 (TID 320496, worker02, executor 2, partition 2998, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-12 11:37:38,312] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 2997.0 in stage 92.0 (TID 320495) in 189 ms on worker02 (executor 2) (2987/5720) + > INFO [2021-02-12 11:37:38,546] ({dispatcher-event-loop-2} Logging.scala[logInfo]:54) - Starting task 2999.0 in stage 92.0 (TID 320497, worker02, executor 2, partition 2999, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-12 11:37:38,547] ({task-result-getter-3} Logging.scala[logInfo]:54) - Finished task 2998.0 in stage 92.0 (TID 320496) in 235 ms on worker02 (executor 2) (2988/5720) + > INFO [2021-02-12 11:37:39,376] ({dispatcher-event-loop-11} Logging.scala[logInfo]:54) - Starting task 3000.0 in stage 92.0 (TID 320498, worker01, executor 1, partition 3000, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-12 11:37:39,376] ({task-result-getter-0} Logging.scala[logInfo]:54) - Finished task 2986.0 in stage 92.0 (TID 320484) in 4466 ms on worker01 (executor 1) (2989/5720) + > INFO [2021-02-12 11:37:39,974] ({dispatcher-event-loop-1} Logging.scala[logInfo]:54) - Starting task 3001.0 in stage 92.0 (TID 320499, worker04, executor 3, partition 3001, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-12 11:37:39,974] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 2976.0 in stage 92.0 (TID 320474) in 7743 ms on worker04 (executor 3) (2990/5720) + > INFO [2021-02-12 11:37:40,235] ({dispatcher-event-loop-2} Logging.scala[logInfo]:54) - Starting task 3002.0 in stage 92.0 (TID 320500, worker04, executor 3, partition 3002, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-12 11:37:40,235] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished + > .... + > .... + + # + # Not sending much to worker03 for some reason ? + # + +# ----------------------------------------------------- +# Test run #3.2 disc use on worker02 +#[fedora@gaia-dev-20210212-worker01] + + + ls -al /var/hadoop/ + + > total 8 + > drwxrwsr-x. 2 root root 4096 Feb 12 10:09 . + > drwxr-xr-x. 20 root root 4096 Feb 12 10:11 .. + > lrwxrwxrwx. 1 root root 27 Feb 12 10:08 data -> /mnt/cinder/vdc/hadoop/data + > lrwxrwxrwx. 1 root root 27 Feb 12 10:09 logs -> /mnt/cinder/vdc/hadoop/logs + > lrwxrwxrwx. 1 root root 26 Feb 12 10:09 temp -> /mnt/local/vdb/hadoop/temp + + + du -h -d 2 -L /var/hadoop/ + + > 91M /var/hadoop/logs/userlogs + > 91M /var/hadoop/logs + > 0 /var/hadoop/data + > 293M /var/hadoop/temp/nm-local-dir + > 293M /var/hadoop/temp + > 384M /var/hadoop/ + + + + ls -al /var/hadoop/logs/ + + > drwxrwsr-x. 1 fedora fedora 582 Feb 12 10:19 . + > drwxrwsr-x. 1 root root 16 Feb 12 10:08 .. + > -rw-rw-r--. 1 fedora fedora 39272 Feb 12 11:42 hadoop-fedora-datanode-gaia-dev-20210212-worker02.novalocal.log + > -rw-rw-r--. 1 fedora fedora 702 Feb 12 10:19 hadoop-fedora-datanode-gaia-dev-20210212-worker02.novalocal.out + > -rw-rw-r--. 1 fedora fedora 38788 Feb 12 11:49 hadoop-fedora-nodemanager-gaia-dev-20210212-worker02.novalocal.log + > -rw-rw-r--. 1 fedora fedora 2218 Feb 12 10:19 hadoop-fedora-nodemanager-gaia-dev-20210212-worker02.novalocal.out + > -rw-rw-r--. 1 fedora fedora 0 Feb 12 10:19 SecurityAuth-fedora.audit + > drwxr-xr-x. 1 fedora fedora 60 Feb 12 11:51 userlogs + + + du -h -d 2 /var/hadoop/logs/ + + > 88M /var/hadoop/logs/userlogs/application_1613125194823_0001 + > 88M /var/hadoop/logs/userlogs + > 89M /var/hadoop/logs/ + + + ls -al /var/hadoop/data/ + + > total 0 + > drwxrwsr-x. 1 fedora fedora 0 Feb 12 10:08 . + > drwxrwsr-x. 1 root root 16 Feb 12 10:08 .. + + + du -h -d 2 /var/hadoop/data/ + + > 0 /var/hadoop/data/ + + + ls -al /var/hadoop/temp/ + + > drwxrwsr-x. 3 fedora fedora 4096 Feb 12 10:19 . + > drwxrwsr-x. 3 root root 4096 Feb 12 10:09 .. + > drwxr-xr-x. 5 fedora fedora 4096 Feb 12 11:55 nm-local-dir + + + du -h -d 2 /var/hadoop/temp/ + + > 292M /var/hadoop/temp/nm-local-dir/usercache + > 4.0K /var/hadoop/temp/nm-local-dir/filecache + > 36K /var/hadoop/temp/nm-local-dir/nmPrivate + > 292M /var/hadoop/temp/nm-local-dir + > 292M /var/hadoop/temp/ + + + ls -al /var/hdfs/ + + > total 8 + > drwxrwsr-x. 2 root root 4096 Feb 12 10:11 . + > drwxr-xr-x. 20 root root 4096 Feb 12 10:11 .. + > lrwxrwxrwx. 1 root root 25 Feb 12 10:11 data -> /mnt/cinder/vdc/hdfs/data + > lrwxrwxrwx. 1 root root 25 Feb 12 10:11 logs -> /mnt/cinder/vdc/hdfs/logs + + + ls -al /var/hdfs/data/ + + > total 4 + > drwx------. 1 fedora fedora 36 Feb 12 10:19 . + > drwxrwsr-x. 1 root root 16 Feb 12 10:11 .. + > drwxrwxr-x. 1 fedora fedora 90 Feb 12 10:19 current + > -rw-rw-r--. 1 fedora fedora 14 Feb 12 10:19 in_use.lock + + + du -h -d 2 /var/hdfs/data/ + + > 928M /var/hdfs/data/current/BP-346622070-10.10.3.194-1613125180505 + > 928M /var/hdfs/data/current + > 928M /var/hdfs/data/ + + + ls -al /var/hdfs/logs/ + + > drwxrwsr-x. 1 fedora fedora 0 Feb 12 10:11 . + > drwxrwsr-x. 1 root root 16 Feb 12 10:11 .. + + + du -h -d 2 /var/hdfs/logs/ + + > 0 /var/hdfs/logs/ + + +# ----------------------------------------------------- +# Test run #3.2 disc use on zeppelin +#[fedora@gaia-dev-20210212-zeppelin] + + ls -al /var/spark/ + + > total 8 + > drwxrwsr-x. 2 root root 4096 Feb 12 10:13 . + > drwxr-xr-x. 20 root root 4096 Feb 12 10:13 .. + > lrwxrwxrwx. 1 root root 25 Feb 12 10:13 temp -> /mnt/local/vdb/spark/temp + + + ls -al /var/spark/temp/ + + > drwxrwsr-x. 4 fedora fedora 4096 Feb 12 10:35 . + > drwxrwsr-x. 3 root root 4096 Feb 12 10:13 .. + > drwxrwsr-x. 51 fedora fedora 4096 Feb 12 11:29 blockmgr-2c154649-34c5-4eb0-b588-b08b5bceccda + > drwx--S---. 4 fedora fedora 4096 Feb 12 10:35 spark-0f42b2cc-7042-48bd-992f-99d49cabd8d5 + + + du -h -d 2 /var/spark/temp/ + + > 220K /var/spark/temp/spark-0f42b2cc-7042-48bd-992f-99d49cabd8d5 + > 200K /var/spark/temp/blockmgr-2c154649-34c5-4eb0-b588-b08b5bceccda + > 424K /var/spark/temp/ + + +# ----------------------------------------------------- +# ----------------------------------------------------- + +dev deployment #3 + + test #3.2 + + changed settings, 100% data, 500 trees + edit the notebook to remove quick_filter + clear output and run again + + main select statement + 1724028 + Took 12 min 51 sec. Last updated by gaiauser at February 12 2021, 11:42:40 AM. + + first plot + Took 16 min 21 sec. Last updated by gaiauser at February 12 2021, 11:59:01 AM. + + good/bad select - 100% data + Good training data size: 244740 rows + Bad training data size: 244740 rows + Took 19 min 16 sec. Last updated by gaiauser at February 12 2021, 12:18:17 PM. + + RandomForestClassifier - 100% data 500 trees + + Back to slow progress. + Several Java 30%cpu. lots of cephfuse at 2%cpu. + + Reached 66% and then started to go backwards. + Reached 62% and decided to stop it. + Started 2 hours ago. + + Clicked the [Cancel] button - no effect. + Log shows new tasks being issued. + + Keyboard cancel, Ctrl-Atl-C - no effect. + Log shows new tasks being issued. + + Restarted the intepreter - result. + Log shows tasks being cancelled. + + > INFO [2021-02-12 14:37:41,547] ({dispatcher-event-loop-10} Logging.scala[logInfo]:54) - Added rdd_419_2815 in memory on worker02:39835 (size: 16.0 B, free: 6.6 GB) + > INFO [2021-02-12 14:37:41,582] ({dispatcher-event-loop-8} Logging.scala[logInfo]:54) - Starting task 2817.0 in stage 113.0 (TID 383260, worker02, executor 2, partition 2817, PROCESS_LOCAL, 8559 bytes) + > INFO [2021-02-12 14:37:41,582] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 2815.0 in stage 113.0 (TID 383258) in 1246 ms on worker02 (executor 2) (2806/5721) + > INFO [2021-02-12 14:37:52,456] ({dispatcher-event-loop-0} Logging.scala[logInfo]:54) - Added rdd_419_2817 in memory on worker02:39835 (size: 16.0 B, free: 6.6 GB) + > INFO [2021-02-12 14:37:52,487] ({dispatcher-event-loop-2} Logging.scala[logInfo]:54) - Starting task 2818.0 in stage 113.0 (TID 383261, worker02, executor 2, partition 2818, PROCESS_LOCAL, 8559 bytes) + > INFO [2021-02-12 14:37:52,487] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 2817.0 in stage 113.0 (TID 383260) in 10905 ms on worker02 (executor 2) (2807/5721) + > INFO [2021-02-12 14:37:57,681] ({pool-1-thread-3} RemoteInterpreterServer.java[cancel]:681) - cancel org.apache.zeppelin.spark.PySparkInterpreter 20201013-152110_1282917873 + > INFO [2021-02-12 14:37:57,702] ({pool-1-thread-3} Logging.scala[logInfo]:54) - Asked to cancel job group zeppelin-gaiauser-2FX82FMTH-20201013-152110_1282917873 + > INFO [2021-02-12 14:37:57,706] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - Cancelling stage 113 + > INFO [2021-02-12 14:37:57,707] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - Killing all running tasks in stage 113: Stage cancelled + > INFO [2021-02-12 14:37:57,711] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - Stage 113 was cancelled + > INFO [2021-02-12 14:37:57,712] ({dag-scheduler-event-loop} Logging.scala[logInfo]:54) - ShuffleMapStage 113 (mapPartitions at RandomForest.scala:538) failed in 1388.544 s due to Job 61 cancelled part of cancelled job group zeppelin-gaiauser-2FX82FMTH-20201013-152110_1282917873 + > INFO [2021-02-12 14:37:57,713] ({Thread-39} Logging.scala[logInfo]:54) - Job 61 failed: collectAsMap at RandomForest.scala:567, took 1388.585122 s + > ERROR [2021-02-12 14:37:57,720] ({Thread-39} Logging.scala[logError]:70) - org.apache.spark.SparkException: Job 61 cancelled part of cancelled job group zeppelin-gaiauser-2FX82FMTH-20201013-152110_1282917873 + > at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925) + > at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:1860) + > at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply$mcVI$sp(DAGScheduler.scala:928) + > at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply(DAGScheduler.scala:928) + > at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply(DAGScheduler.scala:928) + > at scala.collection.mutable.HashSet.foreach(HashSet.scala:78) + > at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:928) + > at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2115) + > at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095) + > at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084) + + + +# ----------------------------------------------------- +# ----------------------------------------------------- + +dev deployment #3 + + test #3.3 + + changed settings, 100% data, 500 trees, no cache + + Caching may cause problems for datasets in Parquet files. + https://towardsdatascience.com/best-practices-for-caching-in-spark-sql-b22fb0f02d34 + + edit the notebook to remove quick_filter and caching + + - quick_filter = ' AND MOD(random_index, 10) = 0' + + quick_filter = '' # AND MOD(random_index, 10) = 0' + + - raw_sources_df.cache() + + #raw_sources_df.cache() + + clear output and run again + + main select statement + 1724028 + Took 10 min 39 sec. Last updated by gaiauser at February 12 2021, 2:57:19 PM. + + first plot + Took 19 min 22 sec. Last updated by gaiauser at February 12 2021, 3:16:42 PM. + + good/bad select - 100% data + Good training data size: 244740 rows + Bad training data size: 244740 rows + Took 28 min 3 sec. Last updated by gaiauser at February 12 2021, 3:44:45 PM. + + RandomForestClassifier - 100% data 500 trees + Killed at 80% to allow John to heal the Ceph system. + + +# ----------------------------------------------------- +# ----------------------------------------------------- + +dev deployment #3 + + test #3.4 + + 100% data, 500 trees, no cache + + ml intro + Took 0 sec. Last updated by gaiauser at February 12 2021, 8:18:31 PM. + + temp view + Took 1 min 40 sec. Last updated by gaiauser at February 12 2021, 8:20:11 PM. + + main select statement + 1724028 + Took 1 min 38 sec. Last updated by gaiauser at February 12 2021, 8:21:49 PM. + + ceph-fuse at 80-90% + java at 20-50% + + Hertzsprung-Russell + Took 4 min 39 sec. Last updated by gaiauser at February 12 2021, 8:26:28 PM. + + good/bad select - 100% data + Good training data size: 244740 rows + Bad training data size: 244740 rows + Took 7 min 13 sec. Last updated by gaiauser at February 12 2021, 8:33:41 PM. + + RandomForestClassifier - 100% data 500 trees + Took 1 hrs 16 min 6 sec. Last updated by gaiauser at February 12 2021, 9:49:48 PM. + + Misclassifications for the test set: 0.35 % + Took 18 min 35 sec. Last updated by gaiauser at February 12 2021, 10:08:23 PM. + + Hertzsprung-Russell + Took 54 min 22 sec. Last updated by gaiauser at February 12 2021, 11:02:46 PM. + + histogram + Took 14 min 58 sec. Last updated by gaiauser at February 12 2021, 11:17:44 PM. + + Good sources plot + Took 27 min 12 sec. Last updated by gaiauser at February 12 2021, 11:44:56 PM. + + Bad sources plot + Took 27 min 13 sec. Last updated by gaiauser at February 13 2021, 12:12:09 AM. + + No. of good sources: 22254 + No. of bad sources: 26170 + Took 27 min 42 sec. Last updated by gaiauser at February 13 2021, 12:39:51 AM. + + histogram + Took 19 min 10 sec. Last updated by gaiauser at February 13 2021, 12:59:01 AM. + + Nulls + Took 15 min 48 sec. Last updated by gaiauser at February 13 2021, 1:14:49 AM. + + ---- + +dev deployment #3 + + test #3.5 + + repeat of the same + 100% data, 500 trees, no cache + + clear cells and run all + + ml intro + Took 0 sec. Last updated by gaiauser at February 13 2021, 3:19:07 AM. + + temp view + Took 50 sec. Last updated by gaiauser at February 13 2021, 3:19:57 AM. + + main select statement + 1724028 + Took 38 sec. Last updated by gaiauser at February 13 2021, 3:20:35 AM. + + Hertzsprung-Russell + Took 4 min 1 sec. Last updated by gaiauser at February 13 2021, 3:24:36 AM. + + good/bad select - 100% data + Good training data size: 244740 rows + Bad training data size: 244740 rows + Took 7 min 4 sec. Last updated by gaiauser at February 13 2021, 3:31:40 AM. + + RandomForestClassifier - 100% data 500 trees + Took 1 hrs 19 min 39 sec. Last updated by gaiauser at February 13 2021, 4:51:20 AM. + + Misclassifications for the test set: 0.35 % + Took 20 min 13 sec. Last updated by gaiauser at February 13 2021, 5:11:34 AM. + + Hertzsprung-Russell + Took 55 min 7 sec. Last updated by gaiauser at February 13 2021, 6:06:42 AM. + + histogram + Took 14 min 12 sec. Last updated by gaiauser at February 13 2021, 6:20:54 AM. + + Good sources plot + Took 27 min 15 sec. Last updated by gaiauser at February 13 2021, 6:48:09 AM. + + Bad sources plot + Took 27 min 56 sec. Last updated by gaiauser at February 13 2021, 7:16:06 AM. + + good/bad count + No. of good sources: 22254 + No. of bad sources: 26170 + Took 27 min 16 sec. Last updated by gaiauser at February 13 2021, 7:43:22 AM. + + histogram + Took 19 min 43 sec. Last updated by gaiauser at February 13 2021, 8:03:05 AM. + + Nulls + Took 15 min 54 sec. Last updated by gaiauser at February 13 2021, 8:18:59 AM. + + TODO + + retry with caching enabled + retry with 1000 and 5000 trees + + diff --git a/notes/zrq/20210213-01-speed-tests.txt b/notes/zrq/20210213-01-speed-tests.txt new file mode 100644 index 00000000..72f3a977 --- /dev/null +++ b/notes/zrq/20210213-01-speed-tests.txt @@ -0,0 +1,118 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Follow on from previous tests. + Enable caching or raw_sources. + + Result: + + Work in progress .. + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Login to Zeppelin ... +#[user@desktop] + + firefox --new-window "http://zeppelin.metagrid.xyz:8080/" & + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + Import notebooks from GitHub, clear the output and run all the cells ... + + Good astrometric solutions via ML Random Forrest classifier + https://raw.githubusercontent.com/wfau/aglais-notebooks/main/2FRPC4BFS/note.json + + + +dev deployment #3 + + test #3.5 + + 100% data, 500 trees, cache + + edit the notebook to enable caching + + - #raw_sources_df.cache() + + raw_sources_df.cache() + + clear cells and run all + + ML intro + Took 0 sec. Last updated by gaiauser at February 13 2021, 1:33:46 PM. + + Temp view + Took 50 sec. Last updated by gaiauser at February 13 2021, 1:34:36 PM. + + Main select statement + 1724028 + Took 40 sec. Last updated by gaiauser at February 13 2021, 1:35:16 PM. + + # This MUST be using cached data. + + Hertzsprung-Russell + Took 3 min 42 sec. Last updated by gaiauser at February 13 2021, 1:38:58 PM. + + Good/bad select - 100% data + Good training data size: 244740 rows + Bad training data size: 244740 rows + Took 7 min 8 sec. Last updated by gaiauser at February 13 2021, 1:46:06 PM. + + RandomForestClassifier - 100% data 500 trees + Took 1 hrs 19 min 29 sec. Last updated by gaiauser at February 13 2021, 3:05:36 PM. + + Misclassification fraction + Misclassifications for the test set: 0.35 % + Took 18 min 24 sec. Last updated by gaiauser at February 13 2021, 3:24:00 PM. + + Hertzsprung-Russell + Took 55 min 28 sec. Last updated by gaiauser at February 13 2021, 4:19:28 PM. + + Histogram + Took 14 min 5 sec. Last updated by gaiauser at February 13 2021, 4:33:33 PM. + + Good sources plot + Took 27 min 43 sec. Last updated by gaiauser at February 13 2021, 5:01:16 PM. + + Bad sources plot + Took 27 min 19 sec. Last updated by gaiauser at February 13 2021, 5:28:35 PM. + + Good/bad count + No. of good sources: 22254 + No. of bad sources: 26170 + Took 27 min 25 sec. Last updated by gaiauser at February 13 2021, 5:56:00 PM. + + Histogram + Took 18 min 46 sec. Last updated by gaiauser at February 13 2021, 6:14:46 PM. + + Nulls + Took 14 min 39 sec. Last updated by gaiauser at February 13 2021, 6:29:25 PM. + diff --git a/notes/zrq/20210214-01-timing-logger.txt b/notes/zrq/20210214-01-timing-logger.txt new file mode 100644 index 00000000..98a5883c --- /dev/null +++ b/notes/zrq/20210214-01-timing-logger.txt @@ -0,0 +1,661 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Issue #371 + https://github.com/wfau/aglais/issues/371 + + Create a jq parser to extract the timing information from a Zeppelin notebook. + + Result: + + Initial version works. + Extracts the paragraph id, star and end times, and calculates the elapsed time. + + + # + # REST API documentation. + # https://zeppelin.apache.org/docs/0.7.0/rest-api/rest-notebook.html#export-a-note + + # + # REST API template + # http://[zeppelin-server]:[zeppelin-port]/api/notebook/export/[noteId] + + # + # HTML notebook URL + # http://128.232.227.222:8080/#/notebook/2FX82FMTH + + +# ----------------------------------------------------- +# Use the REST API to get a copy of the notebook. +#[user@desktop] + + curl -v 'http://128.232.227.222:8080/api/notebook/export/2FX82FMTH' + + > * Trying 128.232.227.222:8080... + > * Connected to 128.232.227.222 (128.232.227.222) port 8080 (#0) + > > GET /api/notebook/export/2FX82FMTH HTTP/1.1 + > > Host: 128.232.227.222:8080 + > > User-Agent: curl/7.69.1 + > > Accept: */* + > > + > * Mark bundle as not supporting multiuse + > < HTTP/1.1 302 Found + > < Date: Sunday, February 14, 2021 6:21:46 PM UTC + > < Access-Control-Allow-Credentials: true + > < Access-Control-Allow-Headers: authorization,Content-Type + > < Access-Control-Allow-Methods: POST, GET, OPTIONS, PUT, HEAD, DELETE + > < X-FRAME-OPTIONS: SAMEORIGIN + > < X-XSS-Protection: 1 + > < Set-Cookie: JSESSIONID=c3b28d9a-67d3-403b-8d10-6931e35b7211; Path=/; HttpOnly + > < Location: http://128.232.227.222:8080/api/login;JSESSIONID=c3b28d9a-67d3-403b-8d10-6931e35b7211 + > < Content-Length: 0 + > < Server: Jetty(9.4.14.v20181114) + > < + + +# ----------------------------------------------------- +# Use the REST API to login. +# https://community.cloudera.com/t5/Support-Questions/Authentication-with-the-Zeppelin-REST-API/td-p/115170 +#[user@desktop] + + gaiauser=$(secret aglais.zeppelin.gaiauser) + gaiapass=$(secret aglais.zeppelin.gaiapass) + + curl \ + --include \ + --request 'POST' \ + --data "userName=${gaiauser:?}" \ + --data "password=${gaiapass:?}" \ + 'http://128.232.227.222:8080/api/login' + + > HTTP/1.1 200 OK + > Date: Sunday, February 14, 2021 6:29:54 PM UTC + > Access-Control-Allow-Credentials: true + > Access-Control-Allow-Headers: authorization,Content-Type + > Access-Control-Allow-Methods: POST, GET, OPTIONS, PUT, HEAD, DELETE + > X-FRAME-OPTIONS: SAMEORIGIN + > X-XSS-Protection: 1 + > Set-Cookie: rememberMe=deleteMe; Path=/; Max-Age=0; Expires=Sat, 13-Feb-2021 18:29:54 GMT + > Set-Cookie: JSESSIONID=28567db4-9c2c-4b24-afbb-4517b3dd9dbf; Path=/; HttpOnly + > Set-Cookie: JSESSIONID=deleteMe; Path=/; Max-Age=0; Expires=Sat, 13-Feb-2021 18:29:54 GMT + > Set-Cookie: rememberMe=deleteMe; Path=/; Max-Age=0; Expires=Sat, 13-Feb-2021 18:29:54 GMT + > Set-Cookie: JSESSIONID=278c187f-7add-4ee3-a0e8-492a132cadb4; Path=/; HttpOnly + > Set-Cookie: rememberMe=deleteMe; Path=/; Max-Age=0; Expires=Sat, 13-Feb-2021 18:29:54 GMT + > Content-Type: application/json + > Content-Length: 130 + > Server: Jetty(9.4.14.v20181114) + + > { + > "status":"OK", + > "message":"", + > "body": { + > "principal":"gaiauser", + > "ticket":"4342bdd9-0a7b-4f27-a216-bae695a69b22", + > "roles":"[\"role1\"]" + > } + > } + + +# ----------------------------------------------------- +# Use the REST API to login, and save the cookie in a cookie-jar. +#[user@desktop] + + curl \ + --request 'POST' \ + --cookie-jar '/tmp/cookies' \ + --data "userName=${gaiauser:?}" \ + --data "password=${gaiapass:?}" \ + 'http://128.232.227.222:8080/api/login' + + > { + > "status":"OK", + > "message":"", + > "body": { + > "principal":"gaiauser", + > "ticket":"4342bdd9-0a7b-4f27-a216-bae695a69b22", + > "roles":"[\"role1\"]" + > } + > } + + + cat '/tmp/cookies' + + > # Netscape HTTP Cookie File + > # https://curl.haxx.se/docs/http-cookies.html + > # This file was generated by libcurl! Edit at your own risk. + > + > #HttpOnly_128.232.227.222 FALSE / FALSE 0 JSESSIONID 405b514d-c195-488b-98a8-86c9f06d65e2 + + +# ----------------------------------------------------- +# Use the cookie in our cookie-jar to authenticate the GET. +#[user@desktop] + + curl \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/export/2FX82FMTH' \ + | jq '.' + + + > > | jq '' + > % Total % Received % Xferd Average Speed Time Time Time Current + > Dload Upload Total Spent Left Speed + > 100 761k 0 761k 0 0 1603k 0 --:--:-- --:--:-- --:--:-- 1600k + > { + > "status": "OK", + > "message": "", + > "body": "{\n \"paragraphs\": [.... + > .... + > .... + > .... \"info\": {}\n}" + > } + + +# ----------------------------------------------------- +# GET the note status rather than the content. +#[user@desktop] + + curl \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/job/2FX82FMTH' \ + | jq '.' + + > { + > "status": "OK", + > "body": [ + > { + > "progress": "100", + > "started": "Sat Feb 13 13:33:46 UTC 2021", + > "finished": "Sat Feb 13 13:33:46 UTC 2021", + > "id": "20201013-131059_546082898", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 13:33:46 UTC 2021", + > "finished": "Sat Feb 13 13:34:36 UTC 2021", + > "id": "20201013-131649_1734629667", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 13:34:36 UTC 2021", + > "finished": "Sat Feb 13 13:35:16 UTC 2021", + > "id": "20201013-132418_278702125", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 13:35:16 UTC 2021", + > "finished": "Sat Feb 13 13:38:58 UTC 2021", + > "id": "20201120-094650_221463065", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 13:38:58 UTC 2021", + > "finished": "Sat Feb 13 13:38:58 UTC 2021", + > "id": "20201120-110502_1704727157", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 13:38:58 UTC 2021", + > "finished": "Sat Feb 13 13:46:06 UTC 2021", + > "id": "20201123-105445_95907042", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 13:46:06 UTC 2021", + > "finished": "Sat Feb 13 13:46:06 UTC 2021", + > "id": "20201015-161110_18118893", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 13:46:07 UTC 2021", + > "finished": "Sat Feb 13 15:05:36 UTC 2021", + > "id": "20201013-152110_1282917873", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 15:05:36 UTC 2021", + > "finished": "Sat Feb 13 15:05:36 UTC 2021", + > "id": "20201015-131823_1744793710", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 15:05:36 UTC 2021", + > "finished": "Sat Feb 13 15:24:00 UTC 2021", + > "id": "20201016-154755_24366630", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 15:24:00 UTC 2021", + > "finished": "Sat Feb 13 15:24:00 UTC 2021", + > "id": "20201123-163421_1811049882", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 15:24:00 UTC 2021", + > "finished": "Sat Feb 13 16:19:28 UTC 2021", + > "id": "20201123-162249_1468741293", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 16:19:28 UTC 2021", + > "finished": "Sat Feb 13 16:33:33 UTC 2021", + > "id": "20201124-100512_110153564", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 16:33:33 UTC 2021", + > "finished": "Sat Feb 13 17:01:16 UTC 2021", + > "id": "20201125-103046_1353183691", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 17:01:16 UTC 2021", + > "finished": "Sat Feb 13 17:28:35 UTC 2021", + > "id": "20201125-163312_728555601", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 17:28:35 UTC 2021", + > "finished": "Sat Feb 13 17:56:00 UTC 2021", + > "id": "20201125-155131_269531128", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 17:56:00 UTC 2021", + > "finished": "Sat Feb 13 18:14:46 UTC 2021", + > "id": "20201124-161145_1933006801", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "started": "Sat Feb 13 18:14:46 UTC 2021", + > "finished": "Sat Feb 13 18:29:25 UTC 2021", + > "id": "20201124-171324_1960205489", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "id": "20210108-142101_850914828", + > "status": "FINISHED" + > } + > ] + > } + + +# ----------------------------------------------------- +# Is there an easy way of doing the date subraction to get the execution time ? +#[user@desktop] + + # GoogleFoo to the rescue + # http://www.fresse.org/dateutils/ + + dnf install dateutils + + started="Sat Feb 13 17:56:00 UTC 2021" + finished="Sat Feb 13 18:29:25 UTC 2021" + + datediff "${started:?}" "${finished:?}" + + > ddiff: Error: reference DATE must be specified + + datediff \ + --input-format '%a %b %-d %H:%M:%S %Z %Y' \ + "${started:?}" "${finished:?}" + + > ddiff: Error: reference DATE must be specified + + + datediff \ + '13 Feb 2021 17:56:00' \ + '13 Feb 2021 18:29:25' + + > ddiff: Error: reference DATE must be specified + + + datediff \ + '2021-02-13 17:56:00' \ + '2021-02-13 18:29:25' + + > 2005s + + + datediff \ + --format '%H:%M:%S' \ + '2021-02-13 17:56:00' \ + '2021-02-13 18:29:25' + + > 0:33:25 + + + datediff \ + --format '%H:%M:%S' \ + --input-format '%Y %b %d %H:%M:%S' \ + '2021 Feb 13 17:56:00' \ + '2021 Feb 13 18:29:25' + + > 0:33:25 + + # + # Need to parse the crappy date format into something useable. + # + + strptime '%a %b $-d %H:%M:%S %Z %Y' "${started:?}" + + > strptime: cannot make sense of `Sat Feb 13 17:56:00 UTC 2021' using the given input formats + + # + # Need to parse the crappy date format into something useable. + # + + echo "${started:?}" | sed ' + s/\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)/\6 \2 \3 \4/ + ' + + > 2021 Feb 13 17:56:00 + + + dateform() + { + local input=${1:?} + echo "${input:?}" | sed ' + s/\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)/\6 \2 \3 \4/ + ' + } + + dateform "${started}" + + > 2021 Feb 13 17:56:00 + + datediff \ + --format '%H:%M:%S' \ + --input-format '%Y %b %d %H:%M:%S' \ + "$(dateform "${started:?}")" \ + "$(dateform "${finished:?}")" + + > 0:33:25 + + # + # Code to generate the date/time is here. + # https://github.com/apache/zeppelin/blob/f3bdd4a1fa0cf19bc1015955d8ade4bc79a8e16f/zeppelin-server/src/main/java/org/apache/zeppelin/rest/message/ParagraphJobStatus.java#L35 + # Looks like a standard java.util.Date.toString() call. + # + # TODO : PR to implement an extra param that formats the dates ? + # + + +# ----------------------------------------------------- +# GET the note status rather than the content. +#[user@desktop] + + curl \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/job/2FX82FMTH' \ + | jq '.' + + + > { + > "status": "OK", + > "body": [ + > { + > "progress": "100", + > "started": "Sat Feb 13 13:33:46 UTC 2021", + > "finished": "Sat Feb 13 13:33:46 UTC 2021", + > "id": "20201013-131059_546082898", + > "status": "FINISHED" + > }, + > .... + > .... + > { + > "progress": "100", + > "started": "Sat Feb 13 18:14:46 UTC 2021", + > "finished": "Sat Feb 13 18:29:25 UTC 2021", + > "id": "20201124-171324_1960205489", + > "status": "FINISHED" + > }, + > { + > "progress": "100", + > "id": "20210108-142101_850914828", + > "status": "FINISHED" + > } + > ] + > } + + + # + # For each element in the list .. + # Calculate the elapsed time. + # + + + curl \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/job/2FX82FMTH' \ + | jq '.body[]' \ + > status.txt + + # + # First format the start and end times and collect them together on a third line. + # + + sed ' + s/\("started":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/ + s/\("finished":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/ + /"started":/ { + h + s/\([[:space:]]*\)"started":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"finished":/ { + H + x + s/[[:space:]]*"finished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1"elapsed": \2/ + x + G + } + ' status.txt + + +1 "started": "2021 Feb 13 13:38:58", +2 "finished": "2021 Feb 13 13:46:06", + +-- + +P "started": "2021 Feb 13 13:38:58", + +H "started": "2021 Feb 13 13:38:58", +P "started": "2021 Feb 13 13:38:58", + +H "started": "2021 Feb 13 13:38:58", +P "2021 Feb 13 13:38:58" + +H "2021 Feb 13 13:38:58" +P "started": "2021 Feb 13 13:38:58", + +-- + +H "2021 Feb 13 13:38:58" +P "finished": "2021 Feb 13 13:46:06", + +H "2021 Feb 13 13:38:58"\n"finished": "2021 Feb 13 13:46:06", +P "finished": "2021 Feb 13 13:46:06", + +H "finished": "2021 Feb 13 13:46:06", +P "2021 Feb 13 13:38:58"\n"finished": "2021 Feb 13 13:46:06", + +H "finished": "2021 Feb 13 13:46:06", +P "2021 Feb 13 13:38:58" "2021 Feb 13 13:46:06" + +H "finished": "2021 Feb 13 13:46:06", +P "elapsed": "2021 Feb 13 13:38:58" "2021 Feb 13 13:46:06" + +H "elapsed": "2021 Feb 13 13:38:58" "2021 Feb 13 13:46:06" +P "finished": "2021 Feb 13 13:46:06", + +P "finished": "2021 Feb 13 13:46:06", + "elapsed": "2021 Feb 13 13:38:58" "2021 Feb 13 13:46:06" + +-- + + > { + > "progress": "100", + > "started": "2021 Feb 13 13:33:46", + > "finished": "2021 Feb 13 13:33:46", + > "elapsed": "2021 Feb 13 13:33:46" "2021 Feb 13 13:33:46" + > "id": "20201013-131059_546082898", + > "status": "FINISHED" + > } + > { + > "progress": "100", + > "started": "2021 Feb 13 13:33:46", + > "finished": "2021 Feb 13 13:34:36", + > "elapsed": "2021 Feb 13 13:33:46" "2021 Feb 13 13:34:36" + > "id": "20201013-131649_1734629667", + > "status": "FINISHED" + > } + > .... + > .... + + # + # Add a call to datediff to generate the elapsed time. + # + + sed ' + s/\("started":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/ + s/\("finished":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/ + /"started":/ { + h + s/\([[:space:]]*\)"started":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"finished":/ { + H + x + s/[[:space:]]*"finished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsed\\": \\"$(datediff --format "%H:%M:%S" --input-format "%Y %b %d %H:%M:%S" \2)\\","/e + x + G + } + ' status.txt + + > { + > "progress": "100", + > "started": "2021 Feb 13 13:33:46", + > "finished": "2021 Feb 13 13:33:46", + > "elapsed": "0:0:0", + > "id": "20201013-131059_546082898", + > "status": "FINISHED" + > } + > { + > "progress": "100", + > "started": "2021 Feb 13 13:33:46", + > "finished": "2021 Feb 13 13:34:36", + > "elapsed": "0:0:50", + > "id": "20201013-131649_1734629667", + > "status": "FINISHED" + > } + > .... + > .... + + # + # Putting it all together. + # All because Zeppelin uses the default Java data format in a JSON response. + # + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/job/2FX82FMTH' \ + | jq '.body' \ + | sed ' + s/\("started":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/ + s/\("finished":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/ + /"started":/ { + h + s/\([[:space:]]*\)"started":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"finished":/ { + H + x + s/[[:space:]]*"finished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsed\\": \\"$(datediff --format "%H:%M:%S" --input-format "%Y %b %d %H:%M:%S" \2)\\","/e + x + G + } + ' + + # + # Just get the elapsed time. + # + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/job/2FX82FMTH' \ + | jq '.body' \ + | sed ' + s/\("started":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/ + s/\("finished":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/ + /"started":/ { + h + s/\([[:space:]]*\)"started":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"finished":/ { + H + x + s/[[:space:]]*"finished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsed\\": \\"$(datediff --format "%H:%M:%S" --input-format "%Y %b %d %H:%M:%S" \2)\\","/e + x + G + } + ' \ + | jq -r '.[] | select(.elapsed != null) | .elapsed' + + + + diff --git a/notes/zrq/20210215-01-speed-tests.txt b/notes/zrq/20210215-01-speed-tests.txt new file mode 100644 index 00000000..53856c05 --- /dev/null +++ b/notes/zrq/20210215-01-speed-tests.txt @@ -0,0 +1,482 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Follow on from previous tests. + Compare 500,1000 ... trees + + Result: + + Work in progress .. + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Login to Zeppelin ... +#[user@desktop] + + firefox --new-window "http://zeppelin.metagrid.xyz:8080/" & + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + Import notebooks from GitHub, clear the output and run all the cells ... + + Good astrometric solutions via ML Random Forrest classifier + https://raw.githubusercontent.com/wfau/aglais-notebooks/main/2FRPC4BFS/note.json + +# ----------------------------------------------------- + + dev deployment #3 + + test #3.6 + 100% data, 500 trees, cache + + 8:15 - run all + 13:30 - done + + # + # Get the elapsed time. + # + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/job/2FX82FMTH' \ + | jq '.body' \ + | sed ' + s/\("started":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/ + s/\("finished":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/ + /"started":/ { + h + s/\([[:space:]]*\)"started":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"finished":/ { + H + x + s/[[:space:]]*"finished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsed\\": \\"$(datediff --format "%H:%M:%S" --input-format "%Y %b %d %H:%M:%S" \2)\\","/e + x + G + } + ' \ + | jq -r '.[] | select(.elapsed != null) | .elapsed' + + + > 0:0:0 + > 0:0:55 + > 0:0:39 + > 0:3:55 + > 0:0:0 + > 0:8:2 + > 0:0:0 + > 1:18:44 + > 0:0:1 + > 0:20:20 + > 0:0:0 + > 1:4:36 + > 0:16:50 + > 0:33:29 + > 0:27:59 + > 0:27:39 + > 0:18:53 + > 0:16:43 + + # + # Manual annotation.. + # + + > 0:0:0 + > 0:0:55 - Astrometric features + > 0:0:39 - Select sources + > 0:3:55 - Hertzsprung-Russell diagram + > 0:0:0 + > 0:8:2 - Selecting training data + > 0:0:0 + > 1:18:44 - Random forest training + > 0:0:1 + > 0:20:20 - Misclassification fraction + > 0:0:0 + > 1:4:36 - Hertzsprung-Russell diagram + > 0:16:50 - Classification probabilities + > 0:33:29 - Good plot + > 0:27:59 - Bad plot + > 0:27:39 - Good/bad count + > 0:18:53 - Error distribution + > 0:16:43 - Null values + + + dev deployment #3 + test #3.7 + modify the code to use cached data + + # cache it for speedy access below (all subsequent samples are derived from this): + - raw_sources_df.cache() + + cached_sources = raw_sources_df.cache() + + # register as SQL-queryable + - raw_sources_df.createOrReplaceTempView('raw_sources') + + cached_sources.createOrReplaceTempView('cached_sources') + + 100% data, 500 trees, cache + + # + # Get the elapsed time. + # + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/job/2FX82FMTH' \ + | jq '.body' \ + | sed ' + s/\("started":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/ + s/\("finished":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/ + /"started":/ { + h + s/\([[:space:]]*\)"started":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"finished":/ { + H + x + s/[[:space:]]*"finished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsed\\": \\"$(datediff --format "%H:%M:%S" --input-format "%Y %b %d %H:%M:%S" \2)\\","/e + x + G + } + ' \ + | jq -r '.[] | select(.elapsed != null) | .elapsed' + + > 0:0:0 + > 0:0:6 - Astrometric features + > 0:11:20 - Select sources + > 0:0:5 - Hertzsprung-Russell diagram + > 0:0:0 + > 0:0:9 - Selecting training data + > 0:0:0 + > 0:18:42 - Random forest training + > 0:0:0 + > 0:0:29 - Misclassification fraction + > 0:0:0 + > 0:1:49 - Hertzsprung-Russell diagram + > 0:0:58 - Classification probabilities + > 0:0:55 - Good plot + > 0:0:54 - Bad plot + > 0:0:56 - Good/bad count + > 0:0:10 - Error distribution + > 0:0:47 - Null values + + + dev deployment #3 + test #3.8 + 100% data, 1000 trees, cache + + checkstatus() + { + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \ + | jq '.' \ + | sed ' + /"dateStarted": null,/d + /"dateStarted":/ { + h + s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"dateFinished": null,/ d + /"dateFinished":/ { + H + x + s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e + x + G + } + ' \ + | jq ' + .body.paragraphs[] | select(.results.code != null) | { + title, + result: .results.code, + time: .elapsedTime, + output: ((.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) // "-") + } + ' + } + + checkstatus + + > { + > "title": "Introduction", + > "result": "SUCCESS", + > "time": "0:0:0", + > "output": "-" + > } + > { + > "title": "Astrometric features ", + > "result": "SUCCESS", + > "time": "0:0:5", + > "output": "-" + > } + > { + > "title": "Select sources", + > "result": "SUCCESS", + > "time": "0:1:27", + > "output": [] + > } + > { + > "title": "Hertzsprung-Russell diagram", + > "result": "SUCCESS", + > "time": "0:3:41", + > "output": [] + > } + > { + > "title": null, + > "result": "SUCCESS", + > "time": "0:0:0", + > "output": "-" + > } + > { + > "title": "Selecting training data", + > "result": "SUCCESS", + > "time": "0:6:34", + > "output": [] + > } + > { + > "title": null, + > "result": "SUCCESS", + > "time": "0:0:0", + > "output": "-" + > } + > { + > "title": "Random forest training", + > "result": "SUCCESS", + > "time": "1:34:11", + > "output": "-" + > } + > { + > "title": "Random forest testing", + > "result": "SUCCESS", + > "time": "0:0:0", + > "output": "-" + > } + > { + > "title": "Misclassification fraction", + > "result": "SUCCESS", + > "time": "0:17:40", + > "output": [ + > "------------------------------" + > ] + > } + > { + > "title": "Feature importance", + > "result": "SUCCESS", + > "time": "0:0:0", + > "output": [] + > } + > { + > "title": "Hertzsprung-Russell diagram", + > "result": "SUCCESS", + > "time": "0:54:46", + > "output": [] + > } + > { + > "title": "Classification probabilities", + > "result": "SUCCESS", + > "time": "0:14:22", + > "output": [] + > } + > { + > "title": "Good sources plot", + > "result": "SUCCESS", + > "time": "0:27:28", + > "output": [] + > } + + + dev deployment #3 + test #3.9 + repeat the same test + 100% data, 1000 trees, cache + minor edits to print statements + + start 18:47 + + checkstatus + + > { + > "title": "Introduction", + > "result": "SUCCESS", + > "time": "0:0:0", + > "output": "-" + > } + > { + > "title": "Astrometric features ", + > "result": "SUCCESS", + > "time": "0:0:5", + > "output": "-" + > } + > { + > "title": "Select sources", + > "result": "SUCCESS", + > "time": "0:1:22", + > "output": [ + > "- Cached rows : 1724028 rows" + > ] + > } + + + + > { + > "title": "Introduction", + > "result": "SUCCESS", + > "time": "0:0:0", + > "output": "-" + > } + > { + > "title": "Astrometric features ", + > "result": "SUCCESS", + > "time": "0:0:5", + > "output": "-" + > } + > { + > "title": "Select sources", + > "result": "SUCCESS", + > "time": "0:1:22", + > "output": [ + > "- Cached rows : 1724028 rows" + > ] + > } + > { + > "title": "Hertzsprung-Russell diagram", + > "result": "SUCCESS", + > "time": "0:4:1", + > "output": [] + > } + > { + > "title": null, + > "result": "SUCCESS", + > "time": "0:0:1", + > "output": "-" + > } + > { + > "title": "Selecting training data", + > "result": "SUCCESS", + > "time": "0:6:39", + > "output": [ + > "- Good training data size: 244740 rows", + > "- Bad training data size: 244740 rows" + > ] + > } + > { + > "title": null, + > "result": "SUCCESS", + > "time": "0:18:14", + > "output": [ + > "- Combined training data : 328175 rows" + > ] + > } + > { + > "title": "Random forest training", + > "result": "SUCCESS", + > "time": "1:26:12", + > "output": [ + > "- Classifier : 1000 trees" + > ] + > } + > { + > "title": "Random forest testing", + > "result": "SUCCESS", + > "time": "0:0:0", + > "output": "-" + > } + > { + > "title": "Misclassification fraction", + > "result": "SUCCESS", + > "time": "0:18:7", + > "output": [ + > "- Misclassifications for the test set: 0.35 %" + > ] + > } + > { + > "title": "Feature importance", + > "result": "SUCCESS", + > "time": "0:0:0", + > "output": [] + > } + > { + > "title": "Hertzsprung-Russell diagram", + > "result": "SUCCESS", + > "time": "0:56:23", + > "output": [] + > } + > { + > "title": "Classification probabilities", + > "result": "SUCCESS", + > "time": "0:14:2", + > "output": [] + > } + > { + > "title": "Good sources plot", + > "result": "SUCCESS", + > "time": "0:26:49", + > "output": [] + > } + > { + > "title": "Bad sources plot", + > "result": "SUCCESS", + > "time": "0:26:34", + > "output": [] + > } + > { + > "title": "Good/bad count", + > "result": "SUCCESS", + > "time": "0:27:9", + > "output": [ + > "- Found 22263 good sources", + > "- Found 26161 bad sources" + > ] + > } + > { + > "title": "Parallax over error distribution", + > "result": "SUCCESS", + > "time": "0:16:43", + > "output": [] + > } + > { + > "title": "Null values check", + > "result": "SUCCESS", + > "time": "0:9:39", + > "output": [] + > } + diff --git a/notes/zrq/20210215-02-timing-logger.txt b/notes/zrq/20210215-02-timing-logger.txt new file mode 100644 index 00000000..d08a5bdd --- /dev/null +++ b/notes/zrq/20210215-02-timing-logger.txt @@ -0,0 +1,1046 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Issue #371 + https://github.com/wfau/aglais/issues/371 + + Create a jq parser to extract the timing information from a Zeppelin notebook. + + Result: + + Work in progress ... + + + # + # REST API documentation. + # https://zeppelin.apache.org/docs/0.7.0/rest-api/rest-notebook.html#export-a-note + + # + # REST API template + # http://[zeppelin-server]:[zeppelin-port]/api/notebook/export/[noteId] + + # + # HTML notebook URL + # http://128.232.227.222:8080/#/notebook/2FX82FMTH + + +# ----------------------------------------------------- +# Use the REST API to get the notebook contents. +#[user@desktop] + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \ + | jq '.' + + > .... + > .... + + # + # Select just the text outputs. + # + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \ + | jq ' + .body.paragraphs[] | .results | select(.code == "SUCCESS") | .msg[] | select(.type == "TEXT") + ' + + > { + > "type": "TEXT", + > "data": "1724028" + > } + > { + > "type": "TEXT", + > "data": "
\n" + > } + > { + > "type": "TEXT", + > "data": "Good training data size: 244740 rows\nBad training data size: 244740 rows\n" + > } + > { + > "type": "TEXT", + > "data": " | 1 2\n------------------------------\n 1 | 80320 553\n 2 | 10 80422\n\nMisclassifications for the test set: 0.35 %\n" + > } + > { + > "type": "TEXT", + > "data": "Relative importance of astrometric features:\n\n parallax_error : 0.238293\n parallax_over_error : 0.090757\n astrometric_sigma_5d_max : 0.216903\n pmra_error : 0.152419\n pmdec_error : 0.135658\n astrometric_excess_noise : 0.078488\nipd_gof_harmonic_amplitude : 0.036072\n ruwe : 0.016268\n visibility_periods_used : 0.007118\n pmdec : 0.007725\n pmra : 0.004116\n ipd_frac_odd_win : 0.001107\n ipd_frac_multi_peak : 0.009796\n astrometric_gof_al : 0.003791\n parallax_pmdec_corr : 0.000393\nastrometric_excess_noise_sig : 0.001098\n" + > } + > { + > "type": "TEXT", + > "data": "
\n" + > } + + + # + # Exclude image outputs. + # + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \ + | jq ' + .body.paragraphs[] | del(.results | select(.code == "SUCCESS") | .msg[] | select(.type == "IMG")) + ' + + > { + > "text": "%md\n\n# Using ML to define an astrometrically clean sample of stars\n\n Follows Gaia EDR3 performance verification paper DPACP-81 (Smart et al.) in classifying astrometric solutions as good or bad\n via supervised ML. Employs a Random Forrest classifier plus appropriately defined training sets - see\n\n https://arxiv.org/abs/2012.02061\n \n for further details. The work flow implemented here follows closely that described in Section 2, \"GCNS Generation\"\n (GCNS = Gaia Catalogue of Nearby Stars) and is designed to clean up a 100pc (= nearby) sample.\n\n Version employing newer, richer dataframe API in pyspark ML\n \n IMPORTANT NOTE: current deployment has Spark 2.4.7 installed. That specific version's API is documented here:\n \n https://spark.apache.org/docs/2.4.7/ml-classification-regression.html#random-forest-classifier\n \n Beware of following on-line message board and other fora posts for help and examples as they more often than not describe and link to different versions, and the API is evolving all the time.\n \n ", + > "user": "gaiauser", + > "dateUpdated": "Feb 15, 2021 8:15:17 AM", + > "config": { + > "tableHide": false, + > "editorSetting": { + > "language": "markdown", + > "editOnDblClick": true, + > "completionKey": "TAB", + > "completionSupport": false + > }, + > "colWidth": 12, + > "editorMode": "ace/mode/markdown", + > "fontSize": 9, + > "editorHide": false, + > "results": {}, + > "enabled": true + > }, + > "settings": { + > "params": {}, + > "forms": {} + > }, + > "results": { + > "code": "SUCCESS", + > "msg": [ + > { + > "type": "HTML", + > "data": "
\n

Using ML to define an astrometrically clean sample of stars

\n

Follows Gaia EDR3 performance verification paper DPACP-81 (Smart et al.) in classifying astrometric solutions as good or bad
via supervised ML. Employs a Random Forrest classifier plus appropriately defined training sets - see

\n

https://arxiv.org/abs/2012.02061

\n

for further details. The work flow implemented here follows closely that described in Section 2, “GCNS Generation”
(GCNS = Gaia Catalogue of Nearby Stars) and is designed to clean up a 100pc (= nearby) sample.

\n

Version employing newer, richer dataframe API in pyspark ML

\n

IMPORTANT NOTE: current deployment has Spark 2.4.7 installed. That specific version’s API is documented here:

\n

https://spark.apache.org/docs/2.4.7/ml-classification-regression.html#random-forest-classifier

\n

Beware of following on-line message board and other fora posts for help and examples as they more often than not describe and link to different versions, and the API is evolving all the time.

\n
" + > } + > ] + > }, + > "apps": [], + > "jobName": "paragraph_1613126076679_1211627861", + > "id": "20201013-131059_546082898", + > "dateCreated": "Feb 12, 2021 10:34:36 AM", + > "dateStarted": "Feb 15, 2021 8:15:17 AM", + > "dateFinished": "Feb 15, 2021 8:15:17 AM", + > "status": "FINISHED", + > "progressUpdateIntervalMs": 500 + > } + > .... + > .... + > { + > "text": "%spark.pyspark\n\n# where are the NULLs in raw_sources features selection?\nfor feature in astrometric_features: print (spark.sql('SELECT COUNT(*) AS ' + feature + '_nulls FROM raw_sources WHERE ' + feature + ' IS NULL').show())\n# scan_direction_strength_k2 is the culprit!\n \n# alternatively could try:\n#Dict_Null = {col:df.filter(df[col].isNull()).count() for col in df.columns}\n#Dict_Null\n \n", + > "user": "gaiauser", + > "dateUpdated": "Feb 13, 2021 6:14:46 PM", + > "config": { + > "editorSetting": { + > "language": "python", + > "editOnDblClick": false, + > "completionKey": "TAB", + > "completionSupport": true + > }, + > "colWidth": 12, + > "editorMode": "ace/mode/python", + > "fontSize": 9, + > "results": {}, + > "enabled": true + > }, + > "settings": { + > "params": {}, + > "forms": {} + > }, + > "apps": [], + > "jobName": "paragraph_1613126076687_1356332997", + > "id": "20201124-171324_1960205489", + > "dateCreated": "Feb 12, 2021 10:34:36 AM", + > "dateStarted": "Feb 13, 2021 6:14:46 PM", + > "dateFinished": "Feb 13, 2021 6:29:25 PM", + > "status": "FINISHED", + > "errorMessage": "", + > "progressUpdateIntervalMs": 500 + > } + + # + # Select specific fields. + # + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FYRDDR17' \ + | jq ' + .body.paragraphs[] | { + title, + status, + dateStarted, + dateFinished, + status: .results.code + } + ' + + > { + > "title": "Paragraph 001", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 11:51:15 AM", + > "dateFinished": "Feb 15, 2021 11:51:17 AM" + > } + > .... + > .... + + # + # Select text message lines that begin with '-'. + # + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FYRDDR17' \ + | jq ' + .body.paragraphs[] | { + title, + status, + dateStarted, + dateFinished, + status: .results.code, + output: (.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) + } + ' + + > { + > "title": "Paragraph 001", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 11:51:15 AM", + > "dateFinished": "Feb 15, 2021 11:51:17 AM", + > "output": [ + > "-rw-------. 1 fedora fedora 503 Feb 13 03:21 .bash_history", + > "-rw-r--r--. 1 fedora fedora 18 Feb 16 2019 .bash_logout", + > "-rw-r--r--. 1 fedora fedora 141 Feb 16 2019 .bash_profile", + > "-rw-r--r--. 1 fedora fedora 376 Feb 16 2019 .bashrc", + > "-rw-------. 1 fedora fedora 0 Feb 12 10:34 .scala_history" + > ] + > } + + # + # Add the elapsed time calculation. + # + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FYRDDR17' \ + | jq '.' \ + | sed ' + s/\("dateStarted":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/ + s/\("dateFinished":[[:space:]]*\)"\([[:alpha:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)[[:space:]]*\([[:digit:]]*:[[:digit:]]*:[[:digit:]]*\)[[:space:]]*\([[:alpha:]]*\)[[:space:]]*\([[:digit:]]*\)"/\1"\7 \3 \4 \5"/ + /"started":/ { + h + s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"finished":/ { + H + x + s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%Y %b %d %H:%M:%S" \2)\\","/e + x + G + } + ' \ + + # + # !!!!! the dates are in a different format ! + # + + # + # Back to square one with the date format .. although they are not as bad as the previous ones. + # Might be able to do it without the sed processing step. + # + + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FYRDDR17' \ + | jq '.' \ + | sed ' + /"dateStarted":/ { + h + s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"dateFinished":/ { + H + x + s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e + x + G + } + ' \ + + > { + > "status": "OK", + > "message": "", + > "body": { + > "paragraphs": [ + > { + > "title": "Paragraph 001", + > "text": "%sh\nls -al\n", + > "user": "gaiauser", + > "dateUpdated": "Feb 15, 2021 11:51:15 AM", + > .... + > .... + > "id": "20210215-115033_1362835282", + > "dateCreated": "Feb 15, 2021 11:50:33 AM", + > "dateStarted": "Feb 15, 2021 11:51:15 AM", + > "dateFinished": "Feb 15, 2021 11:51:17 AM", + > "elapsedTime": "0:0:2", + > "status": "FINISHED", + > "progressUpdateIntervalMs": 500 + > }, + > .... + > .... + > "info": {} + > } + > } + + # + # Add the field selection and output parser. + # + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FYRDDR17' \ + | jq ' + .body.paragraphs[] | { + title, + status, + dateStarted, + dateFinished, + status: .results.code, + output: (.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) + } + ' \ + | sed ' + /"dateStarted":/ { + h + s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"dateFinished":/ { + H + x + s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e + x + G + } + ' + + > { + > "title": "Paragraph 001", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 11:51:15 AM", + > "dateFinished": "Feb 15, 2021 11:51:17 AM", + > "elapsedTime": "0:0:2", + > "output": [ + > "-rw-------. 1 fedora fedora 503 Feb 13 03:21 .bash_history", + > "-rw-r--r--. 1 fedora fedora 18 Feb 16 2019 .bash_logout", + > "-rw-r--r--. 1 fedora fedora 141 Feb 16 2019 .bash_profile", + > "-rw-r--r--. 1 fedora fedora 376 Feb 16 2019 .bashrc", + > "-rw-------. 1 fedora fedora 0 Feb 12 10:34 .scala_history" + > ] + > } + + +# ----------------------------------------------------- +# Try it on the real notebook. +#[user@desktop] + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \ + | jq ' + .body.paragraphs[] | { + title, + status, + dateStarted, + dateFinished, + status: .results.code, + output: (.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) + } + ' \ + | sed ' + /"dateStarted":/ { + h + s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"dateFinished":/ { + H + x + s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e + x + G + } + ' + + > { + > "title": "Select sources", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:25:36 PM", + > "dateFinished": "Feb 15, 2021 2:27:03 PM", + > "elapsedTime": "0:1:27", + > "output": [] + > } + > { + > "title": "Hertzsprung-Russell diagram", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:27:03 PM", + > "dateFinished": "Feb 15, 2021 2:30:44 PM", + > "elapsedTime": "0:3:41", + > "output": [] + > } + > { + > "title": "Selecting training data", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:30:44 PM", + > "dateFinished": "Feb 15, 2021 2:37:18 PM", + > "elapsedTime": "0:6:34", + > "output": [] + > } + + # + # Missing some elements .. + # + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \ + | jq ' + .body.paragraphs[] | { + title, + status, + dateStarted, + dateFinished, + status: .results.code + } + ' \ + | sed ' + /"dateStarted":/ { + h + s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"dateFinished":/ { + H + x + s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e + x + G + } + ' + + # Skips rows if output is null ? + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \ + | jq ' + .body.paragraphs[] | { + title, + status, + dateStarted, + dateFinished, + status: .results.code, + output: (.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT")) + } + ' + + > { + > "title": "Select sources", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:25:36 PM", + > "dateFinished": "Feb 15, 2021 2:27:03 PM", + > "output": { + > "type": "TEXT", + > "data": "1724028" + > } + > } + > { + > "title": "Hertzsprung-Russell diagram", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:27:03 PM", + > "dateFinished": "Feb 15, 2021 2:30:44 PM", + > "output": { + > "type": "TEXT", + > "data": "
\n" + > } + > } + > { + > "title": "Selecting training data", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:30:44 PM", + > "dateFinished": "Feb 15, 2021 2:37:18 PM", + > "output": { + > "type": "TEXT", + > "data": "Good training data size: 244740 rows\nBad training data size: 244740 rows\n" + > } + > } + + # + # Add a default value to the output. + # + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \ + | jq ' + .body.paragraphs[] | { + title, + status, + dateStarted, + dateFinished, + status: .results.code, + output: ((.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") ) // "-") + } + ' + + > { + > "title": "Introduction", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:25:31 PM", + > "dateFinished": "Feb 15, 2021 2:25:31 PM", + > "output": "-" + > } + > { + > "title": "Astrometric features ", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:25:31 PM", + > "dateFinished": "Feb 15, 2021 2:25:36 PM", + > "output": "-" + > } + > { + > "title": "Select sources", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:25:36 PM", + > "dateFinished": "Feb 15, 2021 2:27:03 PM", + > "output": { + > "type": "TEXT", + > "data": "1724028" + > } + > } + > { + > "title": "Hertzsprung-Russell diagram", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:27:03 PM", + > "dateFinished": "Feb 15, 2021 2:30:44 PM", + > "output": { + > "type": "TEXT", + > "data": "
\n" + > } + > } + > { + > "title": null, + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:30:44 PM", + > "dateFinished": "Feb 15, 2021 2:30:44 PM", + > "output": "-" + > } + > { + > "title": "Selecting training data", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:30:44 PM", + > "dateFinished": "Feb 15, 2021 2:37:18 PM", + > "output": { + > "type": "TEXT", + > "data": "Good training data size: 244740 rows\nBad training data size: 244740 rows\n" + > } + > } + > { + > "title": null, + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:37:19 PM", + > "dateFinished": "Feb 15, 2021 2:37:19 PM", + > "output": "-" + > } + > { + > "title": "Random forest training", + > "status": null, + > "dateStarted": "Feb 15, 2021 2:37:19 PM", + > "dateFinished": "Feb 15, 2021 2:16:12 PM", + > "output": "-" + > } + > { + > "title": "Random forest testing", + > "status": null, + > "dateStarted": "Feb 15, 2021 2:16:12 PM", + > "dateFinished": "Feb 15, 2021 2:16:12 PM", + > "output": "-" + > } + > { + > "title": "Misclassification fraction", + > "status": null, + > "dateStarted": "Feb 15, 2021 2:16:12 PM", + > "dateFinished": "Feb 15, 2021 2:16:41 PM", + > "output": "-" + > } + > { + > "title": "Feature importance", + > "status": null, + > "dateStarted": "Feb 15, 2021 2:16:42 PM", + > "dateFinished": "Feb 15, 2021 2:16:42 PM", + > "output": "-" + > } + > { + > "title": "Hertzsprung-Russell diagram", + > "status": null, + > "dateStarted": "Feb 15, 2021 2:16:42 PM", + > "dateFinished": "Feb 15, 2021 2:18:31 PM", + > "output": "-" + > } + > { + > "title": "Classification probabilities", + > "status": null, + > "dateStarted": "Feb 15, 2021 2:18:31 PM", + > "dateFinished": "Feb 15, 2021 2:19:29 PM", + > "output": "-" + > } + > { + > "title": "Good sources plot", + > "status": null, + > "dateStarted": "Feb 15, 2021 2:19:29 PM", + > "dateFinished": "Feb 15, 2021 2:20:24 PM", + > "output": "-" + > } + > { + > "title": "Bad sources plot", + > "status": null, + > "dateStarted": "Feb 15, 2021 2:20:25 PM", + > "dateFinished": "Feb 15, 2021 2:21:19 PM", + > "output": "-" + > } + > { + > "title": "Good/bad count", + > "status": null, + > "dateStarted": "Feb 15, 2021 2:21:20 PM", + > "dateFinished": "Feb 15, 2021 2:22:16 PM", + > "output": "-" + > } + > { + > "title": "Parallax over error distribution", + > "status": null, + > "dateStarted": "Feb 15, 2021 2:22:16 PM", + > "dateFinished": "Feb 15, 2021 2:22:26 PM", + > "output": "-" + > } + > { + > "title": "Null values check", + > "status": null, + > "dateStarted": "Feb 15, 2021 2:22:26 PM", + > "dateFinished": "Feb 15, 2021 2:23:13 PM", + > "output": "-" + > } + > { + > "title": null, + > "status": null, + > "dateStarted": null, + > "dateFinished": null, + > "output": "-" + > } + +# ----------------------------------------------------- +# Try it on the real notebook. +#[user@desktop] + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \ + | jq ' + .body.paragraphs[] | { + title, + status, + dateStarted, + dateFinished, + status: .results.code, + output: ((.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) // "-") + } + ' \ + | sed ' + /"dateStarted":/ { + h + s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"dateFinished":/ { + H + x + s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e + x + G + } + ' + + > { + > "title": "Introduction", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:25:31 PM", + > "dateFinished": "Feb 15, 2021 2:25:31 PM", + > "elapsedTime": "0:0:0", + > "output": "-" + > } + > .... + > .... + > { + > "title": "Null values check", + > "status": null, + > "dateStarted": "Feb 15, 2021 2:22:26 PM", + > "dateFinished": "Feb 15, 2021 2:23:13 PM", + > "elapsedTime": "0:0:47", + > "output": "-" + > } + > { + > "title": null, + > "status": null, + > "dateStarted": null, + > ddiff: Error: reference DATE must be specified + > + > sh: line 2: dateFinished:: command not found + > "dateFinished": null, + > "elapsedTime": "Usage: datediff [OPTION]... DATE/TIME [DATE/TIME]... + + +# ----------------------------------------------------- +# Add a null value check for the dates. +#[user@desktop] + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \ + | jq ' + .body.paragraphs[] | { + title, + status, + dateStarted, + dateFinished, + status: .results.code, + output: ((.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) // "-") + } + ' \ + | sed ' + /"dateStarted": null,/d + /"dateStarted":/ { + h + s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"dateFinished": null,/ d + /"dateFinished":/ { + H + x + s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e + x + G + } + ' + + > { + > "title": "Introduction", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:25:31 PM", + > "dateFinished": "Feb 15, 2021 2:25:31 PM", + > "elapsedTime": "0:0:0", + > "output": "-" + > } + > { + > "title": "Astrometric features ", + > "status": "SUCCESS", + > "dateStarted": "Feb 15, 2021 2:25:31 PM", + > "dateFinished": "Feb 15, 2021 2:25:36 PM", + > "elapsedTime": "0:0:5", + > "output": "-" + > } + > .... + > .... + > { + > "title": "Null values check", + > "status": null, + > "dateStarted": "Feb 15, 2021 2:22:26 PM", + > "dateFinished": "Feb 15, 2021 2:23:13 PM", + > "elapsedTime": "0:0:47", + > "output": "-" + > } + > { + > "title": null, + > "status": null, + > "output": "-" + > } + + + +# ----------------------------------------------------- +# Skip cells with no result code. +#[user@desktop] + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \ + | jq ' + .body.paragraphs[] | select(.results.code != null) | { + title, + status, + dateStarted, + dateFinished, + result: .results.code, + output: ((.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) // "-") + } + ' \ + | sed ' + /"dateStarted": null,/d + /"dateStarted":/ { + h + s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"dateFinished": null,/ d + /"dateFinished":/ { + H + x + s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e + x + G + } + ' + + > { + > "title": "Introduction", + > "status": "FINISHED", + > "dateStarted": "Feb 15, 2021 2:25:31 PM", + > "dateFinished": "Feb 15, 2021 2:25:31 PM", + > "elapsedTime": "0:0:0", + > "result": "SUCCESS", + > "output": "-" + > } + > { + > "title": "Astrometric features ", + > "status": "FINISHED", + > "dateStarted": "Feb 15, 2021 2:25:31 PM", + > "dateFinished": "Feb 15, 2021 2:25:36 PM", + > "elapsedTime": "0:0:5", + > "result": "SUCCESS", + > "output": "-" + > } + > { + > "title": "Select sources", + > "status": "FINISHED", + > "dateStarted": "Feb 15, 2021 2:25:36 PM", + > "dateFinished": "Feb 15, 2021 2:27:03 PM", + > "elapsedTime": "0:1:27", + > "result": "SUCCESS", + > "output": [] + > } + > { + > "title": "Hertzsprung-Russell diagram", + > "status": "FINISHED", + > "dateStarted": "Feb 15, 2021 2:27:03 PM", + > "dateFinished": "Feb 15, 2021 2:30:44 PM", + > "elapsedTime": "0:3:41", + > "result": "SUCCESS", + > "output": [] + > } + > { + > "title": null, + > "status": "FINISHED", + > "dateStarted": "Feb 15, 2021 2:30:44 PM", + > "dateFinished": "Feb 15, 2021 2:30:44 PM", + > "elapsedTime": "0:0:0", + > "result": "SUCCESS", + > "output": "-" + > } + > { + > "title": "Selecting training data", + > "status": "FINISHED", + > "dateStarted": "Feb 15, 2021 2:30:44 PM", + > "dateFinished": "Feb 15, 2021 2:37:18 PM", + > "elapsedTime": "0:6:34", + > "result": "SUCCESS", + > "output": [] + > } + > { + > "title": null, + > "status": "FINISHED", + > "dateStarted": "Feb 15, 2021 2:37:19 PM", + > "dateFinished": "Feb 15, 2021 2:37:19 PM", + > "elapsedTime": "0:0:0", + > "result": "SUCCESS", + > "output": "-" + > } + > { + > "title": "Random forest training", + > "status": "FINISHED", + > "dateStarted": "Feb 15, 2021 2:37:19 PM", + > "dateFinished": "Feb 15, 2021 4:11:30 PM", + > "elapsedTime": "1:34:11", + > "result": "SUCCESS", + > "output": "-" + > } + > { + > "title": "Random forest testing", + > "status": "FINISHED", + > "dateStarted": "Feb 15, 2021 4:11:30 PM", + > "dateFinished": "Feb 15, 2021 4:11:30 PM", + > "elapsedTime": "0:0:0", + > "result": "SUCCESS", + > "output": "-" + > } + + + + +# ----------------------------------------------------- +# Swap sed and jq, don't include the start and end dates in the output. +#[user@desktop] + + curl \ + --silent \ + --cookie '/tmp/cookies' \ + 'http://128.232.227.222:8080/api/notebook/2FX82FMTH' \ + | jq '.' \ + | sed ' + /"dateStarted": null,/d + /"dateStarted":/ { + h + s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"dateFinished": null,/ d + /"dateFinished":/ { + H + x + s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e + x + G + } + ' \ + | jq ' + .body.paragraphs[] | select(.results.code != null) | { + title, + result: .results.code, + time: .elapsedTime, + output: ((.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) // "-") + } + ' + + + > { + > "title": "Introduction", + > "result": "SUCCESS", + > "time": "0:0:0", + > "output": "-" + > } + > { + > "title": "Astrometric features ", + > "result": "SUCCESS", + > "time": "0:0:5", + > "output": "-" + > } + > { + > "title": "Select sources", + > "result": "SUCCESS", + > "time": "0:1:27", + > "output": [] + > } + > { + > "title": "Hertzsprung-Russell diagram", + > "result": "SUCCESS", + > "time": "0:3:41", + > "output": [] + > } + > { + > "title": null, + > "result": "SUCCESS", + > "time": "0:0:0", + > "output": "-" + > } + > { + > "title": "Selecting training data", + > "result": "SUCCESS", + > "time": "0:6:34", + > "output": [] + > } + > { + > "title": null, + > "result": "SUCCESS", + > "time": "0:0:0", + > "output": "-" + > } + > { + > "title": "Random forest training", + > "result": "SUCCESS", + > "time": "1:34:11", + > "output": "-" + > } + > { + > "title": "Random forest testing", + > "result": "SUCCESS", + > "time": "0:0:0", + > "output": "-" + > } + > { + > "title": "Misclassification fraction", + > "result": "SUCCESS", + > "time": "0:17:40", + > "output": [ + > "------------------------------" + > ] + > } + > { + > "title": "Feature importance", + > "result": "SUCCESS", + > "time": "0:0:0", + > "output": [] + > } + diff --git a/notes/zrq/20210216-01-timing-logger.txt b/notes/zrq/20210216-01-timing-logger.txt new file mode 100644 index 00000000..c8b3ddda --- /dev/null +++ b/notes/zrq/20210216-01-timing-logger.txt @@ -0,0 +1,113 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Issue #371 + https://github.com/wfau/aglais/issues/371 + + Create a jq parser to extract the timing information from a Zeppelin notebook. + + Result: + + Work in progress ... + + + # + # REST API documentation. + # https://zeppelin.apache.org/docs/0.7.0/rest-api/rest-notebook.html#export-a-note + + # + # REST API template + # http://[zeppelin-server]:[zeppelin-port]/api/notebook/export/[noteId] + + # + # JQ download/install + # https://stedolan.github.io/jq/download/ + + + + zeppelinurl=http://128.232.227.222:8080 + notebookid=2FX82FMTH + + checkstatus() + { + local zeppelinurl=${1:?} + local notebookid=${2:?} + local timingdir=/tmp/aglais/timing + local timingfile=\${timingdir:?}/aglais-notebookid-$(date '+%Y%m%dT%H%M%S').json + + rm -f "${timingfile:?}" + + + curl \ + --silent \ + --cookie "${tempdir:?}/cookies" \ + "${zeppelinurl:?}/api/notebook/${notebookid}" \ + | jq '.' \ + > "${timingfile:?}" + + + + + + ## If the file is empty + ## Try login + ## Repeat get + + ## If file is not empty + + + + | sed ' + /"dateStarted": null,/d + /"dateStarted":/ { + h + s/\([[:space:]]*\)"dateStarted":[[:space:]]*\("[^"]*"\).*$/\1\2/ + x + } + /"dateFinished": null,/ d + /"dateFinished":/ { + H + x + s/[[:space:]]*"dateFinished":[[:space:]]*\("[^"]*"\).*$/ \1/ + s/\([[:space:]]*\)\(.*\)/\1echo "\1\\"elapsedTime\\": \\"$(datediff --format "%H:%M:%S" --input-format "%b %d, %Y %H:%M:%S %p" \2)\\","/e + x + G + } + ' \ + | jq ' + .body.paragraphs[] | select(.results.code != null) | { + title, + result: .results.code, + time: .elapsedTime, + output: ((.results | select(.msg | length > 0) | .msg[] | select(.type == "TEXT") | .data | split("\n") | map(select(startswith("-")))) // "-") + } + ' + } + diff --git a/notes/zrq/20210216-02-resources.txt b/notes/zrq/20210216-02-resources.txt new file mode 100644 index 00000000..d388335b --- /dev/null +++ b/notes/zrq/20210216-02-resources.txt @@ -0,0 +1,121 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + + # + # Openstack flavors + # + + + openstack \ + --os-cloud "${cloudname:?}" \ + flavor list + + > +--------------------------------------+-------------------+--------+------+-----------+-------+-----------+ + > | ID | Name | RAM | Disk | Ephemeral | VCPUs | Is Public | + > +--------------------------------------+-------------------+--------+------+-----------+-------+-----------+ + > | 20061eba-9e88-494c-95a3-41ed77721244 | general.v1.small | 22528 | 20 | 0 | 6 | True | + > | 406a17e0-afd0-47d3-a6ad-8b19198bdd97 | general.v1.tiny | 6144 | 12 | 0 | 2 | True | + > | 8a821ef8-20b8-4bbb-990b-91198745e7a7 | general.v1.xlarge | 184320 | 20 | 340 | 28 | True | + > | 996c1c8c-c934-411c-9631-b74eb2829631 | general.v1.medium | 46080 | 20 | 60 | 14 | True | + > | c4c07f5a-260a-4f22-9530-a09a19aa490a | general.v1.large | 92160 | 20 | 160 | 28 | True | + > +--------------------------------------+-------------------+--------+------+-----------+-------+-----------+ + + # + # Physical machines + # + + Our VMs are pinned to four physical hosts. + + From Paul B. via Slack + + cpu-p-633: + local_gb: 880 (G) + memory_mb: 191855 (188G) + vcpus: 110 + + From screen shot from John G. + + cpu ?? + RAM 186G per machine + + + # + # Fitting medium VMs + # + + Local disc + + 60+20 = 80G per VM + + If local disc is 880G, we should be able to fit 11 VMs per host. + 10 VMs per host, over 4 hosts = 40 VMs. + Divide by 3 clouds = 13 medium VMs per cloud. + + Memory + + 45G per VM + + Local memory is 186G, we should be able to fit 4.13 VMs per host. + 4 VMs per host, over 4 hosts = 16 VMs. + Divide by 3 clouds = 5 medium VMs per cloud. + + Local memory is 186G, times four hosts = 744G + Divide by 3 clouds = 248G per cloud. + + Openstack overview says 768GB (per cloud?) + Is the Horizon UI showing a total of 768GB for all 3 clouds. + Or do we have 4 hosts per cloud ? + + CPU + + 14 cores per VM + Don't know how many cores per host ... + + + + + + + + + + + + + + + + + + + + + + + diff --git a/notes/zrq/20210216-03-IRIS-ML-meeting.txt b/notes/zrq/20210216-03-IRIS-ML-meeting.txt new file mode 100644 index 00000000..c95fb46a --- /dev/null +++ b/notes/zrq/20210216-03-IRIS-ML-meeting.txt @@ -0,0 +1,72 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + + ML meeting + + "AI for science" + + https://mlcommons.org/en/ + https://github.com/mlcommons + + ML for Science course is free + Weekly ML seminars + + Jeyan Thiyagalingam + t.jeyan@stfc.ac.uk + + Convolutional Neural Networks (CNN) + + lots of params + needs big data to train + + deep neural networks tend to overfit to training datasets + what works for one survey doesn't work on another survey - even on the same telescope + + networks are sensitive to orientation + rotate the image and the network doesn't recognise it + + Group-equivariant convolutional neural networks help to solve this + + Deep Learning Research for the Cherenkov Telescope Array (CTA) + + .... + + Machine Learning in Particle Physics + + .... + + Dave Morris + Will we start to change the design of instruments to match the needs of ML? + Pete Clarke + Instruments will become smarter, with computing built in + line between instrument and analysis will become blurred + + + + diff --git a/notes/zrq/20210218-01-infra-ops.txt b/notes/zrq/20210218-01-infra-ops.txt new file mode 100644 index 00000000..9055c012 --- /dev/null +++ b/notes/zrq/20210218-01-infra-ops.txt @@ -0,0 +1,159 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Deploy DNSmasq to provide a DNS service for our deployments. + + Solves issue #379 + https://github.com/wfau/aglais/issues/379 + + Result: + + Work in progress ... + + +# ----------------------------------------------------- +# Create a container to work with. +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name ansibler \ + --hostname ansibler \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --volume "${AGLAIS_CODE:?}/experiments/infra-ops:/infra-ops:ro,z" \ + atolmis/ansible-client:2020.12.02 \ + bash + + +# ----------------------------------------------------- +# Test our Ansible scripts ... +#[root@ansibler] + + pushd "/infra-ops/ansible" + + ansible-playbook \ + --inventory "hosts.yml" \ + "01-ssh-config.yml" + + > .... + > .... + + + ansible-playbook \ + --inventory "hosts.yml" \ + "02-ping-test.yml" + + > .... + > .... + + popd + + +# ----------------------------------------------------- +# Install the Podman container plugin. +# TODO - Add this to our ansible-client image. +#[root@ansibler] + + ansible-galaxy collection install containers.podman + + > Process install dependency map + > Starting collection install process + > Installing 'containers.podman:1.4.3' to '/root/.ansible/collections/ansible_collections/containers/podman' + + +# ----------------------------------------------------- +# Install the Podman container plugin. +# TODO - Add this to our ansible-client image. +#[root@ansibler] + + pushd "/infra-ops/ansible" + + ansible-playbook \ + --inventory "hosts.yml" \ + "03-apply-roles.yml" + + popd + + > PLAY [Apply roles] ***************************************************************** + > + > TASK [Gathering Facts] ************************************************************* + > ok: [Hizzoria] + > + > TASK [fedora-base : Update the DNF cache] ****************************************** + > changed: [Hizzoria] + > + > TASK [podman-host : Install Podman] ************************************************ + > changed: [Hizzoria] + > + > TASK [dns-server : Create DNSmasq config directory] ******************************** + > changed: [Hizzoria] + > + > TASK [dns-server : Deploy DNSmasq container] *************************************** + > changed: [Hizzoria] + > + > PLAY RECAP ************************************************************************* + > Hizzoria : ok=5 changed=4 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + + +# ----------------------------------------------------- +# Check the result +#[root@ansibler] + + ssh Hizzoria + + > Last login: Sun Feb 21 05:21:20 2021 from 81.187.247.196 + + + podman ps -a + + > CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES + > 1c28928f2d80 docker.io/storytel/dnsmasq:latest dnsmasq 12 minutes ago Exited (2) 12 minutes ago dnsmasq + + + podman logs dnsmasq + + > dnsmasq: failed to create listening socket for port 53: Address in use + > + > dnsmasq: failed to create listening socket for port 53: Address in use + > + > dnsmasq: failed to create listening socket for port 53: Address in use + > .... + > .... + + # + # OK - needs tweaking - but the roles thing worked :-) + # + + diff --git a/notes/zrq/20210221-01-infra-ops.txt b/notes/zrq/20210221-01-infra-ops.txt new file mode 100644 index 00000000..23460257 --- /dev/null +++ b/notes/zrq/20210221-01-infra-ops.txt @@ -0,0 +1,1204 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Deploy DNSmasq to provide a DNS service for our deployments. + + Follow on from previous notes: + notes/zrq/20210218-01-infra-ops.txt + + Solves issue #379 + https://github.com/wfau/aglais/issues/379 + + Result: + + Work in progress ... + + +# ----------------------------------------------------- +# Create a container to work with. +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name ansibler \ + --hostname ansibler \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --volume "${AGLAIS_CODE:?}/experiments/infra-ops:/infra-ops:ro,z" \ + atolmis/ansible-client:2020.12.02 \ + bash + +# ----------------------------------------------------- +# Install the Podman container plugin. +# TODO - Add this to our ansible-client image. +#[root@ansibler] + + ansible-galaxy collection install containers.podman + + > Process install dependency map + > Starting collection install process + > Installing 'containers.podman:1.4.3' to '/root/.ansible/collections/ansible_collections/containers/podman' + + +# ----------------------------------------------------- +# Run our Ansible scripts ... +#[root@ansibler] + + pushd "/infra-ops/ansible" + + ansible-playbook \ + --inventory "hosts.yml" \ + "01-ssh-config.yml" + + > .... + > .... + + + ansible-playbook \ + --inventory "hosts.yml" \ + "02-ping-test.yml" + + > .... + > .... + + + ansible-playbook \ + --inventory "hosts.yml" \ + "03-apply-roles.yml" + + popd + + > PLAY [Apply roles] ***************************************************************** + > + > TASK [Gathering Facts] ************************************************************* + > ok: [Hizzoria] + > + > TASK [fedora-base : Update the DNF cache] ****************************************** + > changed: [Hizzoria] + > + > TASK [podman-host : Install Podman] ************************************************ + > changed: [Hizzoria] + > + > TASK [dns-server : Create DNSmasq config directory] ******************************** + > changed: [Hizzoria] + > + > TASK [dns-server : Deploy DNSmasq container] *************************************** + > changed: [Hizzoria] + > + > PLAY RECAP ************************************************************************* + > Hizzoria : ok=5 changed=4 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + + +# ----------------------------------------------------- +# Check the result +#[root@ansibler] + + ssh Hizzoria + + > Last login: Sun Feb 21 05:21:20 2021 from 81.187.247.196 + + + podman ps -a + + > CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES + > 1c28928f2d80 docker.io/storytel/dnsmasq:latest dnsmasq 12 minutes ago Exited (2) 12 minutes ago dnsmasq + + + podman logs dnsmasq + + > dnsmasq: failed to create listening socket for port 53: Address in use + > + > dnsmasq: failed to create listening socket for port 53: Address in use + > + > dnsmasq: failed to create listening socket for port 53: Address in use + > .... + > .... + + # + # OK - needs tweaking - but the roles thing worked :-) + # + + +# ----------------------------------------------------- +# Check if anyone is listening on port 53. +# https://www.cyberciti.biz/faq/unix-linux-check-if-port-is-in-use-command/ +#[root@Hizzoria] + + dnf info lsof + + > .... + > .... + + + lsof -i -P -n | grep LISTEN + + > systemd-r 444 systemd-resolve 11u IPv4 17957 0t0 TCP *:5355 (LISTEN) + > systemd-r 444 systemd-resolve 13u IPv6 17960 0t0 TCP *:5355 (LISTEN) + > systemd-r 444 systemd-resolve 16u IPv4 17964 0t0 TCP 127.0.0.53:53 (LISTEN) + > sshd 746 root 4u IPv4 23626 0t0 TCP *:22 (LISTEN) + > sshd 746 root 5u IPv6 23635 0t0 TCP *:22 (LISTEN) + + # systemd-resolve is listening on the internal localhost address. + # - which I think is OK + + + # We want dnsmasq to listen on the external (public) IP address. + + # DigitalOcean website lists the IP address as 46.101.32.198 + + +# ----------------------------------------------------- +# List the IP interfaces. +# https://access.redhat.com/sites/default/files/attachments/rh_ip_command_cheatsheet_1214_jcs_print.pdf +#[root@Hizzoria] + + ip addr + + > 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 + > link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 + > inet 127.0.0.1/8 scope host lo + > valid_lft forever preferred_lft forever + > inet6 ::1/128 scope host + > valid_lft forever preferred_lft forever + > 2: eth0: mtu 1500 qdisc fq_codel state UP group default qlen 1000 + > link/ether 86:ab:2d:00:cb:2a brd ff:ff:ff:ff:ff:ff + > altname enp0s3 + > altname ens3 + > inet 46.101.32.198/18 brd 46.101.63.255 scope global noprefixroute eth0 + > valid_lft forever preferred_lft forever + > inet 10.16.0.5/16 brd 10.16.255.255 scope global noprefixroute eth0 + > valid_lft forever preferred_lft forever + > inet6 2a03:b0c0:1:d0::b53:6001/64 scope global noprefixroute + > valid_lft forever preferred_lft forever + > inet6 fe80::84ab:2dff:fe00:cb2a/64 scope link noprefixroute + > valid_lft forever preferred_lft forever + > 3: eth1: mtu 1500 qdisc fq_codel state UP group default qlen 1000 + > link/ether be:ff:fc:81:f0:74 brd ff:ff:ff:ff:ff:ff + > altname enp0s4 + > altname ens4 + > inet 10.106.0.2/20 brd 10.106.15.255 scope global noprefixroute eth1 + > valid_lft forever preferred_lft forever + > inet6 fe80::bcff:fcff:fe81:f074/64 scope link + > valid_lft forever preferred_lft forever + + # Interesting, but hard to tell which is the external interface. + + +# ----------------------------------------------------- +# List the IP routes. +# https://access.redhat.com/sites/default/files/attachments/rh_ip_command_cheatsheet_1214_jcs_print.pdf +#[root@Hizzoria] + + ip route + + > default via 46.101.0.1 dev eth0 proto static metric 100 + > 10.16.0.0/16 dev eth0 proto kernel scope link src 10.16.0.5 metric 100 + > 10.106.0.0/20 dev eth1 proto kernel scope link src 10.106.0.2 metric 101 + > 46.101.0.0/18 dev eth0 proto kernel scope link src 46.101.32.198 metric 100 + + # That looks more useful, the default route is linked to the external interface. + # Is that always a valid assumption to make ? + + + ip route list match default + + > default via 46.101.0.1 dev eth0 proto static metric 100 + + +# ----------------------------------------------------- +# Try extract the interface name for the default route. +#[root@Hizzoria] + + ip route + + > default via 46.101.0.1 dev eth0 proto static metric 100 + > 10.16.0.0/16 dev eth0 proto kernel scope link src 10.16.0.5 metric 100 + > 10.106.0.0/20 dev eth1 proto kernel scope link src 10.106.0.2 metric 101 + > 46.101.0.0/18 dev eth0 proto kernel scope link src 46.101.32.198 metric 100 + + + ip route | cut -d ' ' -f 1 + + > default + > 10.16.0.0/16 + > 10.106.0.0/20 + > 46.101.0.0/18 + + + ip route | cut -d ' ' -f 2 + + > via + > dev + > dev + > dev + + + ip route | cut -d ' ' -f 3 + + > 46.101.0.1 + > eth0 + > eth1 + > eth0 + + + ip route | cut -d ' ' -f 4 + + > dev + > proto + > proto + > proto + + + # The default line is different because the destination is several words. + # The first term is 'default via 46.101.0.1' + # Equivalent to : + + > default-via-46.101.0.1 dev eth0 proto .... + > 10.16.0.0/16 dev eth0 proto .... + > 10.106.0.0/20 dev eth1 proto .... + > 46.101.0.0/18 dev eth0 proto .... + + + # We could use `ip` to select just the default route, and use `sed` to match the term we want. + + ip route list match default + + > default via 46.101.0.1 dev eth0 proto static metric 100 + + + ip route list match default \ + | sed ' + s/^.*dev[[:space:]]*\([[:alnum:]]*\)[[:space:]]*proto.*$/\1/ + ' + + > eth0 + + ifname=$( + ip route list match default \ + | sed ' + s/^.*dev[[:space:]]*\([[:alnum:]]*\)[[:space:]]*proto.*$/\1/ + ' + ) + + +# ----------------------------------------------------- +# Add a basic DNSmasq config file with the interface name. +# (initial settings copied fro Esperia) +# https://github.com/wfau/esperia/blob/master/src/ansible/dnsmasq/dnsmasq-esperia.conf +#[root@Hizzoria] + + configdir=/var/aglais/dnsmasq + + cat > "${configdir:?}/aglais.conf" << EOF + +no-hosts +no-resolv +no-daemon +bogus-priv +domain-needed +keep-in-foreground + +log-facility=- + +interface=${ifname:?} +bind-interfaces + +EOF + + +# ----------------------------------------------------- +# Restart the DNSmasq container see if that helps. +#[root@Hizzoria] + + podman restart dnsmasq + + > 04b9f37c57872d54e9ad46f26c1d832cfb2c71f78222463725d69de950375e8e + + + podman ps -a + + > CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES + > 04b9f37c5787 docker.io/storytel/dnsmasq:latest dnsmasq 44 minutes ago Up 3 seconds ago dnsmasq + + + podman logs dnsmasq + + > dnsmasq[1]: started, version 2.78 cachesize 150 + > dnsmasq[1]: compile time options: IPv6 GNU-getopt no-DBus no-i18n no-IDN DHCP DHCPv6 no-Lua TFTP no-conntrack ipset auth no-DNSSEC loop-detect inotify + > dnsmasq[1]: LOUD WARNING: listening on 46.101.32.198 may accept requests via interfaces other than eth0 + > dnsmasq[1]: LOUD WARNING: use --bind-dynamic rather than --bind-interfaces to avoid DNS amplification attacks via these interface(s) + > dnsmasq[1]: warning: no upstream servers configured + > dnsmasq[1]: cleared cache + + # Checks the DNSmasql manual .. + # http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html + # We could just use except-interface to preevent it listening on localhost .. + +# ----------------------------------------------------- +# Update the DNSmasq config file and try again. +# http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html +#[root@Hizzoria] + + cat > "${configdir:?}/aglais.conf" << EOF + +no-hosts +no-resolv +no-daemon +bogus-priv +domain-needed +keep-in-foreground + +log-facility=- + +except-interface=localhost + +EOF + + + podman restart dnsmasq + + > 04b9f37c57872d54e9ad46f26c1d832cfb2c71f78222463725d69de950375e8e + + + podman logs --follow dnsmasq + + > dnsmasq: failed to create listening socket for port 53: Address in use + > + > dnsmasq: failed to create listening socket for port 53: Address in use + > .... + > .... + + +# ----------------------------------------------------- +# Update the DNSmasq config file and try again. +# http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html +#[root@Hizzoria] + + cat > "${configdir:?}/aglais.conf" << EOF + +no-hosts +no-resolv +no-daemon +bogus-priv +domain-needed +keep-in-foreground + +log-facility=- + +except-interface=localhost +bind-interface + +EOF + + + podman restart dnsmasq + + > 04b9f37c57872d54e9ad46f26c1d832cfb2c71f78222463725d69de950375e8e + + + podman logs --follow dnsmasq + + > dnsmasq: bad option at line 12 of /etc/dnsmasq/aglais.conf + + +# ----------------------------------------------------- +# Update the DNSmasq config file and try again. +# http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html +#[root@Hizzoria] + + cat > "${configdir:?}/aglais.conf" << EOF + +no-hosts +no-resolv +no-daemon +bogus-priv +domain-needed +keep-in-foreground + +log-facility=- + +except-interface=localhost +bind-interfaces + +EOF + + + podman restart dnsmasq + + > 04b9f37c57872d54e9ad46f26c1d832cfb2c71f78222463725d69de950375e8e + + + podman logs --follow dnsmasq + + > dnsmasq[1]: started, version 2.78 cachesize 150 + > dnsmasq[1]: compile time options: IPv6 GNU-getopt no-DBus no-i18n no-IDN DHCP DHCPv6 no-Lua TFTP no-conntrack ipset auth no-DNSSEC loop-detect inotify + > dnsmasq[1]: LOUD WARNING: listening on 46.101.32.198 may accept requests via interfaces other than eth0 + > dnsmasq[1]: LOUD WARNING: use --bind-dynamic rather than --bind-interfaces to avoid DNS amplification attacks via these interface(s) + > dnsmasq[1]: warning: no upstream servers configured + > dnsmasq[1]: cleared cache + + +# ----------------------------------------------------- +# Update the DNSmasq config file and try again. +# http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html +#[root@Hizzoria] + + cat > "${configdir:?}/aglais.conf" << EOF + +no-hosts +no-resolv +no-daemon +bogus-priv +domain-needed +keep-in-foreground + +log-facility=- + +except-interface=localhost +bind-dynamic + +EOF + + + podman restart dnsmasq + + > 04b9f37c57872d54e9ad46f26c1d832cfb2c71f78222463725d69de950375e8e + + + podman logs --follow dnsmasq + + > dnsmasq[1]: started, version 2.78 cachesize 150 + > dnsmasq[1]: compile time options: IPv6 GNU-getopt no-DBus no-i18n no-IDN DHCP DHCPv6 no-Lua TFTP no-conntrack ipset auth no-DNSSEC loop-detect inotify + > dnsmasq[1]: warning: no upstream servers configured + > dnsmasq[1]: cleared cache + + +# ----------------------------------------------------- +# To perevent DNSmasq from trying to resolve other addresses. +# http://www.thekelleys.org.uk/dnsmasq/docs/dnsmasq-man.html +# https://www.mail-archive.com/dnsmasq-discuss@lists.thekelleys.org.uk/msg14016.html +#[root@Hizzoria] + + # Not conclusive .. + # I think we have what we need. + + # Q - why wrap DNSmasq in a container ? + # A - because that's the way Fedora is going in the future ? + + # It comes installed in Fedora by default. + + dnf info dnsmasq + + > Installed Packages + > Name : dnsmasq + > Version : 2.83 + > Release : 1.fc33 + > Architecture : x86_64 + > Size : 693 k + > Source : dnsmasq-2.83-1.fc33.src.rpm + > Repository : @System + > From repo : updates + > Summary : A lightweight DHCP/caching DNS server + > URL : http://www.thekelleys.org.uk/dnsmasq/ + > License : GPLv2 or GPLv3 + > Description : Dnsmasq is lightweight, easy to configure DNS forwarder and DHCP server. + > : It is designed to provide DNS and, optionally, DHCP, to a small network. + > : It can serve the names of local machines which are not in the global + > : DNS. The DHCP server integrates with the DNS server and allows machines + > : with DHCP-allocated addresses to appear in the DNS with names configured + > : either in each host or in a central configuration file. Dnsmasq supports + > : static and dynamic DHCP leases and BOOTP for network booting of diskless + > : machines. + + + # All we would need to do is start it ... + + systemctl status dnsmasq + + > ● dnsmasq.service - DNS caching server. + > Loaded: loaded (/usr/lib/systemd/system/dnsmasq.service; disabled; vendor preset: disabled) + > Active: inactive (dead) + + # Because we want to have multiple services on the machine + # and we want each of them to be in separate containers + + +# ----------------------------------------------------- +# Setup a second shell to tail the DNSmasq logs. +#[user@desktop] + + podman exec -it ansibler bash + + ssh Hizzoria + + podman logs --follow dnsmasq + + > dnsmasq[1]: started, version 2.78 cachesize 150 + > dnsmasq[1]: compile time options: IPv6 GNU-getopt no-DBus no-i18n no-IDN DHCP DHCPv6 no-Lua TFTP no-conntrack ipset auth no-DNSSEC loop-detect inotify + > dnsmasq[1]: warning: no upstream servers configured + > dnsmasq[1]: cleared cache + > .... + > .... + + +# ----------------------------------------------------- +# Add a directory for host files. +#[root@Hizzoria] + + mkdir "${configdir:?}/hosts" + + cat > "${configdir:?}/aglais.conf" << EOF + +no-hosts +no-resolv +no-daemon +bogus-priv +domain-needed +keep-in-foreground + +log-facility=- + +except-interface=localhost +bind-dynamic + +hostsdir=${configdir:?}/hosts + +EOF + + +# ----------------------------------------------------- +# Send DNSmasq a SIGHUP signal to reload the config. +# https://serverfault.com/questions/723292/dnsmasq-doesnt-automatically-reload-when-entry-is-added-to-etc-hosts +# https://serverfault.com/a/934681 +# http://docs.podman.io/en/latest/markdown/podman-kill.1.html +#[root@Hizzoria] + + podman kill --signal SIGHUP dnsmasq + + > 04b9f37c57872d54e9ad46f26c1d832cfb2c71f78222463725d69de950375e8e + + +# ----------------------------------------------------- +# Add a hosts file for gaia-dev. +#[root@Hizzoria] + + cat > "${configdir:?}/gaia-dev.hosts" << EOF + +zeppelin.gaia-dev.aglais.uk,128.232.227.197 + +EOF + + # + # Useful to know - DNSmasq doesn't handle CNAMES to other domains. + # http://lists.thekelleys.org.uk/pipermail/dnsmasq-discuss/2006q1/000583.html + # + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Send a test query. +#[user@desktop] + + server=46.101.32.198 + + host 'zeppelin.gaia-dev.aglais.uk' "${server:?}" + + + > Using domain server: + > Name: 46.101.32.198 + > Address: 46.101.32.198#53 + > Aliases: + > + > Host zeppelin.gaia-dev.aglais.uk not found: 5(REFUSED) + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Add query logging. +#[root@Hizzoria] + + vi "${configdir:?}/aglais.conf" + + + log-queries + + + podman kill --signal SIGHUP dnsmasq + + > .... + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Send a test query. +#[user@desktop] + + host 'zeppelin.gaia-dev.aglais.uk' "${server:?}" + + > Using domain server: + > Name: 46.101.32.198 + > Address: 46.101.32.198#53 + > Aliases: + > + > Host zeppelin.gaia-dev.aglais.uk not found: 5(REFUSED) + + + # Is that request refused, or connection refused ? + + +# ----------------------------------------------------- +# Try dig instead. +#[user@desktop] + + dig "@${server:?}" 'zeppelin.gaia-dev.aglais.uk' 'A' + + > ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> @46.101.32.198 zeppelin.gaia-dev.aglais.uk A + > ; (1 server found) + > ;; global options: +cmd + > ;; Got answer: + > ;; ->>HEADER<<- opcode: QUERY, status: REFUSED, id: 65372 + > ;; flags: qr rd ra ad; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 0 + > + > ;; QUESTION SECTION: + > ;zeppelin.gaia-dev.aglais.uk. IN A + > + > ;; Query time: 15 msec + > ;; SERVER: 46.101.32.198#53(46.101.32.198) + > ;; WHEN: Sun Feb 21 13:06:16 GMT 2021 + > ;; MSG SIZE rcvd: 45 + + + # OK - looks like we got an answer ... that the query was refused. + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Setup the NS address in our external DNS provider. +# https://admin.lcn.com/ + + .... + .... + + +# ----------------------------------------------------- +# Check the result. +#[user@desktop] + + host -a 'infra-ops.aglais.uk' 'ns1.lcn.com' + + > Trying "infra-ops.aglais.uk" + > ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 33136 + > ;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 3, ADDITIONAL: 3 + > + > ;; QUESTION SECTION: + > ;infra-ops.aglais.uk. IN ANY + > + > ;; ANSWER SECTION: + > infra-ops.aglais.uk. 600 IN A 46.101.32.198 + > + > ;; AUTHORITY SECTION: + > aglais.uk. 172800 IN NS ns0.lcn.com. + > aglais.uk. 172800 IN NS ns1.lcn.com. + > aglais.uk. 172800 IN NS ns2.lcn.com. + > + > ;; ADDITIONAL SECTION: + > ns1.lcn.com. 12497 IN A 85.233.160.69 + > ns0.lcn.com. 12497 IN A 195.110.124.234 + > ns2.lcn.com. 12497 IN A 91.186.2.8 + > + > Received 162 bytes from 195.194.120.1#53 in 85 ms + + + host -a 'gaia-dev.aglais.uk' 'ns1.lcn.com' + + > Trying "gaia-dev.aglais.uk" + > ;; Truncated, retrying in TCP mode. + > Trying "gaia-dev.aglais.uk" + > Using domain server: + > Name: ns1.lcn.com + > Address: 85.233.160.69#53 + > Aliases: + > + > ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 22311 + > ;; flags: qr rd; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 1 + > + > ;; QUESTION SECTION: + > ;gaia-dev.aglais.uk. IN ANY + > + > ;; AUTHORITY SECTION: + > gaia-dev.aglais.uk. 600 IN NS infra-ops.aglais.uk. + > + > ;; ADDITIONAL SECTION: + > infra-ops.aglais.uk. 600 IN A 46.101.32.198 + > + > Received 76 bytes from 85.233.160.69#53 in 15 ms + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Make DNSmasq an authoritative server. +#[root@Hizzoria] + + vi "${configdir:?}/aglais.conf" + + + auth-server=ns0.lcn.com + + auth-zone=gaia-dev.aglais.uk + + podman kill --signal SIGHUP dnsmasq + + > .... + +# ----------------------------------------------------- +# Check the result. +#[user@desktop] + + host -a 'zeppelin.gaia-dev.aglais.uk' 'infra-ops.aglais.uk' + + > Trying "zeppelin.gaia-dev.aglais.uk" + > Using domain server: + > Name: infra-ops.aglais.uk + > Address: 46.101.32.198#53 + > Aliases: + > + > Host zeppelin.gaia-dev.aglais.uk not found: 5(REFUSED) + > Received 45 bytes from 46.101.32.198#53 in 13 ms + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Explicity add the host address to our main config. +#[root@Hizzoria] + + vi "${configdir:?}/aglais.conf" + + + host-record=zeppelin,zeppelin.gaia-dev.aglais.uk,128.232.227.197 + + + podman kill --signal SIGHUP dnsmasq + + > .... + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Check the result. +#[user@desktop] + + host -a 'zeppelin.gaia-dev.aglais.uk' 'infra-ops.aglais.uk' + + > .... + > .... + > Host zeppelin.gaia-dev.aglais.uk not found: 5(REFUSED) + > Received 45 bytes from 46.101.32.198#53 in 15 ms + + # Still getting REFUSED response. + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Stop the service ... +#[root@Hizzoria] + + podman stop dnsmasq + + > dnsmasq[1]: exiting on receipt of SIGTERM + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Check the result. +#[user@desktop] + + host -a 'zeppelin.gaia-dev.aglais.uk' 'infra-ops.aglais.uk' + + > host -a 'zeppelin.gaia-dev.aglais.uk' 'infra-ops.aglais.uk' + > Trying "zeppelin.gaia-dev.aglais.uk" + > ;; connection timed out; no servers could be reached + + # So the REFUSED is coming from DNSmasq itself. + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Start the service ... +#[root@Hizzoria] + + podman start dnsmasq + + > dnsmasq: error at line 17 of /etc/dnsmasq/aglais.conf + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + # Mistake #1 + # Created a hosts directory and added it to the config. + + mkdir "${configdir:?}/hosts" + + # .. but put the hosts file in the top level + + cat > "${configdir:?}/gaia-dev.hosts" << EOF + + # Explicitly adding the host addres to the main config worked + + host-record=zeppelin,zeppelin.gaia-dev.aglais.uk,128.232.227.197 + +# ----------------------------------------------------- +# ----------------------------------------------------- + + # Mistake #2 + # Created a hosts directory on the host VM + + mkdir "${configdir:?}/hosts" + + # and used the same path in the config file + + hostsdir=/var/aglais/dnsmasq/hosts + + # but the host VM path is mounted as something else inside the Pod + + volumes: + - "{{dnsmasq.config_path}}:/etc/dnsmasq:ro" + + # So the path _insied_ the Pod should be + + /etc/dnsmasq/hosts + +# ----------------------------------------------------- +# ----------------------------------------------------- + + # Mistake #3 + # The wrong syntax in the hosts file + # using the addresshost-record syntax from DNSmasq config + + zeppelin.gaia-dev.aglais.uk,128.232.227.197 + + # Should be using the hosts file syntax + + 128.232.227.197 zeppelin.gaia-dev.aglais.uk + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + # Mistake #4 + # Sending the SIGHUP signal to the Pod didn't reload the config file. + # Need to explicitly stop and start the Pod. + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Putting it all together. + + cat "${configdir:?}/aglais.conf" + + > no-hosts + > no-resolv + > no-daemon + > bogus-priv + > domain-needed + > keep-in-foreground + > + > log-queries + > log-facility=- + > + > except-interface=localhost + > bind-dynamic + > + > hostsdir=/etc/dnsmasq/hosts + > + > #auth-server=ns0.lcn.com + > #auth-server=infra-ops.aglais.uk + > auth-zone=gaia-dev.aglais.uk + + + cat "${configdir:?}/hosts/gaia-dev.hosts" + + > # Host addresses for the gaia-dev cloud. + > 128.232.227.197 zeppelin.gaia-dev.aglais.uk + > + +# ----------------------------------------------------- +# Stop and start the Pod ... +#[root@Hizzoria] + + podman stop dnsmasq + + podman start dnsmasq + + +# ----------------------------------------------------- +# Test from an external machine +#[user@trop01] + + host 'zeppelin.gaia-dev.aglais.uk' + + > zeppelin.gaia-dev.aglais.uk has address 128.232.227.197 + + + host -a 'zeppelin.gaia-dev.aglais.uk' + + > Trying "zeppelin.gaia-dev.aglais.uk" + > ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 63506 + > ;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 1, ADDITIONAL: 1 + > + > ;; QUESTION SECTION: + > ;zeppelin.gaia-dev.aglais.uk. IN ANY + > + > ;; ANSWER SECTION: + > zeppelin.gaia-dev.aglais.uk. 558 IN A 128.232.227.197 + > + > ;; AUTHORITY SECTION: + > gaia-dev.aglais.uk. 431 IN NS infra-ops.aglais.uk. + > + > ;; ADDITIONAL SECTION: + > infra-ops.aglais.uk. 431 IN A 46.101.32.198 + > + > Received 101 bytes from 195.194.120.2#53 in 13 ms + + + # The NS record for the sub-domain is wrong. + + host -a 'gaia-dev.aglais.uk' + + > Trying "gaia-dev.aglais.uk" + > ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 50199 + > ;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 0, ADDITIONAL: 0 + > + > ;; QUESTION SECTION: + > ;gaia-dev.aglais.uk. IN ANY + > + > ;; ANSWER SECTION: + > gaia-dev.aglais.uk. 243 IN NS . + > + > Received 49 bytes from 195.194.120.1#53 in 13 ms + + + host -t NS 'gaia-dev.aglais.uk' + + > gaia-dev.aglais.uk name server . + + + # Explicitly point at our server + + host -a 'gaia-dev.aglais.uk' 'infra-ops.aglais.uk' + + > Trying "gaia-dev.aglais.uk" + > Using domain server: + > Name: infra-ops.aglais.uk + > Address: 46.101.32.198#53 + > Aliases: + > + > ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 12966 + > ;; flags: qr rd ra; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 0 + > + > ;; QUESTION SECTION: + > ;gaia-dev.aglais.uk. IN ANY + > + > Received 36 bytes from 46.101.32.198#53 in 15 ms + +# ----------------------------------------------------- +# ----------------------------------------------------- + + # Lots of caching is getting in the way. + # I think we have the right configuration. + + # We needed to specify the interface for the auth-server entry. + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# List the configuration. +#[root@Hizzoria] + + cat "${configdir:?}/aglais.conf" + + > no-hosts + > no-resolv + > no-daemon + > bogus-priv + > domain-needed + > keep-in-foreground + > + > local-ttl=60 + > + > log-queries + > log-facility=- + > + > except-interface=localhost + > bind-dynamic + > + > hostsdir=/etc/dnsmasq/hosts + > + > auth-server=infra-ops.aglais.uk,eth0 + > auth-zone=gaia-dev.aglais.uk + + + cat "${configdir:?}/hosts/gaia-dev.hosts" + + > # Host addresses for the gaia-dev cloud. + > 128.232.227.197 zeppelin.gaia-dev.aglais.uk + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Check the results. +#[user@desktop] + + # Ask LCN's nameserver + dig '@ns0.lcn.com' 'zeppelin.gaia-dev.aglais.uk' + + > ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> @ns0.lcn.com zeppelin.gaia-dev.aglais.uk + > ; (1 server found) + > ;; global options: +cmd + > ;; Got answer: + > ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 27658 + > ;; flags: qr rd; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 2 + > ;; WARNING: recursion requested but not available + > + > ;; OPT PSEUDOSECTION: + > ; EDNS: version: 0, flags:; udp: 1232 + > ;; QUESTION SECTION: + > ;zeppelin.gaia-dev.aglais.uk. IN A + > + > ;; AUTHORITY SECTION: + > gaia-dev.aglais.uk. 600 IN NS infra-ops.aglais.uk. + > + > ;; ADDITIONAL SECTION: + > infra-ops.aglais.uk. 600 IN A 46.101.32.198 + > + > ;; Query time: 51 msec + > ;; SERVER: 195.110.124.234#53(195.110.124.234) + > ;; WHEN: Sun Feb 21 16:17:11 GMT 2021 + > ;; MSG SIZE rcvd: 96 + + + # Ask our nameserver + dig '@infra-ops.aglais.uk' 'zeppelin.gaia-dev.aglais.uk' + + > ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> @infra-ops.aglais.uk zeppelin.gaia-dev.aglais.uk + > ; (1 server found) + > ;; global options: +cmd + > ;; Got answer: + > ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 31016 + > ;; flags: qr aa rd ad; QUERY: 1, ANSWER: 1, AUTHORITY: 1, ADDITIONAL: 1 + > ;; WARNING: recursion requested but not available + > + > ;; OPT PSEUDOSECTION: + > ; EDNS: version: 0, flags:; udp: 4096 + > ;; QUESTION SECTION: + > ;zeppelin.gaia-dev.aglais.uk. IN A + > + > ;; ANSWER SECTION: + > zeppelin.gaia-dev.aglais.uk. 600 IN A 128.232.227.197 + > + > ;; AUTHORITY SECTION: + > gaia-dev.aglais.uk. 600 IN NS infra-ops.aglais.uk. + > + > ;; Query time: 16 msec + > ;; SERVER: 46.101.32.198#53(46.101.32.198) + > ;; WHEN: Sun Feb 21 16:17:51 GMT 2021 + > ;; MSG SIZE rcvd: 123 + + + # That all looks OK, apart from the 600s TTL. + # ... but we can live with that for now. + + # Having to set the interface name in the auth-server entry is a hassle, + # but we work around it. + + # We can use the interface name from the default route entry. + +# ----------------------------------------------------- +# Get the interface name from the default route entry. +#[root@Hizzoria] + + ifname=$( + ip route list match default \ + | sed ' + s/^.*dev[[:space:]]*\([[:alnum:]]*\)[[:space:]]*proto.*$/\1/ + ' + ) + + echo "Interface [${ifname:?}]" + + > Interface [eth0] + + +# ----------------------------------------------------- +# Get the config files. +#[root@Hizzoria] + + ls -1 "${configdir}" + + > aglais.conf + > hosts + + + "cat ${configdir}/aglais.conf" + + > no-hosts + > no-resolv + > no-daemon + > bogus-priv + > domain-needed + > keep-in-foreground + > + > local-ttl=60 + > + > log-queries + > log-facility=- + > + > except-interface=localhost + > bind-dynamic + > + > hostsdir=/etc/dnsmasq/hosts + > + > auth-server=infra-ops.aglais.uk,eth0 + > auth-zone=gaia-dev.aglais.uk + + + ls -1 "${configdir}/hosts" + + > gaia-dev.hosts + + + cat "${configdir}/hosts/gaia-dev.hosts" + + > # Host addresses for the gaia-dev cloud. + > 128.232.227.197 zeppelin.gaia-dev.aglais.uk + + + + diff --git a/notes/zrq/20210222-01-infra-ops.txt b/notes/zrq/20210222-01-infra-ops.txt new file mode 100644 index 00000000..61fafea8 --- /dev/null +++ b/notes/zrq/20210222-01-infra-ops.txt @@ -0,0 +1,322 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Deploy DNSmasq to provide a DNS service for our deployments. + + Solves issue #379 + https://github.com/wfau/aglais/issues/379 + + Result: + + Success - working DNS service hosted on DigitalOcean. + + +# ----------------------------------------------------- +# Rebuild the DigitalOcean droplet + + DigitalOcean website + .... + + publicip4: '46.101.32.198' + publicip6: '2a03:b0c0:1:d0::b53:6001' + + +# ----------------------------------------------------- +# Create a container to work with. +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name infra-ops \ + --hostname infra-ops \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --volume "${AGLAIS_CODE:?}/experiments/infra-ops:/infra-ops:ro,z" \ + atolmis/ansible-client:2020.12.02 \ + bash + +# ----------------------------------------------------- +# Install the Podman container plugin. +# TODO - Add this to our ansible-client image. +#[root@ansibler] + + ansible-galaxy collection install containers.podman + + > Process install dependency map + > Starting collection install process + > Installing 'containers.podman:1.4.3' to '/root/.ansible/collections/ansible_collections/containers/podman' + + +# ----------------------------------------------------- +# Run our Ansible scripts ... +#[root@ansibler] + + pushd "/infra-ops/ansible" + + ansible-playbook \ + --inventory "hosts.yml" \ + "01-ssh-config.yml" + + ansible-playbook \ + --inventory "hosts.yml" \ + "02-ping-test.yml" + + ansible-playbook \ + --inventory "hosts.yml" \ + "03-dns-server.yml" + + ansible-playbook \ + --inventory "hosts.yml" \ + "04-dns-hosts.yml" + + popd + + + > .... + > .... + + +# ----------------------------------------------------- +# Login to the Droplet and tail the DNSmasq log. +#[root@ansibler] + + ssh Hizzoria \ + ' + podman logs --follow dnsmasq + ' + + > dnsmasq: failed to create listening socket for port 53: Address in use + > + > dnsmasq: failed to create listening socket for port 53: Address in use + > + > dnsmasq: failed to create listening socket for port 53: Address in use + > + > dnsmasq: failed to create listening socket for port 53: Address in use + > + > dnsmasq: failed to create listening socket for port 53: Address in use + > dnsmasq[1]: started, version 2.78 cachesize 150 + > dnsmasq[1]: compile time options: IPv6 GNU-getopt no-DBus no-i18n no-IDN DHCP DHCPv6 no-Lua TFTP no-conntrack ipset auth no-DNSSEC loop-detect inotify + > dnsmasq[1]: warning: no upstream servers configured + > dnsmasq[1]: cleared cache + > dnsmasq[1]: inotify, new or changed file /etc/dnsmasq/hosts/gaia-dev.hosts + > dnsmasq[1]: read /etc/dnsmasq/hosts/gaia-dev.hosts - 2 addresses + > dnsmasq[1]: inotify, new or changed file /etc/dnsmasq/hosts/gaia-test.hosts + > dnsmasq[1]: read /etc/dnsmasq/hosts/gaia-test.hosts - 0 addresses + > dnsmasq[1]: inotify, new or changed file /etc/dnsmasq/hosts/gaia-prod.hosts + > dnsmasq[1]: read /etc/dnsmasq/hosts/gaia-prod.hosts - 1 addresses + > .... + > .... + + +# ----------------------------------------------------- +# Test queries direct to our DNS server. +#[user@desktop] + + dig '@infra-ops.aglais.uk' 'zeppelin.gaia-dev.aglais.uk' + + > ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> @infra-ops.aglais.uk zeppelin.gaia-dev.aglais.uk + > ; (1 server found) + > ;; global options: +cmd + > ;; Got answer: + > ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 39531 + > ;; flags: qr aa rd ad; QUERY: 1, ANSWER: 1, AUTHORITY: 1, ADDITIONAL: 1 + > ;; WARNING: recursion requested but not available + > + > ;; OPT PSEUDOSECTION: + > ; EDNS: version: 0, flags:; udp: 4096 + > ;; QUESTION SECTION: + > ;zeppelin.gaia-dev.aglais.uk. IN A + > + > ;; ANSWER SECTION: + > zeppelin.gaia-dev.aglais.uk. 300 IN A 128.232.227.197 + > + > ;; AUTHORITY SECTION: + > gaia-dev.aglais.uk. 300 IN NS infra-ops.aglais.uk. + > + > ;; Query time: 15 msec + > ;; SERVER: 46.101.32.198#53(46.101.32.198) + > ;; WHEN: Tue Feb 23 01:43:23 GMT 2021 + > ;; MSG SIZE rcvd: 123 + + + > .... + > dnsmasq[1]: auth[A] zeppelin.gaia-dev.aglais.uk from 81.187.247.196 + > dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.197 is zeppelin.gaia-dev.aglais.uk + > .... + + + dig '@infra-ops.aglais.uk' 'zeppelin.gaia-prod.aglais.uk' + + > ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> @infra-ops.aglais.uk zeppelin.gaia-prod.aglais.uk + > ; (1 server found) + > ;; global options: +cmd + > ;; Got answer: + > ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 46737 + > ;; flags: qr aa rd ad; QUERY: 1, ANSWER: 1, AUTHORITY: 1, ADDITIONAL: 1 + > ;; WARNING: recursion requested but not available + > + > ;; OPT PSEUDOSECTION: + > ; EDNS: version: 0, flags:; udp: 4096 + > ;; QUESTION SECTION: + > ;zeppelin.gaia-prod.aglais.uk. IN A + > + > ;; ANSWER SECTION: + > zeppelin.gaia-prod.aglais.uk. 300 IN A 128.232.227.197 + > + > ;; AUTHORITY SECTION: + > gaia-prod.aglais.uk. 300 IN NS infra-ops.aglais.uk. + > + > ;; Query time: 15 msec + > ;; SERVER: 46.101.32.198#53(46.101.32.198) + > ;; WHEN: Tue Feb 23 01:43:51 GMT 2021 + > ;; MSG SIZE rcvd: 125 + + + > .... + > dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 81.187.247.196 + > dnsmasq[1]: /etc/dnsmasq/hosts zeppelin.gaia-prod.aglais.uk is 128.232.227.197 + > .... + + + +# ----------------------------------------------------- +# Update our LCN nameserver records. +#[root@ansibler] + + infra-ops A 46.101.32.198 + www CNAME zeppelin.gaia-prod.aglais.uk + zeppelin CNAME zeppelin.gaia-prod.aglais.uk + gaia-dev NS infra-ops.aglais.uk + gaia-test NS infra-ops.aglais.uk + gaia-prod NS infra-ops.aglais.uk + + +# ----------------------------------------------------- +# Query the dev hostname via our local DNS. +#[user@desktop] + + dig 'zeppelin.gaia-dev.aglais.uk' + + > + > ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> zeppelin.gaia-dev.aglais.uk + > ;; global options: +cmd + > ;; Got answer: + > ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 63618 + > ;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 13, ADDITIONAL: 1 + > + > ;; OPT PSEUDOSECTION: + > ; EDNS: version: 0, flags:; udp: 4096 + > ; COOKIE: 786bc567224a9e6584e88f4c603460c0d5dc757d25f6d17a (good) + > ;; QUESTION SECTION: + > ;zeppelin.gaia-dev.aglais.uk. IN A + > + > ;; ANSWER SECTION: + > zeppelin.gaia-dev.aglais.uk. 300 IN A 128.232.227.197 + > + > ;; AUTHORITY SECTION: + > . 79428 IN NS d.root-servers.net. + > . 79428 IN NS i.root-servers.net. + > . 79428 IN NS k.root-servers.net. + > . 79428 IN NS c.root-servers.net. + > . 79428 IN NS l.root-servers.net. + > . 79428 IN NS h.root-servers.net. + > . 79428 IN NS a.root-servers.net. + > . 79428 IN NS b.root-servers.net. + > . 79428 IN NS f.root-servers.net. + > . 79428 IN NS m.root-servers.net. + > . 79428 IN NS g.root-servers.net. + > . 79428 IN NS j.root-servers.net. + > . 79428 IN NS e.root-servers.net. + > + > ;; Query time: 62 msec + > ;; SERVER: 10.4.0.2#53(10.4.0.2) + > ;; WHEN: Tue Feb 23 01:56:16 GMT 2021 + > ;; MSG SIZE rcvd: 311 + + > .... + > dnsmasq[1]: auth[A] zeppelin.gaia-dev.aglais.uk from 90.155.53.34 + > dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.197 is zeppelin.gaia-dev.aglais.uk + > .... + + + # Query originates from our ISP. + + host 90.155.53.34 + + > 34.53.155.90.in-addr.arpa domain name pointer b-dns-thn.aa.net.uk. + + +# ----------------------------------------------------- +# Query the public service name via our local DNS. +#[user@desktop] + + dig 'zeppelin.aglais.uk' + + > ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> zeppelin.aglais.uk + > ;; global options: +cmd + > ;; Got answer: + > ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 4530 + > ;; flags: qr rd ra; QUERY: 1, ANSWER: 2, AUTHORITY: 13, ADDITIONAL: 1 + > + > ;; OPT PSEUDOSECTION: + > ; EDNS: version: 0, flags:; udp: 4096 + > ; COOKIE: 9fd63624d4944e80a40705ca60346180714b55bdf57e9829 (good) + > ;; QUESTION SECTION: + > ;zeppelin.aglais.uk. IN A + > + > ;; ANSWER SECTION: + > zeppelin.aglais.uk. 600 IN CNAME zeppelin.gaia-prod.aglais.uk. + > zeppelin.gaia-prod.aglais.uk. 300 IN A 128.232.227.197 + > + > ;; AUTHORITY SECTION: + > . 79236 IN NS i.root-servers.net. + > . 79236 IN NS c.root-servers.net. + > . 79236 IN NS j.root-servers.net. + > . 79236 IN NS a.root-servers.net. + > . 79236 IN NS g.root-servers.net. + > . 79236 IN NS l.root-servers.net. + > . 79236 IN NS m.root-servers.net. + > . 79236 IN NS b.root-servers.net. + > . 79236 IN NS h.root-servers.net. + > . 79236 IN NS d.root-servers.net. + > . 79236 IN NS k.root-servers.net. + > . 79236 IN NS e.root-servers.net. + > . 79236 IN NS f.root-servers.net. + > + > ;; Query time: 67 msec + > ;; SERVER: 10.4.0.2#53(10.4.0.2) + > ;; WHEN: Tue Feb 23 01:59:28 GMT 2021 + > ;; MSG SIZE rcvd: 335 + + diff --git a/notes/zrq/20210222-03-infra-ops.txt b/notes/zrq/20210222-03-infra-ops.txt new file mode 100644 index 00000000..c1d592ec --- /dev/null +++ b/notes/zrq/20210222-03-infra-ops.txt @@ -0,0 +1,147 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Notes on a suspicious (possible malicious) scan directed at our DNS service. + Probably just normal bad stuff you get it you have something listening on port 53. + +# ----------------------------------------------------- + + Seen in the DNSmasq logs on 2021-02-22 + + dnsmasq[1]: auth[TXT] version.bind from 80.82.77.139 + dnsmasq[1]: auth[TXT] id.server from 80.82.77.139 + dnsmasq[1]: auth[TXT] hostname.bind from 80.82.77.139 + dnsmasq[1]: auth[A] direct.shodan.io from 80.82.77.139 + + The first three are attempts to try to identify what version of Bind we might be running. + + https://serverfault.com/questions/215724/disable-bind-9-3-6-hostname-disclosure + https://www.osi.security/blog/determining-bind-dns-version-using-dig + + The third is a request for the address of the IoT search engine. + + https://www.shodan.io/ + + The IP address making the queries is from a hosting company registered in the Seychelles. + + whois 80.82.77.139 + + > .... + > .... + > inetnum: 80.82.77.0 - 80.82.77.255 + > netname: NET-1-77 + > descr: IPV NETBLOCK + > country: NL + > geoloc: 52.370216 4.895168 + > org: ORG-IVI1-RIPE + > admin-c: IVI24-RIPE + > tech-c: IVI24-RIPE + > status: ASSIGNED PA + > mnt-by: IPV + > mnt-lower: IPV + > mnt-routes: IPV + > created: 2013-04-26T10:57:52Z + > last-modified: 2019-02-01T18:30:06Z + > source: RIPE + > + > organisation: ORG-IVI1-RIPE + > org-name: IP Volume inc + > org-type: OTHER + > address: Suite 9 + > address: Victoria, Mahe + > address: Seychelles + > abuse-c: IVNO1-RIPE + > mnt-ref: IPV + > mnt-by: IPV + > created: 2018-05-14T11:46:50Z + > last-modified: 2019-01-31T14:39:36Z + > source: RIPE # Filtered + > .... + > .... + + They do not have a good reputation. + + https://badpackets.net/a-conversation-with-ripe-ncc-regarding-quasi-networks-ltd/ + https://scamalytics.com/ip/isp/ip-volume-inc + + +# ----------------------------------------------------- +# Try sending our DNSmasq server the same query. +#[user@desktop] + + dig '@infra-ops.aglais.uk' 'direct.shodan.io' + + > ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> @infra-ops.aglais.uk direct.shodan.io + > ; (1 server found) + > ;; global options: +cmd + > ;; Got answer: + > ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 11408 + > ;; flags: qr rd ad; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 1 + > ;; WARNING: recursion requested but not available + > + > ;; OPT PSEUDOSECTION: + > ; EDNS: version: 0, flags:; udp: 4096 + > ;; QUESTION SECTION: + > ;direct.shodan.io. IN A + > + > ;; Query time: 16 msec + > ;; SERVER: 46.101.32.198#53(46.101.32.198) + > ;; WHEN: Mon Feb 22 19:58:28 GMT 2021 + > ;; MSG SIZE rcvd: 45 + + + host 'direct.shodan.io' 'infra-ops.aglais.uk' + + > Trying "direct.shodan.io" + > Using domain server: + > Name: infra-ops.aglais.uk + > Address: 46.101.32.198#53 + > Aliases: + > + > ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 56963 + > ;; flags: qr rd; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 0 + > + > ;; QUESTION SECTION: + > ;direct.shodan.io. IN ANY + > + > Received 34 bytes from 46.101.32.198#53 in 14 ms + + + Creates similar entries in the DNSmasq logs + + > dnsmasq[1]: auth[A] direct.shodan.io from 81.187.247.196 + > dnsmasq[1]: auth[A] direct.shodan.io from 81.187.247.196 + > dnsmasq[1]: auth[AAAA] direct.shodan.io from 81.187.247.196 + > dnsmasq[1]: auth[MX] direct.shodan.io from 81.187.247.196 + > dnsmasq[1]: auth[ANY] direct.shodan.io from 81.187.247.196 + + + + diff --git a/notes/zrq/20210223-01-infra-ops.txt b/notes/zrq/20210223-01-infra-ops.txt new file mode 100644 index 00000000..d406666a --- /dev/null +++ b/notes/zrq/20210223-01-infra-ops.txt @@ -0,0 +1,229 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Update DNSmasq hosts with new IP address. + + Result: + + Success - DNS record updated. + + +# ----------------------------------------------------- +# Update the hosts file +#[user@desktop] + + source "${HOME:?}/aglais.env" + + pushd "${AGLAIS_CODE:?}" + + pushd experiments/infra-ops/ansible + + gedit hosts.yml + + gaia-prod: + zeppelin: + ~ publicip4: '128.232.227.212' + + + popd + popd + + +# ----------------------------------------------------- +# Create a container to work with. +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name infra-ops \ + --hostname infra-ops \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --volume "${AGLAIS_CODE:?}/experiments/infra-ops:/infra-ops:ro,z" \ + atolmis/ansible-client:2020.12.02 \ + bash + +# ----------------------------------------------------- +# Install the Podman container plugin. +# TODO - Add this to our ansible-client image. +#[root@ansibler] + + ansible-galaxy collection install containers.podman + + > Process install dependency map + > Starting collection install process + > Installing 'containers.podman:1.4.3' to '/root/.ansible/collections/ansible_collections/containers/podman' + + +# ----------------------------------------------------- +# Run our Ansible scripts ... +#[root@ansibler] + + pushd "/infra-ops/ansible" + + ansible-playbook \ + --inventory "hosts.yml" \ + "01-ssh-config.yml" + + ansible-playbook \ + --inventory "hosts.yml" \ + "02-ping-test.yml" + + ansible-playbook \ + --inventory "hosts.yml" \ + "04-dns-hosts.yml" + + popd + + + > .... + > TASK [Update DNSmasq host files] ************************************************************************** + > ok: [Hizzoria] => (item={'key': 'gaia-dev', 'value': {....}}) + > ok: [Hizzoria] => (item={'key': 'gaia-test', 'value': None}) + > changed: [Hizzoria] => (item={'key': 'gaia-prod', 'value': {'zeppelin': {'publicip4': '128.232.227.212'}}}) + > .... + + +# ----------------------------------------------------- +# Login to the Droplet and tail the DNSmasq log. +#[root@ansibler] + + ssh Hizzoria \ + ' + podman logs --follow dnsmasq + ' + + > .... + > .... + + +# ----------------------------------------------------- +# Test queries direct to our DNS server. +#[user@desktop] + + dig '@infra-ops.aglais.uk' 'zeppelin.gaia-prod.aglais.uk' + + + > ; <<>> DiG 9.11.26-RedHat-9.11.26-2.fc32 <<>> @infra-ops.aglais.uk zeppelin.gaia-prod.aglais.uk + > ; (1 server found) + > ;; global options: +cmd + > ;; Got answer: + > ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 10057 + > ;; flags: qr aa rd ad; QUERY: 1, ANSWER: 2, AUTHORITY: 1, ADDITIONAL: 1 + > ;; WARNING: recursion requested but not available + > + > ;; OPT PSEUDOSECTION: + > ; EDNS: version: 0, flags:; udp: 4096 + > ;; QUESTION SECTION: + > ;zeppelin.gaia-prod.aglais.uk. IN A + > + > ;; ANSWER SECTION: + > zeppelin.gaia-prod.aglais.uk. 300 IN A 128.232.227.212 + > zeppelin.gaia-prod.aglais.uk. 300 IN A 128.232.227.197 + > + > ;; AUTHORITY SECTION: + > gaia-prod.aglais.uk. 300 IN NS infra-ops.aglais.uk. + > + > ;; Query time: 15 msec + > ;; SERVER: 46.101.32.198#53(46.101.32.198) + > ;; WHEN: Tue Feb 23 06:54:35 GMT 2021 + > ;; MSG SIZE rcvd: 141 + + # + # Two answers for the same host - one is from the cache. + # Need to send a SIGHUP signal to flush the cache. + # + +# ----------------------------------------------------- +# Try send a SIGHUP signal to flush the cache. +#[root@ansibler] + + ssh Hizzoria + + podman kill --signal SIGHUP dnsmask + + +# ----------------------------------------------------- +# Test queries direct to our DNS server. +#[user@desktop] + + dig '@infra-ops.aglais.uk' 'zeppelin.gaia-prod.aglais.uk' + + > .... + > ;; ANSWER SECTION: + > zeppelin.gaia-prod.aglais.uk. 300 IN A 128.232.227.212 + > zeppelin.gaia-prod.aglais.uk. 300 IN A 128.232.227.197 + > .... + + +# ----------------------------------------------------- +# Try restart DNSmasq to flush the cache. +#[root@ansibler] + + ssh Hizzoria + + podman stop dnsmasq + + sleep 1 + + podman start dnsmasq + + +# ----------------------------------------------------- +# Test queries direct to our DNS server. +#[user@desktop] + + dig '@infra-ops.aglais.uk' 'zeppelin.gaia-prod.aglais.uk' + + > .... + > ;; ANSWER SECTION: + > zeppelin.gaia-prod.aglais.uk. 300 IN A 128.232.227.212 + > .... + + +# ----------------------------------------------------- +# Query the public service name via our local DNS. +#[user@desktop] + + dig 'zeppelin.aglais.uk' + + > .... + > ;; ANSWER SECTION: + > zeppelin.aglais.uk. 600 IN CNAME zeppelin.gaia-prod.aglais.uk. + > zeppelin.gaia-prod.aglais.uk. 300 IN A 128.232.227.212 + > .... + + + + diff --git a/notes/zrq/20210225-01-infra-ops.txt b/notes/zrq/20210225-01-infra-ops.txt new file mode 100644 index 00000000..4e5cbf6a --- /dev/null +++ b/notes/zrq/20210225-01-infra-ops.txt @@ -0,0 +1,1128 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + + Target: + + Investigate errors with DNS resolution. + Email from Dennis + + Just letting you know the IRIS system isn’t accessible at the minute. + It appears the login page doesn’t exist and/or the server can’t be found? + No pressure, I know there are issue at the minute, but wasn’t sure if you were aware of this. + + + Result: + + Restarting the Pod got the service working again, + but no clue as to why it failed and if/when it will happen again. + + + +# ----------------------------------------------------- +# Check the name resolves. +#[user@desktop] + + host zeppelin.aglais.uk + + > ;; connection timed out; no servers could be reached + + + host -a zeppelin.aglais.uk + + > .... + > .... + > ;; ANSWER SECTION: + > zeppelin.aglais.uk. 372 IN CNAME zeppelin.gaia-prod.aglais.uk. + > + > ;; AUTHORITY SECTION: + > aglais.uk. 171966 IN NS ns2.lcn.com. + > aglais.uk. 171966 IN NS ns1.lcn.com. + > aglais.uk. 171966 IN NS ns0.lcn.com. + + + host zeppelin.gaia-prod.aglais.uk. + + + > ;; connection timed out; no servers could be reached + + + # + # Suggests that our DNS service isn't working. + # + +# ----------------------------------------------------- +# Test query direct to our DNS server. +#[user@desktop] + + dig '@infra-ops.aglais.uk' 'zeppelin.gaia-prod.aglais.uk' + + > ; (1 server found) + > ;; global options: +cmd + > ;; connection timed out; no servers could be reached + + +# ----------------------------------------------------- +# Test query direct to the LCN server. +#[user@desktop] + + dig '@ns2.lcn.com' 'gaia-prod.aglais.uk' + + > .... + > ;; QUESTION SECTION: + > ;gaia-prod.aglais.uk. IN A + > + > ;; AUTHORITY SECTION: + > gaia-prod.aglais.uk. 600 IN NS infra-ops.aglais.uk. + > + > ;; ADDITIONAL SECTION: + > infra-ops.aglais.uk. 600 IN A 46.101.32.198 + > .... + + +# ----------------------------------------------------- +# Create a container to work with. +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name infra-ops \ + --hostname infra-ops \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --volume "${AGLAIS_CODE:?}/experiments/infra-ops:/infra-ops:ro,z" \ + atolmis/ansible-client:2020.12.02 \ + bash + + +# ----------------------------------------------------- +# Run the Ansible script to generate our local SSH config. +#[root@ansibler] + + pushd "/infra-ops/ansible" + + ansible-playbook \ + --inventory "hosts.yml" \ + "01-ssh-config.yml" + + popd + + > .... + > .... + + +# ----------------------------------------------------- +# Login to the DNS server and see what we can find .. +#[root@ansibler] + + ssh Hizzoria + + +# ----------------------------------------------------- +# Check the Pod is still running +#[root@Hizzoria] + + podman ps -a + + > podman ps -a + > CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES + > 314312dfde70 docker.io/storytel/dnsmasq:latest dnsmasq 2 days ago Up 2 days ago dnsmasq + + +# ----------------------------------------------------- +# Check the logs. +#[root@Hizzoria] + + podman logs --follow dnsmasq + + > .... + > .... + > dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 90.155.53.33 + > dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zeppelin.gaia-prod.aglais.uk + > dnsmasq[1]: auth[RRSIG] pizzaseo.com from 5.157.62.2 + > dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 129.215.17.202 + > dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zeppelin.gaia-prod.aglais.uk + > dnsmasq[1]: auth[TXT] VERSION.BIND from 185.173.35.25 + > dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 129.215.16.12 + > dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zeppelin.gaia-prod.aglais.uk + > dnsmasq[1]: auth[RRSIG] pizzaseo.com from 188.166.8.202 + > dnsmasq[1]: auth[RRSIG] pizzaseo.com from 188.166.8.202 + > dnsmasq[1]: auth[ANY] mz.gov.pl from 178.79.138.36 + > dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 81.97.95.154 + > dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zeppelin.gaia-prod.aglais.uk + > dnsmasq[1]: auth[A] 2e6520c6.asert-dns-research.com from 146.88.240.4 + > dnsmasq[1]: auth[A] ya.ru from 209.188.7.170 + > dnsmasq[1]: auth[A] www.wikipedia.org from 146.88.240.12 + > dnsmasq[1]: auth[AAAA] 67b.org from 80.82.65.90 + > dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 199.30.231.12 + > dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zeppelin.gaia-prod.aglais.uk + > dnsmasq[1]: auth[TXT] hp.com from 123.58.180.77 + > dnsmasq[1]: auth[TXT] hp.com from 123.58.180.77 + > dnsmasq[1]: auth[TXT] hp.com from 123.58.180.77 + > dnsmasq[1]: auth[TXT] version.bind from 104.140.188.22 + > dnsmasq[1]: auth[A] zePpeLIN.gaiA-prOD.aglAIs.Uk from 35.172.165.0 + > dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zePpeLIN.gaiA-prOD.aglAIs.Uk + > dnsmasq[1]: auth[AAAA] ZEPpEliN.GaiA-PRod.aGlAIS.UK from 35.172.165.0 + > dnsmasq[1]: auth[TXT] dns-test.research.a10protects.com from 45.79.54.171 + > dnsmasq[1]: auth[TXT] ebay.com from 123.58.180.77 + > dnsmasq[1]: auth[ANY] com from 185.94.111.1 + > dnsmasq[1]: auth[A] researchscan541.eecs.umich.edu from 141.212.123.34 + > dnsmasq[1]: auth[A] dnsscan.shadowserver.org from 74.82.47.6 + > dnsmasq[1]: auth[PTR] 213.1.168.192.in-addr.arpa from 167.248.133.27 + > dnsmasq[1]: auth[A] invalid.parrotdns.com from 167.248.133.53 + > dnsmasq[1]: auth[A] ip.parrotdns.com from 167.248.133.53 + > dnsmasq[1]: auth[TXT] version.bind from 167.248.133.53 + > dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 207.102.138.19 + > dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zeppelin.gaia-prod.aglais.uk + > dnsmasq[1]: auth[AAAA] zeppelin.gaia-prod.aglais.uk from 207.102.138.19 + > dnsmasq[1]: auth[A] zeppelin.gaia-prod.aglais.uk from 66.249.66.203 + > dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is zeppelin.gaia-prod.aglais.uk + > dnsmasq[1]: auth[A] ZEPPELin.GAiA-prOd.aGLAis.uK from 3.233.239.118 + > dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is ZEPPELin.GAiA-prOd.aGLAis.uK + > dnsmasq[1]: auth[AAAA] ZEppelIN.GAiA-PRod.aGLaIS.uk from 3.233.239.118 + > dnsmasq[1]: auth[A] researchscan541.eecs.umich.edu from 141.212.123.25 + > dnsmasq[1]: auth[TXT] tmz.com from 123.58.180.77 + > dnsmasq[1]: auth[TXT] tmz.com from 123.58.180.77 + + # TODO See if we can get a data stamp on these entries ? + + # Looks like a mixture of genuine requests and malicious poking. + + # This is an interesting one ... + + > dnsmasq[1]: auth[A] ZEPPELin.GAiA-prOd.aGLAis.uK from 3.233.239.118 + > dnsmasq[1]: /etc/dnsmasq/hosts 128.232.227.212 is ZEPPELin.GAiA-prOd.aGLAis.uK + + +# ----------------------------------------------------- +# Check the disc space on the VM. +#[root@Hizzoria] + + df -h + + > Filesystem Size Used Avail Use% Mounted on + > devtmpfs 470M 0 470M 0% /dev + > tmpfs 487M 84K 487M 1% /dev/shm + > tmpfs 195M 616K 195M 1% /run + > /dev/vda1 25G 1.7G 22G 8% / + > tmpfs 487M 0 487M 0% /tmp + > shm 63M 0 63M 0% /var/lib/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/shm + > overlay 25G 1.7G 22G 8% /var/lib/containers/storage/overlay/9661016980dfe5613577610f8ede05969f5cc360f08b12548c5f35a7df160184/merged + > tmpfs 98M 0 98M 0% /run/user/0 + + +# ----------------------------------------------------- +# Check the disc space inside the Pod. +#[root@Hizzoria] + + podman exec -it dnsmasq df -h + + > Filesystem Size Used Available Use% Mounted on + > overlay 24.5G 1.7G 21.8G 7% / + > tmpfs 64.0M 0 64.0M 0% /dev + > shm 62.5M 0 62.5M 0% /dev/shm + > /dev/vda1 24.5G 1.7G 21.8G 7% /etc/dnsmasq + > tmpfs 194.7M 620.0K 194.1M 0% /etc/hostname + > tmpfs 194.7M 620.0K 194.1M 0% /etc/resolv.conf + > tmpfs 194.7M 620.0K 194.1M 0% /etc/hosts + > tmpfs 194.7M 620.0K 194.1M 0% /run/.containerenv + > tmpfs 194.7M 620.0K 194.1M 0% /run/secrets + + +# ----------------------------------------------------- +# Inspect the Pod. +#[root@Hizzoria] + + podman inspect dnsmasq + + > [ + > { + > "Id": "314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb", + > "Created": "2021-02-23T01:38:43.1790103Z", + > "Path": "dnsmasq", + > "Args": [ + > "dnsmasq" + > ], + > "State": { + > "OciVersion": "1.0.2-dev", + > "Status": "running", + > "Running": true, + > "Paused": false, + > "Restarting": false, + > "OOMKilled": false, + > "Dead": false, + > "Pid": 20076, + > "ConmonPid": 20073, + > "ExitCode": 0, + > "Error": "", + > "StartedAt": "2021-02-23T07:06:43.426937776Z", + > "FinishedAt": "2021-02-23T07:06:41.970034999Z", + > "Healthcheck": { + > "Status": "", + > "FailingStreak": 0, + > "Log": null + > } + > }, + > "Image": "a12355af408b83950f803716de133c0440c54c5808125044d6b71898bfb5bdf3", + > "ImageName": "docker.io/storytel/dnsmasq:latest", + > "Rootfs": "", + > "Pod": "", + > "ResolvConfPath": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/resolv.conf", + > "HostnamePath": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/hostname", + > "HostsPath": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/hosts", + > "StaticDir": "/var/lib/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata", + > "OCIConfigPath": "/var/lib/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/config.json", + > "OCIRuntime": "crun", + > "LogPath": "/var/lib/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/ctr.log", + > "LogTag": "", + > "ConmonPidFile": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/conmon.pid", + > "Name": "dnsmasq", + > "RestartCount": 0, + > "Driver": "overlay", + > "MountLabel": "system_u:object_r:container_file_t:s0:c385,c403", + > "ProcessLabel": "", + > "AppArmorProfile": "", + > "EffectiveCaps": [ + > "CAP_CHOWN", + > "CAP_DAC_OVERRIDE", + > "CAP_DAC_READ_SEARCH", + > "CAP_FOWNER", + > "CAP_FSETID", + > "CAP_KILL", + > "CAP_SETGID", + > "CAP_SETUID", + > "CAP_SETPCAP", + > "CAP_LINUX_IMMUTABLE", + > "CAP_NET_BIND_SERVICE", + > "CAP_NET_BROADCAST", + > "CAP_NET_ADMIN", + > "CAP_NET_RAW", + > "CAP_IPC_LOCK", + > "CAP_IPC_OWNER", + > "CAP_SYS_MODULE", + > "CAP_SYS_RAWIO", + > "CAP_SYS_CHROOT", + > "CAP_SYS_PTRACE", + > "CAP_SYS_PACCT", + > "CAP_SYS_ADMIN", + > "CAP_SYS_BOOT", + > "CAP_SYS_NICE", + > "CAP_SYS_RESOURCE", + > "CAP_SYS_TIME", + > "CAP_SYS_TTY_CONFIG", + > "CAP_MKNOD", + > "CAP_LEASE", + > "CAP_AUDIT_WRITE", + > "CAP_AUDIT_CONTROL", + > "CAP_SETFCAP", + > "CAP_MAC_OVERRIDE", + > "CAP_MAC_ADMIN", + > "CAP_SYSLOG", + > "CAP_WAKE_ALARM", + > "CAP_BLOCK_SUSPEND", + > "CAP_AUDIT_READ" + > ], + > "BoundingCaps": [ + > "CAP_CHOWN", + > "CAP_DAC_OVERRIDE", + > "CAP_DAC_READ_SEARCH", + > "CAP_FOWNER", + > "CAP_FSETID", + > "CAP_KILL", + > "CAP_SETGID", + > "CAP_SETUID", + > "CAP_SETPCAP", + > "CAP_LINUX_IMMUTABLE", + > "CAP_NET_BIND_SERVICE", + > "CAP_NET_BROADCAST", + > "CAP_NET_ADMIN", + > "CAP_NET_RAW", + > "CAP_IPC_LOCK", + > "CAP_IPC_OWNER", + > "CAP_SYS_MODULE", + > "CAP_SYS_RAWIO", + > "CAP_SYS_CHROOT", + > "CAP_SYS_PTRACE", + > "CAP_SYS_PACCT", + > "CAP_SYS_ADMIN", + > "CAP_SYS_BOOT", + > "CAP_SYS_NICE", + > "CAP_SYS_RESOURCE", + > "CAP_SYS_TIME", + > "CAP_SYS_TTY_CONFIG", + > "CAP_MKNOD", + > "CAP_LEASE", + > "CAP_AUDIT_WRITE", + > "CAP_AUDIT_CONTROL", + > "CAP_SETFCAP", + > "CAP_MAC_OVERRIDE", + > "CAP_MAC_ADMIN", + > "CAP_SYSLOG", + > "CAP_WAKE_ALARM", + > "CAP_BLOCK_SUSPEND", + > "CAP_AUDIT_READ" + > ], + > "ExecIDs": [], + > "GraphDriver": { + > "Name": "overlay", + > "Data": { + > "LowerDir": "/var/lib/containers/storage/overlay/7bb8a4351055007d2f87cb9bb2902da18fd7c410f9da470b4ef56e78b94080a3/diff:/var/lib/containers/storage/overlay/cd7100a72410606589a54b932cabd804a17f9ae5b42a1882bd56d263e02b6215/diff", + > "MergedDir": "/var/lib/containers/storage/overlay/9661016980dfe5613577610f8ede05969f5cc360f08b12548c5f35a7df160184/merged", + > "UpperDir": "/var/lib/containers/storage/overlay/9661016980dfe5613577610f8ede05969f5cc360f08b12548c5f35a7df160184/diff", + > "WorkDir": "/var/lib/containers/storage/overlay/9661016980dfe5613577610f8ede05969f5cc360f08b12548c5f35a7df160184/work" + > } + > }, + > "Mounts": [ + > { + > "Type": "bind", + > "Name": "", + > "Source": "/var/aglais/dnsmasq", + > "Destination": "/etc/dnsmasq", + > "Driver": "", + > "Mode": "", + > "Options": [ + > "rbind" + > ], + > "RW": false, + > "Propagation": "rprivate" + > } + > ], + > "Dependencies": [], + > "NetworkSettings": { + > "EndpointID": "", + > "Gateway": "", + > "IPAddress": "", + > "IPPrefixLen": 0, + > "IPv6Gateway": "", + > "GlobalIPv6Address": "", + > "GlobalIPv6PrefixLen": 0, + > "MacAddress": "", + > "Bridge": "", + > "SandboxID": "", + > "HairpinMode": false, + > "LinkLocalIPv6Address": "", + > "LinkLocalIPv6PrefixLen": 0, + > "Ports": {}, + > "SandboxKey": "" + > }, + > "ExitCommand": [ + > "/usr/bin/podman", + > "--root", + > "/var/lib/containers/storage", + > "--runroot", + > "/var/run/containers/storage", + > "--log-level", + > "error", + > "--cgroup-manager", + > "systemd", + > "--tmpdir", + > "/var/run/libpod", + > "--runtime", + > "crun", + > "--storage-driver", + > "overlay", + > "--storage-opt", + > "overlay.mountopt=nodev,metacopy=on", + > "--events-backend", + > "journald", + > "container", + > "cleanup", + > "314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb" + > ], + > "Namespace": "", + > "IsInfra": false, + > "Config": { + > "Hostname": "Hizzoria", + > "Domainname": "", + > "User": "", + > "AttachStdin": false, + > "AttachStdout": false, + > "AttachStderr": false, + > "Tty": false, + > "OpenStdin": false, + > "StdinOnce": false, + > "Env": [ + > "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + > "TERM=xterm", + > "container=podman", + > "HOSTNAME=Hizzoria", + > "HOME=/root" + > ], + > "Cmd": [ + > "dnsmasq" + > ], + > "Image": "docker.io/storytel/dnsmasq:latest", + > "Volumes": null, + > "WorkingDir": "/", + > "Entrypoint": "", + > "OnBuild": null, + > "Labels": null, + > "Annotations": { + > "io.container.manager": "libpod", + > "io.kubernetes.cri-o.Created": "2021-02-23T01:38:43.1790103Z", + > "io.kubernetes.cri-o.TTY": "false", + > "io.podman.annotations.autoremove": "FALSE", + > "io.podman.annotations.init": "FALSE", + > "io.podman.annotations.privileged": "TRUE", + > "io.podman.annotations.publish-all": "FALSE", + > "org.opencontainers.image.stopSignal": "15" + > }, + > "StopSignal": 15, + > "CreateCommand": [ + > "podman", + > "container", + > "run", + > "--name", + > "dnsmasq", + > "--detach=True", + > "--privileged=True", + > "--network", + > "host", + > "--restart=on-failure:10", + > "--publish", + > "53:53/tcp", + > "--volume", + > "/var/aglais/dnsmasq:/etc/dnsmasq:ro", + > "storytel/dnsmasq" + > ], + > "Umask": "0022" + > }, + > "HostConfig": { + > "Binds": [ + > "/var/aglais/dnsmasq:/etc/dnsmasq:ro,rprivate,rbind" + > ], + > "CgroupManager": "systemd", + > "CgroupMode": "private", + > "ContainerIDFile": "", + > "LogConfig": { + > "Type": "k8s-file", + > "Config": null + > }, + > "NetworkMode": "host", + > "PortBindings": {}, + > "RestartPolicy": { + > "Name": "on-failure", + > "MaximumRetryCount": 10 + > }, + > "AutoRemove": false, + > "VolumeDriver": "", + > "VolumesFrom": null, + > "CapAdd": [], + > "CapDrop": [], + > "Dns": [], + > "DnsOptions": [], + > "DnsSearch": [], + > "ExtraHosts": [], + > "GroupAdd": [], + > "IpcMode": "private", + > "Cgroup": "", + > "Cgroups": "default", + > "Links": null, + > "OomScoreAdj": 0, + > "PidMode": "private", + > "Privileged": true, + > "PublishAllPorts": false, + > "ReadonlyRootfs": false, + > "SecurityOpt": [], + > "Tmpfs": {}, + > "UTSMode": "private", + > "UsernsMode": "", + > "ShmSize": 65536000, + > "Runtime": "oci", + > "ConsoleSize": [ + > 0, + > 0 + > ], + > "Isolation": "", + > "CpuShares": 0, + > "Memory": 0, + > "NanoCpus": 0, + > "CgroupParent": "", + > "BlkioWeight": 0, + > "BlkioWeightDevice": null, + > "BlkioDeviceReadBps": null, + > "BlkioDeviceWriteBps": null, + > "BlkioDeviceReadIOps": null, + > "BlkioDeviceWriteIOps": null, + > "CpuPeriod": 0, + > "CpuQuota": 0, + > "CpuRealtimePeriod": 0, + > "CpuRealtimeRuntime": 0, + > "CpusetCpus": "", + > "CpusetMems": "", + > "Devices": [], + > "DiskQuota": 0, + > "KernelMemory": 0, + > "MemoryReservation": 0, + > "MemorySwap": 0, + > "MemorySwappiness": 0, + > "OomKillDisable": false, + > "PidsLimit": 2048, + > "Ulimits": [ + > { + > "Name": "RLIMIT_NOFILE", + > "Soft": 1048576, + > "Hard": 1048576 + > }, + > { + > "Name": "RLIMIT_NPROC", + > "Soft": 4194304, + > "Hard": 4194304 + > } + > ], + > "CpuCount": 0, + > "CpuPercent": 0, + > "IOMaximumIOps": 0, + > "IOMaximumBandwidth": 0, + > "CgroupConf": null + > } + > } + > ] + + # I can't remember if the NetworkSettings block should be empty ? + + > .... + > "NetworkSettings": { + > "EndpointID": "", + > "Gateway": "", + > "IPAddress": "", + > "IPPrefixLen": 0, + > "IPv6Gateway": "", + > "GlobalIPv6Address": "", + > "GlobalIPv6PrefixLen": 0, + > "MacAddress": "", + > "Bridge": "", + > "SandboxID": "", + > "HairpinMode": false, + > "LinkLocalIPv6Address": "", + > "LinkLocalIPv6PrefixLen": 0, + > "Ports": {}, + > "SandboxKey": "" + > }, + > .... + + +# ----------------------------------------------------- +# Login to the Pod and check the filesystem is OK. +#[root@Hizzoria] + + podman exec -it dnsmasq /bin/sh + + +# ----------------------------------------------------- +# .... +#[root@dnsmasq] + + ls /etc/dnsmasq + + > aglais.conf + > hosts + + + cat /etc/dnsmasq/aglais.conf + + > .... + > .... + > no-hosts + > no-resolv + > no-daemon + > bogus-priv + > domain-needed + > keep-in-foreground + > + > auth-ttl=300 + > local-ttl=300 + > + > log-queries + > log-facility=- + > + > bind-dynamic + > except-interface=localhost + > + > hostsdir=/etc/dnsmasq/hosts + > + > auth-server=infra-ops.aglais.uk,eth0 + > + > auth-zone=gaia-dev.aglais.uk + > auth-zone=gaia-test.aglais.uk + > auth-zone=gaia-prod.aglais.uk + + + cat /etc/dnsmasq/hosts/gaia-prod.hosts + + > .... + > .... + > 128.232.227.212 zeppelin.gaia-prod.aglais.uk + + + +# ----------------------------------------------------- +# .... +#[root@dnsmasq] + + ifconfig + + > .... + > eth0 Link encap:Ethernet HWaddr 86:AB:2D:00:CB:2A + > inet addr:46.101.32.198 Bcast:46.101.63.255 Mask:255.255.192.0 + > inet6 addr: 2a03:b0c0:1:d0::b53:6001/64 Scope:Global + > inet6 addr: fe80::84ab:2dff:fe00:cb2a/64 Scope:Link + > UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 + > RX packets:300997 errors:0 dropped:0 overruns:0 frame:0 + > TX packets:78428 errors:0 dropped:0 overruns:0 carrier:0 + > collisions:0 txqueuelen:1000 + > RX bytes:373962484 (356.6 MiB) TX bytes:7941941 (7.5 MiB) + > + > eth1 Link encap:Ethernet HWaddr BE:FF:FC:81:F0:74 + > inet addr:10.106.0.2 Bcast:10.106.15.255 Mask:255.255.240.0 + > inet6 addr: fe80::bcff:fcff:fe81:f074/64 Scope:Link + > UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 + > RX packets:75 errors:0 dropped:0 overruns:0 frame:0 + > TX packets:83 errors:0 dropped:0 overruns:0 carrier:0 + > collisions:0 txqueuelen:1000 + > RX bytes:5326 (5.2 KiB) TX bytes:5850 (5.7 KiB) + > .... + + + +# ----------------------------------------------------- +# Out of ideas .. try restarting the Pod ? +#[root@Hizzoria] + + podman stop dnsmasq + sleep 5 + podman start dnsmasq + + > .... + > .... + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Test query direct to our DNS server. +#[user@desktop] + + dig '@infra-ops.aglais.uk' 'zeppelin.gaia-prod.aglais.uk' + + > .... + > .... + > ;; QUESTION SECTION: + > ;zeppelin.gaia-prod.aglais.uk. IN A + > + > ;; ANSWER SECTION: + > zeppelin.gaia-prod.aglais.uk. 300 IN A 128.232.227.212 + > + > ;; AUTHORITY SECTION: + > gaia-prod.aglais.uk. 300 IN NS infra-ops.aglais.uk. + > + > ;; Query time: 14 msec + > ;; SERVER: 46.101.32.198#53(46.101.32.198) + > ;; WHEN: Thu Feb 25 17:23:48 GMT 2021 + > ;; MSG SIZE rcvd: 125 + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Inspect the Pod now it is working .... +#[root@Hizzoria] + + podman inspect dnsmasq + + > [ + > { + > "Id": "314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb", + > "Created": "2021-02-23T01:38:43.1790103Z", + > "Path": "dnsmasq", + > "Args": [ + > "dnsmasq" + > ], + > "State": { + > "OciVersion": "1.0.2-dev", + > "Status": "running", + > "Running": true, + > "Paused": false, + > "Restarting": false, + > "OOMKilled": false, + > "Dead": false, + > "Pid": 25313, + > "ConmonPid": 25310, + > "ExitCode": 0, + > "Error": "", + > "StartedAt": "2021-02-25T17:23:10.323794038Z", + > "FinishedAt": "2021-02-25T17:23:04.795965881Z", + > "Healthcheck": { + > "Status": "", + > "FailingStreak": 0, + > "Log": null + > } + > }, + > "Image": "a12355af408b83950f803716de133c0440c54c5808125044d6b71898bfb5bdf3", + > "ImageName": "docker.io/storytel/dnsmasq:latest", + > "Rootfs": "", + > "Pod": "", + > "ResolvConfPath": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/resolv.conf", + > "HostnamePath": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/hostname", + > "HostsPath": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/hosts", + > "StaticDir": "/var/lib/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata", + > "OCIConfigPath": "/var/lib/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/config.json", + > "OCIRuntime": "crun", + > "LogPath": "/var/lib/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/ctr.log", + > "LogTag": "", + > "ConmonPidFile": "/var/run/containers/storage/overlay-containers/314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb/userdata/conmon.pid", + > "Name": "dnsmasq", + > "RestartCount": 0, + > "Driver": "overlay", + > "MountLabel": "system_u:object_r:container_file_t:s0:c385,c403", + > "ProcessLabel": "", + > "AppArmorProfile": "", + > "EffectiveCaps": [ + > "CAP_CHOWN", + > "CAP_DAC_OVERRIDE", + > "CAP_DAC_READ_SEARCH", + > "CAP_FOWNER", + > "CAP_FSETID", + > "CAP_KILL", + > "CAP_SETGID", + > "CAP_SETUID", + > "CAP_SETPCAP", + > "CAP_LINUX_IMMUTABLE", + > "CAP_NET_BIND_SERVICE", + > "CAP_NET_BROADCAST", + > "CAP_NET_ADMIN", + > "CAP_NET_RAW", + > "CAP_IPC_LOCK", + > "CAP_IPC_OWNER", + > "CAP_SYS_MODULE", + > "CAP_SYS_RAWIO", + > "CAP_SYS_CHROOT", + > "CAP_SYS_PTRACE", + > "CAP_SYS_PACCT", + > "CAP_SYS_ADMIN", + > "CAP_SYS_BOOT", + > "CAP_SYS_NICE", + > "CAP_SYS_RESOURCE", + > "CAP_SYS_TIME", + > "CAP_SYS_TTY_CONFIG", + > "CAP_MKNOD", + > "CAP_LEASE", + > "CAP_AUDIT_WRITE", + > "CAP_AUDIT_CONTROL", + > "CAP_SETFCAP", + > "CAP_MAC_OVERRIDE", + > "CAP_MAC_ADMIN", + > "CAP_SYSLOG", + > "CAP_WAKE_ALARM", + > "CAP_BLOCK_SUSPEND", + > "CAP_AUDIT_READ" + > ], + > "BoundingCaps": [ + > "CAP_CHOWN", + > "CAP_DAC_OVERRIDE", + > "CAP_DAC_READ_SEARCH", + > "CAP_FOWNER", + > "CAP_FSETID", + > "CAP_KILL", + > "CAP_SETGID", + > "CAP_SETUID", + > "CAP_SETPCAP", + > "CAP_LINUX_IMMUTABLE", + > "CAP_NET_BIND_SERVICE", + > "CAP_NET_BROADCAST", + > "CAP_NET_ADMIN", + > "CAP_NET_RAW", + > "CAP_IPC_LOCK", + > "CAP_IPC_OWNER", + > "CAP_SYS_MODULE", + > "CAP_SYS_RAWIO", + > "CAP_SYS_CHROOT", + > "CAP_SYS_PTRACE", + > "CAP_SYS_PACCT", + > "CAP_SYS_ADMIN", + > "CAP_SYS_BOOT", + > "CAP_SYS_NICE", + > "CAP_SYS_RESOURCE", + > "CAP_SYS_TIME", + > "CAP_SYS_TTY_CONFIG", + > "CAP_MKNOD", + > "CAP_LEASE", + > "CAP_AUDIT_WRITE", + > "CAP_AUDIT_CONTROL", + > "CAP_SETFCAP", + > "CAP_MAC_OVERRIDE", + > "CAP_MAC_ADMIN", + > "CAP_SYSLOG", + > "CAP_WAKE_ALARM", + > "CAP_BLOCK_SUSPEND", + > "CAP_AUDIT_READ" + > ], + > "ExecIDs": [], + > "GraphDriver": { + > "Name": "overlay", + > "Data": { + > "LowerDir": "/var/lib/containers/storage/overlay/7bb8a4351055007d2f87cb9bb2902da18fd7c410f9da470b4ef56e78b94080a3/diff:/var/lib/containers/storage/overlay/cd7100a72410606589a54b932cabd804a17f9ae5b42a1882bd56d263e02b6215/diff", + > "MergedDir": "/var/lib/containers/storage/overlay/9661016980dfe5613577610f8ede05969f5cc360f08b12548c5f35a7df160184/merged", + > "UpperDir": "/var/lib/containers/storage/overlay/9661016980dfe5613577610f8ede05969f5cc360f08b12548c5f35a7df160184/diff", + > "WorkDir": "/var/lib/containers/storage/overlay/9661016980dfe5613577610f8ede05969f5cc360f08b12548c5f35a7df160184/work" + > } + > }, + > "Mounts": [ + > { + > "Type": "bind", + > "Name": "", + > "Source": "/var/aglais/dnsmasq", + > "Destination": "/etc/dnsmasq", + > "Driver": "", + > "Mode": "", + > "Options": [ + > "rbind" + > ], + > "RW": false, + > "Propagation": "rprivate" + > } + > ], + > "Dependencies": [], + > "NetworkSettings": { + > "EndpointID": "", + > "Gateway": "", + > "IPAddress": "", + > "IPPrefixLen": 0, + > "IPv6Gateway": "", + > "GlobalIPv6Address": "", + > "GlobalIPv6PrefixLen": 0, + > "MacAddress": "", + > "Bridge": "", + > "SandboxID": "", + > "HairpinMode": false, + > "LinkLocalIPv6Address": "", + > "LinkLocalIPv6PrefixLen": 0, + > "Ports": {}, + > "SandboxKey": "" + > }, + > "ExitCommand": [ + > "/usr/bin/podman", + > "--root", + > "/var/lib/containers/storage", + > "--runroot", + > "/var/run/containers/storage", + > "--log-level", + > "error", + > "--cgroup-manager", + > "systemd", + > "--tmpdir", + > "/var/run/libpod", + > "--runtime", + > "crun", + > "--storage-driver", + > "overlay", + > "--storage-opt", + > "overlay.mountopt=nodev,metacopy=on", + > "--events-backend", + > "journald", + > "container", + > "cleanup", + > "314312dfde7016a8ba0cebd36b752a5a9971e6f9523998dad025d5d0aedd5bcb" + > ], + > "Namespace": "", + > "IsInfra": false, + > "Config": { + > "Hostname": "Hizzoria", + > "Domainname": "", + > "User": "", + > "AttachStdin": false, + > "AttachStdout": false, + > "AttachStderr": false, + > "Tty": false, + > "OpenStdin": false, + > "StdinOnce": false, + > "Env": [ + > "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + > "TERM=xterm", + > "container=podman", + > "HOSTNAME=Hizzoria", + > "HOME=/root" + > ], + > "Cmd": [ + > "dnsmasq" + > ], + > "Image": "docker.io/storytel/dnsmasq:latest", + > "Volumes": null, + > "WorkingDir": "/", + > "Entrypoint": "", + > "OnBuild": null, + > "Labels": null, + > "Annotations": { + > "io.container.manager": "libpod", + > "io.kubernetes.cri-o.Created": "2021-02-23T01:38:43.1790103Z", + > "io.kubernetes.cri-o.TTY": "false", + > "io.podman.annotations.autoremove": "FALSE", + > "io.podman.annotations.init": "FALSE", + > "io.podman.annotations.privileged": "TRUE", + > "io.podman.annotations.publish-all": "FALSE", + > "org.opencontainers.image.stopSignal": "15" + > }, + > "StopSignal": 15, + > "CreateCommand": [ + > "podman", + > "container", + > "run", + > "--name", + > "dnsmasq", + > "--detach=True", + > "--privileged=True", + > "--network", + > "host", + > "--restart=on-failure:10", + > "--publish", + > "53:53/tcp", + > "--volume", + > "/var/aglais/dnsmasq:/etc/dnsmasq:ro", + > "storytel/dnsmasq" + > ], + > "Umask": "0022" + > }, + > "HostConfig": { + > "Binds": [ + > "/var/aglais/dnsmasq:/etc/dnsmasq:ro,rprivate,rbind" + > ], + > "CgroupManager": "systemd", + > "CgroupMode": "private", + > "ContainerIDFile": "", + > "LogConfig": { + > "Type": "k8s-file", + > "Config": null + > }, + > "NetworkMode": "host", + > "PortBindings": {}, + > "RestartPolicy": { + > "Name": "on-failure", + > "MaximumRetryCount": 10 + > }, + > "AutoRemove": false, + > "VolumeDriver": "", + > "VolumesFrom": null, + > "CapAdd": [], + > "CapDrop": [], + > "Dns": [], + > "DnsOptions": [], + > "DnsSearch": [], + > "ExtraHosts": [], + > "GroupAdd": [], + > "IpcMode": "private", + > "Cgroup": "", + > "Cgroups": "default", + > "Links": null, + > "OomScoreAdj": 0, + > "PidMode": "private", + > "Privileged": true, + > "PublishAllPorts": false, + > "ReadonlyRootfs": false, + > "SecurityOpt": [], + > "Tmpfs": {}, + > "UTSMode": "private", + > "UsernsMode": "", + > "ShmSize": 65536000, + > "Runtime": "oci", + > "ConsoleSize": [ + > 0, + > 0 + > ], + > "Isolation": "", + > "CpuShares": 0, + > "Memory": 0, + > "NanoCpus": 0, + > "CgroupParent": "", + > "BlkioWeight": 0, + > "BlkioWeightDevice": null, + > "BlkioDeviceReadBps": null, + > "BlkioDeviceWriteBps": null, + > "BlkioDeviceReadIOps": null, + > "BlkioDeviceWriteIOps": null, + > "CpuPeriod": 0, + > "CpuQuota": 0, + > "CpuRealtimePeriod": 0, + > "CpuRealtimeRuntime": 0, + > "CpusetCpus": "", + > "CpusetMems": "", + > "Devices": [], + > "DiskQuota": 0, + > "KernelMemory": 0, + > "MemoryReservation": 0, + > "MemorySwap": 0, + > "MemorySwappiness": 0, + > "OomKillDisable": false, + > "PidsLimit": 2048, + > "Ulimits": [ + > { + > "Name": "RLIMIT_NOFILE", + > "Soft": 1048576, + > "Hard": 1048576 + > }, + > { + > "Name": "RLIMIT_NPROC", + > "Soft": 4194304, + > "Hard": 4194304 + > } + > ], + > "CpuCount": 0, + > "CpuPercent": 0, + > "IOMaximumIOps": 0, + > "IOMaximumBandwidth": 0, + > "CgroupConf": null + > } + > } + > ] + + # We can compare these later to see if there is any difference. + + # Note the NetworkSettings block is indeed full of blanks. + + > .... + > "NetworkSettings": { + > "EndpointID": "", + > "Gateway": "", + > "IPAddress": "", + > "IPPrefixLen": 0, + > "IPv6Gateway": "", + > "GlobalIPv6Address": "", + > "GlobalIPv6PrefixLen": 0, + > "MacAddress": "", + > "Bridge": "", + > "SandboxID": "", + > "HairpinMode": false, + > "LinkLocalIPv6Address": "", + > "LinkLocalIPv6PrefixLen": 0, + > "Ports": {}, + > "SandboxKey": "" + > }, + > .... + +