From ffe21373c777b47a326269501d64578ef71b9ecf Mon Sep 17 00:00:00 2001 From: "zrq-github@metagrid.co.uk" Date: Thu, 11 Feb 2021 16:34:57 +0000 Subject: [PATCH 01/27] Removed gateway node --- .../hadoop-yarn/ansible/03-create-masters.yml | 4 +-- .../hadoop-yarn/ansible/04-create-workers.yml | 4 +-- .../hadoop-yarn/ansible/05-config-ssh.yml | 6 ++--- .../hadoop-yarn/ansible/06-config-dns.yml | 26 +++---------------- .../hadoop-yarn/ansible/07-host-keys.yml | 4 +-- .../hadoop-yarn/ansible/08-ping-test.yml | 2 +- .../ansible/25-create-zeppelin.yml | 15 ++++++++++- .../hadoop-yarn/ansible/51-cephfs-mount.yml | 2 +- .../hadoop-yarn/ansible/combined-01.yml | 1 - .../hadoop-yarn/ansible/create-all.yml | 3 +-- experiments/hadoop-yarn/ansible/hosts.yml | 4 --- .../ansible/templates/dns-hosts.j2 | 8 ++---- .../ansible/templates/hadoop-workers.j2 | 21 +++++++++++++++ .../ansible/templates/ssh-ansible.j2 | 19 ++++---------- 14 files changed, 58 insertions(+), 61 deletions(-) diff --git a/experiments/hadoop-yarn/ansible/03-create-masters.yml b/experiments/hadoop-yarn/ansible/03-create-masters.yml index facb7fb0..8c621019 100644 --- a/experiments/hadoop-yarn/ansible/03-create-masters.yml +++ b/experiments/hadoop-yarn/ansible/03-create-masters.yml @@ -35,7 +35,7 @@ register: mastersec - - name: "Add a rule to allow SSH from our gateway" + - name: "Add a rule to allow SSH from zeppelin" os_security_group_rule: cloud: "{{ cloudname }}" state: present @@ -44,7 +44,7 @@ protocol: 'tcp' port_range_min: 22 port_range_max: 22 - remote_group: "{{ security['gateway'] }}" + remote_group: "{{ security['zeppelin'] }}" - name: "Create our masters" os_server: diff --git a/experiments/hadoop-yarn/ansible/04-create-workers.yml b/experiments/hadoop-yarn/ansible/04-create-workers.yml index eeb43bb6..e6dc0cda 100644 --- a/experiments/hadoop-yarn/ansible/04-create-workers.yml +++ b/experiments/hadoop-yarn/ansible/04-create-workers.yml @@ -35,7 +35,7 @@ register: secgroup - - name: "Add a rule to allow ssh from the gateway" + - name: "Add a rule to allow ssh from zeppelin" os_security_group_rule: cloud: "{{ cloudname }}" state: present @@ -44,7 +44,7 @@ protocol: 'tcp' port_range_min: 22 port_range_max: 22 - remote_group: "{{ security['gateway'] }}" + remote_group: "{{ security['zeppelin'] }}" - name: "Create our workers" os_server: diff --git a/experiments/hadoop-yarn/ansible/05-config-ssh.yml b/experiments/hadoop-yarn/ansible/05-config-ssh.yml index 4f60131e..72a42edc 100644 --- a/experiments/hadoop-yarn/ansible/05-config-ssh.yml +++ b/experiments/hadoop-yarn/ansible/05-config-ssh.yml @@ -35,12 +35,12 @@ mode: 'u=rwx,g=rx,o=rx' state: directory - - name: "Discover our gateway nodes" + - name: "Discover our zeppelin node" os_server_info: cloud: "{{ cloudname }}" - server: "{{ deployname }}-gateway" + server: "{{ deployname }}-zeppelin" register: - gatewaynodes + zeppelinnodes - name: "Generate Ansible SSH config" template: diff --git a/experiments/hadoop-yarn/ansible/06-config-dns.yml b/experiments/hadoop-yarn/ansible/06-config-dns.yml index 1e20d587..0ec41589 100644 --- a/experiments/hadoop-yarn/ansible/06-config-dns.yml +++ b/experiments/hadoop-yarn/ansible/06-config-dns.yml @@ -26,12 +26,12 @@ - /tmp/ansible-vars.yml tasks: - - name: "Discover our gateway nodes" + - name: "Discover our Zeppelin node" os_server_info: cloud: "{{ cloudname }}" - server: "{{ deployname }}-gateway*" + server: "{{ deployname }}-zeppelin" register: - gatewaynodes + zeppelinnode - name: "Discover our master nodes" os_server_info: @@ -47,34 +47,16 @@ register: workernodes - - name: "Discover our Zeppelin nodes" - os_server_info: - cloud: "{{ cloudname }}" - server: "{{ deployname }}-zeppelin" - register: - zeppelinnode - - name: "Generate our DNS hosts file" template: src: 'templates/dns-hosts.j2' dest: "/tmp/aglais-dns-hosts" -- hosts: gateway - gather_facts: false - tasks: - - name: "Deploy [/etc/hosts] to our gateway" - become: true - copy: - src: /tmp/aglais-dns-hosts - dest: /etc/hosts - owner: root - group: root - mode: u=rw,g=r,o=r - hosts: zeppelin gather_facts: false tasks: - - name: "Deploy [/etc/hosts] to our Zeppelin" + - name: "Deploy [/etc/hosts] to our Zeppelin node" become: true copy: src: /tmp/aglais-dns-hosts diff --git a/experiments/hadoop-yarn/ansible/07-host-keys.yml b/experiments/hadoop-yarn/ansible/07-host-keys.yml index ea87e946..fbfc803c 100644 --- a/experiments/hadoop-yarn/ansible/07-host-keys.yml +++ b/experiments/hadoop-yarn/ansible/07-host-keys.yml @@ -22,7 +22,7 @@ # https://everythingshouldbevirtual.com/automation/ansible-ssh-known-host-keys/ # -- hosts: gateway +- hosts: zeppelin gather_facts: false tasks: @@ -50,7 +50,7 @@ dest: "/tmp/aglais-ssh-hosts" -- hosts: gateway:masters:workers:zeppelin +- hosts: masters:workers:zeppelin gather_facts: false tasks: - name: "Deploy the known hosts file to [/etc/ssh/ssh_known_hosts]" diff --git a/experiments/hadoop-yarn/ansible/08-ping-test.yml b/experiments/hadoop-yarn/ansible/08-ping-test.yml index c07607ae..b8e8cd33 100644 --- a/experiments/hadoop-yarn/ansible/08-ping-test.yml +++ b/experiments/hadoop-yarn/ansible/08-ping-test.yml @@ -22,7 +22,7 @@ --- - name: "Ping tests" - hosts: gateway:masters:workers:zeppelin + hosts: zeppelin:masters:workers gather_facts: false tasks: diff --git a/experiments/hadoop-yarn/ansible/25-create-zeppelin.yml b/experiments/hadoop-yarn/ansible/25-create-zeppelin.yml index 9ce10005..5768e05e 100644 --- a/experiments/hadoop-yarn/ansible/25-create-zeppelin.yml +++ b/experiments/hadoop-yarn/ansible/25-create-zeppelin.yml @@ -35,17 +35,30 @@ register: zeppelinsec - - name: "Add a rule to allow ssh from the gateway" + - name: "Add a security rule for IPv4 SSH" os_security_group_rule: cloud: "{{ cloudname }}" state: present security_group: "{{ zeppelinsec.id }}" direction: 'ingress' protocol: 'tcp' + ethertype: 'IPv4' port_range_min: 22 port_range_max: 22 remote_ip_prefix: '0.0.0.0/0' + - name: "Add a security rule for IPv6 SSH" + os_security_group_rule: + cloud: "{{ cloudname }}" + state: present + security_group: "{{ zeppelinsec.id }}" + direction: 'ingress' + protocol: 'tcp' + ethertype: 'IPv6' + port_range_min: 22 + port_range_max: 22 + remote_ip_prefix: '::/0' + - name: "Add a security rule for IPv4 Port 8080" os_security_group_rule: cloud: "{{ cloudname }}" diff --git a/experiments/hadoop-yarn/ansible/51-cephfs-mount.yml b/experiments/hadoop-yarn/ansible/51-cephfs-mount.yml index 17b669f5..6704d885 100644 --- a/experiments/hadoop-yarn/ansible/51-cephfs-mount.yml +++ b/experiments/hadoop-yarn/ansible/51-cephfs-mount.yml @@ -38,7 +38,7 @@ --- - name: "Install and mount a CephFS share" - hosts: gateway:masters:workers:zeppelin + hosts: zeppelin:masters:workers gather_facts: false vars_files: - /tmp/ansible-vars.yml diff --git a/experiments/hadoop-yarn/ansible/combined-01.yml b/experiments/hadoop-yarn/ansible/combined-01.yml index 2015238a..bc7f5fc3 100644 --- a/experiments/hadoop-yarn/ansible/combined-01.yml +++ b/experiments/hadoop-yarn/ansible/combined-01.yml @@ -22,7 +22,6 @@ --- - import_playbook: 01-create-network.yml -- import_playbook: 02-create-gateway.yml - import_playbook: 03-create-masters.yml - import_playbook: 04-create-workers.yml - import_playbook: 25-create-zeppelin.yml diff --git a/experiments/hadoop-yarn/ansible/create-all.yml b/experiments/hadoop-yarn/ansible/create-all.yml index b021db8a..9476842c 100644 --- a/experiments/hadoop-yarn/ansible/create-all.yml +++ b/experiments/hadoop-yarn/ansible/create-all.yml @@ -22,10 +22,9 @@ --- - import_playbook: 01-create-keypair.yml - import_playbook: 01-create-network.yml -- import_playbook: 02-create-gateway.yml +- import_playbook: 25-create-zeppelin.yml - import_playbook: 03-create-masters.yml - import_playbook: 04-create-workers.yml -- import_playbook: 25-create-zeppelin.yml - import_playbook: 05-config-ssh.yml - import_playbook: 06-config-dns.yml diff --git a/experiments/hadoop-yarn/ansible/hosts.yml b/experiments/hadoop-yarn/ansible/hosts.yml index 67e1d8ad..150585eb 100644 --- a/experiments/hadoop-yarn/ansible/hosts.yml +++ b/experiments/hadoop-yarn/ansible/hosts.yml @@ -49,10 +49,6 @@ all: ansible_host_key_checking: false hosts: - gateway: - login: 'fedora' - image: 'Fedora-30-1.2' - flavor: 'general.v1.tiny' zeppelin: login: 'fedora' diff --git a/experiments/hadoop-yarn/ansible/templates/dns-hosts.j2 b/experiments/hadoop-yarn/ansible/templates/dns-hosts.j2 index 75aabc4d..b170552d 100644 --- a/experiments/hadoop-yarn/ansible/templates/dns-hosts.j2 +++ b/experiments/hadoop-yarn/ansible/templates/dns-hosts.j2 @@ -26,8 +26,8 @@ 127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4 ::1 localhost localhost.localdomain localhost6 localhost6.localdomain6 -# Gateway nodes. -{% for node in gatewaynodes.openstack_servers %} +# Zeppelin nodes. +{% for node in zeppelinnode.openstack_servers %} {{ "%-15s" | format(node.private_v4,) }} {{ node.metadata.hostname }} {% endfor %} @@ -41,7 +41,3 @@ {{ "%-15s" | format(node.private_v4,) }} {{ node.metadata.hostname }} {% endfor %} -# Zeppelin nodes. -{% for node in zeppelinnode.openstack_servers %} -{{ "%-15s" | format(node.private_v4,) }} {{ node.metadata.hostname }} -{% endfor %} diff --git a/experiments/hadoop-yarn/ansible/templates/hadoop-workers.j2 b/experiments/hadoop-yarn/ansible/templates/hadoop-workers.j2 index ce03890f..66f637ef 100644 --- a/experiments/hadoop-yarn/ansible/templates/hadoop-workers.j2 +++ b/experiments/hadoop-yarn/ansible/templates/hadoop-workers.j2 @@ -1,3 +1,24 @@ +{# +# +# +# Copyright (c) 2020, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +#} + {% for worker in groups['workers'] %} {{ worker }} {% endfor %} diff --git a/experiments/hadoop-yarn/ansible/templates/ssh-ansible.j2 b/experiments/hadoop-yarn/ansible/templates/ssh-ansible.j2 index f995ac83..0cbdbe25 100644 --- a/experiments/hadoop-yarn/ansible/templates/ssh-ansible.j2 +++ b/experiments/hadoop-yarn/ansible/templates/ssh-ansible.j2 @@ -28,9 +28,9 @@ ServerAliveInterval 60 ServerAliveCountMax 5 # Primary gateway node. -Host gateway - User {{ hostvars['gateway'].login }} - HostName {{ gatewaynodes.openstack_servers[0].accessIPv4 }} +Host zeppelin + User {{ hostvars['zeppelin'].login }} + HostName {{ zeppelinnodes.openstack_servers[0].accessIPv4 }} ControlPath ~/.ssh/%r@%h:%p ControlMaster auto ControlPersist 5m @@ -39,7 +39,7 @@ Host gateway {% for hostname in groups['masters'] %} Host {{ hostname }} User {{ hostvars[hostname]['login'] }} - ProxyCommand ssh -W %h:%p -l {{ hostvars['gateway'].login }} -F {{ lookup('env','HOME') }}/.ssh/ansible-config gateway + ProxyCommand ssh -W %h:%p -l {{ hostvars['zeppelin'].login }} -F {{ lookup('env','HOME') }}/.ssh/ansible-config zeppelin ControlPath ~/.ssh/%r@%h:%p ControlMaster auto ControlPersist 5m @@ -49,19 +49,10 @@ Host {{ hostname }} {% for hostname in groups['workers'] %} Host {{ hostname }} User {{ hostvars[hostname]['login'] }} - ProxyCommand ssh -W %h:%p -l {{ hostvars['gateway'].login }} -F {{ lookup('env','HOME') }}/.ssh/ansible-config gateway + ProxyCommand ssh -W %h:%p -l {{ hostvars['zeppelin'].login }} -F {{ lookup('env','HOME') }}/.ssh/ansible-config zeppelin ControlPath ~/.ssh/%r@%h:%p ControlMaster auto ControlPersist 5m {% endfor %} - -# Zeppelin node -Host zeppelin - User {{ hostvars['zeppelin']['login'] }} - ProxyCommand ssh -W %h:%p -l {{ hostvars['zeppelin'].login }} -F {{ lookup('env','HOME') }}/.ssh/ansible-config gateway - ControlPath ~/.ssh/%r@%h:%p - ControlMaster auto - ControlPersist 5m - From 5f1cb10a28c9584a468cef4b61cc9a907c1e4198 Mon Sep 17 00:00:00 2001 From: "zrq-github@metagrid.co.uk" Date: Thu, 11 Feb 2021 16:44:53 +0000 Subject: [PATCH 02/27] Fix to catch all the keys created by create-all --- experiments/openstack/bin/delete-all.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/openstack/bin/delete-all.sh b/experiments/openstack/bin/delete-all.sh index 46dd5282..beb232e9 100755 --- a/experiments/openstack/bin/delete-all.sh +++ b/experiments/openstack/bin/delete-all.sh @@ -354,7 +354,7 @@ --os-cloud "${cloudname:?}" \ keypair list \ --format json \ - | jq -r '.[] | select(.Name | startswith("aglais")) | .Name' + | jq -r '.[] | select(.Name | startswith("'${cloudname:?}'")) | .Name' ) do echo "- Deleting key [${keyname:?}]" From d0906a1a31ac86d582748a52dbc87b2bab133275 Mon Sep 17 00:00:00 2001 From: "zrq-github@metagrid.co.uk" Date: Thu, 11 Feb 2021 16:45:26 +0000 Subject: [PATCH 03/27] Finish notes on cherry picking --- notes/zrq/20210206-01-git-cherry-pick.txt | 32 +++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/notes/zrq/20210206-01-git-cherry-pick.txt b/notes/zrq/20210206-01-git-cherry-pick.txt index 1421114d..00311762 100644 --- a/notes/zrq/20210206-01-git-cherry-pick.txt +++ b/notes/zrq/20210206-01-git-cherry-pick.txt @@ -788,6 +788,38 @@ git add 'notes/zrq/20210206-01-git-cherry-pick.txt' + git commit -m "Added notes on git cherry picking" + git push + + popd + + +# ----------------------------------------------------- +# Create a new working branch .... +#[user@desktop] + + prevbranch=${nextbranch:?} + nextbranch=$(date '+%Y%m%d')-zrq-working + + source "${HOME}/aglais.env" + pushd "${AGLAIS_CODE}" + git checkout -b "${nextbranch:?}" + + > Switched to a new branch '20210206-zrq-working' + + + git push --set-upstream 'origin' "${nextbranch:?}" + + > Total 0 (delta 0), reused 0 (delta 0), pack-reused 0 + > remote: + > remote: Create a pull request for '20210206-zrq-working' on GitHub by visiting: + > remote: https://github.com/Zarquan/aglais/pull/new/20210206-zrq-working + > remote: + > To github.com:Zarquan/aglais.git + > * [new branch] 20210206-zrq-working -> 20210206-zrq-working + > Branch '20210206-zrq-working' set up to track remote branch '20210206-zrq-working' from 'origin'. + + popd From 24324016ae2a43b97c63bcd23c2d4bce296805a9 Mon Sep 17 00:00:00 2001 From: "zrq-github@metagrid.co.uk" Date: Thu, 11 Feb 2021 16:58:44 +0000 Subject: [PATCH 04/27] Moved Hadoop, Spark and Zeppelin vars into hosts.yml --- .../hadoop-yarn/ansible/11-install-hadoop.yml | 7 --- .../ansible/12-config-hadoop-core.yml | 7 --- .../ansible/13-config-hdfs-namenode.yml | 6 --- .../ansible/14-config-hdfs-workers.yml | 7 --- .../ansible/16-config-yarn-masters.yml | 7 --- .../ansible/17-config-yarn-workers.yml | 7 --- .../hadoop-yarn/ansible/20-install-spark.yml | 8 --- .../ansible/22-config-spark-master.yml | 9 ---- .../ansible/24-install-pyspark.yml | 6 --- .../ansible/27-install-zeppelin.yml | 7 --- experiments/hadoop-yarn/ansible/hosts.yml | 50 +++++++++++++++++++ 11 files changed, 50 insertions(+), 71 deletions(-) diff --git a/experiments/hadoop-yarn/ansible/11-install-hadoop.yml b/experiments/hadoop-yarn/ansible/11-install-hadoop.yml index 427bbaa0..696d9e88 100644 --- a/experiments/hadoop-yarn/ansible/11-install-hadoop.yml +++ b/experiments/hadoop-yarn/ansible/11-install-hadoop.yml @@ -29,13 +29,6 @@ - name: "Install Hadoop" hosts: masters:workers:zeppelin gather_facts: false - vars: - hdname: "hadoop-3.1.3" - hdbase: "/opt" - hdhome: "/opt/hadoop" - hddata: "/var/local/hadoop" - hdhost: "{{groups['masters'][0]}}" - hduser: "{{hostvars[inventory_hostname].login}}" tasks: diff --git a/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml b/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml index bbbe6e71..2c4e50ad 100644 --- a/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml +++ b/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml @@ -23,13 +23,6 @@ - name: "Configure Hadoop [core-site.xml]" hosts: masters:workers:zeppelin gather_facts: false - vars: - hdname: "hadoop-3.1.3" - hdbase: "/opt" - hdhome: "/opt/hadoop" - hddata: "/var/local/hadoop" - hdhost: "{{groups['masters'][0]}}" - hduser: "{{hostvars[inventory_hostname].login}}" tasks: diff --git a/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml b/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml index caae6b5b..4b8344c4 100644 --- a/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml +++ b/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml @@ -24,12 +24,6 @@ hosts: master01:zeppelin gather_facts: false vars: - hdname: "hadoop-3.1.3" - hdbase: "/opt" - hdhome: "/opt/hadoop" - hddata: "/var/local/hadoop" - hdhost: "{{groups['masters'][0]}}" - hduser: "{{hostvars[inventory_hostname].login}}" tasks: diff --git a/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml b/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml index 6df8aafe..fe7341b0 100644 --- a/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml +++ b/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml @@ -23,13 +23,6 @@ - name: "Configure Hadoop workers" hosts: workers:zeppelin gather_facts: false - vars: - hdname: "hadoop-3.1.3" - hdbase: "/opt" - hdhome: "/opt/hadoop" - hddata: "/var/local/hadoop" - hdhost: "{{groups['masters'][0]}}" - hduser: "{{hostvars[inventory_hostname].login}}" tasks: diff --git a/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml b/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml index 47a32d7a..688bb1a2 100644 --- a/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml +++ b/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml @@ -23,13 +23,6 @@ - name: "Configure YARN masters" hosts: master01:zeppelin gather_facts: false - vars: - hdname: "hadoop-3.1.3" - hdbase: "/opt" - hdhome: "/opt/hadoop" - hddata: "/var/local/hadoop" - hdhost: "master01" - hduser: "{{hostvars[inventory_hostname].login}}" tasks: diff --git a/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml b/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml index d39524d4..b6e6a116 100644 --- a/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml +++ b/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml @@ -23,13 +23,6 @@ - name: "Configure YARN workers" hosts: workers:zeppelin gather_facts: false - vars: - hdname: "hadoop-3.1.3" - hdbase: "/opt" - hdhome: "/opt/hadoop" - hddata: "/var/local/hadoop" - hdhost: "{{groups['masters'][0]}}" - hduser: "{{hostvars[inventory_hostname].login}}" tasks: diff --git a/experiments/hadoop-yarn/ansible/20-install-spark.yml b/experiments/hadoop-yarn/ansible/20-install-spark.yml index 114a6717..dfefac4a 100644 --- a/experiments/hadoop-yarn/ansible/20-install-spark.yml +++ b/experiments/hadoop-yarn/ansible/20-install-spark.yml @@ -24,14 +24,6 @@ - name: "Install Spark" hosts: master01:zeppelin gather_facts: false - vars: - spname: "spark-2.4.7" - spfull: "spark-2.4.7-bin-hadoop2.7" - spbase: "/opt" - sphome: "/opt/spark" - spdata: "/var/local/spark" - sphost: "{{groups['masters'][0]}}" - spuser: "{{hostvars[inventory_hostname].login}}" tasks: diff --git a/experiments/hadoop-yarn/ansible/22-config-spark-master.yml b/experiments/hadoop-yarn/ansible/22-config-spark-master.yml index aa4ff104..66b452c4 100644 --- a/experiments/hadoop-yarn/ansible/22-config-spark-master.yml +++ b/experiments/hadoop-yarn/ansible/22-config-spark-master.yml @@ -23,15 +23,6 @@ - name: "Configure YARN masters" hosts: master01:zeppelin gather_facts: false - vars: - hdbase: "/opt" - hdhome: "/opt/hadoop" - hdhost: "{{groups['masters'][0]}}" - spbase: "/opt" - sphome: "/opt/spark" - spdata: "/var/local/spark" - sphost: "master01" - spuser: "{{hostvars[inventory_hostname].login}}" tasks: diff --git a/experiments/hadoop-yarn/ansible/24-install-pyspark.yml b/experiments/hadoop-yarn/ansible/24-install-pyspark.yml index 8ca2ef3d..75b330ac 100644 --- a/experiments/hadoop-yarn/ansible/24-install-pyspark.yml +++ b/experiments/hadoop-yarn/ansible/24-install-pyspark.yml @@ -24,12 +24,6 @@ - name: "Install PySpark" hosts: master01:zeppelin gather_facts: false - vars: - spbase: "/opt" - sphome: "/opt/spark" - spdata: "/var/local/spark" - sphost: "{{groups['masters'][0]}}" - spuser: "{{hostvars[inventory_hostname].login}}" tasks: diff --git a/experiments/hadoop-yarn/ansible/27-install-zeppelin.yml b/experiments/hadoop-yarn/ansible/27-install-zeppelin.yml index 2f091f8c..45a5eafa 100644 --- a/experiments/hadoop-yarn/ansible/27-install-zeppelin.yml +++ b/experiments/hadoop-yarn/ansible/27-install-zeppelin.yml @@ -24,13 +24,6 @@ hosts: zeppelin gather_facts: yes vars: - zepname: "zeppelin-0.8.2" - sphome: "/opt/spark" - hdhome: "/opt/hadoop" - zepbase: "/home/fedora" - zephome: "/home/fedora/zeppelin-0.8.2-bin-all" - zephost: "zeppelin" - zepuser: "{{hostvars[inventory_hostname].login}}" zeppelinconfig: | diff --git a/experiments/hadoop-yarn/ansible/hosts.yml b/experiments/hadoop-yarn/ansible/hosts.yml index 150585eb..8247e37a 100644 --- a/experiments/hadoop-yarn/ansible/hosts.yml +++ b/experiments/hadoop-yarn/ansible/hosts.yml @@ -48,6 +48,56 @@ all: # https://docs.ansible.com/ansible/latest/user_guide/intro_getting_started.html#host-key-checking ansible_host_key_checking: false + # Hadoop vars + + hdname: "hadoop-3.1.3" + hdbase: "/opt" + hdhome: "/opt/hadoop" + + hdconf: "{{hdhome}}/etc/hadoop" + hdhost: "master01" + hduser: "fedora" + + hddatalink: "/var/hadoop/data" + hddatadest: "/mnt/cinder/vdc/hadoop/data" + + hdlogslink: "/var/hadoop/logs" + hdlogsdest: "/mnt/cinder/vdc/hadoop/logs" + + # HDFS vars + + hdfsconf: "/var/hdfs/conf" + + hdfsmetalink: "/var/hdfs/meta" + hdfsmetadest: "/mnt/cinder/vdc/hdfs/meta" + + hdfslogslink: "/var/hdfs/logs" + hdfslogsdest: "/mnt/cinder/vdc/hdfs/logs" + + hdfsdatalink: "/var/hdfs/data" + hdfsdatadest: "/mnt/cinder/vdc/hdfs/data" + + # Spark vars + spname: "spark-2.4.7" + spfull: "spark-2.4.7-bin-hadoop2.7" + spbase: "/opt" + sphome: "/opt/spark" + sphost: "master01" + spuser: "fedora" + + sptemplink: "/var/spark/temp" + sptempdest: "/mnt/local/vdb/spark/temp" + + # Zeppelin vars + zepname: "zeppelin-0.8.2" + zepbase: "/home/fedora" + zephome: "/home/fedora/zeppelin-0.8.2-bin-all" + zephost: "zeppelin" + zepuser: "fedora" + + #zepdatalink: '/var/zeppelin/data' + #zepdatadest: "/mnt/cinder/vdc/zeppelin/data" + hosts: zeppelin: From 090ba72e62019b21d984e03be9b2ba1115049fcb Mon Sep 17 00:00:00 2001 From: "zrq-github@metagrid.co.uk" Date: Thu, 11 Feb 2021 17:09:41 +0000 Subject: [PATCH 05/27] Fix a problem with Fedora updates --- .../hadoop-yarn/ansible/04-update-fedora.yml | 50 +++++++++++ .../hadoop-yarn/ansible/09-worker-volumes.yml | 2 +- .../hadoop-yarn/ansible/combined-01.yml | 1 + .../hadoop-yarn/ansible/create-all.yml | 2 + notes/zrq/20210208-01-fedora-repo.txt | 86 +++++++++++++++++++ 5 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 experiments/hadoop-yarn/ansible/04-update-fedora.yml create mode 100644 notes/zrq/20210208-01-fedora-repo.txt diff --git a/experiments/hadoop-yarn/ansible/04-update-fedora.yml b/experiments/hadoop-yarn/ansible/04-update-fedora.yml new file mode 100644 index 00000000..d5085c0e --- /dev/null +++ b/experiments/hadoop-yarn/ansible/04-update-fedora.yml @@ -0,0 +1,50 @@ +# +# +# +# Copyright (c) 2020, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +# + +# ignore_errors +# https://docs.ansible.com/ansible/latest/user_guide/playbooks_error_handling.html#ignoring-failed-commands + +- name: "DNF update" + gather_facts: false + hosts: masters:workers:zeppelin + vars_files: + - /tmp/ansible-vars.yml + tasks: + + # This is a noop to force a cache-refresh. + - name: "Update the DNF cache" + become: true + ignore_errors: yes + dnf: + name: 'kernel' + state: present + update_cache: yes + + + - name: "Install monitoring tools" + become: true + dnf: + name: + - 'atop' + - 'htop' + state: present + diff --git a/experiments/hadoop-yarn/ansible/09-worker-volumes.yml b/experiments/hadoop-yarn/ansible/09-worker-volumes.yml index de158418..3f71a848 100644 --- a/experiments/hadoop-yarn/ansible/09-worker-volumes.yml +++ b/experiments/hadoop-yarn/ansible/09-worker-volumes.yml @@ -45,7 +45,7 @@ become: true dnf: name: btrfs-progs - state: latest + state: present - name: "Mount data volumes for {{ inventory_hostname }}" include_tasks: tasks/mount-volumes.yml diff --git a/experiments/hadoop-yarn/ansible/combined-01.yml b/experiments/hadoop-yarn/ansible/combined-01.yml index bc7f5fc3..35394d1c 100644 --- a/experiments/hadoop-yarn/ansible/combined-01.yml +++ b/experiments/hadoop-yarn/ansible/combined-01.yml @@ -31,4 +31,5 @@ - import_playbook: 07-host-keys.yml - import_playbook: 08-ping-test.yml +- import_playbook: 04-update-fedora.yml diff --git a/experiments/hadoop-yarn/ansible/create-all.yml b/experiments/hadoop-yarn/ansible/create-all.yml index 9476842c..8aae3fb4 100644 --- a/experiments/hadoop-yarn/ansible/create-all.yml +++ b/experiments/hadoop-yarn/ansible/create-all.yml @@ -32,6 +32,8 @@ - import_playbook: 07-host-keys.yml - import_playbook: 08-ping-test.yml +- import_playbook: 04-update-fedora.yml + - import_playbook: 09-worker-volumes.yml - import_playbook: 26-zeppelin-volumes.yml diff --git a/notes/zrq/20210208-01-fedora-repo.txt b/notes/zrq/20210208-01-fedora-repo.txt new file mode 100644 index 00000000..efd961f3 --- /dev/null +++ b/notes/zrq/20210208-01-fedora-repo.txt @@ -0,0 +1,86 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + Quick test to see if we can solve the problems with resolving the Fedora reposrityry. + Try configuring dnf to use the UK mirror service. + + pushd /etc/yum.repos.d/ + for repo in *.repo + do + echo "---- ----" + echo "Repo [${repo:?}]" + sudo sed -i ' + s/^metalink/#metalink/ + s/^#baseurl/baseurl/ + s|http://download.fedoraproject.org|http://www.mirrorservice.org/sites/download.fedora.redhat.com| + ' "${repo:?}" + + done + popd + + Fails because mirrorservice only has data for 32 and 33? + Directories for 30 and 31 are empty. + + http://www.mirrorservice.org/sites/download.fedora.redhat.com/pub/fedora/linux/releases/30 - empty + http://www.mirrorservice.org/sites/download.fedora.redhat.com/pub/fedora/linux/releases/31 - empty + + http://www.mirrorservice.org/sites/download.fedora.redhat.com/pub/fedora/linux/releases/32 - OK + http://www.mirrorservice.org/sites/download.fedora.redhat.com/pub/fedora/linux/releases/33 - OK + + Suggests this would work - once we have updated. + Alternative - create our own caching proxy site. + Add this to the infrastructure node. + + + List of mirrors per version + + https://admin.fedoraproject.org/mirrormanager/ + https://admin.fedoraproject.org/mirrormanager/mirrors/Fedora/30/x86_64 + + http://mirrors.dotsrc.org/fedora-buffet/archive/fedora/linux/updates/30/ + + https://ftp-stud.hs-esslingen.de/pub/Mirrors/archive.fedoraproject.org/fedora/linux/updates/30/ + + + + + # + # Issue solved by forcing DNF to flush the metadata cache. + # Using the Ansible DNF plugin to check something is present. + # Picked 'kernel' because it is on every machine. + # + + - name: "Update the DNF cache" + become: true + dnf: + name: 'kernel' + state: present + update_cache: yes + + + + From 287a16406631e35df79841cc0e1fb1de73bbe2ad Mon Sep 17 00:00:00 2001 From: "zrq-github@metagrid.co.uk" Date: Thu, 11 Feb 2021 17:44:22 +0000 Subject: [PATCH 06/27] Volume mounts for temp space --- .../hadoop-yarn/ansible/09-worker-volumes.yml | 3 +- .../hadoop-yarn/ansible/11-install-hadoop.yml | 38 +- .../ansible/13-config-hdfs-namenode.yml | 16 +- .../ansible/14-config-hdfs-workers.yml | 19 +- .../ansible/16-config-yarn-masters.yml | 2 +- .../ansible/17-config-yarn-workers.yml | 2 +- .../hadoop-yarn/ansible/20-install-spark.yml | 2 +- .../ansible/22-config-spark-master.yml | 28 +- .../ansible/22-config-spark-workers.yml | 36 + .../ansible/24-install-pyspark.yml | 2 +- .../hadoop-yarn/ansible/combined-03.yml | 1 + .../hadoop-yarn/ansible/create-all.yml | 1 + experiments/hadoop-yarn/ansible/hosts.yml | 31 +- .../ansible/tasks/create-linked.yml | 61 ++ .../ansible/tasks/create-volumes.yml | 12 +- .../ansible/tasks/mount-volumes.yml | 18 +- notes/zrq/20210208-02-ansible-deploy.txt | 238 ++++++ notes/zrq/20210210-01-ansible-deploy.txt | 467 +++++++++++ notes/zrq/20210211-01-ansible-deploy.txt | 771 ++++++++++++++++++ 19 files changed, 1665 insertions(+), 83 deletions(-) create mode 100644 experiments/hadoop-yarn/ansible/22-config-spark-workers.yml create mode 100644 experiments/hadoop-yarn/ansible/tasks/create-linked.yml create mode 100644 notes/zrq/20210208-02-ansible-deploy.txt create mode 100644 notes/zrq/20210210-01-ansible-deploy.txt create mode 100644 notes/zrq/20210211-01-ansible-deploy.txt diff --git a/experiments/hadoop-yarn/ansible/09-worker-volumes.yml b/experiments/hadoop-yarn/ansible/09-worker-volumes.yml index 3f71a848..584f1a7a 100644 --- a/experiments/hadoop-yarn/ansible/09-worker-volumes.yml +++ b/experiments/hadoop-yarn/ansible/09-worker-volumes.yml @@ -47,10 +47,11 @@ name: btrfs-progs state: present - - name: "Mount data volumes for {{ inventory_hostname }}" + - name: "Call the mount-volumes task" include_tasks: tasks/mount-volumes.yml loop: "{{ hostvars[ inventory_hostname ].discs }}" loop_control: loop_var: disc + when: ((disc.type == 'cinder') or (disc.type == 'local')) diff --git a/experiments/hadoop-yarn/ansible/11-install-hadoop.yml b/experiments/hadoop-yarn/ansible/11-install-hadoop.yml index 696d9e88..2434d200 100644 --- a/experiments/hadoop-yarn/ansible/11-install-hadoop.yml +++ b/experiments/hadoop-yarn/ansible/11-install-hadoop.yml @@ -39,32 +39,26 @@ dest: "{{hdbase}}" remote_src: yes - - name: "Create a symbolic link" + - name: "Create a symlink for the Hadoop version" become: true file: src: "{{hdname}}" path: "{{hdhome}}" state: link - - name: "Create '{{hddata}}'" - become: true - file: - path: "{{hddata}}" - mode: 'u=rwx,g=rwxs,o=rx' - state: directory - recurse: yes - owner: "{{hduser}}" - group: "{{hduser}}" + - name: "Create Hadoop data directory" + include_tasks: "tasks/create-linked.yml" + vars: + linkdest: "{{hddatadest}}" + linkpath: "{{hddatalink}}" + linkuser: "{{hduser}}" - - name: "Create [{{hddata}}/logs]" - become: true - file: - path: "{{hddata}}/logs" - mode: 'u=rwx,g=rwxs,o=rx' - state: directory - recurse: yes - owner: "{{hduser}}" - group: "{{hduser}}" + - name: "Create Hadoop logs directory" + include_tasks: "tasks/create-linked.yml" + vars: + linkdest: "{{hdlogsdest}}" + linkpath: "{{hdlogslink}}" + linkuser: "{{hduser}}" # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/ClusterSetup.html#Configuring_Environment_of_Hadoop_Daemons - name: "Create [/etc/profile.d/hadoop.sh]" @@ -82,8 +76,8 @@ export PATH=${PATH}:{{hdhome}}/bin:{{hdhome}}/sbin #export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:{{hdhome}}/lib/native export HADOOP_HOME={{hdhome}} - export HADOOP_DATA={{hddata}} - export HADOOP_CONF_DIR={{hdhome}}/etc/hadoop - export HADOOP_LOG_DIR=${HADOOP_DATA}/logs + export HADOOP_DATA={{hddatalink}} + export HADOOP_CONF_DIR={{hdconf}} + export HADOOP_LOG_DIR={{hdlogslink}} diff --git a/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml b/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml index 4b8344c4..7bfe899b 100644 --- a/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml +++ b/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml @@ -21,16 +21,24 @@ # - name: "Configure HDFS namenode" - hosts: master01:zeppelin + hosts: master01 gather_facts: false vars: + hdfsimage: "{{hdfsmetalink}}/namenode/fsimage" tasks: - - name: "Create [{{hddata}}/namenode/fsimage]" + - name: "Create HDFS metadata directory" + include_tasks: "tasks/create-linked.yml" + vars: + linkdest: "{{hdfsmetadest}}" + linkpath: "{{hdfsmetalink}}" + linkuser: "{{hduser}}" + + - name: "Create [{{hdfsimage}}]" become: true file: - path: "{{hddata}}/namenode/fsimage" + path: "{{hdfsimage}}" mode: 'u=rwx,g=rwxs,o=rx' state: directory recurse: yes @@ -53,7 +61,7 @@ +--> dfs.namenode.name.dir - {{hddata}}/namenode/fsimage + {{hdfsimage}} dfs.datanode.data.dir - /data-01/hdfs/data + {{hdfsdatalink}} diff --git a/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml b/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml index 688bb1a2..e5a9fa76 100644 --- a/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml +++ b/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml @@ -21,7 +21,7 @@ # - name: "Configure YARN masters" - hosts: master01:zeppelin + hosts: masters:zeppelin gather_facts: false tasks: diff --git a/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml b/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml index b6e6a116..7490cdc4 100644 --- a/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml +++ b/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml @@ -21,7 +21,7 @@ # - name: "Configure YARN workers" - hosts: workers:zeppelin + hosts: workers gather_facts: false tasks: diff --git a/experiments/hadoop-yarn/ansible/20-install-spark.yml b/experiments/hadoop-yarn/ansible/20-install-spark.yml index dfefac4a..ba9ce98a 100644 --- a/experiments/hadoop-yarn/ansible/20-install-spark.yml +++ b/experiments/hadoop-yarn/ansible/20-install-spark.yml @@ -22,7 +22,7 @@ --- - name: "Install Spark" - hosts: master01:zeppelin + hosts: masters:zeppelin gather_facts: false tasks: diff --git a/experiments/hadoop-yarn/ansible/22-config-spark-master.yml b/experiments/hadoop-yarn/ansible/22-config-spark-master.yml index 66b452c4..d46359d5 100644 --- a/experiments/hadoop-yarn/ansible/22-config-spark-master.yml +++ b/experiments/hadoop-yarn/ansible/22-config-spark-master.yml @@ -20,19 +20,21 @@ # # -- name: "Configure YARN masters" - hosts: master01:zeppelin +- name: "Configure Spark masters" + hosts: zeppelin:masters gather_facts: false tasks: - - name: Creates directory - file: - path: "{{sphome}}/local" - state: directory - owner: "{{spuser}}" - group: "{{spuser}}" - mode: 0775 + # + # The Zeppelin node is acting as our Spark Master. + - name: "Create Spark temp directory" + include_tasks: "tasks/create-linked.yml" + vars: + linkdest: "{{sptempdest}}" + linkpath: "{{sptemplink}}" + linkuser: "{{spuser}}" + # # Documentation # https://spark.apache.org/docs/3.0.0-preview2/running-on-yarn.html#configuration @@ -56,7 +58,7 @@ spark.yarn.am.cores 4 spark.eventLog.enabled true spark.driver.maxResultSize 8192m - spark.local.dir {{sphome}}/local + spark.local.dir {{sptemplink}} spark.master yarn spark.eventLog.enabled true spark.eventLog.dir hdfs://{{hdhost}}:9000/spark-log @@ -94,12 +96,6 @@ spark.yarn.appMasterEnv.YARN_CONF_DIR={{hdhome}}/etc/hadoop spark.yarn.appMasterEnv.HADOOP_CONF_DIR={{hdhome}}/etc/hadoop -# -# TODO Experiment -# Move Spark to master02, add Yarn config. -# {{hdhome}}/etc/hadoop/yarn-site.xml - - # # TODO History server. # https://spark.apache.org/docs/3.0.0-preview2/monitoring.html#viewing-after-the-fact diff --git a/experiments/hadoop-yarn/ansible/22-config-spark-workers.yml b/experiments/hadoop-yarn/ansible/22-config-spark-workers.yml new file mode 100644 index 00000000..7b086f91 --- /dev/null +++ b/experiments/hadoop-yarn/ansible/22-config-spark-workers.yml @@ -0,0 +1,36 @@ +# +# +# +# Copyright (c) 2020, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +# + +- name: "Configure Spark workers" + hosts: workers + gather_facts: false + + tasks: + +# - name: "Create Spark temp directory" +# include_tasks: "tasks/create-linked.yml" +# vars: +# linkdest: "{{sptempdest}}" +# linkpath: "{{sptemplink}}" +# linkuser: "{{spuser}}" + + diff --git a/experiments/hadoop-yarn/ansible/24-install-pyspark.yml b/experiments/hadoop-yarn/ansible/24-install-pyspark.yml index 75b330ac..78573095 100644 --- a/experiments/hadoop-yarn/ansible/24-install-pyspark.yml +++ b/experiments/hadoop-yarn/ansible/24-install-pyspark.yml @@ -22,7 +22,7 @@ --- - name: "Install PySpark" - hosts: master01:zeppelin + hosts: masters:zeppelin gather_facts: false tasks: diff --git a/experiments/hadoop-yarn/ansible/combined-03.yml b/experiments/hadoop-yarn/ansible/combined-03.yml index a2bea86a..ca8f4484 100644 --- a/experiments/hadoop-yarn/ansible/combined-03.yml +++ b/experiments/hadoop-yarn/ansible/combined-03.yml @@ -24,6 +24,7 @@ - import_playbook: 20-install-spark.yml - import_playbook: 21-config-spark-security.yml - import_playbook: 22-config-spark-master.yml +- import_playbook: 22-config-spark-workers.yml - import_playbook: 23-install-python.yml - import_playbook: 24-install-pyspark.yml diff --git a/experiments/hadoop-yarn/ansible/create-all.yml b/experiments/hadoop-yarn/ansible/create-all.yml index 8aae3fb4..67e91c0c 100644 --- a/experiments/hadoop-yarn/ansible/create-all.yml +++ b/experiments/hadoop-yarn/ansible/create-all.yml @@ -54,6 +54,7 @@ - import_playbook: 20-install-spark.yml - import_playbook: 21-config-spark-security.yml - import_playbook: 22-config-spark-master.yml +- import_playbook: 22-config-spark-workers.yml - import_playbook: 23-install-python.yml - import_playbook: 24-install-pyspark.yml diff --git a/experiments/hadoop-yarn/ansible/hosts.yml b/experiments/hadoop-yarn/ansible/hosts.yml index 8247e37a..9035f5f0 100644 --- a/experiments/hadoop-yarn/ansible/hosts.yml +++ b/experiments/hadoop-yarn/ansible/hosts.yml @@ -103,18 +103,23 @@ all: zeppelin: login: 'fedora' image: 'Fedora-30-1.2' - flavor: 'general.v1.small' + flavor: 'general.v1.medium' discs: - - size: 512 - name: data-02 - mntpath: '/data-02' - devpath: '/dev/vdb' + - type: 'local' + format: 'ext4' + mntpath: "/mnt/local/vdb" + devname: 'vdb' + - type: 'cinder' + size: 512 + format: 'btrfs' + mntpath: "/mnt/cinder/vdc" + devname: 'vdc' children: masters: hosts: - master[01:02]: + master[01:01]: vars: login: 'fedora' image: 'Fedora-30-1.2' @@ -128,9 +133,13 @@ all: image: 'Fedora-30-1.2' flavor: 'general.v1.small' discs: - - size: 512 - name: data-01 - mntpath: '/data-01' - devpath: '/dev/vdb' - +# - type: 'local' +# format: 'ext4' +# mntpath: "/mnt/local/vdb" +# devname: 'vdb' + - type: 'cinder' + size: 512 + format: 'btrfs' + mntpath: "/mnt/cinder/vdb" + devname: 'vdb' diff --git a/experiments/hadoop-yarn/ansible/tasks/create-linked.yml b/experiments/hadoop-yarn/ansible/tasks/create-linked.yml new file mode 100644 index 00000000..1c137b86 --- /dev/null +++ b/experiments/hadoop-yarn/ansible/tasks/create-linked.yml @@ -0,0 +1,61 @@ +# +# +# +# Copyright (c) 2020, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +# + +- name: "Create destination parent [{{linkdest | dirname}}]" + become: true + file: + path: "{{linkdest | dirname}}" + mode: 'u=rwx,g=rwxs,o=rx' + state: directory + recurse: yes + owner: 'root' + group: 'root' + +- name: "Create link destination [{{linkdest}}]" + become: true + file: + path: "{{linkdest}}" + mode: 'u=rwx,g=rwxs,o=rx' + state: directory + owner: "{{linkuser}}" + group: "{{linkuser}}" + + +- name: "Create link parent [{{linkpath | dirname}}]" + become: true + file: + path: "{{linkpath | dirname}}" + mode: 'u=rwx,g=rwxs,o=rx' + state: directory + recurse: yes + owner: 'root' + group: 'root' + +- name: "Create link [{{linkpath}} -> {{linkdest}}]" + become: true + file: + src: "{{linkdest}}" + path: "{{linkpath}}" + state: link +# owner: 'root' +# group: 'root' + diff --git a/experiments/hadoop-yarn/ansible/tasks/create-volumes.yml b/experiments/hadoop-yarn/ansible/tasks/create-volumes.yml index 6109fc3e..17b52456 100644 --- a/experiments/hadoop-yarn/ansible/tasks/create-volumes.yml +++ b/experiments/hadoop-yarn/ansible/tasks/create-volumes.yml @@ -20,23 +20,25 @@ # # -- name: "Create volumes for [{{ vmname }}]" +- name: "Create Cinder volumes for [{{ vmname }}]" os_volume: cloud: "{{ cloudname }}" state: present size: "{{ item.size }}" - display_name: "{{ deployname }}-{{ vmname }}-{{ item.name }}" + display_name: "{{ deployname }}-{{ vmname }}-{{ item.devname }}" loop: "{{ hostvars[vmname].discs }}" + when: item.type == 'cinder' -- name: "Attach volumes to [{{ vmname }}]" +- name: "Attach Cinder volumes to [{{ vmname }}]" os_server_volume: cloud: "{{ cloudname }}" state: present server: "{{ deployname }}-{{ vmname }}" - volume: "{{ deployname }}-{{ vmname }}-{{ item.name }}" - device: "{{ item.devpath }}" + volume: "{{ deployname }}-{{ vmname }}-{{ item.devname }}" + device: "/dev/{{ item.devname }}" loop: "{{ hostvars[vmname].discs }}" + when: item.type == 'cinder' diff --git a/experiments/hadoop-yarn/ansible/tasks/mount-volumes.yml b/experiments/hadoop-yarn/ansible/tasks/mount-volumes.yml index 512aab23..a6efa442 100644 --- a/experiments/hadoop-yarn/ansible/tasks/mount-volumes.yml +++ b/experiments/hadoop-yarn/ansible/tasks/mount-volumes.yml @@ -20,19 +20,19 @@ # # - -- name: "Create btrfs filesystem on {{disc.devpath}}" +- name: "Create [{{disc.format}}] filesystem on [/dev/{{disc.devname}}]" become: true filesystem: - fstype: btrfs - dev: "{{disc.devpath}}" + fstype: "{{disc.format}}" + dev: "/dev/{{disc.devname}}" + when: (disc.format == 'btrfs') -# TODO Only do this if not already created -- name: "Create the mount point {{disc.mntpath}}" +- name: "Create mount path [{{disc.mntpath}}]" become: true file: path: "{{disc.mntpath}}" state: directory + recurse: yes # TODO Only do this if not already mounted - name: "Create the mount-failed indicator" @@ -44,12 +44,12 @@ # TODO Only do this if not already mounted # TODO Mount using UUID rather than device path. -- name: "Mount device {{disc.devpath}} at {{disc.mntpath}}" +- name: "Mount [{{disc.format}}] [/dev/{{disc.devname}}] at [{{disc.mntpath}}]" become: true mount: - src: "{{disc.devpath}}" + src: "/dev/{{disc.devname}}" path: "{{disc.mntpath}}" - fstype: btrfs + fstype: "{{disc.format}}" state: mounted diff --git a/notes/zrq/20210208-02-ansible-deploy.txt b/notes/zrq/20210208-02-ansible-deploy.txt new file mode 100644 index 00000000..5caed874 --- /dev/null +++ b/notes/zrq/20210208-02-ansible-deploy.txt @@ -0,0 +1,238 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Add support for different types of disc ... + + Results: + + Work in progress .... + + +# ----------------------------------------------------- +# Update the Openstack cloud name. +#[user@desktop] + + cloudname=gaia-dev + + sed -i ' + s/^\(AGLAIS_CLOUD\)=.*$/\1='${cloudname:?}'/ + ' "${HOME}/aglais.env" + + +# ----------------------------------------------------- +# Create a container to work with. +# (*) extra volume mount for /common +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name ansibler \ + --hostname ansibler \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --env "cloudname=${AGLAIS_CLOUD:?}" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/common:/common:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/openstack:/openstack:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/hadoop-yarn:/hadoop-yarn:ro,z" \ + atolmis/ansible-client:2020.12.02 \ + bash + + +# ----------------------------------------------------- +# Create our Aglais configuration. +#[root@kubernator] + +cat > '/tmp/aglais-config.yml' << EOF +aglais: + version: 1.0 + spec: + openstack: + cloud: '${cloudname:?}' + +EOF + + +# ----------------------------------------------------- +# Create everything from scratch. +#[root@ansibler] + + time \ + /openstack/bin/delete-all.sh \ + "${cloudname:?}" + + rm -f ~/.ssh/* + + time \ + /hadoop-yarn/bin/create-all.sh + + + > .... + > .... + + +# ----------------------------------------------------- +# Check the data directories. +#[root@ansibler] + + + ssh worker01 \ + ' + date + hostname + echo "----" + echo "/var/spark" + ls -l "/var/spark" + df -h "/var/spark/temp" + + echo "----" + echo "/var/hdfs" + ls -l "/var/hdfs" + df -h "/var/hdfs/data" + + echo "----" + echo "/var/hadoop" + ls -l "/var/hadoop" + df -h "/var/hadoop/data" + ' + + > Tue Feb 9 15:46:45 UTC 2021 + > gaia-dev-20210209-worker01.novalocal + > ---- + > /var/spark + > total 0 + > lrwxrwxrwx. 1 root root 25 Feb 9 15:41 temp -> /mnt/local/vdb/spark/temp + > Filesystem Size Used Avail Use% Mounted on + > /dev/vdb 59G 53M 56G 1% /mnt/local/vdb + > ---- + > /var/hdfs + > total 0 + > lrwxrwxrwx. 1 root root 25 Feb 9 15:38 data -> /mnt/cinder/vdc/hdfs/data + > Filesystem Size Used Avail Use% Mounted on + > /dev/vdc 512G 17M 510G 1% /mnt/cinder/vdc + > ---- + > /var/hadoop + > total 0 + > lrwxrwxrwx. 1 root root 27 Feb 9 15:36 data -> /mnt/cinder/vdc/hadoop/data + > lrwxrwxrwx. 1 root root 27 Feb 9 15:36 logs -> /mnt/cinder/vdc/hadoop/logs + > Filesystem Size Used Avail Use% Mounted on + > /dev/vdc 512G 17M 510G 1% /mnt/cinder/vdc + + +# ----------------------------------------------------- +# Check the deployment status. +#[root@ansibler] + + cat '/tmp/aglais-status.yml' + + > aglais: + > spec: + > openstack: + > cloud: gaia-dev + > status: + > deployment: + > type: hadoop-yarn + > name: gaia-dev-20210209 + > date: 20210209T194001 + + deployname=$( + yq read \ + '/tmp/aglais-status.yml' \ + 'aglais.status.deployment.name' + ) + + echo "Deployment [${deployname}]" + + > Deployment [gaia-dev-20210209] + + +# ----------------------------------------------------- +# Get the public IP address of our Zeppelin node. +#[root@ansibler] + + zeppelinid=$( + openstack \ + --os-cloud "${cloudname:?}" \ + server list \ + --format json \ + | jq -r '.[] | select(.Name == "'${deployname:?}'-zeppelin") | .ID' + ) + + zeppelinip=$( + openstack \ + --os-cloud "${cloudname:?}" \ + server show \ + --format json \ + "${zeppelinid:?}" \ + | jq -r '.addresses' \ + | sed ' + s/[[:space:]]// + s/.*=\(.*\)/\1/ + s/.*,\(.*\)/\1/ + ' + ) + +cat << EOF +Zeppelin ID [${zeppelinid:?}] +Zeppelin IP [${zeppelinip:?}] +EOF + + > Zeppelin ID [a10a9b20-812a-4cab-ae97-efb2ccaddc0f] + > Zeppelin IP [128.232.227.229] + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + Update our DNS + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Login to Zeppelin ... +#[user@desktop] + + firefox --new-window "http://zeppelin.metagrid.xyz:8080/" & + + +# ----------------------------------------------------- +# Run test notebooks .. +#[user@zeppelin] + + + Import notebooks from GitHu, clear the output and run all the cells ... + + Good astrometric solutions via ML Random Forrest classifier + https://raw.githubusercontent.com/wfau/aglais-notebooks/main/2FRPC4BFS/note.json + diff --git a/notes/zrq/20210210-01-ansible-deploy.txt b/notes/zrq/20210210-01-ansible-deploy.txt new file mode 100644 index 00000000..6cdff1c3 --- /dev/null +++ b/notes/zrq/20210210-01-ansible-deploy.txt @@ -0,0 +1,467 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Get Spark to work with the new configuration. + + Test config: + no gateway + medium zeppelin + 4 medium workers + + Results: + + Work in progress .... + + +# ----------------------------------------------------- +# Update the Openstack cloud name. +#[user@desktop] + + cloudname=gaia-dev + + sed -i ' + s/^\(AGLAIS_CLOUD\)=.*$/\1='${cloudname:?}'/ + ' "${HOME}/aglais.env" + + +# ----------------------------------------------------- +# Create a container to work with. +# (*) extra volume mount for /common +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name ansibler \ + --hostname ansibler \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --env "cloudname=${AGLAIS_CLOUD:?}" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/common:/common:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/openstack:/openstack:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/hadoop-yarn:/hadoop-yarn:ro,z" \ + atolmis/ansible-client:2020.12.02 \ + bash + + +# ----------------------------------------------------- +# Create our Aglais configuration. +#[root@kubernator] + +cat > '/tmp/aglais-config.yml' << EOF +aglais: + version: 1.0 + spec: + openstack: + cloud: '${cloudname:?}' + +EOF + + +# ----------------------------------------------------- +# Create everything from scratch. +#[root@ansibler] + + time \ + /openstack/bin/delete-all.sh \ + "${cloudname:?}" + + rm -f ~/.ssh/* + + time \ + /hadoop-yarn/bin/create-all.sh + + + > .... + > .... + + +# ----------------------------------------------------- +# Check the deployment status. +#[root@ansibler] + + cat '/tmp/aglais-status.yml' + + > aglais: + > spec: + > openstack: + > cloud: gaia-dev + > status: + > deployment: + > type: hadoop-yarn + > name: gaia-dev-20210209 + > date: 20210209T194001 + + +# ----------------------------------------------------- +# Get the public IP address of our Zeppelin node. +#[root@ansibler] + + deployname=$( + yq read \ + '/tmp/aglais-status.yml' \ + 'aglais.status.deployment.name' + ) + + zeppelinid=$( + openstack \ + --os-cloud "${cloudname:?}" \ + server list \ + --format json \ + | jq -r '.[] | select(.Name == "'${deployname:?}'-zeppelin") | .ID' + ) + + zeppelinip=$( + openstack \ + --os-cloud "${cloudname:?}" \ + server show \ + --format json \ + "${zeppelinid:?}" \ + | jq -r '.addresses' \ + | sed ' + s/[[:space:]]// + s/.*=\(.*\)/\1/ + s/.*,\(.*\)/\1/ + ' + ) + +cat << EOF +Zeppelin ID [${zeppelinid:?}] +Zeppelin IP [${zeppelinip:?}] +EOF + + > Zeppelin ID [a10a9b20-812a-4cab-ae97-efb2ccaddc0f] + > Zeppelin IP [128.232.227.229] + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + Update our DNS + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Login to Zeppelin ... +#[user@desktop] + + firefox --new-window "http://zeppelin.metagrid.xyz:8080/" & + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + + Import notebooks from GitHub, clear the output and run all the cells ... + + Good astrometric solutions via ML Random Forrest classifier + https://raw.githubusercontent.com/wfau/aglais-notebooks/main/2FRPC4BFS/note.json + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + > org.apache.thrift.transport.TTransportException + > at org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:132) + > at org.apache.thrift.transport.TTransport.readAll(TTransport.java:86) + > at org.apache.thrift.protocol.TBinaryProtocol.readAll(TBinaryProtocol.java:429) + > at org.apache.thrift.protocol.TBinaryProtocol.readI32(TBinaryProtocol.java:318) + > at org.apache.thrift.protocol.TBinaryProtocol.readMessageBegin(TBinaryProtocol.java:219) + > at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:69) + > at org.apache.zeppelin.interpreter.thrift.RemoteInterpreterService$Client.recv_interpret(RemoteInterpreterService.java:274) + > at org.apache.zeppelin.interpreter.thrift.RemoteInterpreterService$Client.interpret(RemoteInterpreterService.java:258) + > at org.apache.zeppelin.interpreter.remote.RemoteInterpreter$4.call(RemoteInterpreter.java:233) + > at org.apache.zeppelin.interpreter.remote.RemoteInterpreter$4.call(RemoteInterpreter.java:229) + > at org.apache.zeppelin.interpreter.remote.RemoteInterpreterProcess.callRemoteFunction(RemoteInterpreterProcess.java:135) + > at org.apache.zeppelin.interpreter.remote.RemoteInterpreter.interpret(RemoteInterpreter.java:228) + > at org.apache.zeppelin.notebook.Paragraph.jobRun(Paragraph.java:449) + > at org.apache.zeppelin.scheduler.Job.run(Job.java:188) + > at org.apache.zeppelin.scheduler.RemoteScheduler$JobRunner.run(RemoteScheduler.java:315) + > at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) + > at java.util.concurrent.FutureTask.run(FutureTask.java:266) + > at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180) + > at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + > at java.lang.Thread.run(Thread.java:748) + + +# ----------------------------------------------------- +# Check the Zeppelin logs. +#[user@zeppelin] + + pushd /home/fedora/zeppelin-0.8.2-bin-all/logs + cat zeppelin-interpreter-spark-fedora-gaia-dev-20210210-zeppelin.novalocal.log + + > .... + > .... + > INFO [2021-02-10 12:33:29,264] ({main} RemoteInterpreterServer.java[main]:261) - URL:jar:file:/home/fedora/zeppelin-0.8.2-bin-all/interpreter/spark/spark-interpreter-0.8.2.jar!/org/apache/zeppelin/interpreter/remote/RemoteInterpreterServer.class + > INFO [2021-02-10 12:33:29,284] ({main} RemoteInterpreterServer.java[]:162) - Launching ThriftServer at 10.10.2.210:36095 + > INFO [2021-02-10 12:33:29,286] ({main} RemoteInterpreterServer.java[]:166) - Starting remote interpreter server on port 36095 + > INFO [2021-02-10 12:33:29,287] ({Thread-3} RemoteInterpreterServer.java[run]:203) - Starting remote interpreter server on port 36095 + > INFO [2021-02-10 12:33:29,291] ({Thread-4} RemoteInterpreterUtils.java[registerInterpreter]:165) - callbackHost: 10.10.2.210, callbackPort: 42139, callbackInfo: CallbackInfo(host:10.10.2.210, port:36095) + > INFO [2021-02-10 12:33:29,354] ({pool-1-thread-1} RemoteInterpreterServer.java[createInterpreter]:311) - Instantiate interpreter org.apache.zeppelin.spark.SparkInterpreter + > INFO [2021-02-10 12:33:29,355] ({pool-1-thread-1} RemoteInterpreterServer.java[createInterpreter]:311) - Instantiate interpreter org.apache.zeppelin.spark.SparkSqlInterpreter + > INFO [2021-02-10 12:33:29,360] ({pool-1-thread-1} RemoteInterpreterServer.java[createInterpreter]:311) - Instantiate interpreter org.apache.zeppelin.spark.DepInterpreter + > INFO [2021-02-10 12:33:29,363] ({pool-1-thread-1} RemoteInterpreterServer.java[createInterpreter]:311) - Instantiate interpreter org.apache.zeppelin.spark.PySparkInterpreter + > INFO [2021-02-10 12:33:29,366] ({pool-1-thread-1} RemoteInterpreterServer.java[createInterpreter]:311) - Instantiate interpreter org.apache.zeppelin.spark.IPySparkInterpreter + > INFO [2021-02-10 12:33:29,368] ({pool-1-thread-1} RemoteInterpreterServer.java[createInterpreter]:311) - Instantiate interpreter org.apache.zeppelin.spark.SparkRInterpreter + > WARN [2021-02-10 12:33:29,456] ({pool-1-thread-1} ZeppelinConfiguration.java[create]:117) - Failed to load configuration, proceeding with a default + > INFO [2021-02-10 12:33:29,470] ({pool-1-thread-1} ZeppelinConfiguration.java[create]:129) - Server Host: 127.0.0.1 + > INFO [2021-02-10 12:33:29,470] ({pool-1-thread-1} ZeppelinConfiguration.java[create]:131) - Server Port: 8080 + > INFO [2021-02-10 12:33:29,470] ({pool-1-thread-1} ZeppelinConfiguration.java[create]:135) - Context Path: / + > INFO [2021-02-10 12:33:29,472] ({pool-1-thread-1} ZeppelinConfiguration.java[create]:136) - Zeppelin Version: 0.8.2 + > INFO [2021-02-10 12:33:29,472] ({pool-1-thread-1} SchedulerFactory.java[]:59) - Scheduler Thread Pool Size: 100 + > INFO [2021-02-10 12:33:29,475] ({pool-2-thread-2} SchedulerFactory.java[jobStarted]:114) - Job 20201013-131649_1734629667 started by scheduler interpreter_1097442532 + > INFO [2021-02-10 12:33:29,818] ({pool-2-thread-2} IPythonInterpreter.java[checkIPythonPrerequisite]:200) - IPython prerequisite is met + > INFO [2021-02-10 12:33:29,820] ({pool-2-thread-2} NewSparkInterpreter.java[open]:83) - Using Scala Version: 2.11 + > INFO [2021-02-10 12:33:33,057] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Running Spark version 2.4.7 + > WARN [2021-02-10 12:33:33,104] ({pool-2-thread-2} Logging.scala[logWarning]:66) - Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN). + > INFO [2021-02-10 12:33:33,113] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Submitted application: Zeppelin + > INFO [2021-02-10 12:33:33,162] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Changing view acls to: fedora + > INFO [2021-02-10 12:33:33,163] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Changing modify acls to: fedora + > INFO [2021-02-10 12:33:33,163] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Changing view acls groups to: + > INFO [2021-02-10 12:33:33,163] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Changing modify acls groups to: + > INFO [2021-02-10 12:33:33,163] ({pool-2-thread-2} Logging.scala[logInfo]:54) - SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(fedora); groups with view permissions: Set(); users with modify permissions: Set(fedora); groups with modify permissions: Set() + > INFO [2021-02-10 12:33:33,343] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Successfully started service 'sparkDriver' on port 36301. + > INFO [2021-02-10 12:33:33,365] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Registering MapOutputTracker + > INFO [2021-02-10 12:33:33,381] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Registering BlockManagerMaster + > INFO [2021-02-10 12:33:33,383] ({pool-2-thread-2} Logging.scala[logInfo]:54) - Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information + > INFO [2021-02-10 12:33:33,384] ({pool-2-thread-2} Logging.scala[logInfo]:54) - BlockManagerMasterEndpoint up + > ERROR [2021-02-10 12:33:33,394] ({pool-2-thread-2} Logging.scala[logError]:91) - Failed to create local dir in /var/spark/temp. Ignoring this directory. + > java.io.IOException: Failed to create a temp directory (under /var/spark/temp) after 10 attempts! + > at org.apache.spark.util.Utils$.createDirectory(Utils.scala:311) + > at org.apache.spark.storage.DiskBlockManager$$anonfun$createLocalDirs$1.apply(DiskBlockManager.scala:141) + > at org.apache.spark.storage.DiskBlockManager$$anonfun$createLocalDirs$1.apply(DiskBlockManager.scala:139) + > at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241) + > at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241) + > at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) + > at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) + > at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241) + > at scala.collection.mutable.ArrayOps$ofRef.flatMap(ArrayOps.scala:186) + > at org.apache.spark.storage.DiskBlockManager.createLocalDirs(DiskBlockManager.scala:139) + > at org.apache.spark.storage.DiskBlockManager.(DiskBlockManager.scala:42) + > at org.apache.spark.storage.BlockManager.(BlockManager.scala:143) + > at org.apache.spark.SparkEnv$.create(SparkEnv.scala:349) + > at org.apache.spark.SparkEnv$.createDriverEnv(SparkEnv.scala:175) + > at org.apache.spark.SparkContext.createSparkEnv(SparkContext.scala:257) + > at org.apache.spark.SparkContext.(SparkContext.scala:424) + > at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2520) + > at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:930) + > at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:921) + > at scala.Option.getOrElse(Option.scala:121) + > at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:921) + > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + > at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + > at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + > at java.lang.reflect.Method.invoke(Method.java:498) + > at org.apache.zeppelin.spark.BaseSparkScalaInterpreter.spark2CreateContext(BaseSparkScalaInterpreter.scala:263) + > at org.apache.zeppelin.spark.BaseSparkScalaInterpreter.createSparkContext(BaseSparkScalaInterpreter.scala:182) + > at org.apache.zeppelin.spark.SparkScala211Interpreter.open(SparkScala211Interpreter.scala:90) + > at org.apache.zeppelin.spark.NewSparkInterpreter.open(NewSparkInterpreter.java:102) + > at org.apache.zeppelin.spark.SparkInterpreter.open(SparkInterpreter.java:62) + > at org.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:69) + > at org.apache.zeppelin.spark.IPySparkInterpreter.getSparkInterpreter(IPySparkInterpreter.java:94) + > at org.apache.zeppelin.spark.IPySparkInterpreter.open(IPySparkInterpreter.java:54) + > at org.apache.zeppelin.spark.PySparkInterpreter.open(PySparkInterpreter.java:129) + > at org.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:69) + > at org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:616) + > at org.apache.zeppelin.scheduler.Job.run(Job.java:188) + > at org.apache.zeppelin.scheduler.FIFOScheduler$1.run(FIFOScheduler.java:140) + > at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) + > at java.util.concurrent.FutureTask.run(FutureTask.java:266) + > at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180) + > at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + > at java.lang.Thread.run(Thread.java:748) + > ERROR [2021-02-10 12:33:33,396] ({pool-2-thread-2} Logging.scala[logError]:70) - Failed to create any local dir. + > INFO [2021-02-10 12:33:33,399] ({Thread-1} Logging.scala[logInfo]:54) - Shutdown hook called + > INFO [2021-02-10 12:33:33,400] ({Thread-1} Logging.scala[logInfo]:54) - Deleting directory /tmp/spark-c78cdeb5-667e-4ad4-bdf4-c8da522abd15 + + # + # Zeppelin is actibg as the Spark master, and is looking for the /var/spark/temp local directory. + # + + # + # Now that we have removed the gateway node, can we change Zeppelin into a medium node + # and add the /var/spark/temp local directory. + # + # Yes - needed to add the /var/spark/temp local directory to Zeppelin node to get notebook to run. + # + + # + # Zeppelin node is running a single threaded python task at 100% cpu. + # Logs on Zeppelin node show it is sending out tasks to the other nodes .. + + + 100% active thread is ipython_server + + python /tmp/zeppelin_ipython8675898749474775789/ipython_server.py 43261 + + +# ----------------------------------------------------- +# Check the Zeppelin logs. +#[user@zeppelin] + + pushd /home/fedora + + tail -f zeppelin-0.8.2-bin-all/logs/zeppelin-interpreter-spark-fedora-gaia-dev-20210210-zeppelin.novalocal.log + + > .... + > .... + > INFO [2021-02-10 18:15:13,368] ({dispatcher-event-loop-4} Logging.scala[logInfo]:54) - Starting task 674.0 in stage 92.0 (TID 318172, worker01, executor 2, partition 674, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:13,368] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 541.0 in stage 92.0 (TID 318039) in 201169 ms on worker01 (executor 2) (663/5720) + > INFO [2021-02-10 18:15:15,880] ({dispatcher-event-loop-11} Logging.scala[logInfo]:54) - Starting task 675.0 in stage 92.0 (TID 318173, worker02, executor 3, partition 675, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:15,880] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 529.0 in stage 92.0 (TID 318027) in 215511 ms on worker02 (executor 3) (664/5720) + > INFO [2021-02-10 18:15:15,884] ({dispatcher-event-loop-1} Logging.scala[logInfo]:54) - Starting task 676.0 in stage 92.0 (TID 318174, worker04, executor 1, partition 676, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:15,884] ({task-result-getter-0} Logging.scala[logInfo]:54) - Finished task 652.0 in stage 92.0 (TID 318150) in 57495 ms on worker04 (executor 1) (665/5720) + > INFO [2021-02-10 18:15:16,153] ({dispatcher-event-loop-2} Logging.scala[logInfo]:54) - Starting task 677.0 in stage 92.0 (TID 318175, worker04, executor 1, partition 677, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:16,153] ({task-result-getter-3} Logging.scala[logInfo]:54) - Finished task 676.0 in stage 92.0 (TID 318174) in 269 ms on worker04 (executor 1) (666/5720) + > .... + > .... + > INFO [2021-02-10 18:15:42,232] ({dispatcher-event-loop-4} Logging.scala[logInfo]:54) - Starting task 678.0 in stage 92.0 (TID 318176, worker01, executor 2, partition 678, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:42,232] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 628.0 in stage 92.0 (TID 318126) in 129777 ms on worker01 (executor 2) (667/5720) + > INFO [2021-02-10 18:15:42,662] ({dispatcher-event-loop-12} Logging.scala[logInfo]:54) - Starting task 679.0 in stage 92.0 (TID 318177, worker01, executor 2, partition 679, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:42,662] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 678.0 in stage 92.0 (TID 318176) in 430 ms on worker01 (executor 2) (668/5720) + > INFO [2021-02-10 18:15:51,745] ({dispatcher-event-loop-0} Logging.scala[logInfo]:54) - Starting task 680.0 in stage 92.0 (TID 318178, worker04, executor 1, partition 680, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:51,746] ({task-result-getter-0} Logging.scala[logInfo]:54) - Finished task 673.0 in stage 92.0 (TID 318171) in 60263 ms on worker04 (executor 1) (669/5720) + > INFO [2021-02-10 18:15:51,988] ({dispatcher-event-loop-7} Logging.scala[logInfo]:54) - Starting task 681.0 in stage 92.0 (TID 318179, worker04, executor 1, partition 681, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:51,988] ({task-result-getter-3} Logging.scala[logInfo]:54) - Finished task 680.0 in stage 92.0 (TID 318178) in 243 ms on worker04 (executor 1) (670/5720) + > INFO [2021-02-10 18:15:52,201] ({dispatcher-event-loop-6} Logging.scala[logInfo]:54) - Starting task 682.0 in stage 92.0 (TID 318180, worker04, executor 1, partition 682, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:52,202] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 681.0 in stage 92.0 (TID 318179) in 214 ms on worker04 (executor 1) (671/5720) + > INFO [2021-02-10 18:15:52,388] ({dispatcher-event-loop-9} Logging.scala[logInfo]:54) - Starting task 683.0 in stage 92.0 (TID 318181, worker04, executor 1, partition 683, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:52,388] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 682.0 in stage 92.0 (TID 318180) in 187 ms on worker04 (executor 1) (672/5720) + > INFO [2021-02-10 18:15:52,822] ({dispatcher-event-loop-4} Logging.scala[logInfo]:54) - Starting task 684.0 in stage 92.0 (TID 318182, worker04, executor 1, partition 684, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:52,822] ({task-result-getter-0} Logging.scala[logInfo]:54) - Finished task 683.0 in stage 92.0 (TID 318181) in 434 ms on worker04 (executor 1) (673/5720) + > INFO [2021-02-10 18:15:53,118] ({dispatcher-event-loop-10} Logging.scala[logInfo]:54) - Starting task 685.0 in stage 92.0 (TID 318183, worker04, executor 1, partition 685, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:53,118] ({task-result-getter-3} Logging.scala[logInfo]:54) - Finished task 684.0 in stage 92.0 (TID 318182) in 296 ms on worker04 (executor 1) (674/5720) + > INFO [2021-02-10 18:15:53,312] ({dispatcher-event-loop-0} Logging.scala[logInfo]:54) - Starting task 686.0 in stage 92.0 (TID 318184, worker04, executor 1, partition 686, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:53,313] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 685.0 in stage 92.0 (TID 318183) in 194 ms on worker04 (executor 1) (675/5720) + > INFO [2021-02-10 18:15:53,522] ({dispatcher-event-loop-7} Logging.scala[logInfo]:54) - Starting task 687.0 in stage 92.0 (TID 318185, worker04, executor 1, partition 687, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:53,522] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 686.0 in stage 92.0 (TID 318184) in 210 ms on worker04 (executor 1) (676/5720) + > INFO [2021-02-10 18:15:54,158] ({dispatcher-event-loop-6} Logging.scala[logInfo]:54) - Starting task 688.0 in stage 92.0 (TID 318186, worker04, executor 1, partition 688, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:54,159] ({task-result-getter-0} Logging.scala[logInfo]:54) - Finished task 687.0 in stage 92.0 (TID 318185) in 637 ms on worker04 (executor 1) (677/5720) + > INFO [2021-02-10 18:15:54,408] ({dispatcher-event-loop-9} Logging.scala[logInfo]:54) - Starting task 689.0 in stage 92.0 (TID 318187, worker04, executor 1, partition 689, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:54,408] ({task-result-getter-3} Logging.scala[logInfo]:54) - Finished task 688.0 in stage 92.0 (TID 318186) in 250 ms on worker04 (executor 1) (678/5720) + > INFO [2021-02-10 18:15:57,157] ({dispatcher-event-loop-7} Logging.scala[logInfo]:54) - Starting task 690.0 in stage 92.0 (TID 318188, worker02, executor 3, partition 690, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:57,157] ({task-result-getter-2} Logging.scala[logInfo]:54) - Finished task 675.0 in stage 92.0 (TID 318173) in 41277 ms on worker02 (executor 3) (679/5720) + > INFO [2021-02-10 18:15:57,825] ({dispatcher-event-loop-13} Logging.scala[logInfo]:54) - Starting task 691.0 in stage 92.0 (TID 318189, worker02, executor 3, partition 691, PROCESS_LOCAL, 8450 bytes) + > INFO [2021-02-10 18:15:57,825] ({task-result-getter-1} Logging.scala[logInfo]:54) - Finished task 690.0 in stage 92.0 (TID 318188) in 668 ms on worker02 (executor 3) (680/5720) + + top on the worker nodes show 90% idle, then peaks of activity + + 50%cpu to cheph-fuse + 10%cpu to java + +# ----------------------------------------------------- +# Check the Spark temp on Zeppelin. +#[user@zeppelin] + + ls -1 /var/spark/temp/ + + > blockmgr-ddd69876-f595-49bf-bc89-b4c5d52204a3 + > spark-668f91a5-2e20-4f67-8fa5-bc68e679243b + + + du -h -d 1 /var/spark/temp/ + + > 220K /var/spark/temp/spark-668f91a5-2e20-4f67-8fa5-bc68e679243b + > 200K /var/spark/temp/blockmgr-ddd69876-f595-49bf-bc89-b4c5d52204a3 + > 424K /var/spark/temp/ + + +# ----------------------------------------------------- +# Check the Spark temp on workers. +#[user@worker01] + + ls -1 /var/spark/temp/ + + + + du -h -d 1 /var/spark/temp/ + + > 4.0K /var/spark/temp/ + + + # + # Zeppelin node is acting as the Spark masert. + # ipython process is single thread 100% cpu, the rest is idle. + # using <55k of spark/temp + # + + # + # worker nodes are 10-50% ceph, 0-10% java + # mostly idle + # not using spark/temp + # + + + Initial select query (10%) + Took 17 min 48 sec. Last updated by gaiauser at February 10 2021, 6:22:12 PM. + + First graph + Took 18 min 11 sec. Last updated by gaiauser at February 10 2021, 6:40:23 PM. + + Good/bad selection + Took 42 min 13 sec. Last updated by gaiauser at February 10 2021, 7:22:37 PM. + + RandomForestClassifier + Took 2 hrs 59 min 26 sec. Last updated by gaiauser at February 10 2021, 10:22:03 PM. + + Confusion matrix + Took 42 min 6 sec. Last updated by gaiauser at February 10 2021, 11:04:09 PM. + + Second graph + Took 1 hrs 18 min 18 sec. Last updated by gaiauser at February 11 2021, 12:22:28 AM. + + Histogram + Took 19 min 17 sec. Last updated by gaiauser at February 11 2021, 12:41:45 AM. + + Good plot + Took 40 min 3 sec. Last updated by gaiauser at February 11 2021, 1:21:48 AM. + + Bad plot + Took 40 min 8 sec. Last updated by gaiauser at February 11 2021, 2:01:56 AM. + + Good/bad count + Took 40 min 4 sec. Last updated by gaiauser at February 11 2021, 2:42:00 AM. + + Histogram + Took 40 min 15 sec. Last updated by gaiauser at February 11 2021, 3:22:15 AM. + + Null count + Took 25 min 30 sec. Last updated by gaiauser at February 11 2021, 3:47:45 AM. + + diff --git a/notes/zrq/20210211-01-ansible-deploy.txt b/notes/zrq/20210211-01-ansible-deploy.txt new file mode 100644 index 00000000..1585776a --- /dev/null +++ b/notes/zrq/20210211-01-ansible-deploy.txt @@ -0,0 +1,771 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Get Spark to work with the new configuration. + + Test config: + no gateway + medium zeppelin + 6 small workers + + Results: + + Work in progress .... + + TODO: + + To keep consistent with the other deployments, + deploy Zeppelin in /opt rather than /home/fedora. + + To keep consistent with the other deployments, + add a symlik for the Zeppelin deployment. + + zeppelin -> zeppelin-0.8.2-bin-all + + Move the zeppelin logs directory out of the + deployed source tree. + + +# ----------------------------------------------------- +# Update the Openstack cloud name. +#[user@desktop] + + cloudname=gaia-dev + + sed -i ' + s/^\(AGLAIS_CLOUD\)=.*$/\1='${cloudname:?}'/ + ' "${HOME}/aglais.env" + + +# ----------------------------------------------------- +# Create a container to work with. +# (*) extra volume mount for /common +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name ansibler \ + --hostname ansibler \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --env "cloudname=${AGLAIS_CLOUD:?}" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/common:/common:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/openstack:/openstack:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/hadoop-yarn:/hadoop-yarn:ro,z" \ + atolmis/ansible-client:2020.12.02 \ + bash + + +# ----------------------------------------------------- +# Create our Aglais configuration. +#[root@kubernator] + +cat > '/tmp/aglais-config.yml' << EOF +aglais: + version: 1.0 + spec: + openstack: + cloud: '${cloudname:?}' + +EOF + + +# ----------------------------------------------------- +# Create everything from scratch. +#[root@ansibler] + + time \ + /openstack/bin/delete-all.sh \ + "${cloudname:?}" + + rm -f ~/.ssh/* + + time \ + /hadoop-yarn/bin/create-all.sh + + + > .... + > .... + > real 38m51.791s + > user 10m10.096s + > sys 3m6.966s + + +# ----------------------------------------------------- +# Check the deployment status. +#[root@ansibler] + + cat '/tmp/aglais-status.yml' + + > .... + > .... + + +# ----------------------------------------------------- +# Get the public IP address of our Zeppelin node. +#[root@ansibler] + + deployname=$( + yq read \ + '/tmp/aglais-status.yml' \ + 'aglais.status.deployment.name' + ) + + zeppelinid=$( + openstack \ + --os-cloud "${cloudname:?}" \ + server list \ + --format json \ + | jq -r '.[] | select(.Name == "'${deployname:?}'-zeppelin") | .ID' + ) + + zeppelinip=$( + openstack \ + --os-cloud "${cloudname:?}" \ + server show \ + --format json \ + "${zeppelinid:?}" \ + | jq -r '.addresses' \ + | sed ' + s/[[:space:]]// + s/.*=\(.*\)/\1/ + s/.*,\(.*\)/\1/ + ' + ) + +cat << EOF +Zeppelin ID [${zeppelinid:?}] +Zeppelin IP [${zeppelinip:?}] +EOF + + > Zeppelin ID [e4db55cb-2106-4f24-afe6-e335f98ecca1] + > Zeppelin IP [128.232.227.230] + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + Update our DNS + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Login to Zeppelin ... +#[user@desktop] + + firefox --new-window "http://zeppelin.metagrid.xyz:8080/" & + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + + Import notebooks from GitHub, clear the output and run all the cells ... + + Good astrometric solutions via ML Random Forrest classifier + https://raw.githubusercontent.com/wfau/aglais-notebooks/main/2FRPC4BFS/note.json + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + +# ----------------------------------------------------- +# Check the Zeppelin logs. +#[user@zeppelin] + + pushd /home/fedora + + tail -f zeppelin-0.8.2-bin-all/logs/zeppelin-interpreter-spark-fedora-gaia-dev-20210210-zeppelin.novalocal.log + + + top on the worker nodes show 90% idle, then peaks of activity + + 50%cpu to cheph-fuse + 10%cpu to java + +# ----------------------------------------------------- +# Check the Spark temp on Zeppelin. +#[user@zeppelin] + + ssh zeppelin \ + ' + date + hostname + echo + ls -1 /var/spark/temp/ + echo + du -h -d 1 /var/spark/temp/ + ' + + > Thu Feb 11 11:42:45 UTC 2021 + > gaia-dev-20210211-zeppelin.novalocal + > + > blockmgr-6e40f938-4cea-4e51-aa7d-7b8e8d957fd9 + > spark-02a20dcf-c44b-46fd-aba2-84dcd4092b77 + > + > 220K /var/spark/temp/spark-02a20dcf-c44b-46fd-aba2-84dcd4092b77 + > 168K /var/spark/temp/blockmgr-6e40f938-4cea-4e51-aa7d-7b8e8d957fd9 + > 392K /var/spark/temp/ + + +# ----------------------------------------------------- +# Check the Spark temp on workers. +#[root@ansibler] + + ssh zeppelin \ + ' + ssh worker01 \ + " + date + hostname + echo + ls -1 /var/spark/temp/ + echo + du -h -d 1 /var/spark/temp/ + " + ' + + > Thu Feb 11 11:43:12 UTC 2021 + > gaia-dev-20210211-worker01.novalocal + > + > ls: cannot access '/var/spark/temp/': No such file or directory + > + > du: cannot access '/var/spark/temp/': No such file or directory + + + +# ----------------------------------------------------- +# Check /tmp on workers01. +#[root@ansibler] + + ssh zeppelin \ + ' + ssh worker01 \ + " + date + hostname + echo + ls -1 /tmp/ + echo + du -h -d 1 /tmp/ + " + ' + + > Thu Feb 11 11:44:10 UTC 2021 + > gaia-dev-20210211-worker01.novalocal + > + > hadoop-fedora + > hadoop-fedora-datanode.pid + > hadoop-fedora-nodemanager.pid + > hsperfdata_fedora + > hsperfdata_root + > jetty-0.0.0.0-8042-node-_-any-222466267334014063.dir + > jetty-localhost-41565-datanode-_-any-8693046579271702220.dir + > systemd-private-25d0add6979849dcaa7ef3260c7db798-chronyd.service-WR7RUG + > systemd-private-25d0add6979849dcaa7ef3260c7db798-dbus-broker.service-YbHwCr + > + > 4.0K /tmp/jetty-localhost-41565-datanode-_-any-8693046579271702220.dir + > du: cannot read directory '/tmp/systemd-private-25d0add6979849dcaa7ef3260c7db798-chronyd.service-WR7RUG': Permission denied + > 4.0K /tmp/systemd-private-25d0add6979849dcaa7ef3260c7db798-chronyd.service-WR7RUG + > 4.0K /tmp/systemd-private-25d0add6979849dcaa7ef3260c7db798-dbus-broker.service-YbHwCr + > 4.0K /tmp/.ICE-unix + > 4.0K /tmp/.X11-unix + > 4.0K /tmp/.Test-unix + > 8.0K /tmp/jetty-0.0.0.0-8042-node-_-any-222466267334014063.dir + > 4.0K /tmp/.font-unix + > du: cannot read directory '/tmp/systemd-private-25d0add6979849dcaa7ef3260c7db798-dbus-broker.service-YbHwCr': Permission denied + > 100K /tmp/hsperfdata_fedora + > 4.0K /tmp/.XIM-unix + > 235M /tmp/hadoop-fedora + > 36K /tmp/hsperfdata_root + > 235M /tmp/ + + +# ----------------------------------------------------- +# Check the Zeppelin machine +#[user@zeppelin] + + ls -1 /home/fedora + + > spark-warehouse + > zeppelin-0.8.2-bin-all + + + ls -1 /home/fedora/spark-warehouse + + > - + + + ls -1 /home/fedora/zeppelin-0.8.2-bin-all + + > bin + > conf + > interpreter + > lib + > LICENSE + > licenses + > local-repo + > logs + > notebook + > NOTICE + > README.md + > run + > webapps + > zeppelin-web-0.8.2.war + + + ls -1 /home/fedora/zeppelin-0.8.2-bin-all/logs + + > zeppelin-fedora-gaia-dev-20210211-zeppelin.novalocal.log + > zeppelin-fedora-gaia-dev-20210211-zeppelin.novalocal.out + > zeppelin-interpreter-md-fedora-gaia-dev-20210211-zeppelin.novalocal.log + > zeppelin-interpreter-spark-fedora-gaia-dev-20210211-zeppelin.novalocal.log + + +# ----------------------------------------------------- +# Check the Zeppelin log +#[user@zeppelin] + + pushd /home/fedora/zeppelin-0.8.2-bin-all/logs + + ls -1 . + + > zeppelin-fedora-gaia-dev-20210211-zeppelin.novalocal.log + > zeppelin-fedora-gaia-dev-20210211-zeppelin.novalocal.out + > zeppelin-interpreter-md-fedora-gaia-dev-20210211-zeppelin.novalocal.log + > zeppelin-interpreter-spark-fedora-gaia-dev-20210211-zeppelin.novalocal.log + + + less zeppelin-fedora-gaia-dev-20210211-zeppelin.novalocal.log + + > INFO [2021-02-11 07:17:07,710] ({main} ZeppelinConfiguration.java[create]:121) - Load configuration from file:/home/fedora/zeppelin-0.8.2-bin-all/conf/zeppelin-site.xml + > INFO [2021-02-11 07:17:07,750] ({main} ZeppelinConfiguration.java[create]:129) - Server Host: 10.10.0.88 + > INFO [2021-02-11 07:17:07,750] ({main} ZeppelinConfiguration.java[create]:131) - Server Port: 8080 + > INFO [2021-02-11 07:17:07,751] ({main} ZeppelinConfiguration.java[create]:135) - Context Path: / + > INFO [2021-02-11 07:17:07,752] ({main} ZeppelinConfiguration.java[create]:136) - Zeppelin Version: 0.8.2 + > .... + > .... + > INFO [2021-02-11 10:31:27,611] ({pool-2-thread-2} SchedulerFactory.java[jobStarted]:114) - Job 20201013-131059_546082898 started by scheduler org.apache.zeppelin.interpreter.remote.RemoteInterpreter-md:shared_proces + > s-shared_session + > INFO [2021-02-11 10:31:27,612] ({pool-2-thread-2} Paragraph.java[jobRun]:381) - Run paragraph [paragraph_id: 20201013-131059_546082898, interpreter: md, note_id: 2FYW1HNED, user: gaiauser] + > INFO [2021-02-11 10:31:27,612] ({pool-2-thread-2} ManagedInterpreterGroup.java[getOrCreateInterpreterProcess]:61) - Create InterpreterProcess for InterpreterGroup: md:shared_process + > INFO [2021-02-11 10:31:27,612] ({pool-2-thread-2} ShellScriptLauncher.java[launch]:48) - Launching Interpreter: md + > INFO [2021-02-11 10:31:27,623] ({pool-2-thread-2} RemoteInterpreterManagedProcess.java[start]:115) - Thrift server for callback will start. Port: 39353 + > INFO [2021-02-11 10:31:27,631] ({pool-2-thread-2} RemoteInterpreterManagedProcess.java[start]:190) - Run interpreter process [/home/fedora/zeppelin-0.8.2-bin-all/bin/interpreter.sh, -d, /home/fedora/zeppelin-0.8.2-b + > in-all/interpreter/md, -c, 10.10.0.88, -p, 39353, -r, :, -l, /home/fedora/zeppelin-0.8.2-bin-all/local-repo/md, -g, md] + > INFO [2021-02-11 10:31:27,890] ({pool-7-thread-1} RemoteInterpreterManagedProcess.java[callback]:123) - RemoteInterpreterServer Registered: CallbackInfo(host:10.10.0.88, port:33847) + > INFO [2021-02-11 10:31:27,925] ({pool-2-thread-2} RemoteInterpreter.java[call]:168) - Create RemoteInterpreter org.apache.zeppelin.markdown.Markdown + > INFO [2021-02-11 10:31:28,006] ({pool-2-thread-2} RemoteInterpreter.java[call]:142) - Open RemoteInterpreter org.apache.zeppelin.markdown.Markdown + > INFO [2021-02-11 10:31:28,006] ({pool-2-thread-2} RemoteInterpreter.java[pushAngularObjectRegistryToRemote]:436) - Push local angular object registry from ZeppelinServer to remote interpreter group md:shared_process + > INFO [2021-02-11 10:31:28,371] ({pool-2-thread-2} NotebookServer.java[afterStatusChange]:2314) - Job 20201013-131059_546082898 is finished successfully, status: FINISHED + > INFO [2021-02-11 10:31:28,438] ({pool-2-thread-2} VFSNotebookRepo.java[save]:196) - Saving note:2FYW1HNED + > INFO [2021-02-11 10:31:28,441] ({pool-2-thread-2} SchedulerFactory.java[jobFinished]:120) - Job 20201013-131059_546082898 finished by scheduler org.apache.zeppelin.interpreter.remote.RemoteInterpreter-md:shared_proc + > ess-shared_session + > INFO [2021-02-11 10:31:28,458] ({qtp1580893732-14} VFSNotebookRepo.java[save]:196) - Saving note:2FYW1HNED + > .... + > .... + > INFO [2021-02-11 10:31:28,462] ({pool-2-thread-3} SchedulerFactory.java[jobStarted]:114) - Job 20201013-131649_1734629667 started by scheduler org.apache.zeppelin.interpreter.remote.RemoteInterpreter-spark:shared_pr + > ocess-shared_session + > INFO [2021-02-11 10:31:28,462] ({pool-2-thread-3} Paragraph.java[jobRun]:381) - Run paragraph [paragraph_id: 20201013-131649_1734629667, interpreter: spark.pyspark, note_id: 2FYW1HNED, user: gaiauser] + > INFO [2021-02-11 10:31:28,462] ({pool-2-thread-3} ManagedInterpreterGroup.java[getOrCreateInterpreterProcess]:61) - Create InterpreterProcess for InterpreterGroup: spark:shared_process + > INFO [2021-02-11 10:31:28,463] ({pool-2-thread-3} ShellScriptLauncher.java[launch]:48) - Launching Interpreter: spark + > INFO [2021-02-11 10:31:28,464] ({pool-2-thread-3} SparkInterpreterLauncher.java[buildEnvFromProperties]:108) - Run Spark under non-secure mode as no keytab and principal is specified + > INFO [2021-02-11 10:31:28,464] ({pool-2-thread-3} RemoteInterpreterManagedProcess.java[start]:115) - Thrift server for callback will start. Port: 39131 + > INFO [2021-02-11 10:31:28,965] ({pool-2-thread-3} RemoteInterpreterManagedProcess.java[start]:190) - Run interpreter process [/home/fedora/zeppelin-0.8.2-bin-all/bin/interpreter.sh, -d, /home/fedora/zeppelin-0.8.2-b + > in-all/interpreter/spark, -c, 10.10.0.88, -p, 39131, -r, :, -l, /home/fedora/zeppelin-0.8.2-bin-all/local-repo/spark, -g, spark] + > INFO [2021-02-11 10:31:30,280] ({pool-9-thread-1} RemoteInterpreterManagedProcess.java[callback]:123) - RemoteInterpreterServer Registered: CallbackInfo(host:10.10.0.88, port:39975) + > INFO [2021-02-11 10:31:30,282] ({pool-2-thread-3} RemoteInterpreter.java[call]:168) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkInterpreter + > INFO [2021-02-11 10:31:30,336] ({pool-2-thread-3} RemoteInterpreter.java[call]:168) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkSqlInterpreter + > INFO [2021-02-11 10:31:30,337] ({pool-2-thread-3} RemoteInterpreter.java[call]:168) - Create RemoteInterpreter org.apache.zeppelin.spark.DepInterpreter + > INFO [2021-02-11 10:31:30,342] ({pool-2-thread-3} RemoteInterpreter.java[call]:168) - Create RemoteInterpreter org.apache.zeppelin.spark.PySparkInterpreter + > INFO [2021-02-11 10:31:30,346] ({pool-2-thread-3} RemoteInterpreter.java[call]:168) - Create RemoteInterpreter org.apache.zeppelin.spark.IPySparkInterpreter + > INFO [2021-02-11 10:31:30,349] ({pool-2-thread-3} RemoteInterpreter.java[call]:168) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkRInterpreter + > INFO [2021-02-11 10:31:30,350] ({pool-2-thread-3} RemoteInterpreter.java[call]:142) - Open RemoteInterpreter org.apache.zeppelin.spark.PySparkInterpreter + > INFO [2021-02-11 10:31:30,350] ({pool-2-thread-3} RemoteInterpreter.java[pushAngularObjectRegistryToRemote]:436) - Push local angular object registry from ZeppelinServer to remote interpreter group spark:shared_proc + > ess + > INFO [2021-02-11 10:32:38,857] ({pool-2-thread-3} NotebookServer.java[afterStatusChange]:2314) - Job 20201013-131649_1734629667 is finished successfully, status: FINISHED + > .... + > .... + > .... + > .... + > INFO [2021-02-11 10:59:05,185] ({pool-2-thread-3} SchedulerFactory.java[jobStarted]:114) - Job 20201013-152110_1282917873 started by scheduler org.apache.zeppelin.interpreter.remote.RemoteInterpreter-spark:shared_process-shared_session + > INFO [2021-02-11 10:59:05,186] ({pool-2-thread-3} Paragraph.java[jobRun]:381) - Run paragraph [paragraph_id: 20201013-152110_1282917873, interpreter: spark.pyspark, note_id: 2FYW1HNED, user: gaiauser] + > WARN [2021-02-11 11:06:34,508] ({pool-2-thread-3} NotebookServer.java[afterStatusChange]:2316) - Job 20201013-152110_1282917873 is finished, status: ERROR, exception: null, result: %text ESC[0;31m---------------------------------------------------------------------------ESC[0m + > ESC[0;31mPy4JJavaErrorESC[0m Traceback (most recent call last) + > ESC[0;32mESC[0m in ESC[0;36mESC[0;34mESC[0m + > ESC[1;32m 6ESC[0m ESC[0;31m# instantiate a trained RF classifier, seeded for repeatability at this stage:ESC[0mESC[0;34mESC[0mESC[0;34mESC[0mESC[0;34mESC[0mESC[0m + > .... + > .... + > ESC[0;32m/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.pyESC[0m in ESC[0;36mget_return_valueESC[0;34m(answer, gateway_client, target_id, name)ESC[0m + > ESC[1;32m 326ESC[0m raise Py4JJavaError( + > ESC[1;32m 327ESC[0m ESC[0;34m"An error occurred while calling {0}{1}{2}.\n"ESC[0mESC[0;34m.ESC[0mESC[0;34mESC[0mESC[0;34mESC[0mESC[0m + > ESC[0;32m--> 328ESC[0;31m format(target_id, ".", name), value) + > ESC[0mESC[1;32m 329ESC[0m ESC[0;32melseESC[0mESC[0;34m:ESC[0mESC[0;34mESC[0mESC[0;34mESC[0mESC[0m + > ESC[1;32m 330ESC[0m raise Py4JError( + > + > ESC[0;31mPy4JJavaErrorESC[0m: An error occurred while calling o191.fit. + > : org.apache.spark.SparkException: Job aborted due to stage failure: Task 3226 in stage 35.0 failed 4 times, most recent failure: Lost task 3226.3 in stage 35.0 (TID 122005, worker05, executor 2): java.io.IOException: No space left on device + > at java.io.FileOutputStream.writeBytes(Native Method) + > at java.io.FileOutputStream.write(FileOutputStream.java:326) + > at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:58) + > at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82) + > at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140) + > at net.jpountz.lz4.LZ4BlockOutputStream.finish(LZ4BlockOutputStream.java:260) + > at net.jpountz.lz4.LZ4BlockOutputStream.close(LZ4BlockOutputStream.java:190) + > at java.io.ObjectOutputStream$BlockDataOutputStream.close(ObjectOutputStream.java:1828) + > at java.io.ObjectOutputStream.close(ObjectOutputStream.java:742) + > at org.apache.spark.serializer.JavaSerializationStream.close(JavaSerializer.scala:57) + > at org.apache.spark.storage.DiskBlockObjectWriter.commitAndGet(DiskBlockObjectWriter.scala:173) + > at org.apache.spark.util.collection.ExternalSorter.writePartitionedFile(ExternalSorter.scala:701) + > at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:71) + > at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) + > at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) + > at org.apache.spark.scheduler.Task.run(Task.scala:123) + > at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408) + > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) + > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + > at java.lang.Thread.run(Thread.java:748) + > + > Driver stacktrace: + > at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925) + > at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913) + > at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912) + > at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) + > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) + > at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912) + > at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948) + > at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948) + > at scala.Option.foreach(Option.scala:257) + > at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948) + > at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146) + > at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095) + > at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084) + > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) + > at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759) + > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061) + > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082) + > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101) + > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126) + > at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990) + > at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) + > at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) + > at org.apache.spark.rdd.RDD.withScope(RDD.scala:385) + > at org.apache.spark.rdd.RDD.collect(RDD.scala:989) + > at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:743) + > at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:742) + > at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) + > at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) + > at org.apache.spark.rdd.RDD.withScope(RDD.scala:385) + > at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:742) + > at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:567) + > at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:201) + > at org.apache.spark.ml.classification.RandomForestClassifier$$anonfun$train$1.apply(RandomForestClassifier.scala:142) + > at org.apache.spark.ml.classification.RandomForestClassifier$$anonfun$train$1.apply(RandomForestClassifier.scala:120) + > at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185) + > at scala.util.Try$.apply(Try.scala:192) + > at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185) + > at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:120) + > at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:46) + > at org.apache.spark.ml.Predictor.fit(Predictor.scala:118) + > at org.apache.spark.ml.Predictor.fit(Predictor.scala:82) + > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + > at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + > at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + > at java.lang.reflect.Method.invoke(Method.java:498) + > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) + > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) + > at py4j.Gateway.invoke(Gateway.java:282) + > at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) + > at py4j.commands.CallCommand.execute(CallCommand.java:79) + > at py4j.GatewayConnection.run(GatewayConnection.java:238) + > at java.lang.Thread.run(Thread.java:748) + > + > Caused by: java.io.IOException: No space left on device + > at java.io.FileOutputStream.writeBytes(Native Method) + > at java.io.FileOutputStream.write(FileOutputStream.java:326) + > at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:58) + > at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82) + > at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140) + > at net.jpountz.lz4.LZ4BlockOutputStream.finish(LZ4BlockOutputStream.java:260) + > at net.jpountz.lz4.LZ4BlockOutputStream.close(LZ4BlockOutputStream.java:190) + > at java.io.ObjectOutputStream$BlockDataOutputStream.close(ObjectOutputStream.java:1828) + > at java.io.ObjectOutputStream.close(ObjectOutputStream.java:742) + > at org.apache.spark.serializer.JavaSerializationStream.close(JavaSerializer.scala:57) + > at org.apache.spark.storage.DiskBlockObjectWriter.commitAndGet(DiskBlockObjectWriter.scala:173) + > at org.apache.spark.util.collection.ExternalSorter.writePartitionedFile(ExternalSorter.scala:701) + > at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:71) + > at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) + > at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) + > at org.apache.spark.scheduler.Task.run(Task.scala:123) + > at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408) + > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) + > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + > ... 1 more + > + > INFO [2021-02-11 11:06:34,565] ({pool-2-thread-3} VFSNotebookRepo.java[save]:196) - Saving note:2FYW1HNED + > INFO [2021-02-11 11:06:34,572] ({pool-2-thread-3} SchedulerFactory.java[jobFinished]:120) - Job 20201013-152110_1282917873 finished by scheduler org.apache.zeppelin.interpreter.remote.RemoteInterpreter-spark:shared_process-shared_session + > INFO [2021-02-11 11:30:45,031] ({SessionValidationThread-1} AbstractValidatingSessionManager.java[validateSessions]:275) - Validating all active sessions... + > INFO [2021-02-11 11:30:45,032] ({SessionValidationThread-1} AbstractValidatingSessionManager.java[validateSessions]:308) - Finished session validation. No sessions were stopped. + + + Lots of information in that .. + + - The exception was reported by (TID 122005, worker05, executor 2) + - I think the out of space was on worker05, not the Zeppelin node. + + - The stack trace suggests that RandomForestClassifier understands org.apache.spark.rdd.RDD + - Which means at least part of the RandomForestClassifier training is offloaded to the workers. + + > at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) + > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) + > at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912) + > .... + > at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948) + > at scala.Option.foreach(Option.scala:257) + > at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948) + > .... + > at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084) + > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) + > at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759) + > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061) + > .... + > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126) + > .... + > at org.apache.spark.rdd.RDD.withScope(RDD.scala:385) + > at org.apache.spark.rdd.RDD.collect(RDD.scala:989) + > .... + > at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:567) + > .... + > at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185) + > at scala.util.Try$.apply(Try.scala:192) + > at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185) + > at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:120) + > at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:46) + > at org.apache.spark.ml.Predictor.fit(Predictor.scala:118) + + +# ----------------------------------------------------- +# Check the disc space on zeppelin +#[user@zeppelin] + + ls -1 /var/spark/temp/ + + > blockmgr-6e40f938-4cea-4e51-aa7d-7b8e8d957fd9 + > spark-02a20dcf-c44b-46fd-aba2-84dcd4092b77 + + + du -h -d 1 /var/spark/temp/ + + > 220K /var/spark/temp/spark-02a20dcf-c44b-46fd-aba2-84dcd4092b77 + > 168K /var/spark/temp/blockmgr-6e40f938-4cea-4e51-aa7d-7b8e8d957fd9 + > 392K /var/spark/temp/ + + +# ----------------------------------------------------- +# Check the disc space on worker05 +#[user@zeppelin] + + + ssh worker05 \ + ' + hostname + date + echo + ls -1 /var/spark/temp/ + echo + du -h -d 1 /var/spark/temp/ + ' + + > gaia-dev-20210211-worker05.novalocal + > Thu 11 Feb 12:16:18 UTC 2021 + > + > ls: cannot access '/var/spark/temp/': No such file or directory + > + > du: cannot access '/var/spark/temp/': No such file or directory + + # + # When we changed this back down to a small node we didn't create the spark temp directory. + # + + + ssh worker05 \ + ' + hostname + date + echo + ls -1 /tmp/ + echo + du -h -d 1 /tmp/ + ' + + > gaia-dev-20210211-worker05.novalocal + > Thu 11 Feb 12:16:49 UTC 2021 + > + > hadoop-fedora + > hadoop-fedora-datanode.pid + > hadoop-fedora-nodemanager.pid + > hsperfdata_fedora + > hsperfdata_root + > jetty-0.0.0.0-8042-node-_-any-5267485435957391381.dir + > jetty-localhost-33243-datanode-_-any-3555236917512612600.dir + > systemd-private-e9fcce57f1be40acb5b15c979c850494-chronyd.service-0AFL7F + > systemd-private-e9fcce57f1be40acb5b15c979c850494-dbus-broker.service-ej71vh + > + > 4.0K /tmp/jetty-localhost-33243-datanode-_-any-3555236917512612600.dir + > 4.0K /tmp/.ICE-unix + > 4.0K /tmp/.X11-unix + > 4.0K /tmp/.Test-unix + > du: cannot read directory '/tmp/systemd-private-e9fcce57f1be40acb5b15c979c850494-chronyd.service-0AFL7F': Permission denied + > 4.0K /tmp/systemd-private-e9fcce57f1be40acb5b15c979c850494-chronyd.service-0AFL7F + > 8.0K /tmp/jetty-0.0.0.0-8042-node-_-any-5267485435957391381.dir + > 4.0K /tmp/.font-unix + > 100K /tmp/hsperfdata_fedora + > 4.0K /tmp/.XIM-unix + > 14G /tmp/hadoop-fedora + > 4.0K /tmp/systemd-private-e9fcce57f1be40acb5b15c979c850494-dbus-broker.service-ej71vh + > 36K /tmp/hsperfdata_root + > 14G /tmp/ + > du: cannot read directory '/tmp/systemd-private-e9fcce57f1be40acb5b15c979c850494-dbus-broker.service-ej71vh': Permission denied + + + ssh worker05 \ + ' + hostname + date + echo + ls -1 /tmp/hadoop-fedora + echo + du -h -d 1 /tmp/hadoop-fedora + ' + + > gaia-dev-20210211-worker05.novalocal + > Thu 11 Feb 12:18:19 UTC 2021 + > + > nm-local-dir + > + > 14G /tmp/hadoop-fedora/nm-local-dir + > 14G /tmp/hadoop-fedora + + + ssh worker05 \ + ' + hostname + date + echo + du -h /tmp/hadoop-fedora + ' + + > 4.0K /tmp/hadoop-fedora/nm-local-dir/nmPrivate/application_1613027823151_0001 + > 8.0K /tmp/hadoop-fedora/nm-local-dir/nmPrivate + > .... .... + > 284K /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/12 + > 592K /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/11 + > 231M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/13/__spark_libs__4343915086399681065.zip + > 231M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/13 + > .... .... + > 2.9M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/10/sparkr.zip + > 2.9M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/10 + > 52K /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache/14 + > 235M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/filecache + > .... .... + > 51M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc/22 + > 64M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc/1c + > 66M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc/24 + > 414M /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc/30 + > 13G /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/blockmgr-690bd150-bff9-4542-8041-9f73d93d19dc + > 4.0K /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001/filecache + > 13G /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache/application_1613027823151_0001 + > 13G /tmp/hadoop-fedora/nm-local-dir/usercache/fedora/appcache + > 14G /tmp/hadoop-fedora/nm-local-dir/usercache/fedora + > 14G /tmp/hadoop-fedora/nm-local-dir/usercache + > 4.0K /tmp/hadoop-fedora/nm-local-dir/filecache + > 14G /tmp/hadoop-fedora/nm-local-dir + > 14G /tmp/hadoop-fedora + + Lots of information in that .. + + - By the time the job gets here it it is a Hadoop job NOT a Spark job. + - The temp files are owned by Hadoop node-manager and Hadoop block-manager. + - To move them to another location we should use the Hadoop temp settings, not the Spark temp settings. + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + + Things we have learned so far. + + Even if we don't create the separate disc mounts, we should still create the spark and hadoop temp directories. + + Move the mount paths from a global setting to a host specific setting. + Always create the same directories + + /var/spark/temp + /var/spark/data + + /var/hadoop/temp + /var/hadoop/data + + If this is a medium node, then change some of them into links. + If the host config has mount paths for them. + + The master node isn't doing much. + Possibly managing the HDFS namenode ? + Is it actually managing the Yarn scheduling ? + Could all this be done by a tiny VM ? + + The Zeppelin node is running the Spark interpreter. + The Spark interpreter is scheduling the Spark jobs. + The Spark interpreter aggregates the notebook results. + + The Spark interpreter uses 392K of space in /var/spark/temp. + This could still probably me a small node. + The main cpu use is the ipython server running one thread at 100%. + The rest of the cores are idle most of the time. + + + + + + + + + + From ebdb26d822605b0040491e85dbf49ba2046a23ea Mon Sep 17 00:00:00 2001 From: "zrq-github@metagrid.co.uk" Date: Sat, 13 Feb 2021 06:32:19 +0000 Subject: [PATCH 07/27] Notes on git branches --- notes/zrq/20210211-02-git-branches.txt | 457 +++++++++++++++++++++++++ 1 file changed, 457 insertions(+) create mode 100644 notes/zrq/20210211-02-git-branches.txt diff --git a/notes/zrq/20210211-02-git-branches.txt b/notes/zrq/20210211-02-git-branches.txt new file mode 100644 index 00000000..022dc660 --- /dev/null +++ b/notes/zrq/20210211-02-git-branches.txt @@ -0,0 +1,457 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Transfer work on 20210206-zrq-working branch onto spaller task specific branches. + We spent a while adding a mixture of changes to the working branch. + Needed to step back and commit the changes as separate task specific PRs. + + Result: + + Work in progress ... + +# ----------------------------------------------------- +# Create a copy of the local working branch. +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-zrq + + git add . + git commit -m "Adding everything to the working branch" + git push + + popd + + cp -a github-zrq github-working + mv github-zrq github-backup + + + popd + +# ----------------------------------------------------- +# Update the working copy with merged PRs from upstream. +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + git checkout master + + > Switched to branch 'master' + > Your branch is up to date with 'origin/master'. + + git pull + + > Already up to date. + + git fetch upstream + + > remote: Enumerating objects: 7, done. + > remote: Counting objects: 100% (7/7), done. + > remote: Total 26 (delta 7), reused 7 (delta 7), pack-reused 19 + > Unpacking objects: 100% (26/26), 45.43 KiB | 186.00 KiB/s, done. + > From github.com:wfau/aglais + > 7f642cd..01c7c74 master -> upstream/master + + + git merge upstream/master + + > Updating 7f642cd..01c7c74 + > Fast-forward + > experiments/hadoop-yarn/ansible/01-create-keypair.yml | 2 +- + > experiments/hadoop-yarn/ansible/02-create-gateway.yml | 2 +- + > .... + > .... + > create mode 100644 notes/zrq/20210205-02-resources.txt + > create mode 100644 notes/zrq/20210206-01-git-cherry-pick.txt + + + git push + + > Total 0 (delta 0), reused 0 (delta 0), pack-reused 0 + > To github.com:Zarquan/aglais.git + > 01c7c74..f46bc2b master -> master + + + git status + + > On branch master + > Your branch is up to date with 'origin/master'. + + + popd + popd + + +# ----------------------------------------------------- +# Delete merged branches. +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + git branch + + > 20210113-zrq-source-build + > 20210125-zrq-format-notes + > 20210125-zrq-kubernetes-deploy + > 20210127-zrq-error-trap + > 20210127-zrq-oauth + > 20210127-zrq-working + > 20210205-zrq-deployname + > 20210205-zrq-error-trap + > 20210205-zrq-notes + > 20210205-zrq-testing + > 20210205-zrq-timeout + > 20210206-zrq-working + > * master + + + git branch -d 20210125-zrq-format-notes + + > Deleted branch 20210125-zrq-format-notes (was fd1e449). + + + git branch -d 20210125-zrq-kubernetes-deploy + + > Deleted branch 20210125-zrq-kubernetes-deploy (was 3ab3b55). + + + git branch -d 20210127-zrq-error-trap + + > Deleted branch 20210127-zrq-error-trap (was 1b80704). + + + git branch -d 20210127-zrq-oauth + + > Deleted branch 20210127-zrq-oauth (was d5af1da). + + + git branch -d 20210127-zrq-working + + > warning: deleting branch '20210127-zrq-working' that has been merged to + > 'refs/remotes/origin/20210127-zrq-working', but not yet merged to HEAD. + > + > Deleted branch 20210127-zrq-working (was e12e24c). + + + git branch -d 20210205-zrq-deployname + + > Deleted branch 20210205-zrq-deployname (was 64d0f2c). + + + git branch -d 20210205-zrq-error-trap + + > Deleted branch 20210205-zrq-error-trap (was 1b80704). + + + git branch -d 20210205-zrq-notes + + > Deleted branch 20210205-zrq-notes (was 9c73277). + + + git branch -d 20210205-zrq-testing + + > Deleted branch 20210205-zrq-testing (was c148e78). + + + git branch -d 20210205-zrq-timeout + + > Deleted branch 20210205-zrq-timeout (was 9c73277). + + + git branch + + > 20210113-zrq-source-build + > 20210206-zrq-working + > * master + + + popd + popd + + +# ----------------------------------------------------- +# Create a new branch for the gateway changes. +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + nextbranch=$(date '+%Y%m%d')-zrq-gateway + + git checkout master + + > Already on 'master' + > Your branch is up to date with 'origin/master'. + + + git checkout -b "${nextbranch:?}" + + > Switched to a new branch '20210211-zrq-gateway' + + + git push --set-upstream origin "${nextbranch:?}" + + > Total 0 (delta 0), reused 0 (delta 0), pack-reused 0 + > remote: + > remote: Create a pull request for '20210211-zrq-gateway' on GitHub by visiting: + > remote: https://github.com/Zarquan/aglais/pull/new/20210211-zrq-gateway + > remote: + > To github.com:Zarquan/aglais.git + > * [new branch] 20210211-zrq-gateway -> 20210211-zrq-gateway + > Branch '20210211-zrq-gateway' set up to track remote branch '20210211-zrq-gateway' from 'origin'. + + popd + popd + + +# ----------------------------------------------------- +# Transfer the changes to remove the gateway node. +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + meld github-backup github-working & + + pushd github-working + + meld . & + + git branch + + > 20210113-zrq-source-build + > 20210206-zrq-working + > * 20210211-zrq-gateway + > master + + git add . + + git commit -m "Removed gateway node" + + > [20210211-zrq-gateway ffe2137] Removed gateway node + > 14 files changed, 58 insertions(+), 61 deletions(-) + + git push + + > Enumerating objects: 39, done. + > Counting objects: 100% (39/39), done. + > Delta compression using up to 4 threads + > Compressing objects: 100% (20/20), done. + > Writing objects: 100% (20/20), 1.84 KiB | 470.00 KiB/s, done. + > Total 20 (delta 17), reused 0 (delta 0), pack-reused 0 + > remote: Resolving deltas: 100% (17/17), completed with 16 local objects. + > To github.com:Zarquan/aglais.git + > f46bc2b..ffe2137 20210211-zrq-gateway -> 20210211-zrq-gateway + + popd + popd + + +# ----------------------------------------------------- +# Create a new branch with a fix to delete-all. +# Note - this branch follows on from previous branch, carrying forward the changes +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + git status + + > On branch 20210211-zrq-gateway + > Your branch is up to date with 'origin/20210211-zrq-gateway'. + + nextbranch=$(date '+%Y%m%d')-zrq-delete-fix + + git checkout -b "${nextbranch:?}" + + git push --set-upstream origin "${nextbranch:?}" + + meld ../github-backup . & + + git status + + git add experiments/openstack/bin/delete-all.sh + git commit -m "Fix to catch all the keys created by create-all" + + git add notes/zrq/20210206-01-git-cherry-pick.txt + git commit -m "Finish notes on cherry picking" + + git push + + popd + popd + + +# ----------------------------------------------------- +# Create a new branch to move hadoop and spark vars into the hosts file. +# Note - this branch follows on from previous branch, carrying forward the changes +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + git status + + > On branch 20210211-zrq-delete-fix + > Your branch is up to date with 'origin/20210211-zrq-delete-fix'. + + nextbranch=$(date '+%Y%m%d')-zrq-move-vars + + git checkout -b "${nextbranch:?}" + + git push --set-upstream origin "${nextbranch:?}" + + meld ../github-backup . & + + git status + + git add . + + git commit -m "Moved Hadoop, Spark and Zeppelin vars into hosts.yml" + + > [20210211-zrq-move-vars 2432401] Moved Hadoop, Spark and Zeppelin vars into hosts.yml + > 11 files changed, 50 insertions(+), 71 deletions(-) + + git push + + popd + popd + + +# ----------------------------------------------------- +# Create a new branch to fix the issue with Fedora updates. +# Note - this branch follows on from previous branch, carrying forward the changes +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + git status + + > On branch 20210211-zrq-move-vars + > Your branch is up to date with 'origin/20210211-zrq-move-vars'. + + nextbranch=$(date '+%Y%m%d')-zrq-fedora-updates + + git checkout -b "${nextbranch:?}" + + git push --set-upstream origin "${nextbranch:?}" + + meld ../github-backup . & + + git status + + git add . + + git commit -m "Fix a problem with Fedora updates" + + git push + + popd + popd + + +# ----------------------------------------------------- +# Create a new branch to add misc notes. +# Note - this branch follows on from the master branch, nothing to carry forward. +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + git checkout master + + nextbranch=$(date '+%Y%m%d')-zrq-notes + + git checkout -b "${nextbranch:?}" + + meld ../github-backup . & + + git status + + git add . + + git commit -m "Added new notes" + + git push --set-upstream origin "${nextbranch:?}" + + popd + popd + + +# ----------------------------------------------------- +# Create a new branch to add misc notes. +# Note - this branch follows on from a previous branch, carrying forward the changes +#[user@desktop] + + source "${HOME}/aglais.env" + pushd "${AGLAIS_HOME}" + + pushd github-working + + git checkout 20210211-zrq-fedora-updates + + nextbranch=$(date '+%Y%m%d')-zrq-volume-mounts + + git checkout -b "${nextbranch:?}" + + meld ../github-backup . & + + git status + + meld . & + + git add . + + git commit -m "Volume mounts for temp space" + + git push --set-upstream origin "${nextbranch:?}" + + popd + popd + From fb30bea8e69af97888df3451606f784563722b90 Mon Sep 17 00:00:00 2001 From: "zrq-github@metagrid.co.uk" Date: Sat, 13 Feb 2021 06:36:06 +0000 Subject: [PATCH 08/27] Configuring Hadoop and HDFS directories --- .../ansible/12-config-hadoop-core.yml | 13 ++++++++++++ .../ansible/12-config-ssh-access.yml | 7 ------- .../ansible/13-config-hdfs-namenode.yml | 12 +++++------ .../ansible/14-config-hdfs-workers.yml | 11 ++++++++-- experiments/hadoop-yarn/ansible/hosts.yml | 20 +++++++++++-------- .../ansible/tasks/create-linked.yml | 2 -- 6 files changed, 40 insertions(+), 25 deletions(-) diff --git a/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml b/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml index 2c4e50ad..aa6e6693 100644 --- a/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml +++ b/experiments/hadoop-yarn/ansible/12-config-hadoop-core.yml @@ -26,6 +26,13 @@ tasks: + - name: "Create Hadoop temp directory" + include_tasks: "tasks/create-linked.yml" + vars: + linkdest: "{{hdtempdest}}" + linkpath: "{{hdtemplink}}" + linkuser: "{{hduser}}" + # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/ClusterSetup.html#Configuring_the_Hadoop_Daemons # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/core-default.html - name: "Configure [{{hdhome}}/etc/hadoop/core-site.xml]" @@ -51,4 +58,10 @@ hdfs://{{hdhost}}:9000 + + hadoop.tmp.dir + {{hdtemplink}} + + + diff --git a/experiments/hadoop-yarn/ansible/12-config-ssh-access.yml b/experiments/hadoop-yarn/ansible/12-config-ssh-access.yml index 7fd83bca..20a372f0 100644 --- a/experiments/hadoop-yarn/ansible/12-config-ssh-access.yml +++ b/experiments/hadoop-yarn/ansible/12-config-ssh-access.yml @@ -102,13 +102,6 @@ - name: "Configure Hadoop [workers] on master nodes" hosts: masters:zeppelin gather_facts: false - vars: - hdname: "hadoop-3.1.3" - hdbase: "/opt" - hdhome: "/opt/hadoop" - hddata: "/var/local/hadoop" - hdhost: "{{groups['masters'][0]}}" - hduser: "{{hostvars[inventory_hostname].login}}" tasks: diff --git a/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml b/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml index 7bfe899b..390d6656 100644 --- a/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml +++ b/experiments/hadoop-yarn/ansible/13-config-hdfs-namenode.yml @@ -33,7 +33,7 @@ vars: linkdest: "{{hdfsmetadest}}" linkpath: "{{hdfsmetalink}}" - linkuser: "{{hduser}}" + linkuser: "{{hdfsuser}}" - name: "Create [{{hdfsimage}}]" become: true @@ -42,8 +42,8 @@ mode: 'u=rwx,g=rwxs,o=rx' state: directory recurse: yes - owner: "{{hduser}}" - group: "{{hduser}}" + owner: "{{hdfsuser}}" + group: "{{hdfsuser}}" # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/ClusterSetup.html#Configuring_the_Hadoop_Daemons # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml @@ -56,7 +56,7 @@ block: | @@ -87,7 +87,7 @@ @@ -117,7 +117,7 @@ dfs.client.use.datanode.hostname true - + dfs.datanode.use.datanode.hostname true diff --git a/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml b/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml index c3dd25d9..1842c580 100644 --- a/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml +++ b/experiments/hadoop-yarn/ansible/14-config-hdfs-workers.yml @@ -31,7 +31,14 @@ vars: linkdest: "{{hdfsdatadest}}" linkpath: "{{hdfsdatalink}}" - linkuser: "{{hduser}}" + linkuser: "{{hdfsuser}}" + + - name: "Create HDFS logs directory" + include_tasks: "tasks/create-linked.yml" + vars: + linkdest: "{{hdfslogsdest}}" + linkpath: "{{hdfslogslink}}" + linkuser: "{{hdfsuser}}" # https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/ClusterSetup.html#Configuring_the_Hadoop_Daemons - name: "Configure [{{hdhome}}/etc/hadoop/hdfs-site.xml]" @@ -46,7 +53,7 @@ | If this is a comma-delimited list of directories, then data will be stored in all named directories, typically on different devices. | The directories should be tagged with corresponding storage types ([SSD]/[DISK]/[ARCHIVE]/[RAM_DISK]) for HDFS storage policies. | The default storage type will be DISK if the directory does not have a storage type tagged explicitly. - | Directories that do not exist will be created if local filesystem permission allows. + | Directories that do not exist will be created if local filesystem permission allows. +--> dfs.datanode.data.dir diff --git a/experiments/hadoop-yarn/ansible/hosts.yml b/experiments/hadoop-yarn/ansible/hosts.yml index 9035f5f0..2c1804ef 100644 --- a/experiments/hadoop-yarn/ansible/hosts.yml +++ b/experiments/hadoop-yarn/ansible/hosts.yml @@ -61,12 +61,16 @@ all: hddatalink: "/var/hadoop/data" hddatadest: "/mnt/cinder/vdc/hadoop/data" + hdtemplink: "/var/hadoop/temp" + hdtempdest: "/mnt/local/vdb/hadoop/temp" + hdlogslink: "/var/hadoop/logs" hdlogsdest: "/mnt/cinder/vdc/hadoop/logs" # HDFS vars hdfsconf: "/var/hdfs/conf" + hdfsuser: "fedora" hdfsmetalink: "/var/hdfs/meta" hdfsmetadest: "/mnt/cinder/vdc/hdfs/meta" @@ -127,19 +131,19 @@ all: workers: hosts: - worker[01:06]: + worker[01:04]: vars: login: 'fedora' image: 'Fedora-30-1.2' - flavor: 'general.v1.small' + flavor: 'general.v1.medium' discs: -# - type: 'local' -# format: 'ext4' -# mntpath: "/mnt/local/vdb" -# devname: 'vdb' + - type: 'local' + format: 'ext4' + mntpath: "/mnt/local/vdb" + devname: 'vdb' - type: 'cinder' size: 512 format: 'btrfs' - mntpath: "/mnt/cinder/vdb" - devname: 'vdb' + mntpath: "/mnt/cinder/vdc" + devname: 'vdc' diff --git a/experiments/hadoop-yarn/ansible/tasks/create-linked.yml b/experiments/hadoop-yarn/ansible/tasks/create-linked.yml index 1c137b86..1da47c38 100644 --- a/experiments/hadoop-yarn/ansible/tasks/create-linked.yml +++ b/experiments/hadoop-yarn/ansible/tasks/create-linked.yml @@ -26,7 +26,6 @@ path: "{{linkdest | dirname}}" mode: 'u=rwx,g=rwxs,o=rx' state: directory - recurse: yes owner: 'root' group: 'root' @@ -46,7 +45,6 @@ path: "{{linkpath | dirname}}" mode: 'u=rwx,g=rwxs,o=rx' state: directory - recurse: yes owner: 'root' group: 'root' From 7ba885d16c250a7c0191484a042af0cc849a210a Mon Sep 17 00:00:00 2001 From: "zrq-github@metagrid.co.uk" Date: Sat, 13 Feb 2021 06:37:24 +0000 Subject: [PATCH 09/27] Performance optimizations --- .../ansible/16-config-yarn-masters.yml | 37 +- .../ansible/17-config-yarn-workers.yml | 61 +- .../ansible/22-config-spark-master.yml | 32 +- notes/zrq/20210211-03-ansible-deploy.txt | 1016 +++++++++++++++++ 4 files changed, 1095 insertions(+), 51 deletions(-) create mode 100644 notes/zrq/20210211-03-ansible-deploy.txt diff --git a/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml b/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml index e5a9fa76..f2d3c00f 100644 --- a/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml +++ b/experiments/hadoop-yarn/ansible/16-config-yarn-masters.yml @@ -55,7 +55,7 @@ yarn.resourcemanager.hostname @@ -64,7 +64,7 @@ yarn.scheduler.maximum-allocation-mb - 20000 + 43008 - - - - yarn.scheduler.maximum-allocation-mb - 20000 - - - - yarn.scheduler.minimum-allocation-mb - 2000 + 14336 - +--> # # CapacityScheduler config. @@ -161,7 +154,7 @@ diff --git a/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml b/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml index 7490cdc4..a6330d76 100644 --- a/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml +++ b/experiments/hadoop-yarn/ansible/17-config-yarn-workers.yml @@ -54,30 +54,61 @@ yarn.resourcemanager.hostname {{hdhost}} - + yarn.scheduler.maximum-allocation-mb - 20000 + 43008 + + + + + yarn.scheduler.minimum-allocation-mb + 14336 yarn.nodemanager.resource.memory-mb - 20000 + 43008 + + + yarn.nodemanager.resource.cpu-vcores + 13 + + + yarn.scheduler.maximum-allocation-vcores 48 @@ -89,8 +120,8 @@ yarn.nodemanager.aux-services @@ -141,10 +172,10 @@ | HADOOP_CONF_DIR, | HADOOP_HDFS_HOME, | HADOOP_YARN_HOME, - | HADOOP_MAPRED_HOME, + | HADOOP_MAPRED_HOME, | HADOOP_COMMON_HOME, | CLASSPATH_PREPEND_DISTCACHE - | + | yarn.nodemanager.env-whitelist @@ -159,7 +190,7 @@ false +--> - +