diff --git a/deployments/hadoop-yarn/bin/deploy.sh b/deployments/hadoop-yarn/bin/deploy.sh new file mode 100755 index 00000000..6a540f54 --- /dev/null +++ b/deployments/hadoop-yarn/bin/deploy.sh @@ -0,0 +1,139 @@ +#!/bin/bash +# +# +# Copyright (c) 2022, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# + +# ----------------------------------------------------- +# Delete everything. +#[root@ansibler] + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + +# ----------------------------------------------------- +# Create everything. +# (*) apart from the user database. +#[root@ansibler] + + /deployments/hadoop-yarn/bin/create-all.sh \ + "${cloudname:?}" \ + "${configname:?}" \ + | tee /tmp/create-all.log + + +# ----------------------------------------------------- +# Create our shiro-auth database. +#[root@ansibler] + + /deployments/hadoop-yarn/bin/create-auth-database.sh \ + "${cloudname:?}" \ + "${configname:?}" \ + | tee /tmp/create-auth-database.log + + +# ----------------------------------------------------- +# Copy notebooks from the live server. +#[root@ansibler] + + ssh zeppelin \ + ' + sshuser=fedora + sshhost=zeppelin.aglais.uk + + sudo mkdir -p '/var/local/backups' + sudo mv "/home/fedora/zeppelin/notebook" \ + "/var/local/backups/notebook-$(date '+%Y%m%d%H%M%S')" + + ssh-keyscan "${sshhost:?}" >> "${HOME}/.ssh/known_hosts" + + rsync \ + --perms \ + --times \ + --group \ + --owner \ + --stats \ + --progress \ + --human-readable \ + --checksum \ + --recursive \ + "${sshuser:?}@${sshhost:?}:zeppelin/notebook/" \ + "/home/fedora/zeppelin/notebook" + ' + + + +# ----------------------------------------------------- +# re-start Zeppelin. +#[root@ansibler] + + ssh zeppelin \ + ' + zeppelin-daemon.sh restart + ' + + +# ----------------------------------------------------- +# Add the ssh key for our data node. +# This is used by the getpasshash function in the client container. +# TODO Add this to a client-setup.sh in ansible/client/bin. +#[root@ansibler] + + ssh-keyscan 'data.aglais.uk' 2>/dev/null >> "${HOME}/.ssh/known_hosts" + + +# ----------------------------------------------------- +# Get the IP address from the ssh config file. +# TODO Save the IP address during the deployment process. +#[root@ansibler] + + ipaddress=$( + + sed -n ' + /^Host zeppelin/,/^Host/ { + /HostName/ { + s/^[[:space:]]*HostName[[:space:]]\(.*\)/\1/ p + } + } + ' ~/.ssh/config + + ) + + +# ----------------------------------------------------- +# Add the Zeppelin IP address to our hosts file. +# TODO Add this to the Ansible deployment. +#[root@ansibler] + +cat >> /etc/hosts << EOF +# Zeppelin +${ipaddress} zeppelin +EOF + +# ----------------------------------------------------- +# Configure our client container. +#[root@ansibler] + + dnf install -y git + + pip install git+https://github.com/wfau/aglais-testing@v0.2.3 + + + + diff --git a/notes/zrq/20220601-01-blue-deploy.txt b/notes/zrq/20220601-01-blue-deploy.txt new file mode 100644 index 00000000..5d490719 --- /dev/null +++ b/notes/zrq/20220601-01-blue-deploy.txt @@ -0,0 +1,188 @@ +# +# +# +# Copyright (c) 2022, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Deployment used to run the concurrent tests. + + Names generated + https://www.name-generator.org.uk/ + + Result: + + Work in progress .. + +# ----------------------------------------------------- +# Create a container to work with. +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name ansibler \ + --hostname ansibler \ + --publish 3000:3000 \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/deployments:/deployments:ro,z" \ + ghcr.io/wfau/atolmis/ansible-client:2022.03.19 \ + bash + + +# ----------------------------------------------------- +# Set the target configuration. +#[root@ansibler] + + cloudbase='arcus' + cloudname='iris-gaia-blue' + configname=zeppelin-54.86-spark-6.26.43 + + +# ----------------------------------------------------- +# Delete everything. +#[root@ansibler] + + time \ + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + +# ----------------------------------------------------- +# Create everything. +# (*) apart from the user database. +#[root@ansibler] + + time \ + /deployments/hadoop-yarn/bin/create-all.sh \ + "${cloudname:?}" \ + "${configname:?}" \ + | tee /tmp/create-all.log + + +# ----------------------------------------------------- +# Create our shiro-auth database. +#[root@ansibler] + + time \ + /deployments/hadoop-yarn/bin/create-auth-database.sh \ + "${cloudname:?}" \ + "${configname:?}" \ + | tee /tmp/create-auth-database.log + + +# ----------------------------------------------------- +# Copy notebooks from the live server. +#[root@ansibler] + + ssh zeppelin \ + ' + sshuser=fedora + sshhost=zeppelin.aglais.uk + + sudo mkdir -p '/var/local/backups' + sudo mv "/home/fedora/zeppelin/notebook" \ + "/var/local/backups/notebook-$(date '+%Y%m%d%H%M%S')" + + ssh-keyscan "${sshhost:?}" >> "${HOME}/.ssh/known_hosts" + + rsync \ + --perms \ + --times \ + --group \ + --owner \ + --stats \ + --progress \ + --human-readable \ + --checksum \ + --recursive \ + "${sshuser:?}@${sshhost:?}:zeppelin/notebook/" \ + "/home/fedora/zeppelin/notebook" + ' + + +# ----------------------------------------------------- +# re-start Zeppelin. +#[root@ansibler] + + ssh zeppelin \ + ' + zeppelin-daemon.sh restart + ' + + +# ----------------------------------------------------- +# Add the ssh key for our data node. +# This is used by the getpasshash function in the client container. +# TODO Add this to a client-setup.sh in ansible/client/bin. +#[root@ansibler] + + ssh-keyscan 'data.aglais.uk' 2>/dev/null >> "${HOME}/.ssh/known_hosts" + + +# ----------------------------------------------------- +# Get the IP address from the ssh config file. +# TODO Save the IP address during the deployment process. +#[root@ansibler] + + ipaddress=$( + + sed -n ' + /^Host zeppelin/,/^Host/ { + /HostName/ { + s/^[[:space:]]*HostName[[:space:]]\(.*\)/\1/ p + } + } + ' ~/.ssh/config + + ) + + +# ----------------------------------------------------- +# Add the Zeppelin IP address to our hosts file. +# TODO Add this to the Ansible deployment. +#[root@ansibler] + +cat >> /etc/hosts << EOF +# Zeppelin +${ipaddress} zeppelin +EOF + +# ----------------------------------------------------- +# Configure out client container. +#[root@ansibler] + + dnf install -y git + + pip install git+https://github.com/wfau/aglais-testing@v0.2.2 + + + diff --git a/notes/zrq/20220601-02-concurrent-tests.txt b/notes/zrq/20220601-02-concurrent-tests.txt new file mode 100644 index 00000000..756e3aae --- /dev/null +++ b/notes/zrq/20220601-02-concurrent-tests.txt @@ -0,0 +1,2663 @@ +# +# +# +# Copyright (c) 2022, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Try to find out more about the limits on concurrent users. + Started with a clean deployment 20220601-01-blue-deploy.txt + + Result: + + Work in progress ... + +# ----------------------------------------------------- +# Create some test users. +# TODO Move the create-user-tools to ansible/client/bin. +# TODO Add ansible/client/bin to the client PATH. +#[root@ansibler] + + source /deployments/zeppelin/bin/create-user-tools.sh + + testnames01=( + Rhaelhall + Fipa + Mythicson + Balline + Hiness + Anskelisia + Iflee + Mischiellis + Kellaug + Liphima + Jarters + Williazoga + Carrovieus + Pierione + Hayesphasia + Collinotter + Adazoga + Harinabla + Sanderlotus + Bellgrin + ) + + testnames02=( + Hamar + Carclop + Halda + Jaden + Mavaca + Franilley + Masonania + Webbbron + Granwaler + Stelama + ) + + testnames03=( + Smical + Reyesfan + Evison + Surbron + Floresslight + ) + + createarrayusers \ + "${testnames01[@]}" \ + | tee /tmp/testusers-01.json \ + | jq '[ .users[] | {"name": .shirouser.name, "pass": .shirouser.pass} ]' + + > [ + > { + > "name": "Rhaelhall", + > "pass": "ea8aiqu1liubachohthahwieh3ko1O" + > } + > { + > "name": "Fipa", + > "pass": "eigheiZoo9Mei4fereim1ahp3weu1E" + > } + > .... + > .... + > ] + + + createarrayusers \ + "${testnames02[@]}" \ + | tee /tmp/testusers-02.json \ + | jq '[ .users[] | {"name": .shirouser.name, "pass": .shirouser.pass} ]' + + > [ + > { + > "name": "Hamar", + > "pass": "fausaimuugh8ue3aNowaig4uozes6o" + > }, + > { + > "name": "Carclop", + > "pass": "aeX8Xie9oozeasiehoh4pheeyahliC" + > }, + > .... + > .... + > ] + + + createarrayusers \ + "${testnames03[@]}" \ + | tee /tmp/testusers-03.json \ + | jq '[ .users[] | {"name": .shirouser.name, "pass": .shirouser.pass} ]' + + > [ + > { + > "name": "Smical", + > "pass": "roh1ohpaeYohY4hiequeiseiMoh0ah" + > }, + > { + > "name": "Reyesfan", + > "pass": "eeyah5iegeis5ne6ohPh4hagaiduk8" + > }, + > .... + > .... + > ] + + +# ----------------------------------------------------- +# Create our benchmark script. +# TODO Create run-benchmark.py in ansible/client/bin. +# Learning Python: +# Command line args +# https://realpython.com/python-command-line-arguments/ +# String.format() +# https://docs.python.org/3/library/string.html#formatstrings +# Escape {} in format() +# https://stackoverflow.com/a/5466478 +#[root@ansibler] + + cat > /tmp/run-benchmark.py << 'EOF' +#!/bin/python3 +import sys +from aglais_benchmark import AglaisBenchmarker + +try: + + opts = [opt for opt in sys.argv[1:] if opt.startswith("-")] + args = [arg for arg in sys.argv[1:] if not arg.startswith("-")] + + endpoint = args[0] + testconfig = args[1] + userlist = args[2] + usercount = int(args[3]) + +except IndexError: + + raise SystemExit(f"Usage: {sys.argv[0]} ") + +print( +""" +{{ +\"config\": {{ + \"endpoint\": \"{}\", + \"testconfig\": \"{}\", + \"userlist\": \"{}\", + \"usercount\": \"{}\" + }} +}} +""".format( + endpoint, + testconfig, + userlist, + usercount + ) + ) + +AglaisBenchmarker( + testconfig, + userlist, + "/tmp/", + endpoint + ).run( + concurrent=True, + users=usercount + ) + +EOF + + chmod 'a+x' /tmp/run-benchmark.py + + +# ----------------------------------------------------- +# Run a quick test with one user. +#[root@ansibler] + + mkdir /tmp/results + + endpoint="http://zeppelin:8080" + + testconfig=/deployments/zeppelin/test/config/quick.json + + testusers=/tmp/testusers-01.json + testname=single-user-01 + usercount=1 + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + + > Test started [Multi User] + > b'Create notebook: 2H667AYP5\n' + > b'Create notebook: 2H5A6XXYR\n' + > b'Create notebook: 2H6QYSP33\n' + > b'Create notebook: 2H47M43R1\n' + > Test completed! (47.49 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + + sed " + 0,/^----/ d + s/\"/#/g + s/'\(-\{0,1\}[0-9.]\{1,\}\)'/\1/g + s/:[[:space:]]*\([a-zA-Z]\{1,\}\)\([,}]\)/:'\1'\2/g + s/:[[:space:]]*\([,}]\),/: ''\1/g + s/'/\"/g + " \ + "/tmp/results/${testname:?}.txt" \ + | tee "/tmp/results/${testname:?}.json" \ + | jq ' + .[] | keys as $x | [ $x[] as $y | {name: $y, value: .[$y].result, time: .[$y].time.elapsed } ] + ' + + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.63 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.80 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 15.03 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 22.01 + > } + > ] + + +# ----------------------------------------------------- +# Run the same test 10 times .. +#[root@ansibler] + + usercount=1 + + for i in $(seq 0 9) + do + echo "" + echo "-----------------------------------------------------" + echo "Loop [${i}]" + testname="single-user-0${i}" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + done + + > ----------------------------------------------------- + > Loop [0] + > .... + > .... + > + > ----------------------------------------------------- + > Loop [8] + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + > .... + > ----------------------------------------------------- + > Loop [9] + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + + + +# ----------------------------------------------------- +# Try 2 concurrent users 10 times .. +#[root@ansibler] + + usercount=2 + loopcount=10 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-----------------------------------------------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + done + + > .... + > .... + > ----------------------------------------------------- + > Loop [9] + > + > { + > "config": { + > "endpoint": "http://128.232.222.52:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-01.json", + > "usercount": "2" + > } + > } + > + > /tmp/testusers-01.json + > Test started [Multi User] + > b'Create notebook: 2H48AHMJT\n' + > b'Create notebook: 2H7B1YN8S\n' + > b'Create notebook: 2H7E5N83J\n' + > b'Create notebook: 2H48UEFJY\n' + > b'Create notebook: 2H4872JVA\n' + > b'Create notebook: 2H6UWG925\n' + > b'Create notebook: 2H5ABEHER\n' + > b'Create notebook: 2H79XX5BV\n' + > Test completed! (28.12 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + > + + + grep 'Test Result:' /tmp/results/multi-user-* + + > /tmp/results/multi-user-02-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-02-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-02-02.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-02-03.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-02-04.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-02-05.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-02-06.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-02-07.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-02-08.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-02-09.txt:------------ Test Result: [PASS] ------------ + + +# ----------------------------------------------------- +# Try 4 concurrent users 10 times .. +#[root@ansibler] + + usercount=4 + loopcount=10 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-----------------------------------------------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + done + + > .... + > .... + + > ----------------------------------------------------- + > Loop [1] + > .... + > Test started [Multi User] + > ERROR:root:list index out of range + > Traceback (most recent call last): + > File "/usr/local/lib/python3.10/site-packages/aglais_benchmark/aglais_benchmark.py", line 114, in run_notebook + > notebookid = text.split(": ")[1] + > IndexError: list index out of range + > b'Create notebook: 2H4XXB4A7\n' + > b'status_code:500\n' + > b'Create notebook: 2H4YU62GV\n' + > b'Create notebook: 2H48C7WRT\n' + > .... + > ------------ Test Result: [FAIL] ------------ + > .... + + > .... + > .... + + > .... + > .... + > ----------------------------------------------------- + > Loop [9] + > + > { + > "config": { + > "endpoint": "http://128.232.222.52:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-01.json", + > "usercount": "4" + > } + > } + > + > /tmp/testusers-01.json + > Test started [Multi User] + > b'Create notebook: 2H76YPGQV\n' + > b'Create notebook: 2H654MVKV\n' + > b'Create notebook: 2H3WM2J3N\n' + > b'Create notebook: 2H4MZFKHJ\n' + > b'Create notebook: 2H77GBM2S\n' + > b'Create notebook: 2H4E6Z8A2\n' + > b'Create notebook: 2H5Q6NDGG\n' + > b'Create notebook: 2H55VSVK6\n' + > b'Create notebook: 2H58T3VGV\n' + > b'Create notebook: 2H4RJ9Q8U\n' + > b'Create notebook: 2H5QY7JHT\n' + > b'Create notebook: 2H43MEC8Z\n' + > b'Create notebook: 2H5KK6RNU\n' + > b'Create notebook: 2H6ETAKCK\n' + > b'Create notebook: 2H5VX7EMJ\n' + > b'Create notebook: 2H51QAZ4D\n' + > Test completed! (59.38 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + + + sed " + 0,/^----/ d + s/\"/#/g + s/'\(-\{0,1\}[0-9.]\{1,\}\)'/\1/g + s/:[[:space:]]*\([a-zA-Z]\{1,\}\)\([,}]\)/:'\1'\2/g + s/:[[:space:]]*\([,}]\),/: ''\1/g + s/'/\"/g + " \ + "/tmp/results/${testname:?}.txt" \ + | tee "/tmp/results/${testname:?}.json" \ + | jq ' + .[] | keys as $x | [ $x[] as $y | {name: $y, value: .[$y].result, time: .[$y].time.elapsed } ] + ' + + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.62 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.61 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 6.91 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 17.44 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.64 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.66 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 5.83 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 19.37 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.64 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.49 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 5.78 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 43.44 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.61 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.75 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 5.85 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 19.37 + > } + > ] + + # + # We are seeing faliures in the notebook create steps, but the concurrent execution part works. + # Logged a couple of issues about creating multiple notebooks. + # + # Add error handling and diagnostics to create notebook step in AglaisBenchmarke + # https://github.com/wfau/aglais/issues/741 + # + # Update AglaisBenchmarker to use exsiting notebooks + # https://github.com/wfau/aglais/issues/742 + # + + # + # As far as I know we have not seen errors during the create-user clone-notebook steps. + # Does this suggest we are get errors when multiple users tyr to create notebooks at the same time? + # + + + grep 'Test Result:' /tmp/results/multi-user-04-* + + > /tmp/results/multi-user-04-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-01.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-04-02.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-04-03.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-04-04.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-05.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-06.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-07.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-08.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-09.txt:------------ Test Result: [PASS] ------------ + + + less /tmp/results/multi-user-04-01.txt + + > .... + > Test started [Multi User] + > b'Create notebook: 2H4XXB4A7\n' + > b'status_code:500\n' + > b'Create notebook: 2H4YU62GV\n' + > b'Create notebook: 2H48C7WRT\n' + > b'Create notebook: 2H5HD1MFX\n' + > b'Create notebook: 2H4PCT4A8\n' + > b'Create notebook: 2H3WXBYAN\n' + > b'Create notebook: 2H69U63WE\n' + > b'Create notebook: 2H4PC8MUP\n' + > b'Create notebook: 2H46Y6415\n' + > b'Create notebook: 2H4YUSQQJ\n' + > b'Create notebook: 2H3VK8W99\n' + > b'Create notebook: 2H56SNMKC\n' + > b'Create notebook: 2H6MRPMZW\n' + > b'Create notebook: 2H74DYMXV\n' + > b'Create notebook: 2H5QM4HDU\n' + > Test completed! (119.43 seconds) + > .... + + + less /tmp/results/multi-user-04-02.txt + + > .... + > Test started [Multi User] + > b'Create notebook: 2H4CX7YR7\n' + > b'Create notebook: 2H6UZJZSF\n' + > b'Create notebook: 2H4PA56Y8\n' + > b'Create notebook: 2H78GH1FT\n' + > b'Create notebook: 2H7CYF8BB\n' + > b'Create notebook: 2H4C2EAWZ\n' + > b'Create notebook: 2H3W9114F\n' + > b'Create notebook: 2H52F6BT9\n' + > b'Create notebook: 2H3RE238N\n' + > b'status_code:500\n' + > b'Create notebook: 2H6WAKH5P\n' + > b'Create notebook: 2H4PADFB6\n' + > b'Create notebook: 2H41T5HZW\n' + > b'Create notebook: 2H67DY58X\n' + > b'Create notebook: 2H5RDZZG5\n' + > b'Create notebook: 2H43H533Z\n' + > Test completed! (94.81 seconds) + > .... + + + less /tmp/results/multi-user-04-03.txt + + > .... + > Test started [Multi User] + > b'Create notebook: 2H5W73K4Z\n' + > b'Create notebook: 2H7B8P9H8\n' + > b'Create notebook: 2H7DN6VFA\n' + > b'Create notebook: 2H49EM5AU\n' + > b'Create notebook: 2H623G7KT\n' + > b'Create notebook: 2H3PJHDJU\n' + > b'Create notebook: 2H4C9BQD6\n' + > b'Create notebook: 2H7FU2MRR\n' + > b'Create notebook: 2H49YTWRJ\n' + > b'Create notebook: 2H4DKNTFB\n' + > b'Create notebook: 2H5UTD5EM\n' + > b'Create notebook: 2H45NAVKB\n' + > b'Create notebook: 2H5MXKQDP\n' + > b'Create notebook: 2H6TM5FS7\n' + > b'Create notebook: 2H47V1E75\n' + > b'Create notebook: 2H49RA2C6\n' + > Test completed! (71.33 seconds) + > .... + + + testname=multi-user-04-01 + + sed " + 0,/^----/ d + s/\"/#/g + s/'\(-\{0,1\}[0-9.]\{1,\}\)'/\1/g + s/:[[:space:]]*\([a-zA-Z]\{1,\}\)\([,}]\)/:'\1'\2/g + s/:[[:space:]]*\([,}]\),/: ''\1/g + s/'/\"/g + " \ + "/tmp/results/${testname:?}.txt" \ + | tee "/tmp/results/${testname:?}.json" \ + | jq ' + .[] | keys as $x | [ $x[] as $y | {name: $y, value: .[$y].result, time: .[$y].time.elapsed } ] + ' + + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.86 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.43 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 19.41 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 78.24 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.82 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.52 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 5.92 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 10.30 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.88 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.49 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 19.42 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 89.61 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.91 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.54 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "FAIL", + > "time": 0.99 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 96.03 + > } + > ] + + # + # The 4th user fails on the proper_motions notebook. + # + + jq '.[3]' "/tmp/results/${testname:?}.json" + + > .... + > "Mean_proper_motions_over_the_sky": { + > "result": "FAIL", + > "outputs": { + > "valid": "True" + > }, + > "time": { + > "result": "FAST", + > "elapsed": 0.99, + > "expected": 55.00, + > "percent": -98.20, + > "start": "2022-06-01T14:06:33.174644", + > "finish": "2022-06-01T14:06:34.162608" + > }, + > "logs": "" + > }, + > .... + + # + # Nothing in the test results to indicate why it failed. + # + + # + # Nothing in the test output to show what user the test is using. + # Assume it is one of the first four users in the first list. + # + + jq ' + [ .users[0:4] | .[].shirouser | {name, pass} ] + ' /tmp/testusers-01.json \ + | tee /tmp/fourusers.json + + > [ + > { + > "name": "Rhaelhall", + > "pass": "ea8aiqu1liubachohthahwieh3ko1O" + > }, + > { + > "name": "Fipa", + > "pass": "eigheiZoo9Mei4fereim1ahp3weu1E" + > }, + > { + > "name": "Mythicson", + > "pass": "iereizuephiengoisaiqu9AeLai4Ee" + > }, + > { + > "name": "Balline", + > "pass": "IeJuthei9aijuvahwumahRei4Ogooy" + > } + > ] + + + for username in $( + jq -r '.[].name' /tmp/fourusers.json + ) + do + password=$( + jq -r ' + .[] | select(.name == "'${username}'") | .pass + ' /tmp/fourusers.json + ) + echo "Login [${username}][${password}]" + done + + > Login [Rhaelhall][ea8aiqu1liubachohthahwieh3ko1O] + > Login [Fipa][eigheiZoo9Mei4fereim1ahp3weu1E] + > Login [Mythicson][iereizuephiengoisaiqu9AeLai4Ee] + > Login [Balline][IeJuthei9aijuvahwumahRei4Ogooy] + + +# ----------------------------------------------------- +# Login to Zeppelin as a test user. +#[root@ansibler] + + zeppelinurl=http://zeppelin:8080 + + source '/deployments/zeppelin/bin/zeppelin-rest-tools.sh' + + zeplogin "${username:?}" "${password:?}" + + > { + > "status": "OK", + > "message": "", + > "body": { + > "principal": "Balline", + > "ticket": "f449766c-4056-496d-abef-7aa46a0d4302", + > "roles": "[\"user\"]" + > } + > } + + +# ----------------------------------------------------- +# List the user's notebooks +#[root@ansibler] + + curl \ + --silent \ + --cookie "${zepcookies:?}" \ + "${zeppelinurl:?}/api/notebook" \ + | jq "[.body[] | select(.path | startswith(\"/Users/${username:?}\"))]" + + > [ + > { + > "id": "2H6BY1RQ1", + > "path": "/Users/Balline/1. Start here" + > }, + > { + > "id": "2H5VA1B5H", + > "path": "/Users/Balline/2. Data holdings" + > }, + > { + > "id": "2H6KB3TMG", + > "path": "/Users/Balline/3. Source counts over the sky" + > }, + > { + > "id": "2H6SBW7SK", + > "path": "/Users/Balline/4. Mean proper motions over the sky" + > }, + > { + > "id": "2H6U6WHDZ", + > "path": "/Users/Balline/5. Working with Gaia XP spectra" + > }, + > { + > "id": "2H6YUJ3FU", + > "path": "/Users/Balline/6. Working with cross-matched surveys" + > }, + > { + > "id": "2H4KCJRQH", + > "path": "/Users/Balline/7. Good astrometric solutions via ML Random Forrest classifier" + > }, + > { + > "id": "2H493DJD8", + > "path": "/Users/Balline/8. Tips and tricks" + > } + > ] + + # + # These are the user's clone of the public notebooks. + # What about the test notebooks that AglaisBenchmarker created ? + # + +# ----------------------------------------------------- +# List the notebooks in /tmp. +#[root@ansibler] + + curl \ + --silent \ + --cookie "${zepcookies:?}" \ + "${zeppelinurl:?}/api/notebook" \ + | jq ' + [ + .body[] + | select( + .path | startswith("/tmp") + ) + ] + ' + + > [ + > { + > "id": "2H6BNM8B3", + > "path": "/tmp/09UKVH2TDO.json" + > }, + > { + > "id": "2H3RE238N", + > "path": "/tmp/0HYOKR8OXW.json" + > }, + > .... + > .... + > { + > "id": "2H3NUJZ7D", + > "path": "/tmp/YIV1CNDWAI.json" + > } + > ] + + +# ----------------------------------------------------- +# Count the notebooks in /tmp. +#[root@ansibler] + + curl \ + --silent \ + --cookie "${zepcookies:?}" \ + "${zeppelinurl:?}/api/notebook" \ + | jq ' + [ + .body[] + | select( + .path | startswith("/tmp") + ) + ] + | length + ' + + > 39 + + +# ----------------------------------------------------- +# List the permissions for a notebook. +#[root@ansibler] + + noteid=2H3NUJZ7D + + curl \ + --silent \ + --cookie "${zepcookies:?}" \ + "${zeppelinurl:?}/api/notebook/${noteid:?}/permissions" \ + | jq '.' + + > { + > "status": "OK", + > "message": "", + > "body": { + > "readers": [ + > "Balline" + > ], + > "owners": [ + > "Balline" + > ], + > "writers": [ + > "Balline" + > ], + > "runners": [ + > "Balline" + > ] + > } + > } + + +# ----------------------------------------------------- +# Putting it together. +#[root@ansibler] + + for noteid in $( + curl \ + --silent \ + --cookie "${zepcookies:?}" \ + "${zeppelinurl:?}/api/notebook" \ + | jq -r ' + .body[] + | select( + .path | startswith("/tmp") + ) + | .id + ' + ) + do + owner=$( + curl \ + --silent \ + --cookie "${zepcookies:?}" \ + "${zeppelinurl:?}/api/notebook/${noteid:?}/permissions" \ + | jq -r '.body.owners[0]' + ) +cat << EOF +{ +"id": "${noteid}", +"owner": "${owner}" +} +EOF + + done + + > { + > "id": "2H6BNM8B3", + > "owner": "Balline" + > } + > { + > "id": "2H3RE238N", + > "owner": "Balline" + > } + > { + > "id": "2H49RA2C6", + > "owner": "Balline" + > } + > { + > "id": "2H43RQS6R", + > "owner": "Balline" + > } + > { + > "id": "2H6ETAKCK", + > "owner": "Balline" + > } + > { + > "id": "2H5KK6RNU", + > "owner": "Balline" + > } + > { + > "id": "2H5HD1MFX", + > "owner": "Balline" + > } + > { + > "id": "2H6TM5FS7", + > "owner": "Balline" + > } + > { + > "id": "2H6MZ7VHR", + > "owner": "Balline" + > } + > { + > "id": "2H4PADFB6", + > "owner": "Balline" + > } + > { + > "id": "2H6WAKH5P", + > "owner": "Balline" + > } + > { + > "id": "2H5VX7EMJ", + > "owner": "Balline" + > } + > { + > "id": "2H4PCT4A8", + > "owner": "Balline" + > } + > { + > "id": "2H3R2QYQY", + > "owner": "Balline" + > } + > { + > "id": "2H3QYXU5F", + > "owner": "Balline" + > } + > { + > "id": "2H77J69XP", + > "owner": "Balline" + > } + > { + > "id": "2H51QAZ4D", + > "owner": "Balline" + > } + > { + > "id": "2H47V1E75", + > "owner": "Balline" + > } + > { + > "id": "2H46E9DUX", + > "owner": "Balline" + > } + > { + > "id": "2H5AEJTS6", + > "owner": "Balline" + > } + > { + > "id": "2H3WXBYAN", + > "owner": "Balline" + > } + > { + > "id": "2H4WE2NKY", + > "owner": "Balline" + > } + > { + > "id": "2H43RBB4D", + > "owner": "Balline" + > } + > { + > "id": "2H42RPNFR", + > "owner": "Balline" + > } + > { + > "id": "2H564QGVP", + > "owner": "Balline" + > } + > { + > "id": "2H5RZ7AED", + > "owner": "Balline" + > } + > { + > "id": "2H5K64FK5", + > "owner": "Balline" + > } + > { + > "id": "2H5ZT5JXY", + > "owner": "Balline" + > } + > { + > "id": "2H6Q8AFB8", + > "owner": "Balline" + > } + > { + > "id": "2H5MXKQDP", + > "owner": "Balline" + > } + > { + > "id": "2H65X6KSQ", + > "owner": "Balline" + > } + > { + > "id": "2H5CFXCJD", + > "owner": "Balline" + > } + > { + > "id": "2H45CYQNV", + > "owner": "Balline" + > } + > { + > "id": "2H3J89FQN", + > "owner": "Balline" + > } + > { + > "id": "2H43A3ZGD", + > "owner": "Balline" + > } + > { + > "id": "2H69U63WE", + > "owner": "Balline" + > } + > { + > "id": "2H74VFG5U", + > "owner": "Balline" + > } + > { + > "id": "2H4QBG6M7", + > "owner": "Balline" + > } + > { + > "id": "2H3NUJZ7D", + > "owner": "Balline" + > } + + # + # Why are all the notes owned by Balline ? + # + + # + # Add some debug to AglaisBenchmarker .. + # + + +# ----------------------------------------------------- +# Try 2 concurrent users 2 times .. +#[root@ansibler] + + usercount=2 + loopcount=2 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-----------------------------------------------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + done + + + > ----------------------------------------------------- + > Loop [0] + > + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-01.json", + > "usercount": "2" + > } + > } + > + > /tmp/testusers-01.json + > Test started [Multi User] + > ERROR:root:list index out of range + > Traceback (most recent call last): + > File "/usr/local/lib/python3.10/site-packages/aglais_benchmark/aglais_benchmark.py", line 115, in run_notebook + > notebookid = text.split(": ")[1] + > IndexError: list index out of range + > Config [/tmp/user2.yml] + > b'status_code:500\n' + > Config [/tmp/user2.yml] + > b'Create notebook: 2H6RDY45N\n' + > Config [/tmp/user2.yml] + > b'Create notebook: 2H65TFVMD\n' + > Config [/tmp/user2.yml] + > b'Create notebook: 2H6833AW7\n' + > Config [/tmp/user1.yml] + > b'Create notebook: 2H49223X3\n' + > Config [/tmp/user1.yml] + > b'Create notebook: 2H3X184V7\n' + > Config [/tmp/user1.yml] + > b'Create notebook: 2H6S7EYJ7\n' + > Config [/tmp/user1.yml] + > b'Create notebook: 2H3XGAPAQ\n' + > Test completed! (45.41 seconds) + > ------------ Test Result: [FAIL] ------------ + > [{'GaiaDMPSetup': { .... }}] + > + > + > + > ----------------------------------------------------- + > Loop [1] + > + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-01.json", + > "usercount": "2" + > } + > } + > + > /tmp/testusers-01.json + > Test started [Multi User] + > Config [/tmp/user1.yml] + > b'Create notebook: 2H3XCN3YN\n' + > Config [/tmp/user1.yml] + > b'Create notebook: 2H5ED6FGA\n' + > Config [/tmp/user1.yml] + > b'Create notebook: 2H44AKRYY\n' + > Config [/tmp/user1.yml] + > b'Create notebook: 2H4N42MQU\n' + > Config [/tmp/user2.yml] + > b'Create notebook: 2H4SHWP5W\n' + > Config [/tmp/user2.yml] + > b'Create notebook: 2H5TMJFK5\n' + > Config [/tmp/user2.yml] + > b'Create notebook: 2H78HBZSB\n' + > Config [/tmp/user2.yml] + > b'Create notebook: 2H5WEBRUB\n' + > Test completed! (31.55 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + + # + # OK, so it is using two of the user configurations. + # + + cat /tmp/user1.yml + + > zeppelin_url: http://zeppelin:8080 + > zeppelin_auth: true + > zeppelin_user: Fipa + > zeppelin_password: eigheiZoo9Mei4fereim1ahp3weu1E + + cat /tmp/user2.yml + + > zeppelin_url: http://zeppelin:8080 + > zeppelin_auth: true + > zeppelin_user: Mythicson + > zeppelin_password: iereizuephiengoisaiqu9AeLai4Ee + + +# ----------------------------------------------------- +# Count the notebooks in /tmp. +#[root@ansibler] + + curl \ + --silent \ + --cookie "${zepcookies:?}" \ + "${zeppelinurl:?}/api/notebook" \ + | jq ' + [ + .body[] + | select( + .path | startswith("/tmp") + ) + ] + | length + ' + + > 39 + + +# ----------------------------------------------------- +# Check the owner of the test notebooks. +#[root@ansibler] + + for noteid in $( + curl \ + --silent \ + --cookie "${zepcookies:?}" \ + "${zeppelinurl:?}/api/notebook" \ + | jq -r ' + .body[] + | select( + .path | startswith("/tmp") + ) + | .id + ' + ) + do + owner=$( + curl \ + --silent \ + --cookie "${zepcookies:?}" \ + "${zeppelinurl:?}/api/notebook/${noteid:?}/permissions" \ + | jq -r '.body.owners[0]' + ) +cat << EOF +{ +"id": "${noteid}", +"owner": "${owner}" +} +EOF + done + + > { + > "id": "2H6BNM8B3", + > "owner": "Balline" + > } + > { + > "id": "2H3RE238N", + > "owner": "Balline" + > } + > .... + > .... + + # + # Same list ... becase that is who we are logged in as. + # We are logged in as Balline so we only see Balline's notebooks. + # Duh! + # + + + + + +# ----------------------------------------------------- +# Loop through the first four users and count the user's notebooks in /tmp. +#[root@ansibler] + + jq ' + [ .users[0:4] | .[].shirouser | {name, pass} ] + ' /tmp/testusers-01.json \ + | tee /tmp/fourusers.json + + + for username in $( + jq -r '.[].name' /tmp/fourusers.json + ) + do + password=$( + jq -r ' + .[] | select(.name == "'${username}'") | .pass + ' /tmp/fourusers.json + ) + + echo + echo "-----------------------------------------------------" + echo "Login [${username}]" + + zeplogin "${username:?}" "${password:?}" + + curl \ + --silent \ + --cookie "${zepcookies:?}" \ + "${zeppelinurl:?}/api/notebook" \ + | jq ' + [ + .body[] + | select( + .path | startswith("/tmp") + ) + ] + | length + ' + done + + > ----------------------------------------------------- + > Login [Rhaelhall] + > { + > "status": "OK", + > "message": "", + > "body": { + > "principal": "Rhaelhall", + > "ticket": "5ddfca82-8a83-4463-acb4-a0aad0be3f8f", + > "roles": "[\"user\"]" + > } + > } + > 0 + > + > ----------------------------------------------------- + > Login [Fipa] + > { + > "status": "OK", + > "message": "", + > "body": { + > "principal": "Fipa", + > "ticket": "5fa2a098-ce5c-4d36-aedc-8af7b0cf53cf", + > "roles": "[\"user\"]" + > } + > } + > 140 + > + > ----------------------------------------------------- + > Login [Mythicson] + > { + > "status": "OK", + > "message": "", + > "body": { + > "principal": "Mythicson", + > "ticket": "b3036c86-a728-49cb-8e25-ab8ae32f16e3", + > "roles": "[\"user\"]" + > } + > } + > 87 + > + > ----------------------------------------------------- + > Login [Balline] + > { + > "status": "OK", + > "message": "", + > "body": { + > "principal": "Balline", + > "ticket": "f449766c-4056-496d-abef-7aa46a0d4302", + > "roles": "[\"user\"]" + > } + > } + > 39 + + +# ----------------------------------------------------- +# Loop through the first four users and list the user's notebooks in /tmp. +#[root@ansibler] + + for username in $( + jq -r '.[].name' /tmp/fourusers.json + ) + do + password=$( + jq -r ' + .[] | select(.name == "'${username}'") | .pass + ' /tmp/fourusers.json + ) + + echo + echo "-----------------------------------------------------" + echo "Login [${username}]" + + zeplogin "${username:?}" "${password:?}" + + curl \ + --silent \ + --cookie "${zepcookies:?}" \ + "${zeppelinurl:?}/api/notebook" \ + | jq ' + [ + .body[] + | select( + .path | startswith("/tmp") + ) + ] + ' + done + + + > + > ----------------------------------------------------- + > Login [Rhaelhall] + > { + > "status": "OK", + > "message": "", + > "body": { + > "principal": "Rhaelhall", + > "ticket": "5ddfca82-8a83-4463-acb4-a0aad0be3f8f", + > "roles": "[\"user\"]" + > } + > } + > [] + > + > ----------------------------------------------------- + > Login [Fipa] + > { + > "status": "OK", + > "message": "", + > "body": { + > "principal": "Fipa", + > "ticket": "5fa2a098-ce5c-4d36-aedc-8af7b0cf53cf", + > "roles": "[\"user\"]" + > } + > } + > [ + > { + > "id": "2H59GEJKF", + > "path": "/tmp/022R7M2V83.json" + > }, + > { + > "id": "2H66Y9K13", + > "path": "/tmp/0FQ0MYHY03.json" + > }, + > { + > "id": "2H46Y6415", + > "path": "/tmp/105JZ3HW87.json" + > }, + > { + > "id": "2H4GKACU8", + > "path": "/tmp/133PZPR7IP.json" + > }, + > { + > "id": "2H5ZE87KH", + > "path": "/tmp/15SRZ9RFOZ.json" + > }, + > { + > "id": "2H4CX7YR7", + > "path": "/tmp/1FVCRYLLWU.json" + > }, + > { + > "id": "2H78GH1FT", + > "path": "/tmp/1UP4CEQ030.json" + > }, + > { + > "id": "2H6C6PBN1", + > "path": "/tmp/27MZ3NBUEJ.json" + > }, + > { + > "id": "2H4816V59", + > "path": "/tmp/2RS0QRA48Z.json" + > }, + > { + > "id": "2H4CZCEFB", + > "path": "/tmp/3IR5X8IN3T.json" + > }, + > { + > "id": "2H61J4CU4", + > "path": "/tmp/3P5WG9HMBR.json" + > }, + > { + > "id": "2H43RRGS7", + > "path": "/tmp/40KBLJ92DG.json" + > }, + > { + > "id": "2H6ZV9ZZX", + > "path": "/tmp/48ZVFH5BAL.json" + > }, + > { + > "id": "2H5U9N9JP", + > "path": "/tmp/4GWGY005HK.json" + > }, + > { + > "id": "2H6QYSP33", + > "path": "/tmp/4W1HF3M8HY.json" + > }, + > { + > "id": "2H7D4W72G", + > "path": "/tmp/53Y96C03WT.json" + > }, + > { + > "id": "2H76M4DGM", + > "path": "/tmp/57B4L0N8R5.json" + > }, + > { + > "id": "2H5KDYF7M", + > "path": "/tmp/59UU952R0U.json" + > }, + > { + > "id": "2H5X8SRQD", + > "path": "/tmp/5S8QAL1HSF.json" + > }, + > { + > "id": "2H5ED6FGA", + > "path": "/tmp/5SVVV4WFJM.json" + > }, + > { + > "id": "2H3UJAB31", + > "path": "/tmp/5TWE1O2T6G.json" + > }, + > { + > "id": "2H6S7EYJ7", + > "path": "/tmp/69DV6JIQBM.json" + > }, + > { + > "id": "2H5DDTQUA", + > "path": "/tmp/69O40VYIB0.json" + > }, + > { + > "id": "2H4VRHFU7", + > "path": "/tmp/6C583WLR6X.json" + > }, + > { + > "id": "2H7C25UT9", + > "path": "/tmp/6GWCLVOYLT.json" + > }, + > { + > "id": "2H6UYX3P3", + > "path": "/tmp/6RNTRY88TQ.json" + > }, + > { + > "id": "2H6EZ6DJW", + > "path": "/tmp/6SBUI2HETY.json" + > }, + > { + > "id": "2H49223X3", + > "path": "/tmp/77Z0OVRY7S.json" + > }, + > { + > "id": "2H5RTZS15", + > "path": "/tmp/8CCMY7UDW0.json" + > }, + > { + > "id": "2H4B1JB72", + > "path": "/tmp/90FK6F71ZH.json" + > }, + > { + > "id": "2H4YUSQQJ", + > "path": "/tmp/93P71ADYMZ.json" + > }, + > { + > "id": "2H4KJEX4G", + > "path": "/tmp/9FGCUNENZJ.json" + > }, + > { + > "id": "2H48AHMJT", + > "path": "/tmp/A1T8AMLHJT.json" + > }, + > { + > "id": "2H5A6XXYR", + > "path": "/tmp/A6G0T5A3NI.json" + > }, + > { + > "id": "2H6H4RUA6", + > "path": "/tmp/AC5Y5CFFNP.json" + > }, + > { + > "id": "2H42Y7X6V", + > "path": "/tmp/AFAEF5SKPM.json" + > }, + > { + > "id": "2H4F644A7", + > "path": "/tmp/AN5MRNU0G8.json" + > }, + > { + > "id": "2H42GH33E", + > "path": "/tmp/ANMYK1U0Y5.json" + > }, + > { + > "id": "2H4V7VJ19", + > "path": "/tmp/BPIUBIL0DS.json" + > }, + > { + > "id": "2H49YTWRJ", + > "path": "/tmp/CRL2HYCR4P.json" + > }, + > { + > "id": "2H7E5N83J", + > "path": "/tmp/CT2XL34HCH.json" + > }, + > { + > "id": "2H4PJ869P", + > "path": "/tmp/D4GKFQGM90.json" + > }, + > { + > "id": "2H6UJZQMD", + > "path": "/tmp/D6KWMDTL32.json" + > }, + > { + > "id": "2H6ZKAVCQ", + > "path": "/tmp/DUIV6WT1UU.json" + > }, + > { + > "id": "2H7BWG83K", + > "path": "/tmp/DVJXJFQ24R.json" + > }, + > { + > "id": "2H4JBS2YW", + > "path": "/tmp/DXFNBQ4PAB.json" + > }, + > { + > "id": "2H54AW8BM", + > "path": "/tmp/DXZFOQ6YGR.json" + > }, + > { + > "id": "2H5VGD2J8", + > "path": "/tmp/ECS2N9LM74.json" + > }, + > { + > "id": "2H4H99QME", + > "path": "/tmp/EH6F3CMC6A.json" + > }, + > { + > "id": "2H45NAVKB", + > "path": "/tmp/EH9XFGVQWQ.json" + > }, + > { + > "id": "2H7AJ2CXJ", + > "path": "/tmp/EINUPFN12V.json" + > }, + > { + > "id": "2H4MZFKHJ", + > "path": "/tmp/EW3BA5GCBQ.json" + > }, + > { + > "id": "2H6TDSFQP", + > "path": "/tmp/F7OFFBPRJZ.json" + > }, + > { + > "id": "2H6H13VMT", + > "path": "/tmp/F91TW47GX2.json" + > }, + > { + > "id": "2H3UZ7D51", + > "path": "/tmp/FDFKXBOP2N.json" + > }, + > { + > "id": "2H548U9VD", + > "path": "/tmp/FJXYHVMTYF.json" + > }, + > { + > "id": "2H3XGAPAQ", + > "path": "/tmp/FPDZB0PDLZ.json" + > }, + > { + > "id": "2H5Z2T5UX", + > "path": "/tmp/FQO91RK18Q.json" + > }, + > { + > "id": "2H5XN2Q6V", + > "path": "/tmp/FZ6XX9YK4A.json" + > }, + > { + > "id": "2H61NBVVF", + > "path": "/tmp/G6AFC2879C.json" + > }, + > { + > "id": "2H6NM2Y47", + > "path": "/tmp/GFWVBA9YLW.json" + > }, + > { + > "id": "2H5Q8NRDW", + > "path": "/tmp/GI4UCF8Y7U.json" + > }, + > { + > "id": "2H64HJNJW", + > "path": "/tmp/GMZE1O6420.json" + > }, + > { + > "id": "2H3X184V7", + > "path": "/tmp/GTWIBS6910.json" + > }, + > { + > "id": "2H4WZMYBZ", + > "path": "/tmp/GVXIXV68UH.json" + > }, + > { + > "id": "2H78899PG", + > "path": "/tmp/H2OJ5ZHTS5.json" + > }, + > { + > "id": "2H3Z12XPF", + > "path": "/tmp/HJZBJH3FTE.json" + > }, + > { + > "id": "2H3KZHCAY", + > "path": "/tmp/HNPF8IG8W3.json" + > }, + > { + > "id": "2H5WB11GM", + > "path": "/tmp/HOP8RGEZRM.json" + > }, + > { + > "id": "2H574VQMD", + > "path": "/tmp/HRV31LU3K9.json" + > }, + > { + > "id": "2H568J4VY", + > "path": "/tmp/HYH4TVT8YH.json" + > }, + > { + > "id": "2H4KF1RYF", + > "path": "/tmp/HZQQOQSPPK.json" + > }, + > { + > "id": "2H6ACC5NB", + > "path": "/tmp/IF43YWJFDS.json" + > }, + > { + > "id": "2H443CBPC", + > "path": "/tmp/JKRSUH4TQ0.json" + > }, + > { + > "id": "2H3XCN3YN", + > "path": "/tmp/JPL5NPWC7O.json" + > }, + > { + > "id": "2H3XTWMXC", + > "path": "/tmp/JSZ4GJR4F5.json" + > }, + > { + > "id": "2H5D41ZT1", + > "path": "/tmp/K43XH0UJ6O.json" + > }, + > { + > "id": "2H47M43R1", + > "path": "/tmp/LE3Y8UCH96.json" + > }, + > { + > "id": "2H6VH55HN", + > "path": "/tmp/LYA9S5405S.json" + > }, + > { + > "id": "2H7973RPH", + > "path": "/tmp/M91IP70AKW.json" + > }, + > { + > "id": "2H4H3ZAV9", + > "path": "/tmp/MKRVOILKAU.json" + > }, + > { + > "id": "2H3ZXE99D", + > "path": "/tmp/MMZP51EDNE.json" + > }, + > { + > "id": "2H626U6WH", + > "path": "/tmp/MRZU8ON5VS.json" + > }, + > { + > "id": "2H5EY7CDK", + > "path": "/tmp/N9F1AQ9PJ1.json" + > }, + > { + > "id": "2H4SRWZNK", + > "path": "/tmp/NGL9ILJ4SG.json" + > }, + > { + > "id": "2H5C1QNKH", + > "path": "/tmp/NLBHEJ8GY8.json" + > }, + > { + > "id": "2H46DW2JT", + > "path": "/tmp/NLQL7886ZC.json" + > }, + > { + > "id": "2H6J1HCC6", + > "path": "/tmp/NUTCTPNQNN.json" + > }, + > { + > "id": "2H4BGR6M9", + > "path": "/tmp/NUVSLSTD5H.json" + > }, + > { + > "id": "2H3YPJQAU", + > "path": "/tmp/O40L8D7HS1.json" + > }, + > { + > "id": "2H4N42MQU", + > "path": "/tmp/O8XVALB2HF.json" + > }, + > { + > "id": "2H6STG5FG", + > "path": "/tmp/OMMN0WNKIP.json" + > }, + > { + > "id": "2H6DAKW7X", + > "path": "/tmp/OUT98TRUBK.json" + > }, + > { + > "id": "2H72R9WEQ", + > "path": "/tmp/PEDT3WL0EM.json" + > }, + > { + > "id": "2H6B5Y4BC", + > "path": "/tmp/PM8PB35OL1.json" + > }, + > { + > "id": "2H4KJR49E", + > "path": "/tmp/QCOTOO1UP2.json" + > }, + > { + > "id": "2H6JAEXHH", + > "path": "/tmp/QVMDJCW5UG.json" + > }, + > { + > "id": "2H4PC8MUP", + > "path": "/tmp/QW455FPI76.json" + > }, + > { + > "id": "2H4PA56Y8", + > "path": "/tmp/QZ1NG52WYN.json" + > }, + > { + > "id": "2H719STY7", + > "path": "/tmp/R1W4ZNMZNZ.json" + > }, + > { + > "id": "2H58JX2DJ", + > "path": "/tmp/R6VRWBPORS.json" + > }, + > { + > "id": "2H3MUAV71", + > "path": "/tmp/R8DFL93OWO.json" + > }, + > { + > "id": "2H4EHABB9", + > "path": "/tmp/RCRCO6Q9SX.json" + > }, + > { + > "id": "2H7DGY5BJ", + > "path": "/tmp/RLFVC7ANMW.json" + > }, + > { + > "id": "2H3RWY4T3", + > "path": "/tmp/RUON9YEFL2.json" + > }, + > { + > "id": "2H5J5GPRT", + > "path": "/tmp/S0P5VGGH1X.json" + > }, + > { + > "id": "2H76YPGQV", + > "path": "/tmp/S3QJ9XY9WZ.json" + > }, + > { + > "id": "2H7EC37T2", + > "path": "/tmp/SRIDFBZ84T.json" + > }, + > { + > "id": "2H667AYP5", + > "path": "/tmp/T7DW02BKTN.json" + > }, + > { + > "id": "2H5UTD5EM", + > "path": "/tmp/TBNPDGDYRV.json" + > }, + > { + > "id": "2H6UCUZCF", + > "path": "/tmp/TL8T4WCLOM.json" + > }, + > { + > "id": "2H4Y1D3PT", + > "path": "/tmp/TQJR7EPDTE.json" + > }, + > { + > "id": "2H5VUTQQZ", + > "path": "/tmp/TR0JWQH38B.json" + > }, + > { + > "id": "2H7B1YN8S", + > "path": "/tmp/TTZ2B41UHL.json" + > }, + > { + > "id": "2H44AKRYY", + > "path": "/tmp/U71TT7149H.json" + > }, + > { + > "id": "2H3WM2J3N", + > "path": "/tmp/U8U3FRKXDN.json" + > }, + > { + > "id": "2H72DVTGP", + > "path": "/tmp/UD0U3GLNWN.json" + > }, + > { + > "id": "2H3QKXAGH", + > "path": "/tmp/UFO784N17C.json" + > }, + > { + > "id": "2H6HWC2KU", + > "path": "/tmp/UUN1EV5FWN.json" + > }, + > { + > "id": "2H48UEFJY", + > "path": "/tmp/UWZV3KAG0V.json" + > }, + > { + > "id": "2H4B7CV9K", + > "path": "/tmp/UZAHP43K2M.json" + > }, + > { + > "id": "2H654MVKV", + > "path": "/tmp/VEBGE5YMCN.json" + > }, + > { + > "id": "2H7ECETYM", + > "path": "/tmp/VXLPMAKTPH.json" + > }, + > { + > "id": "2H4KBJRY7", + > "path": "/tmp/WB3SYJHWJB.json" + > }, + > { + > "id": "2H5V3T2AG", + > "path": "/tmp/WG8H0C0T2C.json" + > }, + > { + > "id": "2H64AWF8J", + > "path": "/tmp/WN8BBX6761.json" + > }, + > { + > "id": "2H73JFFPP", + > "path": "/tmp/WYCWMKYSEN.json" + > }, + > { + > "id": "2H6U5P5JN", + > "path": "/tmp/WYDQK9NRR1.json" + > }, + > { + > "id": "2H4DKNTFB", + > "path": "/tmp/X2MVYGJHVO.json" + > }, + > { + > "id": "2H5PKN1Q9", + > "path": "/tmp/X3ZUOYNGHD.json" + > }, + > { + > "id": "2H3S98NHP", + > "path": "/tmp/XWJVR4UGHK.json" + > }, + > { + > "id": "2H3VK8W99", + > "path": "/tmp/XYTUPTIQBU.json" + > }, + > { + > "id": "2H6NF6C3D", + > "path": "/tmp/Y0LHFNGREQ.json" + > }, + > { + > "id": "2H4ZR25YW", + > "path": "/tmp/Y75PD9WABF.json" + > }, + > { + > "id": "2H43C7Y1Z", + > "path": "/tmp/YS1LITVII4.json" + > }, + > { + > "id": "2H6UZJZSF", + > "path": "/tmp/YSQD56ZD1W.json" + > }, + > { + > "id": "2H559D6GA", + > "path": "/tmp/YUQQDK5YMG.json" + > }, + > { + > "id": "2H4EVN2UU", + > "path": "/tmp/ZATB76TL21.json" + > }, + > { + > "id": "2H4WFTPZ6", + > "path": "/tmp/ZD61L7EDGC.json" + > }, + > { + > "id": "2H4KXXGBB", + > "path": "/tmp/ZPH1KAF24P.json" + > } + > ] + > + > ----------------------------------------------------- + > Login [Mythicson] + > { + > "status": "OK", + > "message": "", + > "body": { + > "principal": "Mythicson", + > "ticket": "b3036c86-a728-49cb-8e25-ab8ae32f16e3", + > "roles": "[\"user\"]" + > } + > } + > [ + > { + > "id": "2H4872JVA", + > "path": "/tmp/0TJUG7VHRR.json" + > }, + > { + > "id": "2H5KC4XCW", + > "path": "/tmp/1U71FHT4G4.json" + > }, + > { + > "id": "2H6YRQTRC", + > "path": "/tmp/296LJK6LVI.json" + > }, + > { + > "id": "2H4ZJKFTJ", + > "path": "/tmp/2FQWKLZ4DL.json" + > }, + > { + > "id": "2H55AFVDE", + > "path": "/tmp/2I43ITIDDG.json" + > }, + > { + > "id": "2H75HDV7F", + > "path": "/tmp/2SI94BS4VC.json" + > }, + > { + > "id": "2H623G7KT", + > "path": "/tmp/2VXZEQLBX6.json" + > }, + > { + > "id": "2H7FU2MRR", + > "path": "/tmp/3APXG7JVUX.json" + > }, + > { + > "id": "2H3YTV7SY", + > "path": "/tmp/3Z3XL4SY5A.json" + > }, + > { + > "id": "2H7B897E9", + > "path": "/tmp/4MCJQ34TKR.json" + > }, + > { + > "id": "2H4YVKSZR", + > "path": "/tmp/4NS07CLBT4.json" + > }, + > { + > "id": "2H6VXFT27", + > "path": "/tmp/53IZQ3F4QT.json" + > }, + > { + > "id": "2H5PTNF1G", + > "path": "/tmp/62XT04DLVY.json" + > }, + > { + > "id": "2H5F5Y4BW", + > "path": "/tmp/73T1U2VXZQ.json" + > }, + > { + > "id": "2H4AK3AV4", + > "path": "/tmp/7SFWC7XNQX.json" + > }, + > { + > "id": "2H52F6BT9", + > "path": "/tmp/8AML4Z4EXA.json" + > }, + > { + > "id": "2H3PJHDJU", + > "path": "/tmp/9MZQLTGFRS.json" + > }, + > { + > "id": "2H5ABEHER", + > "path": "/tmp/A6WBFEVEBQ.json" + > }, + > { + > "id": "2H7DUJU9T", + > "path": "/tmp/AA3U4IMX2R.json" + > }, + > { + > "id": "2H4PAR2SK", + > "path": "/tmp/AT15DYU6EM.json" + > }, + > { + > "id": "2H4XXB4A7", + > "path": "/tmp/AXPTY45S4Z.json" + > }, + > { + > "id": "2H5TMJFK5", + > "path": "/tmp/B5UEGMXG1T.json" + > }, + > { + > "id": "2H79XX5BV", + > "path": "/tmp/BC39WRIW68.json" + > }, + > { + > "id": "2H4DCYKX3", + > "path": "/tmp/BIE950BZCX.json" + > }, + > { + > "id": "2H6VHKCP8", + > "path": "/tmp/BKP6NNNEL9.json" + > }, + > { + > "id": "2H65TFVMD", + > "path": "/tmp/BRHX2IEJ6U.json" + > }, + > { + > "id": "2H5Q5M6ZM", + > "path": "/tmp/CEJ6Y4RJ1M.json" + > }, + > { + > "id": "2H5YXJR6F", + > "path": "/tmp/CIK3T3SWKO.json" + > }, + > { + > "id": "2H57PJFTU", + > "path": "/tmp/CKJU8KWOW9.json" + > }, + > { + > "id": "2H5PTZT67", + > "path": "/tmp/D3MPQ687P4.json" + > }, + > { + > "id": "2H77GBM2S", + > "path": "/tmp/EMOW54U9PO.json" + > }, + > { + > "id": "2H6GWTB3S", + > "path": "/tmp/FCAHMWNYXH.json" + > }, + > { + > "id": "2H4YU62GV", + > "path": "/tmp/G2LSMUN5PE.json" + > }, + > { + > "id": "2H4TKMASU", + > "path": "/tmp/G7AIN3CZ06.json" + > }, + > { + > "id": "2H4TQC2RC", + > "path": "/tmp/GBE3J2DNLO.json" + > }, + > { + > "id": "2H48PA9Q4", + > "path": "/tmp/GIG8HC1RCL.json" + > }, + > { + > "id": "2H5XGFP7T", + > "path": "/tmp/GVD0Z9F33X.json" + > }, + > { + > "id": "2H3RF6M8U", + > "path": "/tmp/H9SIZTF7UM.json" + > }, + > { + > "id": "2H5URS6D2", + > "path": "/tmp/HEO3I3RRJA.json" + > }, + > { + > "id": "2H5BA914H", + > "path": "/tmp/HVBHA4VEAW.json" + > }, + > { + > "id": "2H4C9BQD6", + > "path": "/tmp/HWT0Z44R80.json" + > }, + > { + > "id": "2H6NWE6NN", + > "path": "/tmp/IRN6U5TWN3.json" + > }, + > { + > "id": "2H7CYF8BB", + > "path": "/tmp/J3U7H23XY8.json" + > }, + > { + > "id": "2H45CAU7Y", + > "path": "/tmp/K190YEP5V3.json" + > }, + > { + > "id": "2H3WGFWRJ", + > "path": "/tmp/KKHCGIZKIV.json" + > }, + > { + > "id": "2H5JQ2VQP", + > "path": "/tmp/KXRRDTWKFI.json" + > }, + > { + > "id": "2H5WEBRUB", + > "path": "/tmp/L6JQ95Z137.json" + > }, + > { + > "id": "2H4X25BZE", + > "path": "/tmp/L9SN7ITRXT.json" + > }, + > { + > "id": "2H42XGQN9", + > "path": "/tmp/M02GM977UA.json" + > }, + > { + > "id": "2H3MVW8XW", + > "path": "/tmp/MCFJKNALYD.json" + > }, + > { + > "id": "2H3MA2V4A", + > "path": "/tmp/MPIMUVP9YC.json" + > }, + > { + > "id": "2H48WVDKM", + > "path": "/tmp/MRNAHBIY3E.json" + > }, + > { + > "id": "2H46MA5K3", + > "path": "/tmp/N07WPBO8KW.json" + > }, + > { + > "id": "2H55SPQ5M", + > "path": "/tmp/N9VS7NL3QR.json" + > }, + > { + > "id": "2H6X5Y2CU", + > "path": "/tmp/O34MMYYK3A.json" + > }, + > { + > "id": "2H6U3E8N4", + > "path": "/tmp/O4WYZDZYJY.json" + > }, + > { + > "id": "2H6JAQSRR", + > "path": "/tmp/O8O0FN3M1H.json" + > }, + > { + > "id": "2H4SHWP5W", + > "path": "/tmp/OO0NDL6NEX.json" + > }, + > { + > "id": "2H4C2EAWZ", + > "path": "/tmp/PQAF8UUN6F.json" + > }, + > { + > "id": "2H3KQREJQ", + > "path": "/tmp/PYI7RWGSMK.json" + > }, + > { + > "id": "2H5YBEJ22", + > "path": "/tmp/PZD3QOWDM1.json" + > }, + > { + > "id": "2H62V5ARM", + > "path": "/tmp/Q86Y489VCW.json" + > }, + > { + > "id": "2H3W9114F", + > "path": "/tmp/QF23185FCY.json" + > }, + > { + > "id": "2H55VSVK6", + > "path": "/tmp/RI1DW2B08X.json" + > }, + > { + > "id": "2H5CEUKC1", + > "path": "/tmp/RLS1M1YOOA.json" + > }, + > { + > "id": "2H6W4BPSV", + > "path": "/tmp/RT8G9QRUFT.json" + > }, + > { + > "id": "2H6833AW7", + > "path": "/tmp/RYM7WNSJ26.json" + > }, + > { + > "id": "2H5DHDU2T", + > "path": "/tmp/S8611GAM44.json" + > }, + > { + > "id": "2H48C7WRT", + > "path": "/tmp/S9YRTKXMRH.json" + > }, + > { + > "id": "2H6JG1X4F", + > "path": "/tmp/SCC2KIUHCR.json" + > }, + > { + > "id": "2H5U6FUCD", + > "path": "/tmp/SJL8ZNQRAX.json" + > }, + > { + > "id": "2H3T8YR14", + > "path": "/tmp/SNSBSUWV9W.json" + > }, + > { + > "id": "2H46TXTNW", + > "path": "/tmp/SV647A878F.json" + > }, + > { + > "id": "2H6UWG925", + > "path": "/tmp/SX0NS7SBHM.json" + > }, + > { + > "id": "2H78HBZSB", + > "path": "/tmp/T8FZI7RCSS.json" + > }, + > { + > "id": "2H53JRJ1F", + > "path": "/tmp/TAPDG2D4QM.json" + > }, + > { + > "id": "2H6RDY45N", + > "path": "/tmp/THQVGYUC0O.json" + > }, + > { + > "id": "2H5Q6NDGG", + > "path": "/tmp/TNBT7ZOOCW.json" + > }, + > { + > "id": "2H7E23JXF", + > "path": "/tmp/TUB43BURDT.json" + > }, + > { + > "id": "2H4DFPV2K", + > "path": "/tmp/U8SWHBXBX2.json" + > }, + > { + > "id": "2H7231BFU", + > "path": "/tmp/UHYRZEP1EL.json" + > }, + > { + > "id": "2H462AVYB", + > "path": "/tmp/UOZR3K7CQS.json" + > }, + > { + > "id": "2H6WNDATP", + > "path": "/tmp/UPLR6VS44M.json" + > }, + > { + > "id": "2H5V94KT9", + > "path": "/tmp/UQAMRH2UTK.json" + > }, + > { + > "id": "2H6MCCZBQ", + > "path": "/tmp/WCJ3SGV47P.json" + > }, + > { + > "id": "2H3S1TZ9M", + > "path": "/tmp/YWP8GTSOU8.json" + > }, + > { + > "id": "2H4E6Z8A2", + > "path": "/tmp/Z1PAB6EI4Y.json" + > } + > ] + > + > ----------------------------------------------------- + > Login [Balline] + > { + > "status": "OK", + > "message": "", + > "body": { + > "principal": "Balline", + > "ticket": "f449766c-4056-496d-abef-7aa46a0d4302", + > "roles": "[\"user\"]" + > } + > } + > [ + > { + > "id": "2H6BNM8B3", + > "path": "/tmp/09UKVH2TDO.json" + > }, + > { + > "id": "2H3RE238N", + > "path": "/tmp/0HYOKR8OXW.json" + > }, + > { + > "id": "2H49RA2C6", + > "path": "/tmp/0PONYR2NL7.json" + > }, + > { + > "id": "2H43RQS6R", + > "path": "/tmp/1KEH0UM16N.json" + > }, + > { + > "id": "2H6ETAKCK", + > "path": "/tmp/3AEQJ5A42O.json" + > }, + > { + > "id": "2H5KK6RNU", + > "path": "/tmp/3NYCOV9NPM.json" + > }, + > { + > "id": "2H5HD1MFX", + > "path": "/tmp/3Y5ADJ7WE8.json" + > }, + > { + > "id": "2H6TM5FS7", + > "path": "/tmp/4ISOU8J6DY.json" + > }, + > { + > "id": "2H6MZ7VHR", + > "path": "/tmp/4Y8LW90FT8.json" + > }, + > { + > "id": "2H4PADFB6", + > "path": "/tmp/5Z0VRGJU64.json" + > }, + > { + > "id": "2H6WAKH5P", + > "path": "/tmp/6F5QRM85F2.json" + > }, + > { + > "id": "2H5VX7EMJ", + > "path": "/tmp/74TPUCKVU9.json" + > }, + > { + > "id": "2H4PCT4A8", + > "path": "/tmp/7VRO2XDW0X.json" + > }, + > { + > "id": "2H3R2QYQY", + > "path": "/tmp/CTKRC426N9.json" + > }, + > { + > "id": "2H3QYXU5F", + > "path": "/tmp/D3T55EFR83.json" + > }, + > { + > "id": "2H77J69XP", + > "path": "/tmp/EQ6GDVS5UP.json" + > }, + > { + > "id": "2H51QAZ4D", + > "path": "/tmp/FB9BL6P4L4.json" + > }, + > { + > "id": "2H47V1E75", + > "path": "/tmp/FOBTUAC9QB.json" + > }, + > { + > "id": "2H46E9DUX", + > "path": "/tmp/FUPN1FH4F1.json" + > }, + > { + > "id": "2H5AEJTS6", + > "path": "/tmp/GLIDE6E9BY.json" + > }, + > { + > "id": "2H3WXBYAN", + > "path": "/tmp/GW3VVKCRT3.json" + > }, + > { + > "id": "2H4WE2NKY", + > "path": "/tmp/HLNTGZ5EKQ.json" + > }, + > { + > "id": "2H43RBB4D", + > "path": "/tmp/HRJM10QDWJ.json" + > }, + > { + > "id": "2H42RPNFR", + > "path": "/tmp/IUAPBKKU8M.json" + > }, + > { + > "id": "2H564QGVP", + > "path": "/tmp/J5ZQ0WHPQ8.json" + > }, + > { + > "id": "2H5RZ7AED", + > "path": "/tmp/K23I0HVPYZ.json" + > }, + > { + > "id": "2H5K64FK5", + > "path": "/tmp/KGRLO5X2D6.json" + > }, + > { + > "id": "2H5ZT5JXY", + > "path": "/tmp/L3PW5XOLKD.json" + > }, + > { + > "id": "2H6Q8AFB8", + > "path": "/tmp/N01D7XINSX.json" + > }, + > { + > "id": "2H5MXKQDP", + > "path": "/tmp/O744JYL4PI.json" + > }, + > { + > "id": "2H65X6KSQ", + > "path": "/tmp/OKYQ6GRE0Z.json" + > }, + > { + > "id": "2H5CFXCJD", + > "path": "/tmp/P6WQTOVZ12.json" + > }, + > { + > "id": "2H45CYQNV", + > "path": "/tmp/PF5FMJVIZD.json" + > }, + > { + > "id": "2H3J89FQN", + > "path": "/tmp/PNZ3SUXEX6.json" + > }, + > { + > "id": "2H43A3ZGD", + > "path": "/tmp/S18M179NL1.json" + > }, + > { + > "id": "2H69U63WE", + > "path": "/tmp/URUNWDNPE1.json" + > }, + > { + > "id": "2H74VFG5U", + > "path": "/tmp/X13PJEY4J9.json" + > }, + > { + > "id": "2H4QBG6M7", + > "path": "/tmp/X2ZADIA763.json" + > }, + > { + > "id": "2H3NUJZ7D", + > "path": "/tmp/YIV1CNDWAI.json" + > } + > ] + + # + # Each user creates a separate set of notebooks - check + # Each notebook gets a unique id in the repository - check + # Each notebook gets a unique path/name in the repository - check + # + + + + + + diff --git a/notes/zrq/20220602-01-concurrent-tests.txt b/notes/zrq/20220602-01-concurrent-tests.txt new file mode 100644 index 00000000..8ce15b06 --- /dev/null +++ b/notes/zrq/20220602-01-concurrent-tests.txt @@ -0,0 +1,2698 @@ +# +# +# +# Copyright (c) 2022, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Try to find out more about the limits on concurrent users. + Started with a clean deployment 20220601-01-blue-deploy.txt + + Result: + + Work in progress ... + +# ----------------------------------------------------- +# Create some test users. +# TODO Move the create-user-tools to ansible/client/bin. +# TODO Add ansible/client/bin to the client PATH. +#[root@ansibler] + + source /deployments/zeppelin/bin/create-user-tools.sh + + testnames01=( + Rhaelhall + Fipa + Mythicson + Balline + Hiness + Anskelisia + Iflee + Mischiellis + Kellaug + Liphima + Jarters + Williazoga + Carrovieus + Pierione + Hayesphasia + Collinotter + Adazoga + Harinabla + Sanderlotus + Bellgrin + ) + + testnames02=( + Hamar + Carclop + Halda + Jaden + Mavaca + Franilley + Masonania + Webbbron + Granwaler + Stelama + ) + + testnames03=( + Smical + Reyesfan + Evison + Surbron + Floresslight + ) + + createarrayusers \ + "${testnames01[@]}" \ + | tee /tmp/testusers-01.json \ + | jq '[ .users[] | {"name": .shirouser.name, "pass": .shirouser.pass} ]' + + > [ + > { + > "name": "Rhaelhall", + > "pass": "ea8aiqu1liubachohthahwieh3ko1O" + > } + > { + > "name": "Fipa", + > "pass": "eigheiZoo9Mei4fereim1ahp3weu1E" + > } + > .... + > .... + > ] + + + createarrayusers \ + "${testnames02[@]}" \ + | tee /tmp/testusers-02.json \ + | jq '[ .users[] | {"name": .shirouser.name, "pass": .shirouser.pass} ]' + + > [ + > { + > "name": "Hamar", + > "pass": "fausaimuugh8ue3aNowaig4uozes6o" + > }, + > { + > "name": "Carclop", + > "pass": "aeX8Xie9oozeasiehoh4pheeyahliC" + > }, + > .... + > .... + > ] + + + createarrayusers \ + "${testnames03[@]}" \ + | tee /tmp/testusers-03.json \ + | jq '[ .users[] | {"name": .shirouser.name, "pass": .shirouser.pass} ]' + + > [ + > { + > "name": "Smical", + > "pass": "roh1ohpaeYohY4hiequeiseiMoh0ah" + > }, + > { + > "name": "Reyesfan", + > "pass": "eeyah5iegeis5ne6ohPh4hagaiduk8" + > }, + > .... + > .... + > ] + + +# ----------------------------------------------------- +# Create our benchmark script. +# TODO Create run-benchmark.py in ansible/client/bin. +# Learning Python: +# Command line args +# https://realpython.com/python-command-line-arguments/ +# String.format() +# https://docs.python.org/3/library/string.html#formatstrings +# Escape {} in format() +# https://stackoverflow.com/a/5466478 +#[root@ansibler] + + cat > /tmp/run-benchmark.py << 'EOF' +#!/bin/python3 +import sys +from aglais_benchmark import AglaisBenchmarker + +try: + + opts = [opt for opt in sys.argv[1:] if opt.startswith("-")] + args = [arg for arg in sys.argv[1:] if not arg.startswith("-")] + + endpoint = args[0] + testconfig = args[1] + userlist = args[2] + usercount = int(args[3]) + +except IndexError: + + raise SystemExit(f"Usage: {sys.argv[0]} ") + +print( +""" +{{ +\"config\": {{ + \"endpoint\": \"{}\", + \"testconfig\": \"{}\", + \"userlist\": \"{}\", + \"usercount\": \"{}\" + }} +}} +""".format( + endpoint, + testconfig, + userlist, + usercount + ) + ) + +AglaisBenchmarker( + testconfig, + userlist, + "/tmp/", + endpoint + ).run( + concurrent=True, + users=usercount + ) + +EOF + + chmod 'a+x' /tmp/run-benchmark.py + + +# ----------------------------------------------------- +# Run a quick test with one user. +#[root@ansibler] + + mkdir /tmp/results + + endpoint="http://zeppelin:8080" + + testconfig=/deployments/zeppelin/test/config/quick.json + + testusers=/tmp/testusers-01.json + testname=single-user-01 + usercount=1 + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + > b'Create notebook: 2H4STBJ8Y\n' + > b'Create notebook: 2H74SGMY6\n' + > b'Create notebook: 2H6XN9CJP\n' + > b'Create notebook: 2H7BKXBZW\n' + > Test completed! (34.56 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + + + sed " + 0,/^----/ d + s/\"/#/g + s/'\(-\{0,1\}[0-9.]\{1,\}\)'/\1/g + s/:[[:space:]]*\([a-zA-Z]\{1,\}\)\([,}]\)/:'\1'\2/g + s/:[[:space:]]*\([,}]\),/: ''\1/g + s/'/\"/g + " \ + "/tmp/results/${testname:?}.txt" \ + | tee "/tmp/results/${testname:?}.json" \ + | jq ' + .[] | keys as $x | [ $x[] as $y | {name: $y, value: .[$y].result, time: .[$y].time.elapsed } ] + ' + + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.42 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 9.98 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 5.50 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 15.64 + > } + > ] + + +# ----------------------------------------------------- +# Step up to 4 users. +#[root@ansibler] + + loopcount=0 + usercount=4 + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${loopcount})" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + > Test started [Multi User] + > b'Create notebook: 2H68V7CMA\n' + > b'Create notebook: 2H76YA1VH\n' + > b'Create notebook: 2H6PH6UMX\n' + > b'Create notebook: 2H3Z1747B\n' + > b'Create notebook: 2H4DCCJ4G\n' + > b'Create notebook: 2H7A9GPYV\n' + > b'Create notebook: 2H73F4DPN\n' + > b'Create notebook: 2H5FHSQAE\n' + > b'Create notebook: 2H49QWXPN\n' + > b'Create notebook: 2H6VHC35E\n' + > b'Create notebook: 2H6X7UU2N\n' + > b'Create notebook: 2H68BYQ2P\n' + > b'Create notebook: 2H469V5GK\n' + > b'Create notebook: 2H4MEZTQR\n' + > b'Create notebook: 2H4Q8DYEE\n' + > b'Create notebook: 2H4GYCUYS\n' + > Test completed! (288.09 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + + + sed " + 0,/^----/ d + s/\"/#/g + s/'\(-\{0,1\}[0-9.]\{1,\}\)'/\1/g + s/:[[:space:]]*\([a-zA-Z]\{1,\}\)\([,}]\)/:'\1'\2/g + s/:[[:space:]]*\([,}]\),/: ''\1/g + s/'/\"/g + " \ + "/tmp/results/${testname:?}.txt" \ + | tee "/tmp/results/${testname:?}.json" \ + | jq ' + .[] | keys as $x | [ $x[] as $y | {name: $y, value: .[$y].result, time: .[$y].time.elapsed } ] + ' + + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.76 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.82 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 16.08 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 18.80 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 51.87 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 9.71 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 40.85 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 19.91 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 107.21 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 9.60 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 126.54 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 44.72 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 107.22 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 9.64 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 94.14 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 13.41 + > } + > ] + + +# ----------------------------------------------------- +# Step up to 6 users. +#[root@ansibler] + + loopcount=0 + usercount=6 + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${loopcount})" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + + > Test started [Multi User] + > b'Create notebook: 2H7ASRFR5\n' + > b'Create notebook: 2H52UD857\n' + > .... + > .... + > b'Create notebook: 2H5U6N7R3\n' + > b'Create notebook: 2H584PBEF\n' + > Test completed! (190.47 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + + + sed " + 0,/^----/ d + s/\"/#/g + s/'\(-\{0,1\}[0-9.]\{1,\}\)'/\1/g + s/:[[:space:]]*\([a-zA-Z]\{1,\}\)\([,}]\)/:'\1'\2/g + s/:[[:space:]]*\([,}]\),/: ''\1/g + s/'/\"/g + " \ + "/tmp/results/${testname:?}.txt" \ + | tee "/tmp/results/${testname:?}.json" \ + | jq ' + .[] | keys as $x | [ $x[] as $y | {name: $y, value: .[$y].result, time: .[$y].time.elapsed } ] + ' + + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.12 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.61 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 19.33 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 18.78 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.17 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.54 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 16.27 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 32.62 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.16 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.56 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 16.32 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 34.70 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.17 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.53 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 20.37 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 57.37 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 107.36 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 9.56 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 55.84 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 17.67 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 108.38 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 8.60 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 54.76 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 16.78 + > } + > ] + + +# ----------------------------------------------------- +# Step up to 6 users run 6 times. +#[root@ansibler] + + loopcount=6 + usercount=6 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + done + + + + > Test started [Multi User] + > ERROR:root:list index out of range + > Traceback (most recent call last): + > File "/usr/local/lib/python3.10/site-packages/aglais_benchmark/aglais_benchmark.py", line 114, in run_notebook + > notebookid = text.split(": ")[1] + > IndexError: list index out of range + > b'Create notebook: 2H6ZRHKWC\n' + > b'Create notebook: 2H4KNWDFC\n' + > b'Create notebook: 2H6XNQ84T\n' + > b'Create notebook: 2H5SM7Q62\n' + > b'status_code:500\n' + > b'Create notebook: 2H7GPPB4S\n' + > b'Create notebook: 2H3Y5DQJ8\n' + > b'Create notebook: 2H6Q37HQT\n' + > b'Create notebook: 2H52UWVZW\n' + > b'Create notebook: 2H6944K21\n' + > b'Create notebook: 2H656UC5E\n' + > b'Create notebook: 2H5XT1J5P\n' + > b'Create notebook: 2H669UWG5\n' + > b'Create notebook: 2H3PDE7SC\n' + > b'Create notebook: 2H56PXTTU\n' + > b'Create notebook: 2H4YBGHD3\n' + > b'Create notebook: 2H6X7DEDG\n' + > b'Create notebook: 2H4DD8DA5\n' + > b'Create notebook: 2H5K5X7YD\n' + > b'Create notebook: 2H4BMKJ6R\n' + > b'Create notebook: 2H74Q14H5\n' + > b'Create notebook: 2H54EKUWZ\n' + > b'Create notebook: 2H6TNZ9RS\n' + > b'Create notebook: 2H6Z9WD4H\n' + > Test completed! (97.63 seconds) + > ------------ Test Result: [FAIL] ------------ + > [{'GaiaDMPSetup': { .... }}] + + + filter-results() + { + local testname=${1:?'testname required'} + sed " + 0,/^----/ d + s/\"/#/g + s/'\(-\{0,1\}[0-9.]\{1,\}\)'/\1/g + s/:[[:space:]]*\([a-zA-Z]\{1,\}\)\([,}]\)/:'\1'\2/g + s/:[[:space:]]*\([,}]\),/: ''\1/g + s/'/\"/g + " \ + "/tmp/results/${testname:?}.txt" \ + | tee "/tmp/results/${testname:?}.json" \ + | jq ' + .[] | keys as $x | [ $x[] as $y | {name: $y, value: .[$y].result, time: .[$y].time.elapsed } ] + ' + } + + filter-results "${testname}" + + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.84 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.52 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 7.05 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 44.80 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.89 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 4.52 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 18.35 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 17.81 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.94 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.43 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 6.93 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 80.30 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.91 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.42 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 6.95 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 48.99 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "FAIL", + > "time": 1.14 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.39 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 5.67 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 38.74 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.93 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.48 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 5.85 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 37.53 + > } + > ] + + +# ----------------------------------------------------- +# repeat 6 users run 6 times. +#[root@ansibler] + + loopcount=6 + usercount=6 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + + done + + + + > ------------- + > Loop [0] + > + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-01.json", + > "usercount": "6" + > } + > } + > + > /tmp/testusers-01.json + > Test started [Multi User] + > ERROR:root:list index out of range + > Traceback (most recent call last): + > File "/usr/local/lib/python3.10/site-packages/aglais_benchmark/aglais_benchmark.py", line 114, in run_notebook + > notebookid = text.split(": ")[1] + > IndexError: list index out of range + > ERROR:root:list index out of range + > Traceback (most recent call last): + > File "/usr/local/lib/python3.10/site-packages/aglais_benchmark/aglais_benchmark.py", line 114, in run_notebook + > notebookid = text.split(": ")[1] + > IndexError: list index out of range + > b'Create notebook: 2H71J3HT8\n' + > b'status_code:500\n' + > b'Create notebook: 2H3M55AZQ\n' + > b'Create notebook: 2H71K64B2\n' + > b'Create notebook: 2H7F7ECPE\n' + > b'Create notebook: 2H6Y6K2X3\n' + > b'Create notebook: 2H4SN74SQ\n' + > b'Create notebook: 2H6M2UTS9\n' + > b'Create notebook: 2H5GQUMGX\n' + > b'Create notebook: 2H4F27JU6\n' + > b'Create notebook: 2H6S33VH6\n' + > b'Create notebook: 2H5Z6UY1V\n' + > b'Create notebook: 2H4T2VS63\n' + > b'Create notebook: 2H54W82HW\n' + > b'Create notebook: 2H6V3RJ14\n' + > b'Create notebook: 2H4X2ZAWN\n' + > b'Create notebook: 2H4RKGA4E\n' + > b'Create notebook: 2H714QB6B\n' + > b'Create notebook: 2H69ZASTW\n' + > b'Create notebook: 2H7ESVDH7\n' + > b'status_code:500\n' + > b'Create notebook: 2H5HYUVU8\n' + > b'Create notebook: 2H3XZMVD2\n' + > b'Create notebook: 2H4SES5ND\n' + > Test completed! (98.09 seconds) + > ------------ Test Result: [FAIL] ------------ + > [{'GaiaDMPSetup': { .... }}] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.05 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.40 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 6.88 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 79.32 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.09 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.68 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "FAIL", + > "time": 0.96 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 27.31 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "FAIL", + > "time": 1.23 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.47 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 7.93 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 82.42 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.12 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.48 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 17.26 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 21.91 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.08 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.53 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 17.30 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 34.36 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.07 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.52 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 16.23 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 29.16 + > } + > ] + > + > ------------- + > Loop [1] + > + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-01.json", + > "usercount": "6" + > } + > } + > + > /tmp/testusers-01.json + > Test started [Multi User] + > ERROR:root:list index out of range + > Traceback (most recent call last): + > File "/usr/local/lib/python3.10/site-packages/aglais_benchmark/aglais_benchmark.py", line 114, in run_notebook + > notebookid = text.split(": ")[1] + > IndexError: list index out of range + > b'Create notebook: 2H64VTQ6Z\n' + > b'status_code:500\n' + > b'Create notebook: 2H3KU4T52\n' + > b'Create notebook: 2H4E8HU89\n' + > b'Create notebook: 2H5DDF7N5\n' + > b'Create notebook: 2H7JWA1WU\n' + > b'Create notebook: 2H4YA7Q18\n' + > b'Create notebook: 2H7J1TU3E\n' + > b'Create notebook: 2H4TZJZKR\n' + > b'Create notebook: 2H5JEE9A6\n' + > b'Create notebook: 2H3PXTCZ1\n' + > b'Create notebook: 2H5EUURTH\n' + > b'Create notebook: 2H6D2T488\n' + > b'Create notebook: 2H3PNNPBY\n' + > b'Create notebook: 2H6KY7RCP\n' + > b'Create notebook: 2H5V9T2B1\n' + > b'Create notebook: 2H434WDEV\n' + > b'Create notebook: 2H4AHWZ96\n' + > b'Create notebook: 2H44MU5KU\n' + > b'Create notebook: 2H5DY9QH9\n' + > b'Create notebook: 2H5R7VTWU\n' + > b'Create notebook: 2H3SDTRBK\n' + > b'Create notebook: 2H4VWQ993\n' + > b'Create notebook: 2H3VEV4DZ\n' + > Test completed! (95.58 seconds) + > ------------ Test Result: [FAIL] ------------ + > [{'GaiaDMPSetup': { .... }}] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.95 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.42 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 7.04 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 78.13 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.01 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.42 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 18.51 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 17.74 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.96 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.41 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 7.99 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 77.16 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.05 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.63 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 19.49 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 22.97 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.96 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.56 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 5.95 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 33.45 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.00 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.45 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "FAIL", + > "time": 1.07 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 22.99 + > } + > ] + > + > ------------- + > Loop [2] + > + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-01.json", + > "usercount": "6" + > } + > } + > + > /tmp/testusers-01.json + > Test started [Multi User] + > ERROR:root:list index out of range + > Traceback (most recent call last): + > File "/usr/local/lib/python3.10/site-packages/aglais_benchmark/aglais_benchmark.py", line 114, in run_notebook + > notebookid = text.split(": ")[1] + > IndexError: list index out of range + > b'Create notebook: 2H5NTYU65\n' + > b'Create notebook: 2H4N87JGP\n' + > b'Create notebook: 2H7EQ7TRJ\n' + > b'Create notebook: 2H66J62DZ\n' + > b'Create notebook: 2H7A4HWHA\n' + > b'Create notebook: 2H4C5PQTK\n' + > b'Create notebook: 2H4H9KX2N\n' + > b'Create notebook: 2H6EV6UQA\n' + > b'status_code:500\n' + > b'Create notebook: 2H6B3RUZT\n' + > b'Create notebook: 2H5W18ZZ6\n' + > b'Create notebook: 2H57CE3CR\n' + > b'Create notebook: 2H73CMGMP\n' + > b'Create notebook: 2H5VHG9SC\n' + > b'Create notebook: 2H5QBHK89\n' + > b'Create notebook: 2H75TQEKP\n' + > b'Create notebook: 2H6R8SWDT\n' + > b'Create notebook: 2H6XGT5U3\n' + > b'Create notebook: 2H71ME4GZ\n' + > b'Create notebook: 2H7FYHW14\n' + > Test completed! (96.42 seconds) + > ------------ Test Result: [FAIL] ------------ + > [{'GaiaDMPSetup': { .... }}] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.93 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.48 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 7.83 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 51.12 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.90 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.46 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 17.26 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 19.82 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.98 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.35 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 6.79 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 79.26 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.97 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.50 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 5.80 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 18.78 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "FAIL", + > "time": 1.14 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.42 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 5.68 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 33.33 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.95 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.49 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 17.23 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 27.11 + > } + > ] + > + > ------------- + > Loop [3] + > + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-01.json", + > "usercount": "6" + > } + > } + > + > /tmp/testusers-01.json + > Test started [Multi User] + > b'Create notebook: 2H3WSPQHX\n' + > b'Create notebook: 2H55KSSGZ\n' + > b'Create notebook: 2H4BV3PZD\n' + > b'Create notebook: 2H3WZWTG7\n' + > b'Create notebook: 2H6K7UYBV\n' + > b'Create notebook: 2H6K7SUKZ\n' + > b'Create notebook: 2H5N5X153\n' + > b'Create notebook: 2H57EFGB9\n' + > b'Create notebook: 2H3TD2JT7\n' + > b'Create notebook: 2H6JCAVR5\n' + > b'Create notebook: 2H6SR47MC\n' + > b'Create notebook: 2H4FY7KRT\n' + > b'Create notebook: 2H4E7M2ZA\n' + > b'Create notebook: 2H44PRK63\n' + > b'Create notebook: 2H4G8QSTR\n' + > b'Create notebook: 2H5ETM5B2\n' + > b'Create notebook: 2H5TRMM9M\n' + > b'Create notebook: 2H6QBGEXU\n' + > b'Create notebook: 2H6UVUXNA\n' + > b'Create notebook: 2H6JF4PJ6\n' + > b'Create notebook: 2H3ZGTCR2\n' + > b'Create notebook: 2H4D2GT23\n' + > b'Create notebook: 2H45DHY4V\n' + > b'Create notebook: 2H4BF9UNQ\n' + > Test completed! (67.85 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.19 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.49 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 14.37 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 39.81 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.26 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.66 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 18.38 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 20.87 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.20 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.51 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 7.03 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 50.03 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.16 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.52 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 6.00 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 29.29 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.18 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.44 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 17.42 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 25.04 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.28 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.40 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 7.03 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 51.06 + > } + > ] + > + > ------------- + > Loop [4] + > + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-01.json", + > "usercount": "6" + > } + > } + > + > /tmp/testusers-01.json + > Test started [Multi User] + > ERROR:root:list index out of range + > Traceback (most recent call last): + > File "/usr/local/lib/python3.10/site-packages/aglais_benchmark/aglais_benchmark.py", line 114, in run_notebook + > notebookid = text.split(": ")[1] + > IndexError: list index out of range + > + > + > b'status_code:500\n' + > b'Create notebook: 2H5F1XAGM\n' + > b'Create notebook: 2H4NPGZ7H\n' + > b'Create notebook: 2H5CVRRD5\n' + > b'Create notebook: 2H5ZM37WR\n' + > b'Create notebook: 2H7FSWM1N\n' + > b'Create notebook: 2H627GV3Y\n' + > b'Create notebook: 2H7GCDYHU\n' + > b'Create notebook: 2H5DZV9VE\n' + > b'Create notebook: 2H73QDRE4\n' + > b'Create notebook: 2H6WTZHVJ\n' + > b'Create notebook: 2H56DS6T9\n' + > b'Create notebook: 2H3MN37E1\n' + > b'Create notebook: 2H45XA8KY\n' + > b'Create notebook: 2H5KGM7K7\n' + > b'Create notebook: 2H7JNGYXN\n' + > b'Create notebook: 2H5T78U9T\n' + > b'Create notebook: 2H48YEN47\n' + > b'Create notebook: 2H6V9K9BJ\n' + > b'Create notebook: 2H6F99UVX\n' + > b'Create notebook: 2H79A5X5C\n' + > b'Create notebook: 2H6PJDV9U\n' + > b'Create notebook: 2H6AP11R8\n' + > b'Create notebook: 2H5MYZ1ZN\n' + > Test completed! (64.28 seconds) + > ------------ Test Result: [FAIL] ------------ + > [{'GaiaDMPSetup': { .... }}] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.14 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.45 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 6.42 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 48.23 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.13 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.39 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 7.95 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 16.67 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.13 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.53 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 7.07 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 45.02 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "FAIL", + > "time": 1.31 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.63 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 5.72 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 20.19 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.10 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.46 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 6.81 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 34.54 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.10 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.51 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 7.10 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 28.13 + > } + > ] + > + > ------------- + > Loop [5] + > + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-01.json", + > "usercount": "6" + > } + > } + > + > /tmp/testusers-01.json + > Test started [Multi User] + > b'Create notebook: 2H545BCKX\n' + > b'Create notebook: 2H4F9G2SR\n' + > b'Create notebook: 2H4A4A6CD\n' + > b'Create notebook: 2H5VRG689\n' + > b'Create notebook: 2H3SZ5V3D\n' + > b'Create notebook: 2H7EC9H2K\n' + > b'Create notebook: 2H7K2MAYX\n' + > b'Create notebook: 2H485E7B4\n' + > b'Create notebook: 2H6KUZ8A4\n' + > b'Create notebook: 2H3ZTEYR5\n' + > b'Create notebook: 2H6NYK2P5\n' + > b'Create notebook: 2H4XW9EFE\n' + > b'Create notebook: 2H41RHXRP\n' + > b'Create notebook: 2H4DZ1XJG\n' + > b'Create notebook: 2H5ENFVAZ\n' + > b'Create notebook: 2H4NHM9EQ\n' + > b'Create notebook: 2H573N67Z\n' + > b'Create notebook: 2H71Y12QU\n' + > b'Create notebook: 2H6QCD9MF\n' + > b'Create notebook: 2H5DU71F2\n' + > b'Create notebook: 2H6HGY3DB\n' + > b'Create notebook: 2H5KNJPT7\n' + > b'Create notebook: 2H3PTMUD4\n' + > b'Create notebook: 2H7FRWQXZ\n' + > Test completed! (65.45 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.16 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.54 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 7.54 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 48.18 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.12 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.79 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 7.60 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 16.84 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.16 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 5.54 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 7.64 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 46.01 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.99 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.51 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 6.10 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 20.92 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.19 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.53 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 8.52 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 33.52 + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 4.11 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 6.64 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 6.38 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 28.41 + > } + > ] + + + # + # 4/6 tests failing, with a 500 error creating a notebook. + # + + grep 'Result:' /tmp/results/multi-user-06-*.txt + + > /tmp/results/multi-user-06-00.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-06-01.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-06-02.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-06-03.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-06-04.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-06-05.txt:------------ Test Result: [PASS] ------------ + + # + # Nothing visible in the Zeppelin logs. + # + + # + # Try adding a random pause before the create notebook step. + # + + + +# ----------------------------------------------------- +# Try 6 users run 6 times. +#[root@ansibler] + + loopcount=6 + usercount=6 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + done + + + > .... + > ERROR [2022-06-02 14:26:45,195] ({qtp686466458-26908} WebApplicationExceptionMapper.java[toResponse]:49) - Error response + > java.lang.OutOfMemoryError: GC overhead limit exceeded + > .... + + > .... + > INFO [2022-06-02 14:21:53,009] ({qtp686466458-26219} VFSNotebookRepo.java[save]:144) - Saving note 2H6DXDKCJ to tmp/K1MB2HNMUH.json_2H6DXDKCJ.zpln + > ERROR [2022-06-02 14:21:53,009] ({LuceneSearch13} NoteEventAsyncListener.java[run]:128) - Fail to handle NoteEvent + > org.apache.lucene.store.AlreadyClosedException: this IndexWriter is closed + > at org.apache.lucene.index.IndexWriter.ensureOpen(IndexWriter.java:877) + > at org.apache.lucene.index.IndexWriter.ensureOpen(IndexWriter.java:891) + > at org.apache.lucene.index.IndexWriter.updateDocuments(IndexWriter.java:1468) + > at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1757) + > at org.apache.zeppelin.search.LuceneSearch.updateDoc(LuceneSearch.java:240) + > at org.apache.zeppelin.search.LuceneSearch.indexNoteName(LuceneSearch.java:398) + > at org.apache.zeppelin.search.LuceneSearch.addIndexDocAsync(LuceneSearch.java:324) + > at org.apache.zeppelin.search.LuceneSearch.addNoteIndex(LuceneSearch.java:305) + > at org.apache.zeppelin.search.SearchService.handleNoteCreateEvent(SearchService.java:108) + > at org.apache.zeppelin.notebook.NoteEventAsyncListener$EventHandling.run(NoteEventAsyncListener.java:113) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + > at java.lang.Thread.run(Thread.java:748) + > Caused by: java.lang.OutOfMemoryError: Java heap space + > INFO [2022-06-02 14:21:53,010] ({qtp686466458-26219} NotebookRestApi.java[initParagraph]:1105) - Init Paragraph for user Fipa + > ERROR [2022-06-02 14:21:53,010] ({LuceneSearch13} NoteEventAsyncListener.java[run]:128) - Fail to handle NoteEvent + > org.apache.lucene.store.AlreadyClosedException: this IndexWriter is closed + > at org.apache.lucene.index.IndexWriter.ensureOpen(IndexWriter.java:877) + > at org.apache.lucene.index.IndexWriter.ensureOpen(IndexWriter.java:891) + > at org.apache.lucene.index.IndexWriter.updateDocuments(IndexWriter.java:1468) + > at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1757) + > at org.apache.zeppelin.search.LuceneSearch.updateDoc(LuceneSearch.java:240) + > at org.apache.zeppelin.search.LuceneSearch.addParagraphIndex(LuceneSearch.java:314) + > at org.apache.zeppelin.search.SearchService.handleParagraphCreateEvent(SearchService.java:123) + > at org.apache.zeppelin.notebook.NoteEventAsyncListener$EventHandling.run(NoteEventAsyncListener.java:119) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + > at java.lang.Thread.run(Thread.java:748) + > Caused by: java.lang.OutOfMemoryError: Java heap space + > INFO [2022-06-02 14:21:53,010] ({qtp686466458-26219} NotebookRestApi.java[configureParagraph]:1116) - Configure Paragraph for user Fipa + > INFO [2022-06-02 14:21:53,071] ({qtp686466458-26219} NotebookRestApi.java[initParagraph]:1105) - Init Paragraph for user Fipa + > .... + + # + # So now the whole thing is locking up .... + # Not sure about Zeppelin, but the test client hangs .. + # + + +# ----------------------------------------------------- +# Try 4 users run 4 times. +#[root@ansibler] + + loopcount=4 + usercount=4 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + done + + # + # Locked up, nothing running on the server. + # + + + One job is listed as ACCEPTED but not RUNNING in the Hadoop UI. + + > [Thu Jun 02 15:31:12 +0000 2022] + > Application is added to the scheduler and is not yet activated. + > Queue's AM resource limit exceeded. + > Details : + > AM Partition = ; + > AM Resource Request = ; + > Queue Resource Limit for AM = ; + > User AM Resource Limit of the queue = ; + > Queue AM Resource Usage = ; + + # + # Notebooks run via the Zeppelin UI get stalled as soon as they try to run a Spark task. + # + + +# ----------------------------------------------------- +# Restart the Hadoop services. +#[root@ansibler] + + + ssh master01 + + stop-all.sh + + > WARNING: Stopping all Apache Hadoop daemons as fedora in 10 seconds. + > WARNING: Use CTRL-C to abort. + > Stopping namenodes on [master01] + > Stopping datanodes + > Stopping secondary namenodes [iris-gaia-blue-20220602-master01] + > iris-gaia-blue-20220602-master01: fedora@iris-gaia-blue-20220602-master01: Permission denied (publickey,gssapi-keyex,gssapi-with-mic). + > Stopping nodemanagers + > worker06: WARNING: nodemanager did not stop gracefully after 5 seconds: Trying to kill with kill -9 + > worker02: WARNING: nodemanager did not stop gracefully after 5 seconds: Trying to kill with kill -9 + > worker01: WARNING: nodemanager did not stop gracefully after 5 seconds: Trying to kill with kill -9 + > worker05: WARNING: nodemanager did not stop gracefully after 5 seconds: Trying to kill with kill -9 + > worker03: WARNING: nodemanager did not stop gracefully after 5 seconds: Trying to kill with kill -9 + > worker04: WARNING: nodemanager did not stop gracefully after 5 seconds: Trying to kill with kill -9 + > Stopping resourcemanager + + + start-all.sh + + > WARNING: Attempting to start all Apache Hadoop daemons as fedora in 10 seconds. + > WARNING: This is not a recommended production deployment configuration. + > WARNING: Use CTRL-C to abort. + > Starting namenodes on [master01] + > Starting datanodes + > Starting secondary namenodes [iris-gaia-blue-20220602-master01] + > iris-gaia-blue-20220602-master01: fedora@iris-gaia-blue-20220602-master01: Permission denied (publickey,gssapi-keyex,gssapi-with-mic). + > Starting resourcemanager + > Starting nodemanagers + + + # + # Notebooks run via the Zeppelin UI work OK. + # + + + # + # Benchmark tests fail with + # + + + > .... + > ------------ Test Result: [ERROR] ------------ + > [{'GaiaDMPSetup': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '10.29', 'expected': '45.00', 'percent': '-77.13', 'start': '2022-06-02T16:35:28.967009', 'finish': '2022-06-02T16:35:39.257402'}, 'logs': ''}, 'Mean_proper_motions_over_the_sky': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '14.95', 'expected': '55.00', 'percent': '-72.82', 'start': '2022-06-02T16:35:39.257542', 'finish': '2022-06-02T16:35:54.204476'}, 'logs': ''}, 'Source_counts_over_the_sky.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '21.66', 'expected': '22.00', 'percent': '-1.53', 'start': '2022-06-02T16:35:54.205012', 'finish': '2022-06-02T16:36:15.867640'}, 'logs': ''}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '13.98', 'expected': '60.00', 'percent': '-76.70', 'start': '2022-06-02T16:36:15.868025', 'finish': '2022-06-02T16:36:29.847677'}, 'logs': ''}}, {'GaiaDMPSetup': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '11.59', 'expected': '45.00', 'percent': '-74.23', 'start': '2022-06-02T16:35:28.968149', 'finish': '2022-06-02T16:35:40.563038'}, 'logs': "Py4JJavaError: An error occurred while calling o471.toLocalIterator.\n: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.\nThis stopped SparkContext was created at:\n\norg.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:939)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.zeppelin.spark.BaseSparkScalaInterpreter.spark2CreateContext(BaseSparkScalaInterpreter.scala:299)\norg.apache.zeppelin.spark.BaseSparkScalaInterpreter.createSparkContext(BaseSparkScalaInterpreter.scala:228)\norg.apache.zeppelin.spark.SparkScala212Interpreter.open(SparkScala212Interpreter.scala:88)\norg.apache.zeppelin.spark.SparkInterpreter.open(SparkInterpreter.java:121)\norg.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:70)\norg.apache.zeppelin.interpreter.Interpreter.getInterpreterInTheSameSessionByClassName(Interpreter.java:322)\norg.apache.zeppelin.interpreter.Interpreter.getInterpreterInTheSameSessionByClassName(Interpreter.java:333)\norg.apache.zeppelin.spark.PySparkInterpreter.open(PySparkInterpreter.java:90)\norg.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:70)\norg.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:833)\norg.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:741)\norg.apache.zeppelin.scheduler.Job.run(Job.java:172)\norg.apache.zeppelin.scheduler.AbstractScheduler.runJob(AbstractScheduler.java:132)\norg.apache.zeppelin.scheduler.FIFOScheduler.lambda$runJobInScheduler$0(FIFOScheduler.java:42)\njava.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\nThe currently active SparkContext was created at:\n\n(No active SparkContext.)\n \n\tat org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:118)\n\tat org.apache.spark.SparkContext.defaultParallelism(SparkContext.scala:2492)\n\tat org.apache.spark.sql.execution.LocalTableScanExec.rdd$lzycompute(LocalTableScanExec.scala:52)\n\tat org.apache.spark.sql.execution.LocalTableScanExec.rdd(LocalTableScanExec.scala:48)\n\tat org.apache.spark.sql.execution.LocalTableScanExec.doExecute(LocalTableScanExec.scala:59)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)\n\tat org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)\n\tat org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:321)\n\tat org.apache.spark.sql.execution.SparkPlan.executeToIterator(SparkPlan.scala:409)\n\tat org.apache.spark.sql.Dataset.$anonfun$toLocalIterator$1(Dataset.scala:2996)\n\tat org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)\n\tat org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)\n\tat org.apache.spark.sql.Dataset.toLocalIterator(Dataset.scala:2994)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\n\n(, Py4JJavaError('An error occurred while calling o471.toLocalIterator.\\n', JavaObject id=o472), )"}, 'Mean_proper_motions_over_the_sky': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '9.77', 'expected': '55.00', 'percent': '-82.23', 'start': '2022-06-02T16:35:40.563176', 'finish': '2022-06-02T16:35:50.334953'}, 'logs': "Py4JJavaError: An error occurred while calling o481.cache.\n: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.\nThis stopped SparkContext was created at:\n\norg.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:939)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.zeppelin.spark.BaseSparkScalaInterpreter.spark2CreateContext(BaseSparkScalaInterpreter.scala:299)\norg.apache.zeppelin.spark.BaseSparkScalaInterpreter.createSparkContext(BaseSparkScalaInterpreter.scala:228)\norg.apache.zeppelin.spark.SparkScala212Interpreter.open(SparkScala212Interpreter.scala:88)\norg.apache.zeppelin.spark.SparkInterpreter.open(SparkInterpreter.java:121)\norg.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:70)\norg.apache.zeppelin.interpreter.Interpreter.getInterpreterInTheSameSessionByClassName(Interpreter.java:322)\norg.apache.zeppelin.interpreter.Interpreter.getInterpreterInTheSameSessionByClassName(Interpreter.java:333)\norg.apache.zeppelin.spark.PySparkInterpreter.open(PySparkInterpreter.java:90)\norg.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:70)\norg.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:833)\norg.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:741)\norg.apache.zeppelin.scheduler.Job.run(Job.java:172)\norg.apache.zeppelin.scheduler.AbstractScheduler.runJob(AbstractScheduler.java:132)\norg.apache.zeppelin.scheduler.FIFOScheduler.lambda$runJobInScheduler$0(FIFOScheduler.java:42)\njava.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\nThe currently active SparkContext was created at:\n\n(No active SparkContext.)\n \n\tat org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:118)\n\tat org.apache.spark.sql.SparkSession.(SparkSession.scala:108)\n\tat org.apache.spark.sql.SparkSession.cloneSession(SparkSession.scala:272)\n\tat org.apache.spark.sql.SparkSession$.getOrCloneSessionWithConfigsOff(SparkSession.scala:1079)\n\tat org.apache.spark.sql.execution.CacheManager.cacheQuery(CacheManager.scala:96)\n\tat org.apache.spark.sql.Dataset.persist(Dataset.scala:3165)\n\tat org.apache.spark.sql.Dataset.cache(Dataset.scala:3175)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\n\n(, Py4JJavaError('An error occurred while calling o481.cache.\\n', JavaObject id=o482), )"}, 'Source_counts_over_the_sky.json': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '11.07', 'expected': '22.00', 'percent': '-49.68', 'start': '2022-06-02T16:35:50.335146', 'finish': '2022-06-02T16:36:01.405690'}, 'logs': "Py4JJavaError: An error occurred while calling o487.javaToPython.\n: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:\nExchange hashpartitioning(FLOOR((cast(source_id#47L as double) / 1.40737488355328E14))#279L, 200), ENSURE_REQUIREMENTS, [id=#181]\n+- *(1) HashAggregate(keys=[FLOOR((cast(source_id#47L as double) / 1.40737488355328E14)) AS FLOOR((cast(source_id#47L as double) / 1.40737488355328E14))#279L], functions=[partial_count(1)], output=[FLOOR((cast(source_id#47L as double) / 1.40737488355328E14))#279L, count#281L])\n +- *(1) ColumnarToRow\n +- FileScan parquet gaiaedr3.gaia_source[source_id#47L] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex[file:/data/gaia/GEDR3/GEDR3_GAIASOURCE], PartitionFilters: [], PushedFilters: [], ReadSchema: struct\n\n\tat org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)\n\tat org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:163)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)\n\tat org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)\n\tat org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)\n\tat org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)\n\tat org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)\n\tat org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)\n\tat org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:141)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:746)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)\n\tat org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)\n\tat org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)\n\tat org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)\n\tat org.apache.spark.sql.Dataset.javaToPython(Dataset.scala:3510)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\nCaused by: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.\nThis stopped SparkContext was created at:\n\norg.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:939)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.zeppelin.spark.BaseSparkScalaInterpreter.spark2CreateContext(BaseSparkScalaInterpreter.scala:299)\norg.apache.zeppelin.spark.BaseSparkScalaInterpreter.createSparkContext(BaseSparkScalaInterpreter.scala:228)\norg.apache.zeppelin.spark.SparkScala212Interpreter.open(SparkScala212Interpreter.scala:88)\norg.apache.zeppelin.spark.SparkInterpreter.open(SparkInterpreter.java:121)\norg.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:70)\norg.apache.zeppelin.interpreter.Interpreter.getInterpreterInTheSameSessionByClassName(Interpreter.java:322)\norg.apache.zeppelin.interpreter.Interpreter.getInterpreterInTheSameSessionByClassName(Interpreter.java:333)\norg.apache.zeppelin.spark.PySparkInterpreter.open(PySparkInterpreter.java:90)\norg.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:70)\norg.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:833)\norg.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:741)\norg.apache.zeppelin.scheduler.Job.run(Job.java:172)\norg.apache.zeppelin.scheduler.AbstractScheduler.runJob(AbstractScheduler.java:132)\norg.apache.zeppelin.scheduler.FIFOScheduler.lambda$runJobInScheduler$0(FIFOScheduler.java:42)\njava.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\nThe currently active SparkContext was created at:\n\n(No active SparkContext.)\n \n\tat org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:118)\n\tat org.apache.spark.SparkContext.broadcast(SparkContext.scala:1506)\n\tat org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.buildReaderWithPartitionValues(ParquetFileFormat.scala:231)\n\tat org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:407)\n\tat org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:398)\n\tat org.apache.spark.sql.execution.FileSourceScanExec.doExecuteColumnar(DataSourceScanExec.scala:497)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeColumnar$1(SparkPlan.scala:207)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)\n\tat org.apache.spark.sql.execution.SparkPlan.executeColumnar(SparkPlan.scala:203)\n\tat org.apache.spark.sql.execution.InputAdapter.doExecuteColumnar(WholeStageCodegenExec.scala:519)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeColumnar$1(SparkPlan.scala:207)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)\n\tat org.apache.spark.sql.execution.SparkPlan.executeColumnar(SparkPlan.scala:203)\n\tat org.apache.spark.sql.execution.ColumnarToRowExec.inputRDDs(Columnar.scala:202)\n\tat org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:141)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:746)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)\n\tat org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)\n\tat org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD$lzycompute(ShuffleExchangeExec.scala:118)\n\tat org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD(ShuffleExchangeExec.scala:118)\n\tat org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency$lzycompute(ShuffleExchangeExec.scala:151)\n\tat org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency(ShuffleExchangeExec.scala:149)\n\tat org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.$anonfun$doExecute$1(ShuffleExchangeExec.scala:166)\n\tat org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)\n\t... 31 more\n\n(, Py4JJavaError('An error occurred while calling o487.javaToPython.\\n', JavaObject id=o492), )"}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '15.91', 'expected': '60.00', 'percent': '-73.48', 'start': '2022-06-02T16:36:01.405845', 'finish': '2022-06-02T16:36:17.316469'}, 'logs': ''}}, {'GaiaDMPSetup': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '12.21', 'expected': '45.00', 'percent': '-72.87', 'start': '2022-06-02T16:35:28.968954', 'finish': '2022-06-02T16:35:41.175695'}, 'logs': "Py4JJavaError: An error occurred while calling o538.toLocalIterator.\n: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.\nThis stopped SparkContext was created at:\n\norg.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:939)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.zeppelin.spark.BaseSparkScalaInterpreter.spark2CreateContext(BaseSparkScalaInterpreter.scala:299)\norg.apache.zeppelin.spark.BaseSparkScalaInterpreter.createSparkContext(BaseSparkScalaInterpreter.scala:228)\norg.apache.zeppelin.spark.SparkScala212Interpreter.open(SparkScala212Interpreter.scala:88)\norg.apache.zeppelin.spark.SparkInterpreter.open(SparkInterpreter.java:121)\norg.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:70)\norg.apache.zeppelin.interpreter.Interpreter.getInterpreterInTheSameSessionByClassName(Interpreter.java:322)\norg.apache.zeppelin.interpreter.Interpreter.getInterpreterInTheSameSessionByClassName(Interpreter.java:333)\norg.apache.zeppelin.spark.PySparkInterpreter.open(PySparkInterpreter.java:90)\norg.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:70)\norg.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:833)\norg.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:741)\norg.apache.zeppelin.scheduler.Job.run(Job.java:172)\norg.apache.zeppelin.scheduler.AbstractScheduler.runJob(AbstractScheduler.java:132)\norg.apache.zeppelin.scheduler.FIFOScheduler.lambda$runJobInScheduler$0(FIFOScheduler.java:42)\njava.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\nThe currently active SparkContext was created at:\n\n(No active SparkContext.)\n \n\tat org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:118)\n\tat org.apache.spark.SparkContext.defaultParallelism(SparkContext.scala:2492)\n\tat org.apache.spark.sql.execution.LocalTableScanExec.rdd$lzycompute(LocalTableScanExec.scala:52)\n\tat org.apache.spark.sql.execution.LocalTableScanExec.rdd(LocalTableScanExec.scala:48)\n\tat org.apache.spark.sql.execution.LocalTableScanExec.doExecute(LocalTableScanExec.scala:59)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)\n\tat org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)\n\tat org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:321)\n\tat org.apache.spark.sql.execution.SparkPlan.executeToIterator(SparkPlan.scala:409)\n\tat org.apache.spark.sql.Dataset.$anonfun$toLocalIterator$1(Dataset.scala:2996)\n\tat org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)\n\tat org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)\n\tat org.apache.spark.sql.Dataset.toLocalIterator(Dataset.scala:2994)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\n\n(, Py4JJavaError('An error occurred while calling o538.toLocalIterator.\\n', JavaObject id=o539), )"}, 'Mean_proper_motions_over_the_sky': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '10.08', 'expected': '55.00', 'percent': '-81.67', 'start': '2022-06-02T16:35:41.175941', 'finish': '2022-06-02T16:35:51.255030'}, 'logs': "Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.\n: java.lang.IllegalStateException: SparkContext has been shutdown\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2188)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2261)\n\tat org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:414)\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:1029)\n\tat org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)\n\tat org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\n\n(, Py4JJavaError('An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.\\n', JavaObject id=o556), )"}, 'Source_counts_over_the_sky.json': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '13.71', 'expected': '22.00', 'percent': '-37.67', 'start': '2022-06-02T16:35:51.255301', 'finish': '2022-06-02T16:36:04.966840'}, 'logs': "Py4JJavaError: An error occurred while calling o561.javaToPython.\n: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:\nExchange hashpartitioning(FLOOR((cast(source_id#39L as double) / 1.40737488355328E14))#833L, 200), ENSURE_REQUIREMENTS, [id=#281]\n+- *(1) HashAggregate(keys=[FLOOR((cast(source_id#39L as double) / 1.40737488355328E14)) AS FLOOR((cast(source_id#39L as double) / 1.40737488355328E14))#833L], functions=[partial_count(1)], output=[FLOOR((cast(source_id#39L as double) / 1.40737488355328E14))#833L, count#835L])\n +- *(1) ColumnarToRow\n +- FileScan parquet gaiaedr3.gaia_source[source_id#39L] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex[file:/data/gaia/GEDR3/GEDR3_GAIASOURCE], PartitionFilters: [], PushedFilters: [], ReadSchema: struct\n\n\tat org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)\n\tat org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:163)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)\n\tat org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)\n\tat org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)\n\tat org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)\n\tat org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)\n\tat org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)\n\tat org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:141)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:746)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)\n\tat org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)\n\tat org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)\n\tat org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)\n\tat org.apache.spark.sql.Dataset.javaToPython(Dataset.scala:3510)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\nCaused by: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.\nThis stopped SparkContext was created at:\n\norg.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:939)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.zeppelin.spark.BaseSparkScalaInterpreter.spark2CreateContext(BaseSparkScalaInterpreter.scala:299)\norg.apache.zeppelin.spark.BaseSparkScalaInterpreter.createSparkContext(BaseSparkScalaInterpreter.scala:228)\norg.apache.zeppelin.spark.SparkScala212Interpreter.open(SparkScala212Interpreter.scala:88)\norg.apache.zeppelin.spark.SparkInterpreter.open(SparkInterpreter.java:121)\norg.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:70)\norg.apache.zeppelin.interpreter.Interpreter.getInterpreterInTheSameSessionByClassName(Interpreter.java:322)\norg.apache.zeppelin.interpreter.Interpreter.getInterpreterInTheSameSessionByClassName(Interpreter.java:333)\norg.apache.zeppelin.spark.PySparkInterpreter.open(PySparkInterpreter.java:90)\norg.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:70)\norg.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:833)\norg.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:741)\norg.apache.zeppelin.scheduler.Job.run(Job.java:172)\norg.apache.zeppelin.scheduler.AbstractScheduler.runJob(AbstractScheduler.java:132)\norg.apache.zeppelin.scheduler.FIFOScheduler.lambda$runJobInScheduler$0(FIFOScheduler.java:42)\njava.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\nThe currently active SparkContext was created at:\n\n(No active SparkContext.)\n \n\tat org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:118)\n\tat org.apache.spark.SparkContext.broadcast(SparkContext.scala:1506)\n\tat org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.buildReaderWithPartitionValues(ParquetFileFormat.scala:231)\n\tat org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:407)\n\tat org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:398)\n\tat org.apache.spark.sql.execution.FileSourceScanExec.doExecuteColumnar(DataSourceScanExec.scala:497)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeColumnar$1(SparkPlan.scala:207)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)\n\tat org.apache.spark.sql.execution.SparkPlan.executeColumnar(SparkPlan.scala:203)\n\tat org.apache.spark.sql.execution.InputAdapter.doExecuteColumnar(WholeStageCodegenExec.scala:519)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeColumnar$1(SparkPlan.scala:207)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)\n\tat org.apache.spark.sql.execution.SparkPlan.executeColumnar(SparkPlan.scala:203)\n\tat org.apache.spark.sql.execution.ColumnarToRowExec.inputRDDs(Columnar.scala:202)\n\tat org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:141)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:746)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)\n\tat org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)\n\tat org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD$lzycompute(ShuffleExchangeExec.scala:118)\n\tat org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD(ShuffleExchangeExec.scala:118)\n\tat org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency$lzycompute(ShuffleExchangeExec.scala:151)\n\tat org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency(ShuffleExchangeExec.scala:149)\n\tat org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.$anonfun$doExecute$1(ShuffleExchangeExec.scala:166)\n\tat org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)\n\t... 31 more\n\n(, Py4JJavaError('An error occurred while calling o561.javaToPython.\\n', JavaObject id=o566), )"}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '13.08', 'expected': '60.00', 'percent': '-78.20', 'start': '2022-06-02T16:36:04.967016', 'finish': '2022-06-02T16:36:18.049045'}, 'logs': ''}}, {'GaiaDMPSetup': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '8.67', 'expected': '45.00', 'percent': '-80.73', 'start': '2022-06-02T16:35:28.969251', 'finish': '2022-06-02T16:35:37.640126'}, 'logs': ''}, 'Mean_proper_motions_over_the_sky': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '12.93', 'expected': '55.00', 'percent': '-76.48', 'start': '2022-06-02T16:35:37.640288', 'finish': '2022-06-02T16:35:50.574353'}, 'logs': ''}, 'Source_counts_over_the_sky.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '21.99', 'expected': '22.00', 'percent': '-0.03', 'start': '2022-06-02T16:35:50.574665', 'finish': '2022-06-02T16:36:12.567678'}, 'logs': ''}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '14.47', 'expected': '60.00', 'percent': '-75.89', 'start': '2022-06-02T16:36:12.568058', 'finish': '2022-06-02T16:36:27.036493'}, 'logs': ''}}] + > .... + + # + # restart Zeppelin + # + + # + # Zeppelin user interface tasks seem to work OK. + # + + # + # Benchmark tests fail with different errors .. + # + + > .... + > ------------ Test Result: [ERROR] ------------ + > [{'GaiaDMPSetup': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'SLOW', 'elapsed': '46.18', 'expected': '45.00', 'percent': '2.63', 'start': '2022-06-02T16:39:07.998712', 'finish': '2022-06-02T16:39:54.182142'}, 'logs': ''}, 'Mean_proper_motions_over_the_sky': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '43.55', 'expected': '55.00', 'percent': '-20.83', 'start': '2022-06-02T16:39:54.182441', 'finish': '2022-06-02T16:40:37.727502'}, 'logs': ''}, 'Source_counts_over_the_sky.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '20.79', 'expected': '22.00', 'percent': '-5.51', 'start': '2022-06-02T16:40:37.727799', 'finish': '2022-06-02T16:40:58.515407'}, 'logs': ''}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '15.63', 'expected': '60.00', 'percent': '-73.96', 'start': '2022-06-02T16:40:58.516020', 'finish': '2022-06-02T16:41:14.142263'}, 'logs': ''}}, {'GaiaDMPSetup': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'SLOW', 'elapsed': '60.86', 'expected': '45.00', 'percent': '35.24', 'start': '2022-06-02T16:39:07.998887', 'finish': '2022-06-02T16:40:08.855578'}, 'logs': ''}, 'Mean_proper_motions_over_the_sky': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '40.98', 'expected': '55.00', 'percent': '-25.48', 'start': '2022-06-02T16:40:08.855781', 'finish': '2022-06-02T16:40:49.840709'}, 'logs': ''}, 'Source_counts_over_the_sky.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'SLOW', 'elapsed': '23.11', 'expected': '22.00', 'percent': '5.04', 'start': '2022-06-02T16:40:49.841887', 'finish': '2022-06-02T16:41:12.950354'}, 'logs': ''}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '16.19', 'expected': '60.00', 'percent': '-73.01', 'start': '2022-06-02T16:41:12.951192', 'finish': '2022-06-02T16:41:29.143872'}, 'logs': ''}}, {'GaiaDMPSetup': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '11.47', 'expected': '45.00', 'percent': '-74.50', 'start': '2022-06-02T16:39:07.999016', 'finish': '2022-06-02T16:39:19.472115'}, 'logs': 'Unexpected exception: java.util.ConcurrentModificationException\n\tat java.util.HashMap$ValueSpliterator.forEachRemaining(HashMap.java:1633)\n\tat java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482)\n\tat java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472)\n\tat java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708)\n\tat java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)\n\tat java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566)\n\tat org.apache.zeppelin.service.JobManagerService.getNoteJobInfoByUnixTime(JobManagerService.java:90)\n\tat org.apache.zeppelin.socket.NotebookServer.broadcastUpdateNoteJobInfo(NotebookServer.java:519)\n\tat org.apache.zeppelin.socket.NotebookServer.onStatusChange(NotebookServer.java:2007)\n\tat org.apache.zeppelin.socket.NotebookServer.onStatusChange(NotebookServer.java:105)\n\tat org.apache.zeppelin.scheduler.Job.setStatus(Job.java:141)\n\tat org.apache.zeppelin.notebook.Paragraph.setStatus(Paragraph.java:398)\n\tat org.apache.zeppelin.notebook.Paragraph.execute(Paragraph.java:349)\n\tat org.apache.zeppelin.notebook.Note.run(Note.java:873)\n\tat org.apache.zeppelin.service.NotebookService.runParagraph(NotebookService.java:390)\n\tat org.apache.zeppelin.rest.NotebookRestApi.runParagraph(NotebookRestApi.java:849)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat org.glassfish.jersey.server.model.internal.ResourceMethodInvocationHandlerFactory.lambda$static$0(ResourceMethodInvocationHandlerFactory.java:52)\n\tat org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher$1.run(AbstractJavaResourceMethodDispatcher.java:124)\n\tat org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.invoke(AbstractJavaResourceMethodDispatcher.java:167)\n\tat org.glassfish.jersey.server.model.internal.JavaResourceMethodDispatcherProvider$ResponseOutInvoker.doDispatch(JavaResourceMethodDispatcherProvider.java:176)\n\tat org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.dispatch(AbstractJavaResourceMethodDispatcher.java:79)\n\tat org.glassfish.jersey.server.model.ResourceMethodInvoker.invoke(ResourceMethodInvoker.java:469)\n\tat org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:391)\n\tat org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:80)\n\tat org.glassfish.jersey.server.ServerRuntime$1.run(ServerRuntime.java:253)\n\tat org.glassfish.jersey.internal.Errors$1.call(Errors.java:248)\n\tat org.glassfish.jersey.internal.Errors$1.call(Errors.java:244)\n\tat org.glassfish.jersey.internal.Errors.process(Errors.java:292)\n\tat org.glassfish.jersey.internal.Errors.process(Errors.java:274)\n\tat org.glassfish.jersey.internal.Errors.process(Errors.java:244)\n\tat org.glassfish.jersey.process.internal.RequestScope.runInScope(RequestScope.java:265)\n\tat org.glassfish.jersey.server.ServerRuntime.process(ServerRuntime.java:232)\n\tat org.glassfish.jersey.server.ApplicationHandler.handle(ApplicationHandler.java:680)\n\tat org.glassfish.jersey.servlet.WebComponent.serviceImpl(WebComponent.java:394)\n\tat org.glassfish.jersey.servlet.WebComponent.service(WebComponent.java:346)\n\tat org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:366)\n\tat org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:319)\n\tat org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:205)\n\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:763)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1651)\n\tat org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:61)\n\tat org.apache.shiro.web.servlet.AdviceFilter.executeChain(AdviceFilter.java:108)\n\tat org.apache.shiro.web.servlet.AdviceFilter.doFilterInternal(AdviceFilter.java:137)\n\tat org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125)\n\tat org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:66)\n\tat org.apache.shiro.web.servlet.AdviceFilter.executeChain(AdviceFilter.java:108)\n\tat org.apache.shiro.web.servlet.AdviceFilter.doFilterInternal(AdviceFilter.java:137)\n\tat org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125)\n\tat org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:66)\n\tat org.apache.shiro.web.servlet.AbstractShiroFilter.executeChain(AbstractShiroFilter.java:450)\n\tat org.apache.shiro.web.servlet.AbstractShiroFilter$1.call(AbstractShiroFilter.java:365)\n\tat org.apache.shiro.subject.support.SubjectCallable.doCall(SubjectCallable.java:90)\n\tat org.apache.shiro.subject.support.SubjectCallable.call(SubjectCallable.java:83)\n\tat org.apache.shiro.subject.support.DelegatingSubject.execute(DelegatingSubject.java:387)\n\tat org.apache.shiro.web.servlet.AbstractShiroFilter.doFilterInternal(AbstractShiroFilter.java:362)\n\tat org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1638)\n\tat org.apache.zeppelin.server.CorsFilter.doFilter(CorsFilter.java:64)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1638)\n\tat org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:567)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)\n\tat org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:602)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235)\n\tat org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1610)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1377)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)\n\tat org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:507)\n\tat org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1580)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1292)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)\n\tat org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:234)\n\tat io.micrometer.core.instrument.binder.jetty.TimedHandler.handle(TimedHandler.java:120)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)\n\tat org.eclipse.jetty.server.Server.handle(Server.java:501)\n\tat org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:383)\n\tat org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:556)\n\tat org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:375)\n\tat org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:273)\n\tat org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)\n\tat org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)\n\tat org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:336)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:313)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:135)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:806)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:938)\n\tat java.lang.Thread.run(Thread.java:748)'}, 'Mean_proper_motions_over_the_sky': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'SLOW', 'elapsed': '55.94', 'expected': '55.00', 'percent': '1.71', 'start': '2022-06-02T16:39:19.472382', 'finish': '2022-06-02T16:40:15.410366'}, 'logs': 'Fail to execute line 13: df = spark.sql(query).cache()\nTraceback (most recent call last):\n File "/tmp/1654188010798-0/zeppelin_python.py", line 158, in \n exec(code, _zcUserQueryNameSpace)\n File "", line 13, in \n File "/opt/spark/python/pyspark/sql/session.py", line 723, in sql\n return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)\n File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__\n answer, self.gateway_client, self.target_id, self.name)\n File "/opt/spark/python/pyspark/sql/utils.py", line 117, in deco\n raise converted from None\npyspark.sql.utils.AnalysisException: Table or view not found: gaia_source; line 1 pos 121;\n\'Aggregate [\'hpx_id], [\'floor((\'source_id / 140737488355328)) AS hpx_id#0, count(1) AS n#1L, \'AVG(\'pmra) AS avg_pmra#2, \'AVG(\'pmdec) AS avg_pmdec#3]\n+- \'UnresolvedRelation [gaia_source], [], false'}, 'Source_counts_over_the_sky.json': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '11.41', 'expected': '22.00', 'percent': '-48.14', 'start': '2022-06-02T16:40:15.410599', 'finish': '2022-06-02T16:40:26.819835'}, 'logs': 'Fail to execute line 21: df = spark.sql("SELECT FLOOR(source_id / %d"%(divisor) + ") AS hpx_id, COUNT(*) AS n FROM gaia_source GROUP BY hpx_id")\nTraceback (most recent call last):\n File "/tmp/1654188010798-0/zeppelin_python.py", line 158, in \n exec(code, _zcUserQueryNameSpace)\n File "", line 21, in \n File "/opt/spark/python/pyspark/sql/session.py", line 723, in sql\n return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)\n File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__\n answer, self.gateway_client, self.target_id, self.name)\n File "/opt/spark/python/pyspark/sql/utils.py", line 117, in deco\n raise converted from None\npyspark.sql.utils.AnalysisException: Table or view not found: gaia_source; line 1 pos 72;\n\'Aggregate [\'hpx_id], [\'FLOOR((\'source_id / 140737488355328)) AS hpx_id#5, count(1) AS n#6L]\n+- \'UnresolvedRelation [gaia_source], [], false'}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '19.64', 'expected': '60.00', 'percent': '-67.26', 'start': '2022-06-02T16:40:26.820121', 'finish': '2022-06-02T16:40:46.462633'}, 'logs': ''}}, {'GaiaDMPSetup': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '11.48', 'expected': '45.00', 'percent': '-74.50', 'start': '2022-06-02T16:39:07.999045', 'finish': '2022-06-02T16:39:19.475757'}, 'logs': 'Unexpected exception: java.util.ConcurrentModificationException\n\tat java.util.HashMap$ValueSpliterator.forEachRemaining(HashMap.java:1633)\n\tat java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482)\n\tat java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472)\n\tat java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708)\n\tat java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)\n\tat java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566)\n\tat org.apache.zeppelin.service.JobManagerService.getNoteJobInfoByUnixTime(JobManagerService.java:90)\n\tat org.apache.zeppelin.socket.NotebookServer.broadcastUpdateNoteJobInfo(NotebookServer.java:519)\n\tat org.apache.zeppelin.socket.NotebookServer.onStatusChange(NotebookServer.java:2007)\n\tat org.apache.zeppelin.socket.NotebookServer.onStatusChange(NotebookServer.java:105)\n\tat org.apache.zeppelin.scheduler.Job.setStatus(Job.java:141)\n\tat org.apache.zeppelin.notebook.Paragraph.setStatus(Paragraph.java:398)\n\tat org.apache.zeppelin.notebook.Paragraph.execute(Paragraph.java:349)\n\tat org.apache.zeppelin.notebook.Note.run(Note.java:873)\n\tat org.apache.zeppelin.service.NotebookService.runParagraph(NotebookService.java:390)\n\tat org.apache.zeppelin.rest.NotebookRestApi.runParagraph(NotebookRestApi.java:849)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat org.glassfish.jersey.server.model.internal.ResourceMethodInvocationHandlerFactory.lambda$static$0(ResourceMethodInvocationHandlerFactory.java:52)\n\tat org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher$1.run(AbstractJavaResourceMethodDispatcher.java:124)\n\tat org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.invoke(AbstractJavaResourceMethodDispatcher.java:167)\n\tat org.glassfish.jersey.server.model.internal.JavaResourceMethodDispatcherProvider$ResponseOutInvoker.doDispatch(JavaResourceMethodDispatcherProvider.java:176)\n\tat org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.dispatch(AbstractJavaResourceMethodDispatcher.java:79)\n\tat org.glassfish.jersey.server.model.ResourceMethodInvoker.invoke(ResourceMethodInvoker.java:469)\n\tat org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:391)\n\tat org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:80)\n\tat org.glassfish.jersey.server.ServerRuntime$1.run(ServerRuntime.java:253)\n\tat org.glassfish.jersey.internal.Errors$1.call(Errors.java:248)\n\tat org.glassfish.jersey.internal.Errors$1.call(Errors.java:244)\n\tat org.glassfish.jersey.internal.Errors.process(Errors.java:292)\n\tat org.glassfish.jersey.internal.Errors.process(Errors.java:274)\n\tat org.glassfish.jersey.internal.Errors.process(Errors.java:244)\n\tat org.glassfish.jersey.process.internal.RequestScope.runInScope(RequestScope.java:265)\n\tat org.glassfish.jersey.server.ServerRuntime.process(ServerRuntime.java:232)\n\tat org.glassfish.jersey.server.ApplicationHandler.handle(ApplicationHandler.java:680)\n\tat org.glassfish.jersey.servlet.WebComponent.serviceImpl(WebComponent.java:394)\n\tat org.glassfish.jersey.servlet.WebComponent.service(WebComponent.java:346)\n\tat org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:366)\n\tat org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:319)\n\tat org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:205)\n\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:763)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1651)\n\tat org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:61)\n\tat org.apache.shiro.web.servlet.AdviceFilter.executeChain(AdviceFilter.java:108)\n\tat org.apache.shiro.web.servlet.AdviceFilter.doFilterInternal(AdviceFilter.java:137)\n\tat org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125)\n\tat org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:66)\n\tat org.apache.shiro.web.servlet.AdviceFilter.executeChain(AdviceFilter.java:108)\n\tat org.apache.shiro.web.servlet.AdviceFilter.doFilterInternal(AdviceFilter.java:137)\n\tat org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125)\n\tat org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:66)\n\tat org.apache.shiro.web.servlet.AbstractShiroFilter.executeChain(AbstractShiroFilter.java:450)\n\tat org.apache.shiro.web.servlet.AbstractShiroFilter$1.call(AbstractShiroFilter.java:365)\n\tat org.apache.shiro.subject.support.SubjectCallable.doCall(SubjectCallable.java:90)\n\tat org.apache.shiro.subject.support.SubjectCallable.call(SubjectCallable.java:83)\n\tat org.apache.shiro.subject.support.DelegatingSubject.execute(DelegatingSubject.java:387)\n\tat org.apache.shiro.web.servlet.AbstractShiroFilter.doFilterInternal(AbstractShiroFilter.java:362)\n\tat org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1638)\n\tat org.apache.zeppelin.server.CorsFilter.doFilter(CorsFilter.java:64)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1638)\n\tat org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:567)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)\n\tat org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:602)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235)\n\tat org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1610)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1377)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)\n\tat org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:507)\n\tat org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1580)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1292)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)\n\tat org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:234)\n\tat io.micrometer.core.instrument.binder.jetty.TimedHandler.handle(TimedHandler.java:120)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)\n\tat org.eclipse.jetty.server.Server.handle(Server.java:501)\n\tat org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:383)\n\tat org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:556)\n\tat org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:375)\n\tat org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:273)\n\tat org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)\n\tat org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)\n\tat org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:336)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:313)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:135)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:806)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:938)\n\tat java.lang.Thread.run(Thread.java:748)'}, 'Mean_proper_motions_over_the_sky': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '53.77', 'expected': '55.00', 'percent': '-2.24', 'start': '2022-06-02T16:39:19.475890', 'finish': '2022-06-02T16:40:13.241193'}, 'logs': 'Fail to execute line 13: df = spark.sql(query).cache()\nTraceback (most recent call last):\n File "/tmp/1654188008780-0/zeppelin_python.py", line 158, in \n exec(code, _zcUserQueryNameSpace)\n File "", line 13, in \n File "/opt/spark/python/pyspark/sql/session.py", line 723, in sql\n return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)\n File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__\n answer, self.gateway_client, self.target_id, self.name)\n File "/opt/spark/python/pyspark/sql/utils.py", line 117, in deco\n raise converted from None\npyspark.sql.utils.AnalysisException: Table or view not found: gaia_source; line 1 pos 121;\n\'Aggregate [\'hpx_id], [\'floor((\'source_id / 140737488355328)) AS hpx_id#0, count(1) AS n#1L, \'AVG(\'pmra) AS avg_pmra#2, \'AVG(\'pmdec) AS avg_pmdec#3]\n+- \'UnresolvedRelation [gaia_source], [], false'}, 'Source_counts_over_the_sky.json': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '15.53', 'expected': '22.00', 'percent': '-29.40', 'start': '2022-06-02T16:40:13.241490', 'finish': '2022-06-02T16:40:28.773163'}, 'logs': 'Fail to execute line 21: df = spark.sql("SELECT FLOOR(source_id / %d"%(divisor) + ") AS hpx_id, COUNT(*) AS n FROM gaia_source GROUP BY hpx_id")\nTraceback (most recent call last):\n File "/tmp/1654188008780-0/zeppelin_python.py", line 158, in \n exec(code, _zcUserQueryNameSpace)\n File "", line 21, in \n File "/opt/spark/python/pyspark/sql/session.py", line 723, in sql\n return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)\n File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__\n answer, self.gateway_client, self.target_id, self.name)\n File "/opt/spark/python/pyspark/sql/utils.py", line 117, in deco\n raise converted from None\npyspark.sql.utils.AnalysisException: Table or view not found: gaia_source; line 1 pos 72;\n\'Aggregate [\'hpx_id], [\'FLOOR((\'source_id / 140737488355328)) AS hpx_id#5, count(1) AS n#6L]\n+- \'UnresolvedRelation [gaia_source], [], false'}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '18.72', 'expected': '60.00', 'percent': '-68.80', 'start': '2022-06-02T16:40:28.773333', 'finish': '2022-06-02T16:40:47.495359'}, 'logs': ''}}] + > .... + + + > .... + > ------------ Test Result: [ERROR] ------------ + > [{'GaiaDMPSetup': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '12.54', 'expected': '45.00', 'percent': '-72.12', 'start': '2022-06-02T17:01:13.039052', 'finish': '2022-06-02T17:01:25.583576'}, 'logs': ''}, 'Mean_proper_motions_over_the_sky': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '14.17', 'expected': '55.00', 'percent': '-74.24', 'start': '2022-06-02T17:01:25.583700', 'finish': '2022-06-02T17:01:39.749606'}, 'logs': ''}, 'Source_counts_over_the_sky.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '21.24', 'expected': '22.00', 'percent': '-3.46', 'start': '2022-06-02T17:01:39.749986', 'finish': '2022-06-02T17:02:00.988141'}, 'logs': ''}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '13.74', 'expected': '60.00', 'percent': '-77.10', 'start': '2022-06-02T17:02:00.988895', 'finish': '2022-06-02T17:02:14.730454'}, 'logs': ''}}, {'GaiaDMPSetup': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '12.43', 'expected': '45.00', 'percent': '-72.37', 'start': '2022-06-02T17:01:13.040544', 'finish': '2022-06-02T17:01:25.472640'}, 'logs': ''}, 'Mean_proper_motions_over_the_sky': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '12.40', 'expected': '55.00', 'percent': '-77.46', 'start': '2022-06-02T17:01:25.472924', 'finish': '2022-06-02T17:01:37.869753'}, 'logs': ''}, 'Source_counts_over_the_sky.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'SLOW', 'elapsed': '30.66', 'expected': '22.00', 'percent': '39.35', 'start': '2022-06-02T17:01:37.870181', 'finish': '2022-06-02T17:02:08.528274'}, 'logs': ''}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '14.72', 'expected': '60.00', 'percent': '-75.47', 'start': '2022-06-02T17:02:08.528688', 'finish': '2022-06-02T17:02:23.246514'}, 'logs': ''}}, {'GaiaDMPSetup': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '9.89', 'expected': '45.00', 'percent': '-78.03', 'start': '2022-06-02T17:01:13.041417', 'finish': '2022-06-02T17:01:22.928968'}, 'logs': ''}, 'Mean_proper_motions_over_the_sky': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '12.16', 'expected': '55.00', 'percent': '-77.88', 'start': '2022-06-02T17:01:22.929136', 'finish': '2022-06-02T17:01:35.093425'}, 'logs': ''}, 'Source_counts_over_the_sky.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'SLOW', 'elapsed': '34.88', 'expected': '22.00', 'percent': '58.53', 'start': '2022-06-02T17:01:35.093743', 'finish': '2022-06-02T17:02:09.970958'}, 'logs': ''}, 'Library_Validation.json': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '9.38', 'expected': '60.00', 'percent': '-84.37', 'start': '2022-06-02T17:02:09.971602', 'finish': '2022-06-02T17:02:19.347475'}, 'logs': 'Unexpected exception: java.util.ConcurrentModificationException\n\tat java.util.HashMap$ValueSpliterator.forEachRemaining(HashMap.java:1633)\n\tat java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482)\n\tat java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472)\n\tat java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708)\n\tat java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)\n\tat java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566)\n\tat org.apache.zeppelin.service.JobManagerService.getNoteJobInfoByUnixTime(JobManagerService.java:90)\n\tat org.apache.zeppelin.socket.NotebookServer.broadcastUpdateNoteJobInfo(NotebookServer.java:519)\n\tat org.apache.zeppelin.socket.NotebookServer.onStatusChange(NotebookServer.java:2007)\n\tat org.apache.zeppelin.socket.NotebookServer.onStatusChange(NotebookServer.java:105)\n\tat org.apache.zeppelin.scheduler.Job.setStatus(Job.java:141)\n\tat org.apache.zeppelin.notebook.Paragraph.setStatus(Paragraph.java:398)\n\tat org.apache.zeppelin.notebook.Paragraph.execute(Paragraph.java:349)\n\tat org.apache.zeppelin.notebook.Note.run(Note.java:873)\n\tat org.apache.zeppelin.service.NotebookService.runParagraph(NotebookService.java:390)\n\tat org.apache.zeppelin.rest.NotebookRestApi.runParagraph(NotebookRestApi.java:849)\n\tat sun.reflect.GeneratedMethodAccessor13.invoke(Unknown Source)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat org.glassfish.jersey.server.model.internal.ResourceMethodInvocationHandlerFactory.lambda$static$0(ResourceMethodInvocationHandlerFactory.java:52)\n\tat org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher$1.run(AbstractJavaResourceMethodDispatcher.java:124)\n\tat org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.invoke(AbstractJavaResourceMethodDispatcher.java:167)\n\tat org.glassfish.jersey.server.model.internal.JavaResourceMethodDispatcherProvider$ResponseOutInvoker.doDispatch(JavaResourceMethodDispatcherProvider.java:176)\n\tat org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.dispatch(AbstractJavaResourceMethodDispatcher.java:79)\n\tat org.glassfish.jersey.server.model.ResourceMethodInvoker.invoke(ResourceMethodInvoker.java:469)\n\tat org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:391)\n\tat org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:80)\n\tat org.glassfish.jersey.server.ServerRuntime$1.run(ServerRuntime.java:253)\n\tat org.glassfish.jersey.internal.Errors$1.call(Errors.java:248)\n\tat org.glassfish.jersey.internal.Errors$1.call(Errors.java:244)\n\tat org.glassfish.jersey.internal.Errors.process(Errors.java:292)\n\tat org.glassfish.jersey.internal.Errors.process(Errors.java:274)\n\tat org.glassfish.jersey.internal.Errors.process(Errors.java:244)\n\tat org.glassfish.jersey.process.internal.RequestScope.runInScope(RequestScope.java:265)\n\tat org.glassfish.jersey.server.ServerRuntime.process(ServerRuntime.java:232)\n\tat org.glassfish.jersey.server.ApplicationHandler.handle(ApplicationHandler.java:680)\n\tat org.glassfish.jersey.servlet.WebComponent.serviceImpl(WebComponent.java:394)\n\tat org.glassfish.jersey.servlet.WebComponent.service(WebComponent.java:346)\n\tat org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:366)\n\tat org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:319)\n\tat org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:205)\n\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:763)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1651)\n\tat org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:61)\n\tat org.apache.shiro.web.servlet.AdviceFilter.executeChain(AdviceFilter.java:108)\n\tat org.apache.shiro.web.servlet.AdviceFilter.doFilterInternal(AdviceFilter.java:137)\n\tat org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125)\n\tat org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:66)\n\tat org.apache.shiro.web.servlet.AdviceFilter.executeChain(AdviceFilter.java:108)\n\tat org.apache.shiro.web.servlet.AdviceFilter.doFilterInternal(AdviceFilter.java:137)\n\tat org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125)\n\tat org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:66)\n\tat org.apache.shiro.web.servlet.AbstractShiroFilter.executeChain(AbstractShiroFilter.java:450)\n\tat org.apache.shiro.web.servlet.AbstractShiroFilter$1.call(AbstractShiroFilter.java:365)\n\tat org.apache.shiro.subject.support.SubjectCallable.doCall(SubjectCallable.java:90)\n\tat org.apache.shiro.subject.support.SubjectCallable.call(SubjectCallable.java:83)\n\tat org.apache.shiro.subject.support.DelegatingSubject.execute(DelegatingSubject.java:387)\n\tat org.apache.shiro.web.servlet.AbstractShiroFilter.doFilterInternal(AbstractShiroFilter.java:362)\n\tat org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1638)\n\tat org.apache.zeppelin.server.CorsFilter.doFilter(CorsFilter.java:64)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1638)\n\tat org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:567)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)\n\tat org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:602)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235)\n\tat org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1610)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1377)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)\n\tat org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:507)\n\tat org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1580)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1292)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)\n\tat org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:234)\n\tat io.micrometer.core.instrument.binder.jetty.TimedHandler.handle(TimedHandler.java:120)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)\n\tat org.eclipse.jetty.server.Server.handle(Server.java:501)\n\tat org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:383)\n\tat org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:556)\n\tat org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:375)\n\tat org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:273)\n\tat org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)\n\tat org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)\n\tat org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:336)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:313)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:129)\n\tat org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:375)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:806)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:938)\n\tat java.lang.Thread.run(Thread.java:748)'}}, {'GaiaDMPSetup': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '13.63', 'expected': '45.00', 'percent': '-69.71', 'start': '2022-06-02T17:01:13.041688', 'finish': '2022-06-02T17:01:26.672366'}, 'logs': ''}, 'Mean_proper_motions_over_the_sky': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '13.92', 'expected': '55.00', 'percent': '-74.69', 'start': '2022-06-02T17:01:26.672652', 'finish': '2022-06-02T17:01:40.593446'}, 'logs': ''}, 'Source_counts_over_the_sky.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'SLOW', 'elapsed': '34.83', 'expected': '22.00', 'percent': '58.30', 'start': '2022-06-02T17:01:40.593777', 'finish': '2022-06-02T17:02:15.420529'}, 'logs': ''}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '11.82', 'expected': '60.00', 'percent': '-80.30', 'start': '2022-06-02T17:02:15.421003', 'finish': '2022-06-02T17:02:27.240596'}, 'logs': ''}}] + > .... + + # + # Restarting Zeppelin and Hadoop doesn't make everything better. + # + + # + # Rebooting the machines ? + # + +# ----------------------------------------------------- +# Reboot the machines ... +#[root@ansibler] + + ssh worker01 \ + ' + sudo reboot + ' + + ssh worker02 \ + ' + sudo reboot + ' + + ssh worker03 \ + ' + sudo reboot + ' + + ssh worker04 \ + ' + sudo reboot + ' + + ssh worker05 \ + ' + sudo reboot + ' + + ssh worker06 \ + ' + sudo reboot + ' + + ssh master01 \ + ' + sudo reboot + ' + + ssh zeppelin \ + ' + sudo reboot + ' + + ssh master01 \ + ' + start-dfs.sh + start-yarn.sh + ' + + ssh zeppelin \ + ' + zeppelin-daemon.sh start + ' + + # + # Still failing ... + # + + > .... + > Test completed! (196.54 seconds) + > ------------ Test Result: [FAIL] ------------ + > [{'GaiaDMPSetup': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '15.75', 'expected': '45.00', 'percent': '-65.01', 'start': '2022-06-02T17:15:07.611831', 'finish': '2022-06-02T17:15:23.357102'}, 'logs': 'Unexpected exception: java.util.ConcurrentModificationException\n\tat java.util.HashMap$ValueSpliterator.forEachRemaining(HashMap.java:1633)\n\tat java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482)\n\tat java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472)\n\tat java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708)\n\tat java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)\n\tat java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566)\n\tat org.apache.zeppelin.service.JobManagerService.getNoteJobInfoByUnixTime(JobManagerService.java:90)\n\tat org.apache.zeppelin.socket.NotebookServer.broadcastUpdateNoteJobInfo(NotebookServer.java:519)\n\tat org.apache.zeppelin.socket.NotebookServer.onStatusChange(NotebookServer.java:2007)\n\tat org.apache.zeppelin.socket.NotebookServer.onStatusChange(NotebookServer.java:105)\n\tat org.apache.zeppelin.scheduler.Job.setStatus(Job.java:141)\n\tat org.apache.zeppelin.notebook.Paragraph.setStatus(Paragraph.java:398)\n\tat org.apache.zeppelin.notebook.Paragraph.execute(Paragraph.java:349)\n\tat org.apache.zeppelin.notebook.Note.run(Note.java:873)\n\tat org.apache.zeppelin.service.NotebookService.runParagraph(NotebookService.java:390)\n\tat org.apache.zeppelin.rest.NotebookRestApi.runParagraph(NotebookRestApi.java:849)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat org.glassfish.jersey.server.model.internal.ResourceMethodInvocationHandlerFactory.lambda$static$0(ResourceMethodInvocationHandlerFactory.java:52)\n\tat org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher$1.run(AbstractJavaResourceMethodDispatcher.java:124)\n\tat org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.invoke(AbstractJavaResourceMethodDispatcher.java:167)\n\tat org.glassfish.jersey.server.model.internal.JavaResourceMethodDispatcherProvider$ResponseOutInvoker.doDispatch(JavaResourceMethodDispatcherProvider.java:176)\n\tat org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.dispatch(AbstractJavaResourceMethodDispatcher.java:79)\n\tat org.glassfish.jersey.server.model.ResourceMethodInvoker.invoke(ResourceMethodInvoker.java:469)\n\tat org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:391)\n\tat org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:80)\n\tat org.glassfish.jersey.server.ServerRuntime$1.run(ServerRuntime.java:253)\n\tat org.glassfish.jersey.internal.Errors$1.call(Errors.java:248)\n\tat org.glassfish.jersey.internal.Errors$1.call(Errors.java:244)\n\tat org.glassfish.jersey.internal.Errors.process(Errors.java:292)\n\tat org.glassfish.jersey.internal.Errors.process(Errors.java:274)\n\tat org.glassfish.jersey.internal.Errors.process(Errors.java:244)\n\tat org.glassfish.jersey.process.internal.RequestScope.runInScope(RequestScope.java:265)\n\tat org.glassfish.jersey.server.ServerRuntime.process(ServerRuntime.java:232)\n\tat org.glassfish.jersey.server.ApplicationHandler.handle(ApplicationHandler.java:680)\n\tat org.glassfish.jersey.servlet.WebComponent.serviceImpl(WebComponent.java:394)\n\tat org.glassfish.jersey.servlet.WebComponent.service(WebComponent.java:346)\n\tat org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:366)\n\tat org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:319)\n\tat org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:205)\n\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:763)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1651)\n\tat org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:61)\n\tat org.apache.shiro.web.servlet.AdviceFilter.executeChain(AdviceFilter.java:108)\n\tat org.apache.shiro.web.servlet.AdviceFilter.doFilterInternal(AdviceFilter.java:137)\n\tat org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125)\n\tat org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:66)\n\tat org.apache.shiro.web.servlet.AdviceFilter.executeChain(AdviceFilter.java:108)\n\tat org.apache.shiro.web.servlet.AdviceFilter.doFilterInternal(AdviceFilter.java:137)\n\tat org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125)\n\tat org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:66)\n\tat org.apache.shiro.web.servlet.AbstractShiroFilter.executeChain(AbstractShiroFilter.java:450)\n\tat org.apache.shiro.web.servlet.AbstractShiroFilter$1.call(AbstractShiroFilter.java:365)\n\tat org.apache.shiro.subject.support.SubjectCallable.doCall(SubjectCallable.java:90)\n\tat org.apache.shiro.subject.support.SubjectCallable.call(SubjectCallable.java:83)\n\tat org.apache.shiro.subject.support.DelegatingSubject.execute(DelegatingSubject.java:387)\n\tat org.apache.shiro.web.servlet.AbstractShiroFilter.doFilterInternal(AbstractShiroFilter.java:362)\n\tat org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1638)\n\tat org.apache.zeppelin.server.CorsFilter.doFilter(CorsFilter.java:64)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1638)\n\tat org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:567)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)\n\tat org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:602)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235)\n\tat org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1610)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1377)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)\n\tat org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:507)\n\tat org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1580)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1292)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)\n\tat org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:234)\n\tat io.micrometer.core.instrument.binder.jetty.TimedHandler.handle(TimedHandler.java:120)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)\n\tat org.eclipse.jetty.server.Server.handle(Server.java:501)\n\tat org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:383)\n\tat org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:556)\n\tat org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:375)\n\tat org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:273)\n\tat org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)\n\tat org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)\n\tat org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:336)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:313)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:135)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:806)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:938)\n\tat java.lang.Thread.run(Thread.java:748)'}, 'Mean_proper_motions_over_the_sky': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'SLOW', 'elapsed': '55.17', 'expected': '55.00', 'percent': '0.30', 'start': '2022-06-02T17:15:23.357388', 'finish': '2022-06-02T17:16:18.524291'}, 'logs': 'Fail to execute line 13: df = spark.sql(query).cache()\nTraceback (most recent call last):\n File "/tmp/1654190174607-0/zeppelin_python.py", line 158, in \n exec(code, _zcUserQueryNameSpace)\n File "", line 13, in \n File "/opt/spark/python/pyspark/sql/session.py", line 723, in sql\n return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)\n File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__\n answer, self.gateway_client, self.target_id, self.name)\n File "/opt/spark/python/pyspark/sql/utils.py", line 117, in deco\n raise converted from None\npyspark.sql.utils.AnalysisException: Table or view not found: gaia_source; line 1 pos 121;\n\'Aggregate [\'hpx_id], [\'floor((\'source_id / 140737488355328)) AS hpx_id#0, count(1) AS n#1L, \'AVG(\'pmra) AS avg_pmra#2, \'AVG(\'pmdec) AS avg_pmdec#3]\n+- \'UnresolvedRelation [gaia_source], [], false'}, 'Source_counts_over_the_sky.json': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '13.97', 'expected': '22.00', 'percent': '-36.51', 'start': '2022-06-02T17:16:18.524520', 'finish': '2022-06-02T17:16:32.491473'}, 'logs': 'Fail to execute line 21: df = spark.sql("SELECT FLOOR(source_id / %d"%(divisor) + ") AS hpx_id, COUNT(*) AS n FROM gaia_source GROUP BY hpx_id")\nTraceback (most recent call last):\n File "/tmp/1654190174607-0/zeppelin_python.py", line 158, in \n exec(code, _zcUserQueryNameSpace)\n File "", line 21, in \n File "/opt/spark/python/pyspark/sql/session.py", line 723, in sql\n return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)\n File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__\n answer, self.gateway_client, self.target_id, self.name)\n File "/opt/spark/python/pyspark/sql/utils.py", line 117, in deco\n raise converted from None\npyspark.sql.utils.AnalysisException: Table or view not found: gaia_source; line 1 pos 72;\n\'Aggregate [\'hpx_id], [\'FLOOR((\'source_id / 140737488355328)) AS hpx_id#5, count(1) AS n#6L]\n+- \'UnresolvedRelation [gaia_source], [], false'}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '19.35', 'expected': '60.00', 'percent': '-67.75', 'start': '2022-06-02T17:16:32.491733', 'finish': '2022-06-02T17:16:51.842261'}, 'logs': ''}}, {'GaiaDMPSetup': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'SLOW', 'elapsed': '53.49', 'expected': '45.00', 'percent': '18.87', 'start': '2022-06-02T17:15:07.612010', 'finish': '2022-06-02T17:16:01.102911'}, 'logs': ''}, 'Mean_proper_motions_over_the_sky': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'SLOW', 'elapsed': '93.22', 'expected': '55.00', 'percent': '69.50', 'start': '2022-06-02T17:16:01.103126', 'finish': '2022-06-02T17:17:34.325558'}, 'logs': ''}, 'Source_counts_over_the_sky.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'SLOW', 'elapsed': '32.61', 'expected': '22.00', 'percent': '48.21', 'start': '2022-06-02T17:17:34.325949', 'finish': '2022-06-02T17:18:06.931299'}, 'logs': ''}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '17.20', 'expected': '60.00', 'percent': '-71.34', 'start': '2022-06-02T17:18:06.931769', 'finish': '2022-06-02T17:18:24.127882'}, 'logs': ''}}, {'GaiaDMPSetup': {'result': 'FAIL', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '8.72', 'expected': '45.00', 'percent': '-80.62', 'start': '2022-06-02T17:15:07.612168', 'finish': '2022-06-02T17:15:16.333407'}, 'logs': ''}, 'Mean_proper_motions_over_the_sky': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '54.87', 'expected': '55.00', 'percent': '-0.24', 'start': '2022-06-02T17:15:16.333582', 'finish': '2022-06-02T17:16:11.199682'}, 'logs': 'Fail to execute line 13: df = spark.sql(query).cache()\nTraceback (most recent call last):\n File "/tmp/1654190167519-0/zeppelin_python.py", line 158, in \n exec(code, _zcUserQueryNameSpace)\n File "", line 13, in \n File "/opt/spark/python/pyspark/sql/session.py", line 723, in sql\n return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)\n File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__\n answer, self.gateway_client, self.target_id, self.name)\n File "/opt/spark/python/pyspark/sql/utils.py", line 117, in deco\n raise converted from None\npyspark.sql.utils.AnalysisException: Table or view not found: gaia_source; line 1 pos 121;\n\'Aggregate [\'hpx_id], [\'floor((\'source_id / 140737488355328)) AS hpx_id#0, count(1) AS n#1L, \'AVG(\'pmra) AS avg_pmra#2, \'AVG(\'pmdec) AS avg_pmdec#3]\n+- \'UnresolvedRelation [gaia_source], [], false'}, 'Source_counts_over_the_sky.json': {'result': 'ERROR', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '12.28', 'expected': '22.00', 'percent': '-44.17', 'start': '2022-06-02T17:16:11.199863', 'finish': '2022-06-02T17:16:23.482303'}, 'logs': 'Fail to execute line 21: df = spark.sql("SELECT FLOOR(source_id / %d"%(divisor) + ") AS hpx_id, COUNT(*) AS n FROM gaia_source GROUP BY hpx_id")\nTraceback (most recent call last):\n File "/tmp/1654190167519-0/zeppelin_python.py", line 158, in \n exec(code, _zcUserQueryNameSpace)\n File "", line 21, in \n File "/opt/spark/python/pyspark/sql/session.py", line 723, in sql\n return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)\n File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__\n answer, self.gateway_client, self.target_id, self.name)\n File "/opt/spark/python/pyspark/sql/utils.py", line 117, in deco\n raise converted from None\npyspark.sql.utils.AnalysisException: Table or view not found: gaia_source; line 1 pos 72;\n\'Aggregate [\'hpx_id], [\'FLOOR((\'source_id / 140737488355328)) AS hpx_id#5, count(1) AS n#6L]\n+- \'UnresolvedRelation [gaia_source], [], false'}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '18.86', 'expected': '60.00', 'percent': '-68.57', 'start': '2022-06-02T17:16:23.482567', 'finish': '2022-06-02T17:16:42.343461'}, 'logs': ''}}, {'GaiaDMPSetup': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'SLOW', 'elapsed': '59.78', 'expected': '45.00', 'percent': '32.85', 'start': '2022-06-02T17:15:07.612183', 'finish': '2022-06-02T17:16:07.392690'}, 'logs': ''}, 'Mean_proper_motions_over_the_sky': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'SLOW', 'elapsed': '87.92', 'expected': '55.00', 'percent': '59.85', 'start': '2022-06-02T17:16:07.392815', 'finish': '2022-06-02T17:17:35.310943'}, 'logs': ''}, 'Source_counts_over_the_sky.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'SLOW', 'elapsed': '31.44', 'expected': '22.00', 'percent': '42.91', 'start': '2022-06-02T17:17:35.311292', 'finish': '2022-06-02T17:18:06.751329'}, 'logs': ''}, 'Library_Validation.json': {'result': 'PASS', 'outputs': {'valid': True}, 'time': {'result': 'FAST', 'elapsed': '16.68', 'expected': '60.00', 'percent': '-72.20', 'start': '2022-06-02T17:18:06.751983', 'finish': '2022-06-02T17:18:23.429412'}, 'logs': ''}}] + > .... + + + Looks like this might be it ... + https://issues.apache.org/jira/browse/ZEPPELIN-5237 + + + + + # + # Left it to settle for several hours. + # Tried starting again. + # Fails to create notebooks. + # + # Several different things going wrong. + # None of them to do with Spark scheduler. + # + + > .... + > ERROR [2022-06-03 04:33:24,748] ({qtp2128029086-18426} WebApplicationExceptionMapper.java[toResponse]:49) - Error response + > java.lang.OutOfMemoryError: Java heap space + > .... + + + > .... + > INFO [2022-06-03 04:35:26,535] ({qtp2128029086-18468} LoginRestApi.java[postLogin]:249) - {"status":"OK","message":"","body":{"principal":"Balline","ticket":"db09e8f1-0b33-4041-a80a-fcc8275a0bdc","roles":"[\"user\"]"}} + > INFO [2022-06-03 04:35:27,854] ({qtp2128029086-18510} NotebookRestApi.java[createNote]:385) - Creating new note by JSON {"paragraphs": [{"text": "%pyspark\n\n# Check Numpy\n\nimport numpy\nassert numpy.__version__ == \"1.20.3\" ", "user": "gaiauser", "dateUpdated": "2022-03-16T18:34:45+0000", "progress": 0, "config": {"editorSetting": {"language": "python", "editOnDblClick": false, "completionKey": "TAB", "completionSupport": true}, "colWidth": 12, "editorMode": "ace/mode/python", "fontSize": 9, "results": {}, "enabled": true}, "settings": {"params": {}, "forms": {}}, "apps": [], "runtimeInfos": {}, "progressUpdateIntervalMs": 5 + > eCreated": "2022-03-16T18:35:53+0000", "dateStarted": "2022-03-17T14:55:20+0000", "dateFinished": "2022-03-17T14:55:20+0000", "status": "FINISHED", "$$hashKey": "object:2633"}], "name": "/tmp/8H6C2GHHFY.json", "id": "2GZ96Z759", "defaultInterpreterGroup": "spark", "version": "0.10.0", "noteParams": {}, "noteForms": {}, "angularObjects": {}, "config": {"isZeppelinNotebookCronEnable": false, "looknfeel": "default", "personalizedMode": "false"}, "info": {}, "path": "/tmp/libraries.json"} + > INFO [2022-06-03 04:35:29,631] ({qtp2128029086-18510} LocalConfigStorage.java[save]:70) - Save notebook authorization to file: /home/fedora/zeppelin/conf/notebook-authorization.json + > ERROR [2022-06-03 04:36:20,708] ({qtp2128029086-18510} WebApplicationExceptionMapper.java[toResponse]:49) - Error response + > java.lang.OutOfMemoryError: Java heap space + > INFO [2022-06-03 04:36:25,419] ({qtp2128029086-18498} LoginRestApi.java[postLogin]:249) - {"status":"OK","message":"","body":{"principal":"Fipa","ticket":"cf949dcd-2fef-48d8-9d17-c1d46d34b925","roles":"[\"user\"]"}} + > .... + + + + > .... + > INFO [2022-06-03 04:44:58,084] ({qtp2128029086-18884} LocalConfigStorage.java[save]:70) - Save notebook authorization to file: /home/fedora/zeppelin/conf/notebook-authorization.json + > ERROR [2022-06-03 04:45:27,485] ({qtp2128029086-18877} WebApplicationExceptionMapper.java[toResponse]:49) - Error response + > java.util.ConcurrentModificationException + > at java.util.HashMap$EntrySpliterator.forEachRemaining(HashMap.java:1704) + > at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) + > at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) + > at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) + > at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) + > at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566) + > at org.apache.zeppelin.notebook.Notebook.getNotesInfo(Notebook.java:662) + > at org.apache.zeppelin.service.NotebookService.listNotesInfo(NotebookService.java:245) + > at org.apache.zeppelin.rest.NotebookRestApi.getNoteList(NotebookRestApi.java:318) + > at sun.reflect.GeneratedMethodAccessor11.invoke(Unknown Source) + > at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + > at java.lang.reflect.Method.invoke(Method.java:498) + > .... + + This might help .. + https://stackoverflow.com/a/67390602 + + + Enabling HDFS Storage for Zeppelin Notebooks + https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.6.3/bk_zeppelin-component-guide/content/ch_zeppelin_upgrade_hdfs_storage.html + + We are currently using GitNotebookRepo + zeppelin.notebook.storage + org.apache.zeppelin.notebook.repo.GitNotebookRepo + + + Separate the notebook storage from Zeppelin .. + https://zeppelin.apache.org/docs/0.10.0/setup/storage/storage.html#notebook-storage-in-mongodb + +--------------------------------------------------------------------------------------- + + All of the tests fail during create-note . + + Try a simple test + + loopcount=1 + usercount=1 + + > .... + > INFO [2022-06-03 05:21:59,892] ({qtp2128029086-19579} LocalConfigStorage.java[save]:70) - Save notebook authorization to file: /home/fedora/zeppelin/conf/notebook-authorization.json + > ERROR [2022-06-03 05:22:00,491] ({qtp2128029086-19579} WebApplicationExceptionMapper.java[toResponse]:49) - Error response + > java.lang.OutOfMemoryError: Java heap space + > .... + + Restart Zeppelin + + ssh zeppelin \ + ' + zeppelin-daemon.sh restart + ' + + > .... + > INFO [2022-06-03 05:24:05,084] ({main} ZeppelinLocationStrategy.java[locate]:44) - Load configuration from /home/fedora/zeppelin/conf/zeppelin-site.xml + > INFO [2022-06-03 05:24:05,088] ({main} ZeppelinLocationStrategy.java[locate]:44) - Load configuration from /home/fedora/zeppelin/conf/zeppelin-site.xml + > INFO [2022-06-03 05:24:05,146] ({main} ZeppelinConfiguration.java[create]:135) - Server Host: 0.0.0.0 + > INFO [2022-06-03 05:24:05,146] ({main} ZeppelinConfiguration.java[create]:139) - Server Port: 8080 + > INFO [2022-06-03 05:24:05,158] ({main} ZeppelinConfiguration.java[create]:141) - Context Path: / + > INFO [2022-06-03 05:24:05,158] ({main} ZeppelinConfiguration.java[create]:142) - Zeppelin Version: 0.10.0 + > INFO [2022-06-03 05:24:05,171] ({main} Log.java[initialized]:169) - Logging initialized @614ms to org.eclipse.jetty.util.log.Slf4jLog + > WARN [2022-06-03 05:24:05,426] ({main} ZeppelinConfiguration.java[getConfigFSDir]:653) - zeppelin.config.fs.dir is not specified, fall back to local conf directory zeppelin.conf.dir + > WARN [2022-06-03 05:24:05,430] ({main} ZeppelinConfiguration.java[getConfigFSDir]:653) - zeppelin.config.fs.dir is not specified, fall back to local conf directory zeppelin.conf.dir + > WARN [2022-06-03 05:24:05,430] ({main} ZeppelinConfiguration.java[getConfigFSDir]:653) - zeppelin.config.fs.dir is not specified, fall back to local conf directory zeppelin.conf.dir + > WARN [2022-06-03 05:24:05,466] ({main} LocalConfigStorage.java[loadCredentials]:88) - Credential file /home/fedora/zeppelin/conf/credentials.json is not existed + > INFO [2022-06-03 05:24:05,505] ({ImmediateThread-1654233845421} PluginManager.java[loadNotebookRepo]:78) - Loading NotebookRepo Plugin: org.apache.zeppelin.notebook.repo.GitNotebookRepo + > INFO [2022-06-03 05:24:05,579] ({ImmediateThread-1654233845421} VFSNotebookRepo.java[setNotebookDirectory]:69) - Using notebookDir: /home/fedora/zeppelin/notebook + > INFO [2022-06-03 05:24:05,606] ({main} ZeppelinServer.java[setupWebAppContext]:577) - warPath is: /home/fedora/zeppelin/zeppelin-web-0.10.0.war + > INFO [2022-06-03 05:24:05,606] ({main} ZeppelinServer.java[setupWebAppContext]:590) - ZeppelinServer Webapp path: /home/fedora/zeppelin/webapps + > INFO [2022-06-03 05:24:05,626] ({main} ZeppelinServer.java[setupWebAppContext]:577) - warPath is: /home/fedora/zeppelin/zeppelin-web-angular-0.10.0.war + > INFO [2022-06-03 05:24:05,627] ({main} ZeppelinServer.java[setupWebAppContext]:590) - ZeppelinServer Webapp path: /home/fedora/zeppelin/webapps/next + > INFO [2022-06-03 05:24:05,670] ({main} NotebookServer.java[]:156) - NotebookServer instantiated: org.apache.zeppelin.socket.NotebookServer@1bd39d3c + > INFO [2022-06-03 05:24:05,671] ({main} NotebookServer.java[setNotebook]:167) - Injected NotebookProvider + > INFO [2022-06-03 05:24:05,671] ({main} NotebookServer.java[setServiceLocator]:161) - Injected ServiceLocator: ServiceLocatorImpl(shared-locator,0,895281180) + > INFO [2022-06-03 05:24:05,671] ({main} NotebookServer.java[setNotebookService]:174) - Injected NotebookServiceProvider + > INFO [2022-06-03 05:24:05,671] ({main} NotebookServer.java[setAuthorizationServiceProvider]:181) - Injected NotebookAuthorizationServiceProvider + > INFO [2022-06-03 05:24:05,671] ({main} NotebookServer.java[setConnectionManagerProvider]:187) - Injected ConnectionManagerProvider + > INFO [2022-06-03 05:24:05,671] ({ImmediateThread-1654233845421} GitNotebookRepo.java[init]:77) - Opening a git repo at '/home/fedora/zeppelin/notebook' + > INFO [2022-06-03 05:24:05,672] ({main} ZeppelinServer.java[setupClusterManagerServer]:467) - Cluster mode is disabled + > INFO [2022-06-03 05:24:05,672] ({main} ZeppelinServer.java[main]:251) - Starting zeppelin server + > .... + + + Try a simple test + + loopcount=1 + usercount=1 + + > .... + > ------------ Test Result: [PASS] ------------ + > .... + + + Try a harder test + + loopcount=2 + usercount=2 + + > .... + > ------------ Test Result: [PASS] ------------ + > .... + > ------------ Test Result: [PASS] ------------ + > .... + + Try a harder test + + loopcount=4 + usercount=5 + + + + > .... + > INFO [2022-06-03 05:43:42,352] ({qtp2128029086-6278} NotebookRestApi.java[initParagraph]:1105) - Init Paragraph for user Fipa + > INFO [2022-06-03 05:43:42,352] ({qtp2128029086-6278} NotebookRestApi.java[configureParagraph]:1116) - Configure Paragraph for user Fipa + > ERROR [2022-06-03 05:43:42,356] ({qtp2128029086-6345} WebApplicationExceptionMapper.java[toResponse]:49) - Error response + > java.util.ConcurrentModificationException + > at java.util.HashMap$EntrySpliterator.forEachRemaining(HashMap.java:1704) + > at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) + > at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) + > at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) + > at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) + > at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566) + > at org.apache.zeppelin.notebook.Notebook.getNotesInfo(Notebook.java:662) + > at org.apache.zeppelin.service.NotebookService.listNotesInfo(NotebookService.java:245) + > at org.apache.zeppelin.rest.NotebookRestApi.getNoteList(NotebookRestApi.java:318) + > at sun.reflect.GeneratedMethodAccessor13.invoke(Unknown Source) + > at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + > at java.lang.reflect.Method.invoke(Method.java:498) + > .... + + + > .... + > at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171) + > at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:129) + > at org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:375) + > at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:806) + > at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:938) + > at java.lang.Thread.run(Thread.java:748) + > INFO [2022-06-03 05:43:57,686] ({qtp2128029086-6525} LoginRestApi.java[postLogin]:249) - {"status":"FORBIDDEN","message":""} + > ERROR [2022-06-03 05:43:57,746] ({LuceneSearch5} NoteEventAsyncListener.java[run]:128) - Fail to handle NoteEvent + > java.lang.IllegalStateException: this writer hit an unrecoverable error; cannot complete commit + > at org.apache.lucene.index.IndexWriter.finishCommit(IndexWriter.java:3801) + > at org.apache.lucene.index.IndexWriter.commitInternal(IndexWriter.java:3779) + > at org.apache.lucene.index.IndexWriter.commit(IndexWriter.java:3729) + > at org.apache.zeppelin.search.LuceneSearch.updateDoc(LuceneSearch.java:241) + > at org.apache.zeppelin.search.LuceneSearch.addIndexDocAsync(LuceneSearch.java:326) + > at org.apache.zeppelin.search.LuceneSearch.addNoteIndex(LuceneSearch.java:305) + > at org.apache.zeppelin.search.SearchService.handleNoteCreateEvent(SearchService.java:108) + > at org.apache.zeppelin.notebook.NoteEventAsyncListener$EventHandling.run(NoteEventAsyncListener.java:113) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + > at java.lang.Thread.run(Thread.java:748) + > Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded + > ERROR [2022-06-03 05:43:57,747] ({LuceneSearch5} NoteEventAsyncListener.java[run]:128) - Fail to handle NoteEvent + > org.apache.lucene.store.AlreadyClosedException: this IndexWriter is closed + > at org.apache.lucene.index.IndexWriter.ensureOpen(IndexWriter.java:877) + > at org.apache.lucene.index.IndexWriter.ensureOpen(IndexWriter.java:891) + > at org.apache.lucene.index.IndexWriter.updateDocuments(IndexWriter.java:1468) + > at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1757) + > at org.apache.zeppelin.search.LuceneSearch.updateDoc(LuceneSearch.java:240) + > at org.apache.zeppelin.search.LuceneSearch.addParagraphIndex(LuceneSearch.java:314) + > at org.apache.zeppelin.search.SearchService.handleParagraphCreateEvent(SearchService.java:123) + > at org.apache.zeppelin.notebook.NoteEventAsyncListener$EventHandling.run(NoteEventAsyncListener.java:119) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > .... + + + > .... + > INFO [2022-06-03 05:44:07,033] ({qtp2128029086-6665} LocalConfigStorage.java[save]:70) - Save notebook authorization to file: /home/fedora/zeppelin/conf/notebook-authorization.json + > INFO [2022-06-03 05:44:07,155] ({qtp2128029086-6665} VFSNotebookRepo.java[save]:144) - Saving note 2H6BTBEZW to tmp/6G8WI0Q8EA.json_2H6BTBEZW.zpln + > ERROR [2022-06-03 05:44:07,155] ({LuceneSearch5} NoteEventAsyncListener.java[run]:128) - Fail to handle NoteEvent + > org.apache.lucene.store.AlreadyClosedException: this IndexWriter is closed + > at org.apache.lucene.index.IndexWriter.ensureOpen(IndexWriter.java:877) + > at org.apache.lucene.index.IndexWriter.ensureOpen(IndexWriter.java:891) + > at org.apache.lucene.index.IndexWriter.updateDocuments(IndexWriter.java:1468) + > at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1757) + > at org.apache.zeppelin.search.LuceneSearch.updateDoc(LuceneSearch.java:240) + > at org.apache.zeppelin.search.LuceneSearch.indexNoteName(LuceneSearch.java:398) + > at org.apache.zeppelin.search.LuceneSearch.addIndexDocAsync(LuceneSearch.java:324) + > at org.apache.zeppelin.search.LuceneSearch.addNoteIndex(LuceneSearch.java:305) + > at org.apache.zeppelin.search.SearchService.handleNoteCreateEvent(SearchService.java:108) + > at org.apache.zeppelin.notebook.NoteEventAsyncListener$EventHandling.run(NoteEventAsyncListener.java:113) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + > at java.lang.Thread.run(Thread.java:748) + > Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded + > .... + + > .... + > at org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104) + > at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:806) + > at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:938) + > at java.lang.Thread.run(Thread.java:748) + > INFO [2022-06-03 05:44:44,195] ({qtp2128029086-7031} LoginRestApi.java[postLogin]:249) - {"status":"FORBIDDEN","message":""} + > ERROR [2022-06-03 05:44:53,301] ({SchedulerFactory73} Job.java[run]:174) - Job failed + > java.lang.OutOfMemoryError: Java heap space + > ERROR [2022-06-03 05:44:53,303] ({SchedulerFactory73} NotebookServer.java[onStatusChange]:1978) - Error + > java.lang.OutOfMemoryError: Java heap space + > WARN [2022-06-03 05:44:53,303] ({SchedulerFactory73} NotebookServer.java[onStatusChange]:1986) - Job paragraph_1654235056489_233652329 is finished, status: ERROR, exception: java.lang.OutOfMemoryError: Java heap space, result: null + > INFO [2022-06-03 05:44:53,303] ({SchedulerFactory73} VFSNotebookRepo.java[save]:144) - Saving note 2H3SQNHFX to tmp/A2J3VR1BSY.json_2H3SQNHFX.zpln + > WARN [2022-06-03 05:44:53,428] ({qtp2128029086-7118} QueuedThreadPool.java[run]:950) - + > java.lang.OutOfMemoryError: GC overhead limit exceeded + > INFO [2022-06-03 05:44:53,677] ({SchedulerFactory73} AbstractScheduler.java[runJob]:154) - Job paragraph_1654235056489_233652329 finished by scheduler RemoteInterpreter-spark-Fipa-shared_session with status ERROR + > ERROR [2022-06-03 05:44:53,747] ({qtp2128029086-7118} LoginRestApi.java[proceedToLogin]:213) - Exception in login: + > org.apache.shiro.authc.AuthenticationException: Authentication token of type [class org.apache.shiro.authc.UsernamePasswordToken] could not be authenticated by any configured realms. Please ensure that at least one realm can authenticate these tokens. + > at org.apache.shiro.authc.pam.AtLeastOneSuccessfulStrategy.afterAllAttempts(AtLeastOneSuccessfulStrategy.java:58) + > at org.apache.shiro.authc.pam.ModularRealmAuthenticator.doMultiRealmAuthentication(ModularRealmAuthenticator.java:241) + > at org.apache.shiro.authc.pam.ModularRealmAuthenticator.doAuthenticate(ModularRealmAuthenticator.java:275) + > .... + + + # + # 1226 notebooks in the authorization file. + + jq '.authInfo | length' /home/fedora/zeppelin/conf/notebook-authorization.json + + > 1226 + + # + # Everything has ground to a halt. + # Nothing to do with Spark, everything to do with Zeppelin. + # + # Browser login auth works but unable to display home page - hangs + # + + # + # Delete all the notebooks in 'tmp' + # Delete the authorization file. + # Restart Zeppelin ... + + pushd /home/fedora/zeppelin/notebook/tmp/ + rm * + popd + rm /home/fedora/zeppelin/conf/notebook-authorization.json + + zeppelin-daemon.sh restart + + start the same test again .. + + + # + # Locks up, first few notes created, but basically this Zeppelin instance is borked. + # Restarting things doesn't solve it. + # + + loopcount=2 + usercount=2 + + NOW we start to see applications blocked in Hadoop + Queue's AM resource limit exceeded. + Details : AM Partition = ; + AM Resource Request = ; + Queue Resource Limit for AM = ; + User AM Resource Limit of the queue = ; + Queue AM Resource Usage = ; + + + + + Kill all applications on YARN which are in RUNNING state: + https://stackoverflow.com/a/56035711 + + ssh master01 + + for x in $(yarn application -list -appStates RUNNING | awk 'NR > 2 { print $1 }'); do yarn application -kill $x; done + + + restart zeppelin + + + try again + + loopcount=2 + usercount=2 + + INFO [2022-06-03 07:44:30,067] ({SchedulerFactory2} AbstractScheduler.java[runJob]:127) - Job paragraph_1654242268481_2129599498 started by scheduler RemoteInterpreter-spark-Fipa-shared_session + WARN [2022-06-03 07:44:30,068] ({qtp2128029086-37} NotebookServer.java[onStatusChange]:1986) - Job paragraph_1654242267571_1686053048 is finished, status: ERROR, exception: java.util.ConcurrentModificationException, result: %text Unexpected exception: java.util.ConcurrentModificationException + at java.util.HashMap$ValueSpliterator.forEachRemaining(HashMap.java:1633) + at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) + at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) + at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) + at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) + at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566) + at org.apache.zeppelin.service.JobManagerService.getNoteJobInfoByUnixTime(JobManagerService.java:90) + at org.apache.zeppelin.socket.NotebookServer.broadcastUpdateNoteJobInfo(NotebookServer.java:519) + at org.apache.zeppelin.socket.NotebookServer.onStatusChange(NotebookServer.java:2007) + at org.apache.zeppelin.socket.NotebookServer.onStatusChange(NotebookServer.java:105) + at org.apache.zeppelin.scheduler.Job.setStatus(Job.java:141) + at org.apache.zeppelin.notebook.Paragraph.setStatus(Paragraph.java:398) + at org.apache.zeppelin.notebook.Paragraph.execute(Paragraph.java:349) + at org.apache.zeppelin.notebook.Note.run(Note.java:873) + at org.apache.zeppelin.service.NotebookService.runParagraph(NotebookService.java:390) + at org.apache.zeppelin.rest.NotebookRestApi.runParagraph(NotebookRestApi.java:849) + + + try again + + loopcount=2 + usercount=2 + + + > .... + > ------------ Test Result: [PASS] ------------ + > .... + > ------------ Test Result: [PASS] ------------ + > .... + + let's see how long it will last + + loopcount=20 + usercount=2 + + > .... + > ------------ Test Result: [PASS] ------------ + > .... + > ------------ Test Result: [PASS] ------------ + > .... + + + loopcount=4 + usercount=2 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + echo "Name [${testname}]" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + done + + grep 'Result' /tmp/results/multi-user-02* + + > /tmp/results/multi-user-02-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-02-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-02-02.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-02-03.txt:------------ Test Result: [PASS] ------------ + + + loopcount=4 + usercount=4 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + echo "Name [${testname}]" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + done + + Doesn't even get off the ground .. + Fails almost immediately + + > .... + > ERROR [2022-06-03 14:57:14,159] ({SchedulerFactory83} NotebookServer.java[onStatusChange]:1978) - Error + > java.lang.OutOfMemoryError: Java heap space + > .... + + + Options + Increase space for Zeppelin (kick the can) + Modify tests to re-use notebooks (in progress) + Change notebook repo to plain file rather than git - might help + Move notebook repo to a MongDB database - interesting. + + Count the number of notebooks that caused the problem. + + jq '.authInfo | length' /home/fedora/zeppelin/conf/notebook-authorization.json + + > 781 + + find /home/fedora/zeppelin/notebook -name '*.zpln' | wc -l + + > 779 + + Less than I was expecting. + + + + The places where it fails are + + File "/usr/local/lib/python3.10/site-packages/aglais_benchmark/aglais_benchmark.py", line 121, in run_notebook + + > .... + > # Make notebook + > batcmd="zdairi --config " + config + " notebook create --filepath " + tmpfile + > pipe = subprocess.Popen(batcmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) + > result = pipe.communicate()[0] + > print (result) + > + > result = result.decode().split("\n") + > text = result[0] + > notebookid = text.split(": ")[1] + > .... + + + File "/usr/local/lib/python3.10/site-packages/aglais_benchmark/aglais_benchmark.py", line 133, in run_notebook + + > .... + > # Print notebook + > batcmd="zdairi --config " + config + " notebook print --notebook " + notebookid + > pipe = subprocess.Popen(batcmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) + > result = pipe.communicate()[0] + > result = result.decode().split("\n") + > json_notebook = json.loads("".join(result), strict=False) + > .... + + In both cases AglaisBenchmarker is expecting nice JSON and it gets an error response instead. + + +-------------------------------------------------------------------------- + + Why is this deployment having these problems, when previous deployment had less. + + Problems with notebook repo dominate current testing + + 20220602-01-concurrent-tests.txt + 20220601-02-concurrent-tests.txt + + + Problems with notebook repo were there in earlier testing + + 20220529-02-concurrent-tests.txt + + > ERROR:root:list index out of range + > Traceback (most recent call last): + > File "/usr/local/lib/python3.10/site-packages/aglais_benchmark/aglais_benchmark.py", line 114, in run_notebook + > notebookid = text.split(": ")[1] + + + We can change the notebook repository handler + + zeppelin.notebook.storage + org.apache.zeppelin.notebook.repo.GitNotebookRepo + org.apache.zeppelin.notebook.repo.MongoNotebookRepo + + We can install MongoDB on a Fedora node. + https://tecadmin.net/install-mongodb-on-fedora/ + + We can install MongoDB using Ansible + https://docs.ansible.com/ansible/latest/collections/community/mongodb/index.html + + We can change the memory available to Zeppelin. + https://zeppelin.apache.org/docs/0.10.0/setup/operation/configuration.html#zeppelin_mem + + ZEPPELIN_MEM JVM memory options + default "-Xmx1024m -XX:MaxMetaspaceSize=512m" + diff --git a/notes/zrq/20220602-02-metrics-connect.txt b/notes/zrq/20220602-02-metrics-connect.txt new file mode 100644 index 00000000..5cd39e5d --- /dev/null +++ b/notes/zrq/20220602-02-metrics-connect.txt @@ -0,0 +1,218 @@ +# +# +# +# Copyright (c) 2022, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Connect to the Spark and Grafana user interfaces via a ssh proxy in the client container. + + Result: + + Work in progress ... + +# ----------------------------------------------------- +# Setup a SSH tunnel SOCKS proxy. +# https://www.digitalocean.com/community/tutorials/how-to-route-web-traffic-securely-without-a-vpn-using-a-socks-tunnel +# Running 'htop' on the Zeppelin node to keep the connection alive. +#[user@desktop] + + podman exec \ + --tty \ + --interactive \ + ansibler \ + bash -c \ + ' + ssh \ + -t \ + -D "3000" \ + zeppelin \ + " + htop + " + ' + +# ----------------------------------------------------- +# Login to the Spark UI using Firefox. +# (*) using FoxyProxy Firefox plugin to select the SOCKS proxy for internal hostnames. +#[user@desktop] + + firefox --new-window 'http://master01:8088/cluster' & + + +# ----------------------------------------------------- +# Login to Grafana using Firefox. +# (*) using FoxyProxy Firefox plugin to select the SOCKS proxy for internal hostnames. +#[user@desktop] + + firefox --new-window 'http://monitor:3000/login' & + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + # + # Firefox is unable to access the proxied URLs. + # Keep getting 'Connection reset' errors. + # + + # + # Try using curl. + # https://stackoverflow.com/a/9445516 + + curl \ + --head \ + --verbose \ + --proxy 'http://localhost:3000' \ + 'http://zeppelin:8080/' + + > * Trying 127.0.0.1:3000... + > * Connected to localhost (127.0.0.1) port 3000 (#0) + > > HEAD http://zeppelin:8080/ HTTP/1.1 + > > Host: zeppelin:8080 + > > User-Agent: curl/7.79.1 + > > Accept: */* + > > Proxy-Connection: Keep-Alive + > > + > * Recv failure: Connection reset by peer + > * Closing connection 0 + + + + + # + # Socks proxy via podman isn't working. + # + +# ----------------------------------------------------- +# +#[root@ansibler] + + # + # curl via socks inside the container works + # + + + curl \ + --head \ + --verbose \ + --socks5 'localhost:3000' \ + 'http://zeppelin:8080/' + + > * Trying 127.0.0.1:3000... + > * SOCKS5 connect to IPv4 128.232.222.138:8080 (locally resolved) + > * SOCKS5 request granted. + > * Connected to localhost (127.0.0.1) port 3000 (#0) + > > HEAD / HTTP/1.1 + > > Host: zeppelin:8080 + > > User-Agent: curl/7.81.0 + > > Accept: */* + > > + > * Mark bundle as not supporting multiuse + > < HTTP/1.1 200 OK + > HTTP/1.1 200 OK + > < Date: Thu, 02 Jun 2022 15:34:20 GMT + > Date: Thu, 02 Jun 2022 15:34:20 GMT + > < Access-Control-Allow-Credentials: true + > Access-Control-Allow-Credentials: true + > < Access-Control-Allow-Headers: authorization,Content-Type + > Access-Control-Allow-Headers: authorization,Content-Type + > < Access-Control-Allow-Methods: POST, GET, OPTIONS, PUT, HEAD, DELETE + > Access-Control-Allow-Methods: POST, GET, OPTIONS, PUT, HEAD, DELETE + > < X-FRAME-OPTIONS: SAMEORIGIN + > X-FRAME-OPTIONS: SAMEORIGIN + > < X-XSS-Protection: 1; mode=block + > X-XSS-Protection: 1; mode=block + > < X-Content-Type-Options: nosniff + > X-Content-Type-Options: nosniff + > < Last-Modified: Tue, 17 Aug 2021 13:58:44 GMT + > Last-Modified: Tue, 17 Aug 2021 13:58:44 GMT + > < Content-Type: text/html + > Content-Type: text/html + > < Accept-Ranges: bytes + > Accept-Ranges: bytes + > < Content-Length: 4660 + > Content-Length: 4660 + > < Server: + > Server: + > + > < + > * Connection #0 to host localhost left intact + + + # + # curl via socks outside the container fails + # + + curl \ + --head \ + --verbose \ + --socks5 'localhost:3000' \ + 'http://zeppelin:8080/' + + > * Trying 127.0.0.1:3000... + > * Unable to receive initial SOCKS5 response. + > * Closing connection 0 + > curl: (97) Unable to receive initial SOCKS5 response. + + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Setup a SSH tunnel SOCKS proxy. +# https://www.digitalocean.com/community/tutorials/how-to-route-web-traffic-securely-without-a-vpn-using-a-socks-tunnel +# Running 'htop' on the Zeppelin node to keep the connection alive. +#[user@desktop] + + sshhost=128.232.222.138 + sshuser=fedora + + ssh "${sshuser:?}@${sshhost:?}" \ + -t \ + -D "3001" \ + ' + htop + ' + +# ----------------------------------------------------- +# Login to the Spark UI using Firefox. +# (*) using FoxyProxy Firefox plugin to select the SOCKS proxy for internal hostnames. +#[user@desktop] + + firefox --new-window 'http://master01:8088/cluster' & + + +# ----------------------------------------------------- +# Login to Grafana using Firefox. +# (*) using FoxyProxy Firefox plugin to select the SOCKS proxy for internal hostnames. +#[user@desktop] + + firefox --new-window 'http://monitor:3000/login' & + + + + + diff --git a/notes/zrq/20220605-01-blue-deploy.txt b/notes/zrq/20220605-01-blue-deploy.txt new file mode 100644 index 00000000..b61dee0d --- /dev/null +++ b/notes/zrq/20220605-01-blue-deploy.txt @@ -0,0 +1,207 @@ +# +# +# +# Copyright (c) 2022, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Deployment used to run the concurrent tests. + + Result: + + Work in progress .. + +# ----------------------------------------------------- +# Create a container to work with. +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name ansibler \ + --hostname ansibler \ + --publish 3000:3000 \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/deployments:/deployments:ro,z" \ + ghcr.io/wfau/atolmis/ansible-client:2022.03.19 \ + bash + + +# ----------------------------------------------------- +# Set the target configuration. +#[root@ansibler] + + cloudbase='arcus' + cloudname='iris-gaia-blue' + configname=zeppelin-54.86-spark-6.26.43 + + +# ----------------------------------------------------- +# Deploy everything. +#[root@ansibler] + + time \ + source /deployments/hadoop-yarn/bin/deploy.sh + + > .... + > .... + + # + # Fails with SSH errors. + # + +# ----------------------------------------------------- +# SELinux rules are preventing the SSH client in the container from accessing the SSH agent socket on laptop. +#[root@ansibler] + + ssh -v zeppelin + + > OpenSSH_8.8p1, OpenSSL 3.0.0 7 sep 2021 + > debug1: Reading configuration data /root/.ssh/config + > debug1: /root/.ssh/config line 31: Applying options for zeppelin + > .... + > .... + > debug1: Connecting to 128.232.222.170 [128.232.222.170] port 22. + > debug1: Connection established. + > .... + > .... + > debug1: Host '128.232.222.170' is known and matches the ED25519 host key. + > debug1: Found key in /root/.ssh/known_hosts:1 + > .... + > .... + > debug1: Next authentication method: publickey + > debug1: Trying private key: /root/.ssh/id_rsa + > debug1: Trying private key: /root/.ssh/id_dsa + > debug1: Trying private key: /root/.ssh/id_ecdsa + > debug1: Trying private key: /root/.ssh/id_ecdsa_sk + > debug1: Trying private key: /root/.ssh/id_ed25519 + > debug1: Trying private key: /root/.ssh/id_ed25519_sk + > debug1: Trying private key: /root/.ssh/id_xmss + > debug1: No more authentication methods to try. + > fedora@128.232.222.170: Permission denied (publickey,gssapi-keyex,gssapi-with-mic). + + + > SELinux is preventing ssh from connectto access on the unix_stream_socket /run/user/1000/keyring/ssh. + > + > ***** Plugin catchall (100. confidence) suggests ************************** + > + > If you believe that ssh should be allowed connectto access on the ssh unix_stream_socket by default. + > Then you should report this as a bug. + > You can generate a local policy module to allow this access. + > Do + > allow this access for now by executing: + > # ausearch -c 'ssh' --raw | audit2allow -M my-ssh + > # semodule -X 300 -i my-ssh.pp + > + > Additional Information: + > Source Context system_u:system_r:container_t:s0:c5,c38 + > Target Context unconfined_u:unconfined_r:unconfined_t:s0- + > s0:c0.c1023 + > Target Objects /run/user/1000/keyring/ssh [ unix_stream_socket ] + > Source ssh + > Source Path ssh + > Port + > Host fedora + > Source RPM Packages + > Target RPM Packages + > SELinux Policy RPM selinux-policy-targeted-36.10-1.fc36.noarch + > Local Policy RPM selinux-policy-targeted-36.10-1.fc36.noarch + > Selinux Enabled True + > Policy Type targeted + > Enforcing Mode Enforcing + > Host Name fedora + > Platform Linux fedora 5.17.12-300.fc36.x86_64 #1 SMP + > PREEMPT Mon May 30 16:56:53 UTC 2022 x86_64 x86_64 + > Alert Count 6 + > First Seen 2022-06-05 23:50:31 BST + > Last Seen 2022-06-06 00:30:09 BST + > Local ID af7b1ea4-8999-44eb-b262-63580fb19ae9 + > + > Raw Audit Messages + > type=AVC msg=audit(1654471809.349:356): avc: denied { connectto } for pid=7310 comm="ssh" path="/run/user/1000/keyring/ssh" scontext=system_u:system_r:container_t:s0:c5,c38 tcontext=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 tclass=unix_stream_socket permissive=0 + > + > Hash: ssh,container_t,unconfined_t,unix_stream_socket,connectto + + +# ----------------------------------------------------- +# Alllow SSH client in a container access to our SSH agent. +#[user@laptop] + + sudo ausearch -c 'ssh' --raw | audit2allow -M container-ssh + + sudo semodule -X 300 -i container-ssh.pp + + +# ----------------------------------------------------- +# Try again .... +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name ansibler \ + --hostname ansibler \ + --publish 3000:3000 \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/deployments:/deployments:ro,z" \ + ghcr.io/wfau/atolmis/ansible-client:2022.03.19 \ + bash + + +# ----------------------------------------------------- +# Set the target configuration. +#[root@ansibler] + + cloudbase='arcus' + cloudname='iris-gaia-blue' + configname=zeppelin-54.86-spark-6.26.43 + + +# ----------------------------------------------------- +# Deploy everything. +#[root@ansibler] + + time \ + source /deployments/hadoop-yarn/bin/deploy.sh + + > .... + > .... + > real 49m14.575s + > user 13m36.205s + > sys 2m25.241s + + + diff --git a/notes/zrq/20220605-02-concurrent-tests.txt b/notes/zrq/20220605-02-concurrent-tests.txt new file mode 100644 index 00000000..74755cac --- /dev/null +++ b/notes/zrq/20220605-02-concurrent-tests.txt @@ -0,0 +1,646 @@ +# +# +# +# Copyright (c) 2022, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Try to find out more about the limits on concurrent users. + Started with a clean deployment 20220605-01-blue-deploy.txt + + Result: + + Work in progress .. + + +# ----------------------------------------------------- +# Create some test users. +# TODO Move the create-user-tools to ansible/client/bin. +# TODO Add ansible/client/bin to the client PATH. +#[root@ansibler] + + # + # Only create a small set to see if that reduces problems with too many notebooks. + # + + source /deployments/zeppelin/bin/create-user-tools.sh + + testnames02=( + Hamar + Carclop + Halda + Jaden + Mavaca + Franilley + Masonania + Webbbron + Granwaler + ) + + createarrayusers \ + "${testnames02[@]}" \ + | tee /tmp/testusers-02.json \ + | jq '[ .users[] | {"name": .shirouser.name, "pass": .shirouser.pass} ]' + + > [ + > { + > "name": "Hamar", + > "pass": "TieshukeduM8iij2dujeed5viuKoov" + > }, + > .... + > .... + > { + > "name": "Granwaler", + > "pass": "Keik0oiph9moh4Aedahphee7bou2ji" + > } + > ] + + +# ----------------------------------------------------- +# Create our benchmark script. +# TODO Create run-benchmark.py in ansible/client/bin. +# Learning Python: +# Command line args +# https://realpython.com/python-command-line-arguments/ +# String.format() +# https://docs.python.org/3/library/string.html#formatstrings +# Escape {} in format() +# https://stackoverflow.com/a/5466478 +#[root@ansibler] + + cat > /tmp/run-benchmark.py << 'EOF' +#!/bin/python3 +import sys +from aglais_benchmark import AglaisBenchmarker + +try: + + opts = [opt for opt in sys.argv[1:] if opt.startswith("-")] + args = [arg for arg in sys.argv[1:] if not arg.startswith("-")] + + endpoint = args[0] + testconfig = args[1] + userlist = args[2] + usercount = int(args[3]) + +except IndexError: + + raise SystemExit(f"Usage: {sys.argv[0]} ") + +print( +""" +{{ +\"config\": {{ + \"endpoint\": \"{}\", + \"testconfig\": \"{}\", + \"userlist\": \"{}\", + \"usercount\": \"{}\" + }} +}} +""".format( + endpoint, + testconfig, + userlist, + usercount + ) + ) + +AglaisBenchmarker( + testconfig, + userlist, + "/tmp/", + endpoint + ).run( + concurrent=True, + users=usercount + ) + +EOF + + chmod 'a+x' /tmp/run-benchmark.py + + +# ----------------------------------------------------- +# Run a quick test with one user. +#[root@ansibler] + + mkdir /tmp/results + + endpoint="http://zeppelin:8080" + + testconfig=/deployments/zeppelin/test/config/quick.json + + testusers=/tmp/testusers-02.json + testname=single-user-01 + usercount=1 + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + > Test started [Multi User] + > b'Create notebook: 2H58E738R\n' + > b'Create notebook: 2H7NGHV9Q\n' + > b'Create notebook: 2H7SKTKDR\n' + > b'Create notebook: 2H6FTDJU8\n' + > Test completed! (117.60 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + + + sed " + 0,/^----/ d + s/\"/#/g + s/'\(-\{0,1\}[0-9.]\{1,\}\)'/\1/g + s/:[[:space:]]*\([a-zA-Z]\{1,\}\)\([,}]\)/:'\1'\2/g + s/:[[:space:]]*\([,}]\),/: ''\1/g + s/'/\"/g + " \ + "/tmp/results/${testname:?}.txt" \ + | tee "/tmp/results/${testname:?}.json" \ + | jq ' + .[] | keys as $x | [ $x[] as $y | {name: $y, value: .[$y].result, time: .[$y].time.elapsed } ] + ' + + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 37.52 + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 9.53 + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 52.57 + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 17.98 + > } + > ] + + +# ----------------------------------------------------- +# Add a function to filter our results. +#[root@ansibler] + + filter-results() + { + local testname=${1:?'testname required'} + sed " + 0,/^----/ d + s/\"/#/g + s/'\(-\{0,1\}[0-9.]\{1,\}\)'/\1/g + s/:[[:space:]]*\([a-zA-Z]\{1,\}\)\([,}]\)/:'\1'\2/g + s/:[[:space:]]*\([,}]\),/: ''\1/g + s/'/\"/g + " \ + "/tmp/results/${testname:?}.txt" \ + | tee "/tmp/results/${testname:?}.json" \ + | jq ' + .[] | keys as $x | [ $x[] as $y | {name: $y, value: .[$y].result, time: .[$y].time.elapsed } ] + ' + } + +# ----------------------------------------------------- +# Step up to 4 users run 4 times. +#[root@ansibler] + + loopcount=4 + usercount=4 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + echo "Name [${testname}]" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + done + + + > ------------- + > Loop [3] + > Name [multi-user-04-03] + > + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-02.json", + > "usercount": "4" + > } + > } + > + > /tmp/testusers-02.json + > Test started [Multi User] + > b'Create notebook: 2H7GQG7V2\n' + > b'Create notebook: 2H66KYKND\n' + > b'Create notebook: 2H59W8Q1X\n' + > b'Create notebook: 2H7CZHEXV\n' + > b'Create notebook: 2H5WHXUU2\n' + > b'Create notebook: 2H4K3W85M\n' + > b'Create notebook: 2H7S46HA2\n' + > b'Create notebook: 2H79NJ6VA\n' + > b'Create notebook: 2H72AEZ7R\n' + > b'Create notebook: 2H741XUWP\n' + > b'Create notebook: 2H6RU7Z9J\n' + > b'Create notebook: 2H5W28DC8\n' + > b'Create notebook: 2H7HXT4H7\n' + > b'Create notebook: 2H4X3N7HH\n' + > b'Create notebook: 2H5FNWPZY\n' + > b'Create notebook: 2H5D6RZHR\n' + > Test completed! (59.21 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.70 + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 31.35 + > } + > ] + + +# ----------------------------------------------------- +# Step up to 4 users run 8 times. +#[root@ansibler] + + loopcount=8 + usercount=4 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + echo "Name [${testname}]" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + done + + + > .... + > .... + > ------------- + > Loop [7] + > Name [multi-user-04-07] + > + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-02.json", + > "usercount": "4" + > } + > } + > + > /tmp/testusers-02.json + > Test started [Multi User] + > b'Create notebook: 2H6RAKEXJ\n' + > b'Create notebook: 2H4Y746VQ\n' + > b'Create notebook: 2H659RJFZ\n' + > b'Create notebook: 2H4FXKAF9\n' + > b'Create notebook: 2H7V2WFV1\n' + > b'Create notebook: 2H4HM8V81\n' + > b'Create notebook: 2H6945DZW\n' + > b'Create notebook: 2H5KYXT6H\n' + > b'Create notebook: 2H3VKBC5C\n' + > b'Create notebook: 2H7Q6XTDY\n' + > b'Create notebook: 2H658Z5CB\n' + > b'Create notebook: 2H3X85YM8\n' + > b'Create notebook: 2H661MM95\n' + > b'Create notebook: 2H6DWSQK4\n' + > b'Create notebook: 2H6VQ7D83\n' + > b'Create notebook: 2H7UWX4M9\n' + > Test completed! (62.27 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 3.93 + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 20.11 + > } + > ] + + +# ----------------------------------------------------- +# Step up to 4 users run 16 times. +#[root@ansibler] + + loopcount=16 + usercount=4 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + echo "Name [${testname}]" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + done + + # + # Starting to see failures with notebooks access. + # 4 users means only 4 Spark contexts, so only 4 Hadoop applications. + # + + > ------------- + > Loop [0] + > Name [multi-user-04-00] + > .... + > Test completed! (70.05 seconds) + > ------------ Test Result: [PASS] ------------ + > .... + > ------------- + > Loop [1] + > Name [multi-user-04-01] + > .... + > Test completed! (60.42 seconds) + > ------------ Test Result: [PASS] ------------ + > .... + > ------------- + > Loop [2] + > Name [multi-user-04-02] + > .... + > Test completed! (59.80 seconds) + > ------------ Test Result: [PASS] ------------ + > .... + > ------------- + > Loop [3] + > Name [multi-user-04-03] + > .... + > Test completed! (60.88 seconds) + > ------------ Test Result: [PASS] ------------ + > .... + > ------------- + > Loop [4] + > Name [multi-user-04-04] + > .... + > Test completed! (58.60 seconds) + > ------------ Test Result: [FAIL] ------------ + > .... + > ------------- + > Loop [5] + > Name [multi-user-04-05] + > .... + > Test completed! (63.50 seconds) + > ------------ Test Result: [PASS] ------------ + > .... + > ------------- + > Loop [6] + > Name [multi-user-04-06] + > .... + > Test completed! (62.36 seconds) + > ------------ Test Result: [PASS] ------------ + > .... + > ------------- + > Loop [7] + > Name [multi-user-04-07] + > .... + > ------------ Test Result: [PASS] ------------ + > .... + > ------------- + > Loop [8] + > Name [multi-user-04-08] + > .... + > Test completed! (60.83 seconds) + > ------------ Test Result: [PASS] ------------ + > .... + > ------------- + > Loop [9] + > Name [multi-user-04-09] + > .... + > Test completed! (60.60 seconds) + > ------------ Test Result: [PASS] ------------ + > .... + > ------------- + > Loop [10] + > Name [multi-user-04-10] + > .... + > Test completed! (59.41 seconds) + > ------------ Test Result: [FAIL] ------------ + > .... + > ------------- + > Loop [11] + > Name [multi-user-04-11] + > .... + > Test completed! (61.45 seconds) + > ------------ Test Result: [PASS] ------------ + > .... + > ------------- + > Loop [12] + > Name [multi-user-04-12] + > .... + > Test completed! (62.53 seconds) + > ------------ Test Result: [PASS] ------------ + > .... + > ------------- + > Loop [13] + > Name [multi-user-04-13] + > .... + > Test completed! (59.47 seconds) + > ------------ Test Result: [FAIL] ------------ + > .... + > ------------- + > Loop [14] + > Name [multi-user-04-14] + > .... + > Test completed! (51.11 seconds) + > ------------ Test Result: [FAIL] ------------ + > .... + > ------------- + > Loop [15] + > Name [multi-user-04-15] + > .... + > Test completed! (50.79 seconds) + > ------------ Test Result: [FAIL] ------------ + > .... + + + grep 'Result:' /tmp/results/multi-user-04-*.txt + + > /tmp/results/multi-user-04-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-02.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-03.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-04.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-04-05.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-06.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-07.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-08.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-09.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-10.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-04-11.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-12.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-13.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-04-14.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-04-15.txt:------------ Test Result: [FAIL] ------------ + + +# ----------------------------------------------------- +# Step up to 4 users run 32 times. +#[root@ansibler] + + loopcount=32 + usercount=4 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + echo "Name [${testname}]" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + done + + # + # Notebook managment failures. + # + + > .... + > INFO [2022-06-06 02:07:45,640] ({qtp686466458-32159} NotebookRestApi.java[initParagraph]:1105) - Init Paragraph for user Carclop + > INFO [2022-06-06 02:07:45,640] ({qtp686466458-32159} NotebookRestApi.java[configureParagraph]:1116) - Configure Paragraph for user Carclop + > ERROR [2022-06-06 02:07:46,537] ({LuceneSearch7} NoteEventAsyncListener.java[run]:128) - Fail to handle NoteEvent + > org.apache.lucene.store.AlreadyClosedException: refusing to delete any files: this IndexWriter hit an unrecoverable exception + > at org.apache.lucene.index.IndexFileDeleter.ensureOpen(IndexFileDeleter.java:349) + > at org.apache.lucene.index.IndexFileDeleter.deleteFiles(IndexFileDeleter.java:669) + > at org.apache.lucene.index.IndexFileDeleter.decRef(IndexFileDeleter.java:589) + > at org.apache.lucene.index.IndexFileDeleter.checkpoint(IndexFileDeleter.java:531) + > at org.apache.lucene.index.IndexWriter.checkpoint(IndexWriter.java:2717) + > at org.apache.lucene.index.IndexWriter.publishFlushedSegment(IndexWriter.java:2795) + > at org.apache.lucene.index.IndexWriter.lambda$publishFlushedSegments$22(IndexWriter.java:5385) + > at org.apache.lucene.index.DocumentsWriterFlushQueue.innerPurge(DocumentsWriterFlushQueue.java:119) + > at org.apache.lucene.index.DocumentsWriterFlushQueue.tryPurge(DocumentsWriterFlushQueue.java:150) + > at org.apache.lucene.index.DocumentsWriter.purgeFlushTickets(DocumentsWriter.java:191) + > at org.apache.lucene.index.IndexWriter.publishFlushedSegments(IndexWriter.java:5365) + > at org.apache.lucene.index.IndexWriter.access$300(IndexWriter.java:219) + > at org.apache.lucene.index.IndexWriter$1.afterSegmentsFlushed(IndexWriter.java:446) + > at org.apache.lucene.index.DocumentsWriter.doFlush(DocumentsWriter.java:525) + > at org.apache.lucene.index.DocumentsWriter.flushAllThreads(DocumentsWriter.java:660) + > at org.apache.lucene.index.IndexWriter.prepareCommitInternal(IndexWriter.java:3365) + > at org.apache.lucene.index.IndexWriter.commitInternal(IndexWriter.java:3771) + > at org.apache.lucene.index.IndexWriter.commit(IndexWriter.java:3729) + > at org.apache.zeppelin.search.LuceneSearch.updateDoc(LuceneSearch.java:241) + > at org.apache.zeppelin.search.LuceneSearch.addIndexDocAsync(LuceneSearch.java:326) + > at org.apache.zeppelin.search.LuceneSearch.addNoteIndex(LuceneSearch.java:305) + > at org.apache.zeppelin.search.SearchService.handleNoteCreateEvent(SearchService.java:108) + > at org.apache.zeppelin.notebook.NoteEventAsyncListener$EventHandling.run(NoteEventAsyncListener.java:113) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + > at java.lang.Thread.run(Thread.java:748) + > Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded + > ERROR [2022-06-06 02:07:46,539] ({LuceneSearch7} NoteEventAsyncListener.java[run]:128) - Fail to handle NoteEvent + > org.apache.lucene.store.AlreadyClosedException: this IndexWriter is closed + > at org.apache.lucene.index.IndexWriter.ensureOpen(IndexWriter.java:877) + > at org.apache.lucene.index.IndexWriter.ensureOpen(IndexWriter.java:891) + > at org.apache.lucene.index.IndexWriter.updateDocuments(IndexWriter.java:1468) + > at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1757) + > at org.apache.zeppelin.search.LuceneSearch.updateDoc(LuceneSearch.java:240) + > at org.apache.zeppelin.search.LuceneSearch.addParagraphIndex(LuceneSearch.java:314) + > at org.apache.zeppelin.search.SearchService.handleParagraphCreateEvent(SearchService.java:123) + > at org.apache.zeppelin.notebook.NoteEventAsyncListener$EventHandling.run(NoteEventAsyncListener.java:119) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + > at java.lang.Thread.run(Thread.java:748) + > Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded + > .... + + # + # Zeppelin UI fails too .. + # + + > .... + > HTTP ERROR 500 java.lang.OutOfMemoryError: GC overhead limit exceeded + > .... + + # + # Everything locks up and the test fails ... + # + # 4 users lots of times creates lots of notebooks. + # Notebook management tools in Zeppelin fail with memory errors. + # + diff --git a/notes/zrq/20220607-01-concurrent-tests.txt b/notes/zrq/20220607-01-concurrent-tests.txt new file mode 100644 index 00000000..9615a2f3 --- /dev/null +++ b/notes/zrq/20220607-01-concurrent-tests.txt @@ -0,0 +1,241 @@ +# +# +# +# Copyright (c) 2022, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Try to find out more about the limits on concurrent users. + Following on from the broken deployment 20220605-02-concurrent-tests.txt + + Result: + + Work in progress ... + +# ----------------------------------------------------- +# Client container is no longer running on laptop +#[user@laptop] + + podman ps -a + + > CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES + + +# ----------------------------------------------------- +# STart a new client container +#[user@laptop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name ansibler \ + --hostname ansibler \ + --publish 3000:3000 \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/deployments:/deployments:ro,z" \ + ghcr.io/wfau/atolmis/ansible-client:2022.03.19 \ + bash + + +# ----------------------------------------------------- +# Re-create our ansible-vars file. +# Copied from hadoop-yarn/bin/create-all.sh +#[root@ansibler] + + configyml=/tmp/ansible-vars.yml + + cloudbase='arcus' + cloudname='iris-gaia-blue' + configname=zeppelin-54.86-spark-6.26.43 + + deployconf=${configname:?} + deployname=${cloudname:?}-20220605 + deploydate=20220605T000000 + + touch "${configyml:?}" + + yq eval \ + --inplace \ + ".aglais.status.deployment.type = \"hadoop-yarn\"" \ + "${configyml:?}" + + yq eval \ + --inplace \ + ".aglais.status.deployment.conf = \"${deployconf}\"" \ + "${configyml:?}" + + yq eval \ + --inplace \ + ".aglais.status.deployment.name = \"${deployname}\"" \ + "${configyml:?}" + + yq eval \ + --inplace \ + ".aglais.status.deployment.date = \"${deploydate}\"" \ + "${configyml:?}" + + yq eval \ + --inplace \ + ".aglais.spec.openstack.cloud.base = \"${cloudbase}\"" \ + "${configyml:?}" + + yq eval \ + --inplace \ + ".aglais.spec.openstack.cloud.name = \"${cloudname}\"" \ + "${configyml:?}" + + + cat "${configyml:?}" + + > aglais: + > status: + > deployment: + > type: hadoop-yarn + > conf: zeppelin-54.86-spark-6.26.43 + > name: iris-gaia-blue-20220605 + > date: 20220605T000000 + > spec: + > openstack: + > cloud: + > base: arcus + > name: iris-gaia-blue + +# ----------------------------------------------------- +# Run the ssh configuration step. +#[root@ansibler] + + inventory="/deployments/hadoop-yarn/ansible/config/${deployconf:?}.yml" + + pushd "/deployments/hadoop-yarn/ansible" + + ansible-playbook \ + --inventory "${inventory:?}" \ + "05-config-ssh.yml" + + popd + + +# ----------------------------------------------------- +# Get the IP address for the Zeppelin node from the ssh config file. +# TODO Save the IP address during the deployment process. +#[root@ansibler] + + ipaddress=$( + sed -n ' + /^Host zeppelin/,/^Host/ { + /HostName/ { + s/^[[:space:]]*HostName[[:space:]]\(.*\)/\1/ p + } + } + ' ~/.ssh/config + ) + + +# ----------------------------------------------------- +# Add the Zeppelin IP address to our hosts file. +# TODO Add this to the Ansible deployment. +#[root@ansibler] + +cat >> /etc/hosts << EOF +# Zeppelin +${ipaddress} zeppelin +EOF + + +# ----------------------------------------------------- +# Check we can ssh into the Zeppelin node. +#[root@ansibler] + + ssh zeppelin \ + ' + hostname + date + ' + + > iris-gaia-blue-20220605-zeppelin + > Tue Jun 7 10:37:47 UTC 2022 + + +# ----------------------------------------------------- +# Check we can access the Zeppelin webapp. +#[root@ansibler] + + endpoint="http://zeppelin:8080" + + curl --head "${endpoint:?}" + + # + # Timeout - I'm guessing Zeppelin is still locked up ? + # + +# ----------------------------------------------------- +# Re-start the Zeppelin service. +#[root@ansibler] + + ssh zeppelin \ + ' + hostname + date + zeppelin-daemon.sh restart + ' + + > iris-gaia-blue-20220605-zeppelin + > Tue Jun 7 10:45:46 UTC 2022 + > Zeppelin stop [ OK ] + > Zeppelin start [ OK ] + + +# ----------------------------------------------------- +# Try again .... +#[root@ansibler] + + curl --head "${endpoint:?}" + + > HTTP/1.1 200 OK + > Date: Tue, 07 Jun 2022 10:46:31 GMT + > .... + > .... + + + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + + Next steps + count the notebooks + delete all the notebooks + increase the available memory + re-create the test and see if this kicks the can + + + diff --git a/notes/zrq/20220613-01-blue-deploy.txt b/notes/zrq/20220613-01-blue-deploy.txt new file mode 100644 index 00000000..2b6efdc0 --- /dev/null +++ b/notes/zrq/20220613-01-blue-deploy.txt @@ -0,0 +1,79 @@ +# +# +# +# Copyright (c) 2022, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Deployment used to run the concurrent tests. + + Result: + + Work in progress .. + +# ----------------------------------------------------- +# Create a container to work with. +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name ansibler \ + --hostname ansibler \ + --publish 3000:3000 \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/deployments:/deployments:ro,z" \ + ghcr.io/wfau/atolmis/ansible-client:2022.03.19 \ + bash + + +# ----------------------------------------------------- +# Set the target configuration. +#[root@ansibler] + + cloudbase='arcus' + cloudname='iris-gaia-blue' + configname=zeppelin-54.86-spark-6.26.43 + + +# ----------------------------------------------------- +# Deploy everything. +#[root@ansibler] + + time \ + source /deployments/hadoop-yarn/bin/deploy.sh + + > real 51m14.916s + > user 17m33.190s + > sys 4m4.385s + + + diff --git a/notes/zrq/20220613-02-concurrent-tests.txt b/notes/zrq/20220613-02-concurrent-tests.txt new file mode 100644 index 00000000..8e03400c --- /dev/null +++ b/notes/zrq/20220613-02-concurrent-tests.txt @@ -0,0 +1,1277 @@ +# +# +# +# Copyright (c) 2022, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Try to find out more about the limits on concurrent users. + Based on a clean deployment using 20220613-01-blue-deploy.txt. + + Result: + + Work in progress ... + + TODO move from quick to complex test sets + TODO move from 4 to 8 concurrent users + + +# ----------------------------------------------------- +# Create some test users. +# TODO Move the create-user-tools to ansible/client/bin. +# TODO Add ansible/client/bin to the client PATH. +#[root@ansibler] + + source /deployments/zeppelin/bin/create-user-tools.sh + + testnames02=( + Hamar + Carclop + Halda + Jaden + Mavaca + Franilley + Masonania + Webbbron + Granwaler + ) + + createarrayusers \ + "${testnames02[@]}" \ + | tee /tmp/testusers-02.json \ + | jq '[ .users[] | {"name": .shirouser.name, "pass": .shirouser.pass} ]' + + > [ + > { + > "name": "Hamar", + > "pass": "bu2hohmohthiesuNg1deiy5IeshaeD" + > }, + > .... + > .... + > { + > "name": "Granwaler", + > "pass": "Su1ie7akaethae6eic0ien5wiChaeC" + > } + > ] + + +# ----------------------------------------------------- +# Create our benchmark script. +# TODO Create run-benchmark.py in ansible/client/bin. +# Learning Python: +# Command line args +# https://realpython.com/python-command-line-arguments/ +# String.format() +# https://docs.python.org/3/library/string.html#formatstrings +# Escape {} in format() +# https://stackoverflow.com/a/5466478 +#[root@ansibler] + + cat > /tmp/run-benchmark.py << 'EOF' +#!/bin/python3 +import sys +from aglais_benchmark import AglaisBenchmarker + +try: + + opts = [opt for opt in sys.argv[1:] if opt.startswith("-")] + args = [arg for arg in sys.argv[1:] if not arg.startswith("-")] + + endpoint = args[0] + testconfig = args[1] + userlist = args[2] + usercount = int(args[3]) + delaystart = int(args[4]) + delaynotebook = int(args[5]) + +except IndexError: + + raise SystemExit(f"Usage: {sys.argv[0]} ") + +print( +""" +{{ +\"config\": {{ + \"endpoint\": \"{}\", + \"testconfig\": \"{}\", + \"userlist\": \"{}\", + \"usercount\": \"{}\", + \"delaystart\": \"{}\", + \"delaynotebook\": \"{}\" + }} +}} +""".format( + endpoint, + testconfig, + userlist, + usercount, + delaystart, + delaynotebook + ) + ) + +AglaisBenchmarker( + testconfig, + userlist, + "/tmp/", + endpoint + ).run( + concurrent=True, + users=usercount, + delay_start=delaystart, + delay_notebook=delaynotebook + ) + +EOF + + chmod 'a+x' /tmp/run-benchmark.py + + +# ----------------------------------------------------- +# Add a function to filter our results. +#[root@ansibler] + + filter-results() + { + local testname=${1:?'testname required'} + sed " + 0,/^----/ d + s/\"/#/g + s/'\(-\{0,1\}[0-9.]\{1,\}\)'/\1/g + s/:[[:space:]]*\([a-zA-Z]\{1,\}\)\([,}]\)/:'\1'\2/g + s/:[[:space:]]*\([,}]\),/: ''\1/g + s/'/\"/g + " \ + "/tmp/results/${testname:?}.txt" \ + | tee "/tmp/results/${testname:?}.json" \ + | jq ' + .[] | keys as $x | [ $x[] as $y | {name: $y, value: .[$y].result, time: .[$y].time.elapsed , start: .[$y].time.start, finish: .[$y].time.finish } ] + ' + } + + +# ----------------------------------------------------- +# Run a quick test with one user, one second start delay and one second between. +#[root@ansibler] + + mkdir /tmp/results + + endpoint="http://zeppelin:8080" + + testconfig=/deployments/zeppelin/test/config/quick.json + + testusers=/tmp/testusers-02.json + testname=single-user-01 + usercount=1 + + delaystart=1 + delaynotebook=1 + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + "${delaystart:?}" \ + "${delaynotebook:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + + > Test started [Multi User] + > Test completed! (125.28 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 37.51, + > "start": "2022-06-13T12:35:15.419507", + > "finish": "2022-06-13T12:35:52.934486" + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 9.87, + > "start": "2022-06-13T12:37:09.804486", + > "finish": "2022-06-13T12:37:19.672728" + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 55.74, + > "start": "2022-06-13T12:35:53.935352", + > "finish": "2022-06-13T12:36:49.678480" + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 18.12, + > "start": "2022-06-13T12:36:50.679349", + > "finish": "2022-06-13T12:37:08.803166" + > } + > ] + + # + # Looks like the 1 seconbd between delay is working. + # Although the results are not listed in chronological order. + # Library_Validation is listed as second, when it is actually run last. + # + + # + # Re-ordering into the correct the sequence shows a 1 second delay between a notebook "finish" and the next "start". + # + + > "start": "2022-06-13T12:35:15.419507", + > "finish": "2022-06-13T12:35:52.934486" + + > "start": "2022-06-13T12:35:53.935352", + > "finish": "2022-06-13T12:36:49.678480" + + > "start": "2022-06-13T12:36:50.679349", + > "finish": "2022-06-13T12:37:08.803166" + + > "start": "2022-06-13T12:37:09.804486", + > "finish": "2022-06-13T12:37:19.672728" + + + +# ----------------------------------------------------- +# Step up to 4 users run 4 times. +# One second start delay and one second between. +#[root@ansibler] + + loopcount=4 + usercount=4 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + echo "Name [${testname}]" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + "${delaystart:?}" \ + "${delaynotebook:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + done + + > ------------- + > Loop [0] + > Name [multi-user-04-00] + > + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-02.json", + > "usercount": "4", + > "delaystart": "1", + > "delaynotebook": "1" + > } + > } + > + > Test started [Multi User] + > Test completed! (243.42 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 37.13, + > "start": "2022-06-13T12:43:40.946927", + > "finish": "2022-06-13T12:44:18.077720" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 25.15, + > "start": "2022-06-13T12:47:07.494554", + > "finish": "2022-06-13T12:47:32.647725" + > } + > ] + + > ------------- + > Loop [1] + > Name [multi-user-04-01] + > + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-02.json", + > "usercount": "4", + > "delaystart": "1", + > "delaynotebook": "1" + > } + > } + > + > Test started [Multi User] + > Test completed! (221.44 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 35.80, + > "start": "2022-06-13T12:47:44.843521", + > "finish": "2022-06-13T12:48:20.648000" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 15.89, + > "start": "2022-06-13T12:49:09.141460", + > "finish": "2022-06-13T12:49:25.027373" + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 105.85, + > "start": "2022-06-13T12:47:46.845395", + > "finish": "2022-06-13T12:49:32.697761" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 12.78, + > "start": "2022-06-13T12:50:20.245473", + > "finish": "2022-06-13T12:50:33.024777" + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 105.87, + > "start": "2022-06-13T12:47:47.846643", + > "finish": "2022-06-13T12:49:33.713476" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 25.13, + > "start": "2022-06-13T12:50:49.314344", + > "finish": "2022-06-13T12:51:14.443252" + > } + > ] + + > ------------- + > Loop [2] + > Name [multi-user-04-02] + > + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-02.json", + > "usercount": "4", + > "delaystart": "1", + > "delaynotebook": "1" + > } + > } + > + > Test started [Multi User] + > Test completed! (240.41 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 36.48, + > "start": "2022-06-13T12:51:26.761263", + > "finish": "2022-06-13T12:52:03.237429" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 14.64, + > "start": "2022-06-13T12:52:38.382663", + > "finish": "2022-06-13T12:52:53.019742" + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 53.55, + > "start": "2022-06-13T12:51:27.761776", + > "finish": "2022-06-13T12:52:21.312996" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 21.16, + > "start": "2022-06-13T12:53:03.598394", + > "finish": "2022-06-13T12:53:24.762894" + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 108.40, + > "start": "2022-06-13T12:51:28.763434", + > "finish": "2022-06-13T12:53:17.166334" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 12.59, + > "start": "2022-06-13T12:54:14.300348", + > "finish": "2022-06-13T12:54:26.887134" + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 101.92, + > "start": "2022-06-13T12:51:29.764477", + > "finish": "2022-06-13T12:53:11.684486" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 27.23, + > "start": "2022-06-13T12:54:48.190761", + > "finish": "2022-06-13T12:55:15.419630" + > } + > ] + + > ------------- + > Loop [3] + > Name [multi-user-04-03] + > + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-02.json", + > "usercount": "4", + > "delaystart": "1", + > "delaynotebook": "1" + > } + > } + > + > Test started [Multi User] + > Test completed! (240.41 seconds) + > ------------ Test Result: [PASS] ------------ + > [{'GaiaDMPSetup': { .... }}] + + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 36.48, + > "start": "2022-06-13T12:51:26.761263", + > "finish": "2022-06-13T12:52:03.237429" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 14.64, + > "start": "2022-06-13T12:52:38.382663", + > "finish": "2022-06-13T12:52:53.019742" + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 53.55, + > "start": "2022-06-13T12:51:27.761776", + > "finish": "2022-06-13T12:52:21.312996" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 21.16, + > "start": "2022-06-13T12:53:03.598394", + > "finish": "2022-06-13T12:53:24.762894" + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 108.40, + > "start": "2022-06-13T12:51:28.763434", + > "finish": "2022-06-13T12:53:17.166334" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 12.59, + > "start": "2022-06-13T12:54:14.300348", + > "finish": "2022-06-13T12:54:26.887134" + > } + > ] + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 101.92, + > "start": "2022-06-13T12:51:29.764477", + > "finish": "2022-06-13T12:53:11.684486" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 27.23, + > "start": "2022-06-13T12:54:48.190761", + > "finish": "2022-06-13T12:55:15.419630" + > } + > ] + + + grep 'Result:' /tmp/results/multi-user-04-*.txt + + > /tmp/results/multi-user-04-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-02.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-03.txt:------------ Test Result: [PASS] ------------ + + + # + # Checking the start times shows a 1 second delay between each user. + # + + > "start": "2022-06-13T12:43:40.946927", + > "finish": "2022-06-13T12:44:18.077720" + > + > "start": "2022-06-13T12:43:41.947520", + > "finish": "2022-06-13T12:44:29.383222" + > + > "start": "2022-06-13T12:43:42.948604", + > "finish": "2022-06-13T12:45:56.059248" + > + > "start": "2022-06-13T12:43:43.950581", + > "finish": "2022-06-13T12:45:57.127126" + + # + # Need to vary the delays to confirm this in more detail. + # At the moment, 1 second for both start and between makes it hard to separate them. + # + + +# ----------------------------------------------------- +# Step up to 4 users run 8 times. +#[root@ansibler] + + loopcount=8 + usercount=4 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + echo "Name [${testname}]" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + "${delaystart:?}" \ + "${delaynotebook:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + done + + grep 'Result:' /tmp/results/multi-user-04-*.txt + + > /tmp/results/multi-user-04-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-02.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-03.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-04.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-05.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-06.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-07.txt:------------ Test Result: [PASS] ------------ + + +# ----------------------------------------------------- +# Step up to 4 users run 16 times. +#[root@ansibler] + + loopcount=16 + usercount=4 + + for i in $(seq 0 $((loopcount - 1))) + do + echo "" + echo "-------------" + echo "Loop [${i}]" + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + echo "Name [${testname}]" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + "${delaystart:?}" \ + "${delaynotebook:?}" \ + | tee "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + done + + grep 'Result:' /tmp/results/multi-user-04-*.txt + + > /tmp/results/multi-user-04-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-01.txt:------------ Test Result: [PASS] ------------ + > .... + > .... + > /tmp/results/multi-user-04-14.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-15.txt:------------ Test Result: [PASS] ------------ + + +# ----------------------------------------------------- +# Update our benchmark script. +#[root@ansibler] + + cat > /tmp/run-benchmark.py << 'EOF' +#!/bin/python3 +import sys +from aglais_benchmark import AglaisBenchmarker + +try: + + opts = [opt for opt in sys.argv[1:] if opt.startswith("-")] + args = [arg for arg in sys.argv[1:] if not arg.startswith("-")] + + endpoint = args[0] + testconfig = args[1] + userlist = args[2] + usercount = int(args[3]) + delaystart = int(args[4]) + delaynotebook = int(args[5]) + +except IndexError: + + raise SystemExit(f"Usage: {sys.argv[0]} ") + +print( +""" +{{ +\"config\": {{ + \"endpoint\": \"{}\", + \"testconfig\": \"{}\", + \"userlist\": \"{}\", + \"usercount\": \"{}\", + \"delaystart\": \"{}\", + \"delaynotebook\": \"{}\" + }}, +\"output\": {{ +---start--- +""".format( + endpoint, + testconfig, + userlist, + usercount, + delaystart, + delaynotebook + ) + ) + +AglaisBenchmarker( + testconfig, + userlist, + "/tmp/", + endpoint + ).run( + concurrent=True, + users=usercount, + delay_start=delaystart, + delay_notebook=delaynotebook + ) + +print( +""" +---end--- + } +} +""" + ) +EOF + + chmod 'a+x' /tmp/run-benchmark.py + + +# ----------------------------------------------------- +# Update our filter function. +#[root@ansibler] + + filter-results() + { + local testname=${1:?'testname required'} + sed " + /^--*start--*/,/^--*end--*/ { + /^--*start/,/^--* Test Result/ { + /Test Result/ ! { + d + } + /Test Result/ { + s/^.*Test Result: \[\(.*\)\].*$/'testcode': '\1',/ + a \"threads\": + } + } + s/\"/'/g + s/'\(-\{0,1\}[0-9.]\{1,\}\)'/\1/g + s/:[[:space:]]*\([a-zA-Z]\{1,\}\)\([,}]\)/:'\1'\2/g + s/:[[:space:]]*\([,}]\),/: ''\1/g + s/'/\"/g + } + /^--*end--*/ { + d + } + " \ + "/tmp/results/${testname:?}.txt" \ + | tee "/tmp/results/${testname:?}.json" \ + | jq ' + .output.threads[] | keys as $x | [ $x[] as $y | {name: $y, value: .[$y].result, time: .[$y].time.elapsed , start: .[$y].time.start, finish: .[$y].time.finish } ] + ' + } + + filter-results "${testname:?}" + + +# ----------------------------------------------------- +# Create a test-loop function. +#[root@ansibler] + + test-loop() + { + local loopcount=${1:?'loopcount required'} + local usercount=${2:?'usercount required'} + +cat << EOF +[ +EOF + + local comma='' + for i in $(seq 0 $((loopcount - 1))) + do + + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + +cat << EOF + ${comma}{ + "iteration": ${i}, + "testname": "${testname}", + "threads": +EOF +comma=',' + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + "${delaystart:?}" \ + "${delaynotebook:?}" \ + > "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + +cat << EOF + } +EOF + + done + +cat << EOF +] +EOF + } + + +# ----------------------------------------------------- +# Test the new function. +#[root@ansibler] + + test-loop 1 1 \ + | tee /tmp/test-loop.json + | jq '.' + + > [ + > { + > "iteration": 0, + > "testname": "multi-user-01-00", + > "threads": [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 33.70, + > "start": "2022-06-13T17:35:08.001440", + > "finish": "2022-06-13T17:35:41.703389" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 15.49, + > "start": "2022-06-13T17:36:59.063435", + > "finish": "2022-06-13T17:37:14.555184" + > } + > ] + > } + > ] + + + test-loop 2 2 \ + | tee /tmp/test-loop.json + | jq '.' + + + # + # Mix of Python and Bash is complicating things. + # Incompatible non-JSON output from the test is making things really hard. + # Need to jump from Python to Bash in order to run sed on the non-JSON output. + # + + # + # Underneath - the test works fine for loop[0], but fails for loop[1]. + # One user account works, one user account fails. <-- this is wrong + # I think the error is: + # UnresolvedRelation [gaia_source] + # + + +# ----------------------------------------------------- +# Test with 1 loop, 4 users. +#[root@ansibler] + + test-loop 1 4 \ + | tee /tmp/test-loop.json + + + > [ + > { + > "iteration": 0, + > "testname": "multi-user-04-00", + > "threads": + > [ + > { + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > } + > ] + > [ + > { + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > } + > ] + > [ + > { + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > } + > ] + > [ + > { + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > } + > ] + > } + > ] + + +# ----------------------------------------------------- +# Test with 2 loops, 4 users. +#[root@ansibler] + + test-loop 2 4 \ + | tee /tmp/test-loop.json + + + > [ + > { + > "iteration": 0, + > "testname": "multi-user-04-00", + > "threads": + > [ + > { + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > } + > ] + > } + > ,{ + > "iteration": 1, + > "testname": "multi-user-04-01", + > "threads": + > parse error: Invalid numeric literal at line 14, column 638 + > } + > ] + + # + # Same error ON THE SECOND PASS. + # + + less /tmp/results/multi-user-04-01.json + + > .... + > pyspark.sql.utils.AnalysisException: Table or view not found: gaia_source + > .... + + + # + # System got twisted and took a while to sort itself out. + # Manual intervention restarting the interpreters. + # Not precise what I did .. clicked on some buttons .. + # + + +# ----------------------------------------------------- +# Test with 4 loops, 4 users. +#[root@ansibler] + + test-loop 4 4 \ + | tee /tmp/test-loop.json + + > [ + > { + > "iteration": 0, + > "testname": "multi-user-04-00", + > "threads": + > [ + > { + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > } + > ] + > } + > ,{ + > "iteration": 1, + > "testname": "multi-user-04-01", + > "threads": + > [ + > { + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > } + > ] + > } + > ,{ + > "iteration": 2, + > "testname": "multi-user-04-02", + > "threads": + > [ + > { + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > } + > ] + > } + > ,{ + > "iteration": 3, + > "testname": "multi-user-04-03", + > "threads": + > [ + > { + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > } + > ] + > } + > ] + + + grep 'Result:' /tmp/results/multi-user-04-*.txt + + > /tmp/results/multi-user-04-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-02.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-03.txt:------------ Test Result: [PASS] ------------ + + +# ----------------------------------------------------- +# Back to 2 loops, 4 users. +#[root@ansibler] + + rm -f /tmp/results/multi-user-* + + test-loop 2 4 \ + | tee /tmp/test-loop.json + + + > [ + > { + > "iteration": 0, + > "testname": "multi-user-04-00", + > "threads": + > [ + > { + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > } + > ] + > } + > ,{ + > "iteration": 1, + > "testname": "multi-user-04-01", + > "threads": + > [ + > { + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > } + > ] + > } + > ] + + + grep 'Result:' /tmp/results/multi-user-*.txt + + > /tmp/results/multi-user-04-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-01.txt:------------ Test Result: [PASS] ------------ + + # + # So we are OK now !? + # + + +# ----------------------------------------------------- +# Try 8 loops, 4 users. +#[root@ansibler] + + rm -f /tmp/results/multi-user-* + + test-loop 8 4 \ + | tee /tmp/test-loop.json + + + > [ + > { + > "iteration": 0, + > "testname": "multi-user-04-00", + > "threads": + > [ + > { + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > .... + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > "value": "PASS", + > .... + > } + > ] + > } + > ] + + + grep 'Result:' /tmp/results/multi-user-*.txt + + > /tmp/results/multi-user-04-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-02.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-03.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-04.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-05.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-06.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-07.txt:------------ Test Result: [PASS] ------------ + + +# ----------------------------------------------------- +# Try 32 loops, 4 users. +#[root@ansibler] + + rm -f /tmp/results/multi-user-* + + test-loop 32 4 \ + | tee /tmp/test-loop.json + + grep 'Result:' /tmp/results/multi-user-*.txt + + > /tmp/results/multi-user-04-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-01.txt:------------ Test Result: [PASS] ------------ + > .... + > .... + > /tmp/results/multi-user-04-30.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-31.txt:------------ Test Result: [PASS] ------------ + + +# ----------------------------------------------------- +# ... and another loops, 4 users. +#[root@ansibler] + + rm -f /tmp/results/multi-user-* + + test-loop 32 4 \ + | tee /tmp/test-loop.json + + grep 'Result:' /tmp/results/multi-user-*.txt + + > /tmp/results/multi-user-04-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-01.txt:------------ Test Result: [PASS] ------------ + > .... + > .... + > /tmp/results/multi-user-04-07.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-08.txt:------------ Test Result: [PASS] ------------ + diff --git a/notes/zrq/20220614-01-concurrent-tests.txt b/notes/zrq/20220614-01-concurrent-tests.txt new file mode 100644 index 00000000..a692708f --- /dev/null +++ b/notes/zrq/20220614-01-concurrent-tests.txt @@ -0,0 +1,1225 @@ +# +# +# +# Copyright (c) 2022, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Try to find out more about the limits on concurrent users. + Based on a clean deployment using 20220613-01-blue-deploy.txt. + + Result: + + Work in progress ... + + +# ----------------------------------------------------- +# Create our benchmark script. +#[root@ansibler] + + cat > /tmp/run-benchmark.py << 'EOF' +#!/bin/python3 +import sys +from aglais_benchmark import AglaisBenchmarker + +try: + + opts = [opt for opt in sys.argv[1:] if opt.startswith("-")] + args = [arg for arg in sys.argv[1:] if not arg.startswith("-")] + + endpoint = args[0] + testconfig = args[1] + userlist = args[2] + usercount = int(args[3]) + delaystart = int(args[4]) + delaynotebook = int(args[5]) + +except IndexError: + + raise SystemExit(f"Usage: {sys.argv[0]} ") + +print( +""" +{{ +\"config\": {{ + \"endpoint\": \"{}\", + \"testconfig\": \"{}\", + \"userlist\": \"{}\", + \"usercount\": \"{}\", + \"delaystart\": \"{}\", + \"delaynotebook\": \"{}\" + }}, +\"output\": {{ +""".format( + endpoint, + testconfig, + userlist, + usercount, + delaystart, + delaynotebook + ) + ) + +print( + "---start---" + ) +AglaisBenchmarker( + testconfig, + userlist, + "/tmp/", + endpoint + ).run( + concurrent=True, + users=usercount, + delay_start=delaystart, + delay_notebook=delaynotebook + ) +print( + "---end---" + ) +print( +""" + } +} +""" + ) +EOF + + chmod 'a+x' /tmp/run-benchmark.py + + +# ----------------------------------------------------- +# Create our filter function. +# https://github.com/wfau/aglais/issues/602 +#[root@ansibler] + + filter-results() + { + local testname=${1:?'testname required'} + sed " + /^--*start--*/,/^--*end--*/ { + /^--*start/,/^--* Test Result/ { + /Test Result/ ! { + d + } + /Test Result/ { + s/^.*Test Result: \[\(.*\)\].*$/'testcode': '\1',/ + a \"threads\": + } + } + s/\"/'/g + s/'\(-\{0,1\}[0-9.]\{1,\}\)'/\1/g + s/:[[:space:]]*\([a-zA-Z]\{1,\}\)\([,}]\)/:'\1'\2/g + s/:[[:space:]]*\([,}]\),/: ''\1/g + s/'/\"/g + } + /^--*end--*/ { + d + } + " \ + "/tmp/results/${testname:?}.txt" \ + | tee "/tmp/results/${testname:?}.json" \ + | jq ' + .output.threads[] | keys as $x | [ $x[] as $y | {name: $y, value: .[$y].result, time: .[$y].time.elapsed , start: .[$y].time.start, finish: .[$y].time.finish } ] + ' + } + + +# ----------------------------------------------------- +# Create our test-loop function. +#[root@ansibler] + + test-loop() + { + local loopcount=${1:?'loopcount required'} + local usercount=${2:?'usercount required'} + + rm -f /tmp/results/* + + echo "[" + + local comma='' + for i in $(seq 0 $((loopcount - 1))) + do + echo "${comma}" ; comma=',' + + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + +cat << EOF + { + "iteration": ${i}, + "testname": "${testname}", + "threads": +EOF + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + "${delaystart:?}" \ + "${delaynotebook:?}" \ + > "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + + echo "}" + + done + + echo "]" + + } + + +# ----------------------------------------------------- +# Test with 1 user doing 1 loop. +#[root@ansibler] + + test-loop 1 1 \ + | tee /tmp/test-loop.json + + jq '.' /tmp/test-loop.json + + > [ + > { + > "iteration": 0, + > "testname": "multi-user-01-00", + > "threads": [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 36.04, + > "start": "2022-06-14T12:02:17.073449", + > "finish": "2022-06-14T12:02:53.115954" + > }, + > { + > "name": "Library_Validation.json", + > "value": "PASS", + > "time": 10.14, + > "start": "2022-06-14T12:04:19.243331", + > "finish": "2022-06-14T12:04:29.385640" + > }, + > { + > "name": "Mean_proper_motions_over_the_sky", + > "value": "PASS", + > "time": 65.06, + > "start": "2022-06-14T12:02:54.117116", + > "finish": "2022-06-14T12:03:59.174176" + > }, + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 18.07, + > "start": "2022-06-14T12:04:00.175377", + > "finish": "2022-06-14T12:04:18.241466" + > } + > ] + > } + > ] + + + grep 'Result:' /tmp/results/*.txt + + > ------------ Test Result: [PASS] ------------ + + +# ----------------------------------------------------- +# Test with 2 users doing 1 loop (wrong, was 1 user doing 2 loops). +#[root@ansibler] + + test-loop 2 1 \ + | tee /tmp/test-loop.json + + jq '.' /tmp/test-loop.json + + + > [ + > + > { + > "iteration": 0, + > "testname": "multi-user-01-00", + > "threads": + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 36.63, + > "start": "2022-06-14T12:06:46.929067", + > "finish": "2022-06-14T12:07:23.560850" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 21.70, + > "start": "2022-06-14T12:08:31.218326", + > "finish": "2022-06-14T12:08:52.923187" + > } + > ] + > } + > , + > { + > "iteration": 1, + > "testname": "multi-user-01-01", + > "threads": + > parse error: Invalid numeric literal at line 15, column 638 + > } + > ] + > parse error: Unmatched '}' at line 43, column 1 + + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-01-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-01-01.txt:------------ Test Result: [FAIL] ------------ + + # + # Failing the second test is back. + # Go figure. + # + + +# ----------------------------------------------------- +# Test with 3 users doing 1 loop (wrong, was 1 user doing 3 loops). +#[root@ansibler] + + test-loop 3 1 \ + | tee /tmp/test-loop.json + + jq '.' /tmp/test-loop.json + + > [ + > + > { + > "iteration": 0, + > "testname": "multi-user-01-00", + > "threads": + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 34.87, + > "start": "2022-06-14T12:16:20.059583", + > "finish": "2022-06-14T12:16:54.933593" + > }, + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 21.35, + > "start": "2022-06-14T12:18:09.150417", + > "finish": "2022-06-14T12:18:30.503974" + > } + > ] + > } + > , + > { + > "iteration": 1, + > "testname": "multi-user-01-01", + > "threads": + > parse error: Invalid numeric literal at line 15, column 7462 + > } + > , + > { + > "iteration": 2, + > "testname": "multi-user-01-02", + > "threads": + > parse error: Invalid numeric literal at line 15, column 638 + > } + > ] + > parse error: Unmatched '}' at line 43, column 1 + + + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-01-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-01-01.txt:------------ Test Result: [ERROR] ------------ + > /tmp/results/multi-user-01-02.txt:------------ Test Result: [FAIL] ------------ + + # + # What is the difference between an ERROR and a FAIL ? + # + + vi /tmp/results/multi-user-01-01.txt + + + > .... + > Test started [Multi User] + > Test completed! (83.97 seconds) + > ------------ Test Result: [ERROR] ------------ + > [ + > { + > 'GaiaDMPSetup': { + > 'result': 'ERROR', + > 'outputs': { + > 'valid': True + > }, + > 'time': { + > 'result': 'FAST', + > 'elapsed': '33.45', + > 'expected': '45.00', + > 'percent': '-25.67', + > 'start': '2022-06-14T12:18:44.237768', + > 'finish': '2022-06-14T12:19:17.685659' + > }, + > 'logs': ' + > org.apache.zeppelin.interpreter.InterpreterException: java.io.IOException: + > Interpreter Process creation is time out in 30 seconds + > You can increase timeout threshold via setting zeppelin.interpreter.connect.timeout of this interpreter. + > Interpreter download command: + > /etc/alternatives/jre/bin/java + > -Dfile.encoding=UTF-8 + > -Dlog4j.configuration=file:///home/fedora/zeppelin/conf/log4j.properties + > -Dlog4j.configurationFile=file:///home/fedora/zeppelin/conf/log4j2.properties + > -Dzeppelin.log.file=/home/fedora/zeppelin/logs/zeppelin-interpreter-spark-Carclop-Carclop-fedora-iris-gaia-blue-20220613-zeppelin.log + > -cp :/home/fedora/zeppelin/interpreter/spark/* + > :/home/fedora/zeppelin/interpreter/zeppelin-interpreter-shaded-0.10.0.jar + > :/home/fedora/zeppelin/interpreter/spark/spark-interpreter-0.10.0.jar + > :/opt/hadoop/etc/hadoop org.apache.zeppelin.interpreter.remote.RemoteInterpreterDownloader + > 10.10.2.210 35643 + > spark + > /home/fedora/zeppelin/local-repo/spark + > + > [INFO] Interpreter launch command: + > /opt/spark/bin/spark-submit + > --class org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer + > --driver-class-path + > :/home/fedora/zeppelin/local-repo/spark/* + > :/home/fedora/zeppelin/interpreter/spark/* + > :/home/fedora/zeppelin/interpreter/zeppelin-interpreter-shaded-0.10.0.jar + > :/home/fedora/zeppelin/interpreter/spark/spark-interpreter-0.10.0.jar + > :/opt/hadoop/etc/hadoop + > --driver-java-options + > -Dfile.encoding=UTF-8 + > -Dlog4j.configuration=file:///home/fedora/zeppelin/conf/log4j.properties + > -Dlog4j.configurationFile=file:///home/fedora/zeppelin/conf/log4j2.properties + > -Dzeppelin.log.file=/home/fedora/zeppelin/logs/zeppelin-interpreter-spark-Carclop-Carclop-fedora-iris-gaia-blue-20220613-zeppelin.log + > --proxy-user Carclop + > --conf spark.yarn.dist.archives=/opt/spark/R/lib/sparkr.zip#sparkr + > --conf spark.submit.deployMode=client + > --conf spark.webui.yarn.useProxy=false + > --conf spark.yarn.isPython=true + > --conf spark.app.name=spark-Carclop + > --conf spark.master=yarn + > /home/fedora/zeppelin/interpreter/spark/spark-interpreter-0.10.0.jar + > 10.10.2.210 35643 + > spark-Carclop : + > + > SLF4J: Class path contains multiple SLF4J bindings. + > SLF4J: Found binding in [jar:file:/home/fedora/zeppelin-0.10.0-bin-all/interpreter/spark/spark-interpreter-0.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class] + > SLF4J: Found binding in [jar:file:/opt/spark-3.1.2-bin-hadoop3.2/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class] + > SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation. + > SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory] + > + > at org.apache.zeppelin.interpreter.remote.RemoteInterpreter.open(RemoteInterpreter.java:129) + > at org.apache.zeppelin.interpreter.remote.RemoteInterpreter.getFormType(RemoteInterpreter.java:271) + > at org.apache.zeppelin.notebook.Paragraph.jobRun(Paragraph.java:440) + > at org.apache.zeppelin.notebook.Paragraph.jobRun(Paragraph.java:71) + > at org.apache.zeppelin.scheduler.Job.run(Job.java:172) + > at org.apache.zeppelin.scheduler.AbstractScheduler.runJob(AbstractScheduler.java:132) + > at org.apache.zeppelin.scheduler.RemoteScheduler$JobRunner.run(RemoteScheduler.java:182) + > at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) + > at java.util.concurrent.FutureTask.run(FutureTask.java:266) + > at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180) + > at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + > at java.lang.Thread.run(Thread.java:748) + > + > Caused by: java.io.IOException: Interpreter Process creation is time out in 30 seconds + > You can increase timeout threshold via setting zeppelin.interpreter.connect.timeout of this interpreter. + > Interpreter download command: .... + > [INFO] Interpreter launch command: .... + > + > SLF4J: Class path contains multiple SLF4J bindings. + > SLF4J: Found binding in [jar:file:/home/fedora/zeppelin-0.10.0-bin-all/interpreter/spark/spark-interpreter-0.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class] + > SLF4J: Found binding in [jar:file:/opt/spark-3.1.2-bin-hadoop3.2/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class] + > SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation. + > SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory] + > + > at org.apache.zeppelin.interpreter.remote.ExecRemoteInterpreterProcess.start(ExecRemoteInterpreterProcess.java:93) + > at org.apache.zeppelin.interpreter.ManagedInterpreterGroup.getOrCreateInterpreterProcess(ManagedInterpreterGroup.java:68) + > at org.apache.zeppelin.interpreter.remote.RemoteInterpreter.getOrCreateInterpreterProcess(RemoteInterpreter.java:104) + > at org.apache.zeppelin.interpreter.remote.RemoteInterpreter.internal_create(RemoteInterpreter.java:154) + > at org.apache.zeppelin.interpreter.remote.RemoteInterpreter.open(RemoteInterpreter.java:126) + > ... 13 more + > ' + > }, + > 'Mean_proper_motions_over_the_sky': { + > 'result': 'ERROR', + > 'outputs': { + > 'valid': True + > }, + > 'time': { + > 'result': 'FAST', + > 'elapsed': '29.76', + > 'expected': '55.00', + > 'percent': '-45.89', + > 'start': '2022-06-14T12:19:18.687135', + > 'finish': '2022-06-14T12:19:48.447845' + > }, + > 'logs': ' + > Fail to execute line 13: df = spark.sql(query).cache() + > .... + > pyspark.sql.utils.AnalysisException: Table or view not found: gaia_source; line 1 pos 121; + > \'Aggregate [\'hpx_id], [\'floor((\'source_id / 140737488355328)) AS hpx_id#0, count(1) AS n#1L, \'AVG(\'pmra) AS avg_pmra#2, \'AVG(\'pmdec) AS avg_pmdec#3] + > +- \'UnresolvedRelation [gaia_source], [], false + > ' + > }, + > 'Source_counts_over_the_sky.json': { + > 'result': 'ERROR', + > 'outputs': { + > 'valid': True + > }, + > 'time': { + > 'result': 'FAST', + > 'elapsed': '5.74', + > 'expected': '22.00', + > 'percent': '-73.90', + > 'start': '2022-06-14T12:19:49.448736', + > 'finish': '2022-06-14T12:19:55.190901' + > }, + > 'logs': ' + > Fail to execute line 21: df = spark.sql("SELECT FLOOR(source_id / %d"%(divisor) + ") AS hpx_id, COUNT(*) AS n FROM gaia_source GROUP BY hpx_id") + > .... + > pyspark.sql.utils.AnalysisException: Table or view not found: gaia_source; line 1 pos 72; + > \'Aggregate [\'hpx_id], [\'FLOOR((\'source_id / 140737488355328)) AS hpx_id#5, count(1) AS n#6L] + > +- \'UnresolvedRelation [gaia_source], [], false + > ' + > }, + > 'Library_Validation.json': { + > 'result': 'PASS', + > 'outputs': { + > 'valid': True + > }, + > 'time': { + > 'result': 'FAST', + > 'elapsed': '10.99', + > 'expected': '60.00', + > 'percent': '-81.68', + > 'start': '2022-06-14T12:19:56.191334', + > 'finish': '2022-06-14T12:20:07.180597' + > }, + > 'logs': '' + > } + > } + > ] + > .... + + # + # I'm guessing that GaiaDMPSetup ran first, got half way through creating the schema and failed. + # This left the top level schema partially defined, so the test in the subsequent notebooks skipped the schema creation. + # + + + vi /tmp/results/multi-user-01-02.txt + + + > .... + > Test started [Multi User] + > Test completed! (56.67 seconds) + > ------------ Test Result: [FAIL] ------------ + > [ + > { + > 'GaiaDMPSetup': { + > 'result': 'FAIL', + > 'outputs': { + > 'valid': True + > }, + > 'time': { + > 'result': 'FAST', + > 'elapsed': '4.61', + > 'expected': '45.00', + > 'percent': '-89.76', + > 'start': '2022-06-14T12:20:08.716961', + > 'finish': '2022-06-14T12:20:13.325401' + > }, + > 'logs': '' + > }, + > 'Mean_proper_motions_over_the_sky': { + > 'result': 'ERROR', + > 'outputs': { + > 'valid': True + > }, + > 'time': { + > 'result': 'FAST', + > 'elapsed': '30.62', + > 'expected': '55.00', + > 'percent': '-44.33', + > 'start': '2022-06-14T12:20:14.326399', + > 'finish': '2022-06-14T12:20:44.947081' + > }, + > 'logs': ' + > Fail to execute line 13: df = spark.sql(query).cache() + > .... + > pyspark.sql.utils.AnalysisException: Table or view not found: gaia_source; line 1 pos 121; + > \'Aggregate [\'hpx_id], [\'floor((\'source_id / 140737488355328)) AS hpx_id#0, count(1) AS n#1L, \'AVG(\'pmra) AS avg_pmra#2, \'AVG(\'pmdec) AS avg_pmdec#3] + > +- \'UnresolvedRelation [gaia_source], [], false + > ' + > }, + > 'Source_counts_over_the_sky.json': { + > 'result': 'ERROR', + > 'outputs': { + > 'valid': True + > }, + > 'time': { + > 'result': 'FAST', + > 'elapsed': '5.66', + > 'expected': '22.00', + > 'percent': '-74.25', + > 'start': '2022-06-14T12:20:45.948445', + > 'finish': '2022-06-14T12:20:51.613161' + > }, + > 'logs': ' + > Fail to execute line 21: df = spark.sql("SELECT FLOOR(source_id / %d"%(divisor) + ") AS hpx_id, COUNT(*) AS n FROM gaia_source GROUP BY hpx_id") + > .... + > pyspark.sql.utils.AnalysisException: Table or view not found: gaia_source; line 1 pos 72; + > \'Aggregate [\'hpx_id], [\'FLOOR((\'source_id / 140737488355328)) AS hpx_id#5, count(1) AS n#6L] + > +- \'UnresolvedRelation [gaia_source], [], false + > ' + > }, + > 'Library_Validation.json': { + > 'result': 'PASS', + > 'outputs': { + > 'valid': True + > }, + > 'time': { + > 'result': 'FAST', + > 'elapsed': '11.76', + > 'expected': '60.00', + > 'percent': '-80.41', + > 'start': '2022-06-14T12:20:52.614339', + > 'finish': '2022-06-14T12:21:04.370259' + > }, + > 'logs': '' + > } + > } + > ] + > .... + + # + # In this one, GaiaDMPSetup failed (no details). + # This left the top level schema partially defined, so the test in the subsequent notebooks skipped the schema creation. + # + + +# ----------------------------------------------------- +# Test again with 3 users doing 1 loop (wrong, was 1 user doing 3 loops). +#[root@ansibler] + + test-loop 3 1 \ + | tee /tmp/test-loop.json + + jq '.' /tmp/test-loop.json + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-01-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-01-01.txt:------------ Test Result: [ERROR] ------------ + > /tmp/results/multi-user-01-02.txt:------------ Test Result: [ERROR] ------------ + + # + # Slightly different results. + # One pass and two errors .. + # + + +# ----------------------------------------------------- +# Test again with 3 users doing 1 loop (wrong, was 1 user doing 3 loops). +#[root@ansibler] + + test-loop 3 1 \ + | tee /tmp/test-loop.json + + jq '.' /tmp/test-loop.json + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-01-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-01-01.txt:------------ Test Result: [ERROR] ------------ + > /tmp/results/multi-user-01-02.txt:------------ Test Result: [FAIL] ------------ + + # + # Slightly different results. + # A pass an error and a fail .. + # + + # + # A guess ... this started after I had killed a test run using Ctrl-C. + # Perhaps it has left a mis-configured interpreter active ? + # See what we can find out via the REST API. + # + + +# ----------------------------------------------------- +# Get the user login details. +#[root@ansibler] + + less /tmp/results/multi-user-01-01.txt + + > { + > "config": { + > "endpoint": "http://zeppelin:8080", + > "testconfig": "/deployments/zeppelin/test/config/quick.json", + > "userlist": "/tmp/testusers-02.json", + > "usercount": "1", + > "delaystart": "1", + > "delaynotebook": "1" + > }, + > .... + > .... + + + jq '.users[].shirouser | {name, pass}' /tmp/testusers-02.json + + > { + > "name": "Hamar", + > "pass": "########" + > } + > { + > "name": "Carclop", + > "pass": ""########" + > } + > { + > "name": "Halda", + > "pass": ""########" + > } + > { + > "name": "Jaden", + > "pass": ""########" + > } + > .... + > .... + + +# ----------------------------------------------------- +# Select the username and password for the second test user. +#[root@ansibler] + + # Note - JSON array is zero indexed, but AglaisBenchmarker skips the first entry. + # So use [2] to get the second test user. + + jq '.users[2].shirouser | {name, pass}' /tmp/testusers-02.json + + > { + > "name": "Halda", + > "pass": ""########" + > } + + + testername=$( + jq -r '.users[2].shirouser.name' /tmp/testusers-02.json + ) + testerpass=$( + jq -r '.users[2].shirouser.pass' /tmp/testusers-02.json + ) + + +# ----------------------------------------------------- +# Login to Zeppelin as the test user. +#[root@ansibler] + + source '/deployments/zeppelin/bin/zeppelin-rest-tools.sh' + + zeppelinurl='http://zeppelin:8080' + zepcookies=/tmp/${testername:?}.cookies + + zeplogin "${testername:?}" "${testerpass:?}" \ + | jq '.' + + > { + > "status": "OK", + > "message": "", + > "body": { + > "principal": "Halda", + > "ticket": "5d168ff0-a9cd-48e9-9b04-064219d8f4a9", + > "roles": "[\"user\"]" + > } + > } + + +# ----------------------------------------------------- +# List the registered interpreters. +#[root@ansibler] + + curl \ + --silent \ + --cookie "${zepcookies:?}" \ + "${zeppelinurl:?}/api/interpreter" \ + | tee /tmp/interpreter-list.json \ + | jq '.' + + > { + > "status": "OK", + > "message": "", + > "body": { + > "python": { + > "id": "python", + > "name": "python", + > "group": "python", + > "properties": { + > .... + > }, + > "status": "READY", + > "interpreterGroup": [ + > .... + > ], + > "dependencies": [], + > "option": { + > "remote": true, + > "port": -1, + > "isExistingProcess": false, + > "setPermission": false, + > "isUserImpersonate": false + > } + > }, + > "spark": { + > "id": "spark", + > "name": "spark", + > "group": "spark", + > "properties": { + > .... + > }, + > "status": "READY", + > "interpreterGroup": [ + > .... + > ], + > "dependencies": [], + > "option": { + > "remote": true, + > "port": -1, + > "isExistingProcess": false, + > "setPermission": false, + > "isUserImpersonate": false + > } + > }, + > "sh": { + > "id": "sh", + > "name": "sh", + > "group": "sh", + > "properties": { + > .... + > }, + > "status": "READY", + > "interpreterGroup": [ + > .... + > ], + > "dependencies": [], + > "option": { + > "remote": true, + > "port": -1, + > "isExistingProcess": false, + > "setPermission": false, + > "isUserImpersonate": false + > } + > }, + > "md": { + > "id": "md", + > "name": "md", + > "group": "md", + > "properties": { + > .... + > }, + > "status": "READY", + > "interpreterGroup": [ + > .... + > ], + > "dependencies": [], + > "option": { + > "remote": true, + > "port": -1, + > "isExistingProcess": false, + > "setPermission": false, + > "isUserImpersonate": false + > } + > } + > } + > } + + # + # These interpreters all have 'isUserImpersonate' set to 'false'. + # Should they ? + # + + # + # According to the Zeppelin documentation, restarting an interpreter is done on a per notebook basis. + # https://zeppelin.apache.org/docs/0.10.0/usage/rest_api/interpreter.html#restart-an-interpreter + + PUT + ${zeppelinurl:?}/api/interpreter/setting/restart/[interpreter ID] + + { + "noteId": "2AVQJVC8N" + } + + # + # .. but if the tests deleted the notebooks after each test run, + # then we won't have an interpreter attached to a notebook. + # + # Is the interpreter per user account or per notebook ? + # Interpreter settings page says: + # http://zeppelin:8080/#/interpreter + # + # spark: + # The interpreter will be instantiated [per user] in [isolated] process. + # + # Checked the UI, we do have one notebook in /tmp. + # name : 4XM670LBGE.json + # ident : 2H6GREYK3 + # + + # + # Run the test to confirm errors + # Restart the interpreter + # Run the test to confirm errors + # + +# ----------------------------------------------------- +# Test baseline with 3 users doing 1 loop (wrong, was 1 user doing 3 loops). +#[root@ansibler] + + test-loop 3 1 \ + | tee /tmp/test-loop.json + + jq '.' /tmp/test-loop.json + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-01-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-01-01.txt:------------ Test Result: [ERROR] ------------ + > /tmp/results/multi-user-01-02.txt:------------ Test Result: [ERROR] ------------ + + +# ----------------------------------------------------- +# Try restarting our pyspark interpreter. +# https://zeppelin.apache.org/docs/0.10.0/usage/rest_api/interpreter.html#restart-an-interpreter +#[root@ansibler] + + interpreter=spark + + curl \ + --silent \ + --cookie "${zepcookies:?}" \ + --header "Content-Type: application/json" \ + --request PUT \ + "${zeppelinurl:?}/api/interpreter/setting/restart/${interpreter}" + + + > {"status":"OK","message":"","body":{"id":"spark","name":"spark","group":"spark","properties":{"SPARK_HOME":{"name":"SPARK_HOME","value":"/opt/spark","type":"string","description":"Location of spark distribution"},"spark.master":{"name":"spark.master","value":"yarn","type":"string","description":"Spark master uri. local | yarn-client | yarn-cluster | spark master address of standalone mode, ex) spark://master_host:7077"},"spark.submit.deployMode":{"name":"spark.submit.deployMode","value":"client","type":"string","description":"The deploy mode of Spark driver program, either \"client\" or \"cluster\", Which means to launch driver program locally (\"client\") or remotely (\"cluster\") on one of the nodes inside the cluster."},"spark.app.name":{"name":"spark.app.name","value":"","type":"string","description":"The name of spark application."},"spark.driver.cores":{"name":"spark.driver.cores","value":"","type":"number","description":"Number of cores to use for the driver process, only in cluster mode."},"spark.driver.memory":{"name":"spark.driver.memory","value":"","type":"string","description":"Amount of memory to use for the driver process, i.e. where SparkContext is initialized, in the same format as JVM memory strings with a size unit suffix (\"k\", \"m\", \"g\" or \"t\") (e.g. 512m, 2g)."},"spark.executor.cores":{"name":"spark.executor.cores","value":"","type":"number","description":"The number of cores to use on each executor"},"spark.executor.memory":{"name":"spark.executor.memory","value":"","type":"string","description":"Executor memory per worker instance. ex) 512m, 32g"},"spark.executor.instances":{"name":"spark.executor.instances","value":"","type":"number","description":"The number of executors for static allocation."},"spark.files":{"name":"spark.files","value":"","type":"string","description":"Comma-separated list of files to be placed in the working directory of each executor. Globs are allowed."},"spark.jars":{"name":"spark.jars","value":"","type":"string","description":"Comma-separated list of jars to include on the driver and executor classpaths. Globs are allowed."},"spark.jars.packages":{"name":"spark.jars.packages","value":"","type":"string","description":"Comma-separated list of Maven coordinates of jars to include on the driver and executor classpaths. The coordinates should be groupId:artifactId:version. If spark.jars.ivySettings is given artifacts will be resolved according to the configuration in the file, otherwise artifacts will be searched for in the local maven repo, then maven central and finally any additional remote repositories given by the command-line option --repositories."},"zeppelin.spark.useHiveContext":{"name":"zeppelin.spark.useHiveContext","value":true,"type":"checkbox","description":"Use HiveContext instead of SQLContext if it is true. Enable hive for SparkSession."},"zeppelin.spark.run.asLoginUser":{"name":"zeppelin.spark.run.asLoginUser","value":false,"type":"checkbox","description":"Whether run spark job as the zeppelin login user, it is only applied when running spark job in hadoop yarn cluster and shiro is enabled"},"zeppelin.spark.printREPLOutput":{"name":"zeppelin.spark.printREPLOutput","value":true,"type":"checkbox","description":"Print REPL output"},"zeppelin.spark.maxResult":{"name":"zeppelin.spark.maxResult","value":"1000","type":"number","description":"Max number of result to display."},"zeppelin.spark.enableSupportedVersionCheck":{"name":"zeppelin.spark.enableSupportedVersionCheck","value":true,"type":"checkbox","description":"Whether checking supported spark version. Developer only setting, not for production use"},"zeppelin.spark.uiWebUrl":{"name":"zeppelin.spark.uiWebUrl","value":"","type":"string","description":"Override Spark UI default URL. In Kubernetes mode, value can be Jinja template string with 3 template variables \u0027PORT\u0027, \u0027SERVICE_NAME\u0027 and \u0027SERVICE_DOMAIN\u0027. (ex: http://{{PORT}}-{{SERVICE_NAME}}.{{SERVICE_DOMAIN}})"},"zeppelin.spark.ui.hidden":{"name":"zeppelin.spark.ui.hidden","value":false,"type":"checkbox","description":"Whether hide spark ui in zeppelin ui"},"spark.webui.yarn.useProxy":{"name":"spark.webui.yarn.useProxy","value":false,"type":"checkbox","description":"whether use yarn proxy url as spark weburl, e.g. http://localhost:8088/proxy/application_1583396598068_0004"},"zeppelin.spark.scala.color":{"name":"zeppelin.spark.scala.color","value":true,"type":"checkbox","description":"Whether enable color output of spark scala interpreter"},"zeppelin.spark.deprecatedMsg.show":{"name":"zeppelin.spark.deprecatedMsg.show","value":true,"type":"checkbox","description":"Whether show the spark deprecated message, spark 2.2 and before are deprecated. Zeppelin will display warning message by default"},"zeppelin.spark.concurrentSQL":{"name":"zeppelin.spark.concurrentSQL","value":true,"type":"checkbox","description":"Execute multiple SQL concurrently if set true."},"zeppelin.spark.concurrentSQL.max":{"name":"zeppelin.spark.concurrentSQL.max","value":"10","type":"number","description":"Max number of SQL concurrently executed"},"zeppelin.spark.sql.stacktrace":{"name":"zeppelin.spark.sql.stacktrace","value":true,"type":"checkbox","description":"Show full exception stacktrace for SQL queries if set to true."},"zeppelin.spark.sql.interpolation":{"name":"zeppelin.spark.sql.interpolation","value":false,"type":"checkbox","description":"Enable ZeppelinContext variable interpolation into spark sql"},"PYSPARK_PYTHON":{"name":"PYSPARK_PYTHON","value":"python","type":"string","description":"Python binary executable to use for PySpark in both driver and workers (default is python2.7 if available, otherwise python). Property `spark.pyspark.python` take precedence if it is set"},"PYSPARK_DRIVER_PYTHON":{"name":"PYSPARK_DRIVER_PYTHON","value":"python","type":"string","description":"Python binary executable to use for PySpark in driver only (default is `PYSPARK_PYTHON`). Property `spark.pyspark.driver.python` take precedence if it is set"},"zeppelin.pyspark.useIPython":{"name":"zeppelin.pyspark.useIPython","value":false,"type":"checkbox","description":"Whether use IPython when it is available"},"zeppelin.R.knitr":{"name":"zeppelin.R.knitr","value":true,"type":"checkbox","description":"Whether use knitr or not"},"zeppelin.R.cmd":{"name":"zeppelin.R.cmd","value":"R","type":"string","description":"R binary executable path"},"zeppelin.R.image.width":{"name":"zeppelin.R.image.width","value":"100%","type":"number","description":"Image width of R plotting"},"zeppelin.R.render.options":{"name":"zeppelin.R.render.options","value":"out.format \u003d \u0027html\u0027, comment \u003d NA, echo \u003d FALSE, results \u003d \u0027asis\u0027, message \u003d F, warning \u003d F, fig.retina \u003d 2","type":"textarea","description":""},"zeppelin.R.shiny.portRange":{"name":"zeppelin.R.shiny.portRange","value":":","type":"string","description":"Shiny app would launch a web app at some port, this property is to specify the portRange via format \u0027\u003cstart\u003e:\u003cend\u003e\u0027, e.g. \u00275000:5001\u0027. By default it is \u0027:\u0027 which means any port"},"zeppelin.kotlin.shortenTypes":{"name":"zeppelin.kotlin.shortenTypes","value":true,"type":"checkbox","description":"Show short types instead of full, e.g. List\u003cString\u003e or kotlin.collections.List\u003ckotlin.String\u003e"}},"status":"READY","interpreterGroup":[{"name":"spark","class":"org.apache.zeppelin.spark.SparkInterpreter","defaultInterpreter":true,"editor":{"language":"scala","editOnDblClick":false,"completionKey":"TAB","completionSupport":true}},{"name":"sql","class":"org.apache.zeppelin.spark.SparkSqlInterpreter","defaultInterpreter":false,"editor":{"language":"sql","editOnDblClick":false,"completionKey":"TAB","completionSupport":true}},{"name":"pyspark","class":"org.apache.zeppelin.spark.PySparkInterpreter","defaultInterpreter":false,"editor":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true}},{"name":"ipyspark","class":"org.apache.zeppelin.spark.IPySparkInterpreter","defaultInterpreter":false,"editor":{"language":"python","editOnDblClick":false,"completionSupport":true,"completionKey":"TAB"}},{"name":"r","class":"org.apache.zeppelin.spark.SparkRInterpreter","defaultInterpreter":false,"editor":{"language":"r","editOnDblClick":false,"completionSupport":false,"completionKey":"TAB"}},{"name":"ir","class":"org.apache.zeppelin.spark.SparkIRInterpreter","defaultInterpreter":false,"editor":{"language":"r","editOnDblClick":false,"completionSupport":true,"completionKey":"TAB"}},{"name":"shiny","class":"org.apache.zeppelin.spark.SparkShinyInterpreter","defaultInterpreter":false,"editor":{"language":"r","editOnDblClick":false,"completionSupport":true,"completionKey":"TAB"}},{"name":"kotlin","class":"org.apache.zeppelin.spark.KotlinSparkInterpreter","defaultInterpreter":false,"editor":{"language":"kotlin","editOnDblClick":false,"completionKey":"TAB","completionSupport":false}}],"dependencies":[],"option":{"remote":true,"port":-1,"perNote":"shared","perUser":"isolated","isExistingProcess":false,"setPermission":false,"owners":[],"isUserImpersonate":true}}}[root@ansibler /]# + + # + # Re-starting the spark interpreter by user 'Halda', closing interpreters for 'spark-Halda' and 'spark-anonymous', and but logs show it is also closing interpreter for 'spark-Mavaca' too. + # ... but what about the third user 'Carclop' ? + # actually, Mavaca is the 5th user (not used in these tests), so perhaps that was a clean up. + # + # At this point I was thinking the test was running 3 users doing 1 loop, which was wrong, it was 1 user doing 3 loops. + # So the fixes I was trying wouldn't have worked anyway. + # + + + > .... + > INFO [2022-06-15 02:25:22,413] ({qtp686466458-214460} InterpreterRestApi.java[restartSetting]:199) - Restart interpreterSetting spark, msg= + > INFO [2022-06-15 02:25:22,414] ({qtp686466458-214460} InterpreterSetting.java[close]:535) - Close InterpreterSetting: spark + > INFO [2022-06-15 02:25:22,418] ({spark-close} ManagedInterpreterGroup.java[close]:91) - Close InterpreterGroup: spark-anonymous + > INFO [2022-06-15 02:25:22,418] ({spark-close} ManagedInterpreterGroup.java[close]:91) - Close InterpreterGroup: spark-Halda + > INFO [2022-06-15 02:25:22,418] ({spark-close} ManagedInterpreterGroup.java[close]:102) - Close Session: shared_session for interpreter setting: spark + > INFO [2022-06-15 02:25:22,418] ({spark-close} ManagedInterpreterGroup.java[close]:91) - Close InterpreterGroup: spark-Mavaca + > .... + > INFO [2022-06-15 02:25:22,419] ({RemoteInterpreter-close} SchedulerFactory.java[removeScheduler]:110) - Remove scheduler: RemoteInterpreter-spark-anonymous-shared_session + > INFO [2022-06-15 02:25:22,419] ({RemoteInterpreter-close} SchedulerFactory.java[removeScheduler]:110) - Remove scheduler: RemoteInterpreter-spark-Halda-shared_session + > .... + > INFO [2022-06-15 02:25:22,420] ({ConfInterpreter-close} SchedulerFactory.java[createOrGetFIFOScheduler]:76) - Create FIFOScheduler: interpreter_1769331079 + > .... + > .... + > INFO [2022-06-15 02:25:22,426] ({RemoteInterpreter-close} SchedulerFactory.java[removeScheduler]:110) - Remove scheduler: RemoteInterpreter-spark-Halda-shared_session + > INFO [2022-06-15 02:25:22,426] ({RemoteInterpreter-close} SchedulerFactory.java[removeScheduler]:110) - Remove scheduler: RemoteInterpreter-spark-Mavaca-shared_session + > INFO [2022-06-15 02:25:22,427] ({RemoteInterpreter-close} SchedulerFactory.java[removeScheduler]:110) - Remove scheduler: RemoteInterpreter-spark-anonymous-shared_session + > .... + > INFO [2022-06-15 02:25:22,427] ({spark-close} ManagedInterpreterGroup.java[close]:106) - Remove this InterpreterGroup: spark-Halda as all the sessions are closed + > .... + > INFO [2022-06-15 02:25:22,427] ({spark-close} ManagedInterpreterGroup.java[close]:106) - Remove this InterpreterGroup: spark-anonymous as all the sessions are closed + > .... + > INFO [2022-06-15 02:25:22,430] ({spark-close} ManagedInterpreterGroup.java[close]:106) - Remove this InterpreterGroup: spark-Mavaca as all the sessions are closed + > .... + > INFO [2022-06-15 02:25:22,430] ({spark-close} RemoteInterpreterManagedProcess.java[stop]:80) - Stop interpreter process for interpreter group: spark-Mavaca + > INFO [2022-06-15 02:25:22,437] ({pool-7-thread-773} RemoteInterpreterEventServer.java[unRegisterInterpreterProcess]:190) - Unregister interpreter process: spark-Mavaca + > WARN [2022-06-15 02:25:22,437] ({pool-7-thread-773} RemoteInterpreterEventServer.java[unRegisterInterpreterProcess]:194) - Unable to unregister interpreter process because no such interpreterGroup: spark-Mavaca + > WARN [2022-06-15 02:25:22,662] ({Exec Default Executor} ExecRemoteInterpreterProcess.java[onProcessComplete]:226) - Process is exited with exit value 0 + > INFO [2022-06-15 02:25:22,663] ({Exec Default Executor} ProcessLauncher.java[transition]:109) - Process state is transitioned to COMPLETED + > INFO [2022-06-15 02:25:24,942] ({spark-close} ExecRemoteInterpreterProcess.java[stop]:136) - Remote exec process of interpreter group: spark-Mavaca is terminated + > .... + + # + # Spark interpreter restart called by user 'Halda', closing interpreters for 'spark-anonymous', 'spark-Halda' and 'spark-Mavaca'. + # + + +# ----------------------------------------------------- +# Test with 3 users doing 1 loop (wrong, was 1 user doing 3 loops). +#[root@ansibler] + + test-loop 3 1 \ + | tee /tmp/test-loop.json + + jq '.' /tmp/test-loop.json + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-01-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-01-01.txt:------------ Test Result: [ERROR] ------------ + > /tmp/results/multi-user-01-02.txt:------------ Test Result: [ERROR] ------------ + + # + # Still failing .. + # + + > .... + > WARN [2022-06-15 02:30:43,448] ({pool-7-thread-835} RemoteInterpreterEventServer.java[registerInterpreterProcess]:172) - Unable to register interpreter process, because no such interpreterGroup: spark-Carclop + > WARN [2022-06-15 02:31:10,155] ({SchedulerFactory17} ExecRemoteInterpreterProcess.java[waitForReady]:199) - Ready timeout reached + > WARN [2022-06-15 02:31:10,155] ({SchedulerFactory17} ProcessLauncher.java[onTimeout]:113) - Process launch is time out. + > WARN [2022-06-15 02:31:10,158] ({SchedulerFactory17} NotebookServer.java[onStatusChange]:1986) - + > Job paragraph_1655260239378_183842340 is finished, + > status: ERROR, + > exception: null, + > result: %text org.apache.zeppelin.interpreter.InterpreterException: + > java.io.IOException: + > Interpreter Process creation is time out in 30 seconds + > You can increase timeout threshold via setting zeppelin.interpreter.connect.timeout of this interpreter. + > Interpreter download command: + > /etc/alternatives/jre/bin/java + > -Dfile.encoding=UTF-8 + > -Dlog4j.configuration=file:///home/fedora/zeppelin/conf/log4j.properties + > -Dlog4j.configurationFile=file:///home/fedora/zeppelin/conf/log4j2.properties + > -Dzeppelin.log.file=/home/fedora/zeppelin/logs/zeppelin-interpreter-spark-Carclop-Carclop-fedora-iris-gaia-blue-20220613-zeppelin.log + > -cp :/home/fedora/zeppelin/interpreter/spark/* + > :/home/fedora/zeppelin/interpreter/zeppelin-interpreter-shaded-0.10.0.jar + > :/home/fedora/zeppelin/interpreter/spark/spark-interpreter-0.10.0.jar + > :/opt/hadoop/etc/hadoop org.apache.zeppelin.interpreter.remote.RemoteInterpreterDownloader + > 10.10.2.210 + > 35643 + > spark + > /home/fedora/zeppelin/local-repo/spark + > [INFO] Interpreter launch command: + > /opt/spark/bin/spark-submit + > --class org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer + > --driver-class-path + > :/home/fedora/zeppelin/local-repo/spark/* + > :/home/fedora/zeppelin/interpreter/spark/* + > :/home/fedora/zeppelin/interpreter/zeppelin-interpreter-shaded-0.10.0.jar + > :/home/fedora/zeppelin/interpreter/spark/spark-interpreter-0.10.0.jar + > :/opt/hadoop/etc/hadoop + > --driver-java-options + > -Dfile.encoding=UTF-8 + > -Dlog4j.configuration=file:///home/fedora/zeppelin/conf/log4j.properties + > -Dlog4j.configurationFile=file:///home/fedora/zeppelin/conf/log4j2.properties + > -Dzeppelin.log.file=/home/fedora/zeppelin/logs/zeppelin-interpreter-spark-Carclop-Carclop-fedora-iris-gaia-blue-20220613-zeppelin.log + > --proxy-user Carclop + > --conf spark.yarn.dist.archives=/opt/spark/R/lib/sparkr.zip#sparkr + > --conf spark.submit.deployMode=client + > --conf spark.webui.yarn.useProxy=false + > --conf spark.yarn.isPython=true + > --conf spark.app.name=spark-Carclop + > --conf spark.master=yarn + > /home/fedora/zeppelin/interpreter/spark/spark-interpreter-0.10.0.jar + > 10.10.2.210 + > 35643 + > spark-Carclop + > .... + + # + # ** The three test runs for the same user ** + # I've been getting it worng all day :-( + # First iteration works. + # Second and third iterations fail. + # + # Add a 10 second delay between test loops ? + # + + # + # Swap usercount and loopcount params, because I keep getting it worng. + # + +# ----------------------------------------------------- +# Update our test-loop function. +#[root@ansibler] + + test-loop() + { + local usercount=${1:?'usercount required'} + local loopcount=${2:?'loopcount required'} + local looppause=${3:-10} + local delaystart=${4:-1} + local delaynotebook=${5:-1} + + rm -f /tmp/results/* + +cat << EOF + { + "usercount": "${usercount}", + "loopcount": "${loopcount}", + "looppause": "${looppause}", + "delaystart": "${delaystart}", + "delaynotebook": "${delaynotebook}", + "iterations": [ +EOF + + local comma='' + for i in $(seq 0 $((loopcount - 1))) + do + + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + +cat << EOF + ${comma} + { + "iteration": ${i}, + "testname": "${testname}", + "threads": +EOF + + sleep "${looppause}" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + "${delaystart:?}" \ + "${delaynotebook:?}" \ + > "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + +cat << EOF + } +EOF + comma=',' + + done + +cat << EOF + ] + } +EOF + } + + +# ----------------------------------------------------- +# Test with 1 user doing 3 loops. +#[root@ansibler] + + test-loop 1 3 \ + | tee /tmp/test-loop.json + + jq '.' /tmp/test-loop.json + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-01-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-01-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-01-02.txt:------------ Test Result: [PASS] ------------ + + +# ----------------------------------------------------- +# Test with 2 users doing 3 loops. +#[root@ansibler] + + test-loop 2 3 \ + | tee /tmp/test-loop.json + + jq '.' /tmp/test-loop.json + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-02-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-02-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-02-02.txt:------------ Test Result: [PASS] ------------ + + +# ----------------------------------------------------- +# Test with 4 users doing 4 loops. +#[root@ansibler] + + test-loop 4 4 \ + | tee /tmp/test-loop.json + + jq '.' /tmp/test-loop.json + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-04-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-02.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-04-03.txt:------------ Test Result: [PASS] ------------ + + +# ----------------------------------------------------- +# Test with 6 users doing 4 loops. +#[root@ansibler] + + test-loop 6 4 \ + | tee /tmp/test-loop.json + + jq '.' /tmp/test-loop.json + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-06-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-06-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-06-02.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-06-03.txt:------------ Test Result: [PASS] ------------ + + +# ----------------------------------------------------- +# Test with 6 users doing 50 loops. +#[root@ansibler] + + test-loop 6 50 \ + | tee /tmp/test-loop.json + + jq '.' /tmp/test-loop.json + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-06-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-06-01.txt:------------ Test Result: [PASS] ------------ + > .... + > .... + > /tmp/results/multi-user-06-48.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-06-49.txt:------------ Test Result: [PASS] ------------ + + # + # Adding a 10 second delay between loops solves a problem with creating notebooks. + # Race condition between deleting notebooks from previous loop and creating notebooks for the next. + + > org.apache.zeppelin.interpreter.InterpreterException: java.io.IOException: + > Interpreter Process creation is time out in 30 seconds + > You can increase timeout threshold via setting zeppelin.interpreter.connect.timeout of this interpreter. + + diff --git a/notes/zrq/20220615-01-concurrent-tests.txt b/notes/zrq/20220615-01-concurrent-tests.txt new file mode 100644 index 00000000..76de0654 --- /dev/null +++ b/notes/zrq/20220615-01-concurrent-tests.txt @@ -0,0 +1,1231 @@ +# +# +# +# Copyright (c) 2022, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Try to find out more about the limits on concurrent users. + Follow on from yesterda's notes 20220614-01-concurrent-tests.txt. + + Result: + + Identified a number of limits and race conditions. + + Race condition limits creating Interpreter process + https://github.com/wfau/aglais/issues/778 + + Race condition triggers ConcurrentModificationException in notebook handler + https://github.com/wfau/aglais/issues/779 + + Fixed limit to number of ports for Spark UI + https://github.com/wfau/aglais/issues/780 + + Active SparkContexts block the Yarn Application queue + https://github.com/wfau/aglais/issues/781 + + Too many notebooks causes Zeppelin to fail. + https://github.com/wfau/aglais/issues/783 + + +# ----------------------------------------------------- +# Create our benchmark script. +#[root@ansibler] + + cat > /tmp/run-benchmark.py << 'EOF' +#!/bin/python3 +import sys +from aglais_benchmark import AglaisBenchmarker + +try: + + opts = [opt for opt in sys.argv[1:] if opt.startswith("-")] + args = [arg for arg in sys.argv[1:] if not arg.startswith("-")] + + endpoint = args[0] + testconfig = args[1] + userlist = args[2] + usercount = int(args[3]) + delaystart = int(args[4]) + delaynotebook = int(args[5]) + +except IndexError: + + raise SystemExit(f"Usage: {sys.argv[0]} ") + +print( +""" +{{ +\"config\": {{ + \"endpoint\": \"{}\", + \"testconfig\": \"{}\", + \"userlist\": \"{}\", + \"usercount\": \"{}\", + \"delaystart\": \"{}\", + \"delaynotebook\": \"{}\" + }}, +\"output\": {{ +""".format( + endpoint, + testconfig, + userlist, + usercount, + delaystart, + delaynotebook + ) + ) + +print( + "---start---" + ) +AglaisBenchmarker( + testconfig, + userlist, + "/tmp/", + endpoint + ).run( + concurrent=True, + users=usercount, + delay_start=delaystart, + delay_notebook=delaynotebook + ) +print( + "---end---" + ) +print( +""" + } +} +""" + ) +EOF + + chmod 'a+x' /tmp/run-benchmark.py + + +# ----------------------------------------------------- +# Create our filter function. +# https://github.com/wfau/aglais/issues/602 +#[root@ansibler] + + filter-results() + { + local testname=${1:?'testname required'} + sed " + /^--*start--*/,/^--*end--*/ { + /^--*start/,/^--* Test Result/ { + /Test Result/ ! { + d + } + /Test Result/ { + s/^.*Test Result: \[\(.*\)\].*$/'testcode': '\1',/ + a \"threads\": + } + } + s/\"/'/g + s/'\(-\{0,1\}[0-9.]\{1,\}\)'/\1/g + s/:[[:space:]]*\([a-zA-Z]\{1,\}\)\([,}]\)/:'\1'\2/g + s/:[[:space:]]*\([,}]\),/: ''\1/g + s/'/\"/g + } + /^--*end--*/ { + d + } + " \ + "/tmp/results/${testname:?}.txt" \ + | tee "/tmp/results/${testname:?}.json" \ + | jq ' + .output.threads[] | keys as $x | [ $x[] as $y | {name: $y, value: .[$y].result, time: .[$y].time.elapsed , start: .[$y].time.start, finish: .[$y].time.finish } ] + ' + } + + +# ----------------------------------------------------- +# Create our test-loop function. +#[root@ansibler] + + test-loop() + { + local usercount=${1:?'usercount required'} + local loopcount=${2:?'loopcount required'} + local looppause=${3:-10} + local delaystart=${4:-1} + local delaynotebook=${5:-1} + + rm -f /tmp/results/* + +cat << EOF + { + "usercount": "${usercount}", + "loopcount": "${loopcount}", + "looppause": "${looppause}", + "delaystart": "${delaystart}", + "delaynotebook": "${delaynotebook}", + "iterations": [ +EOF + + local comma='' + for i in $(seq 0 $((loopcount - 1))) + do + + testname="multi-user-$(printf "%02d" ${usercount})-$(printf "%02d" ${i})" + +cat << EOF + ${comma} + { + "iteration": ${i}, + "testname": "${testname}", + "threads": +EOF + + sleep "${looppause}" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + "${delaystart:?}" \ + "${delaynotebook:?}" \ + > "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + +cat << EOF + } +EOF + comma=',' + + done + +cat << EOF + ] + } +EOF + } + + +# ----------------------------------------------------- +# Test with 6 users doing 50 loops. +#[root@ansibler] + + test-loop 6 50 \ + | tee /tmp/test-loop.json + + jq '.' /tmp/test-loop.json + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-06-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-06-01.txt:------------ Test Result: [PASS] ------------ + > .... + > .... + > /tmp/results/multi-user-06-48.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-06-49.txt:------------ Test Result: [PASS] ------------ + + +# ----------------------------------------------------- +# Test with 6 users doing 50 loops. +#[root@ansibler] + + test-loop 6 50 \ + | tee /tmp/test-loop.json + + jq '.' /tmp/test-loop.json + + # + # 6 users is fine. + # Adding a 7th user by logging in to the Zeppelin UI and running notebooks manually caused the system to lock up. + # All applications in the list are state ACCEPTED. + # http://master01:8088/cluster/apps + # + # Running some notenooks as the 7th user was fine. + # Could be coincidence that the lock up occurred when I ran the [Random Forrest classifier] notebook. + # Could be just running any notebook at that point would have caused the lockup. + # + + > .... + > [Wed Jun 15 10:04:03 +0000 2022] Application is Activated, waiting for resources to be assigned for AM. + > Details : AM Partition = + > Partition Resource = + > Queue's Absolute capacity = 100.0 % + > Queue's Absolute used capacity = 99.60318 % + > Queue's Absolute max capacity = 100.0 % + > Queue's capacity (absolute resource) = + > Queue's used capacity (absolute resource) = + > Queue's max capacity (absolute resource) = ; + > .... + + # + # Looks like number of cores is fine, but we have reached the limit for memory. + # Available 258048 + # Used 257024 + # + + # + # Now we know how to cause it. + # How do we fix it ? + # + # Um .... I went browsing Reddit for, er, stuff, and when I came back, it was working again. + # So perhaps just waiting will free it if/when one of them times out ? + # + # 6 users running again .. + # Logged in as the 7th, looking at [Random Forrest classifier] notebook. + # Clear all cells. + # Run all cells .. + # + + # + # Running with 6 users is fine. + # Running with 7 users - borderline, some jobs delayed. + # Running with 8 users fails. + # + + + +# ----------------------------------------------------- +# Tailing the spark interpreter logs for a user. +#[user@zeppelin] + + tail -f zeppelin-interpreter-spark-Mavaca-Mavaca-fedora-iris-gaia-blue-20220613-zeppelin.log + + > .... + > INFO [2022-06-15 13:32:00,996] ({Thread-47} Logging.scala[logInfo]:57) - Starting job: hasNext at NativeMethodAccessorImpl.java:0 + > INFO [2022-06-15 13:32:01,014] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Got job 0 (hasNext at NativeMethodAccessorImpl.java:0) with 1 output partitions + > INFO [2022-06-15 13:32:01,015] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Final stage: ResultStage 0 (hasNext at NativeMethodAccessorImpl.java:0) + > INFO [2022-06-15 13:32:01,016] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Parents of final stage: List() + > INFO [2022-06-15 13:32:01,017] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Missing parents: List() + > INFO [2022-06-15 13:32:01,022] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Submitting ResultStage 0 (MapPartitionsRDD[7] at toLocalIterator at NativeMethodAccessorImpl.java:0), which has no missing parents + > INFO [2022-06-15 13:32:01,068] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Block broadcast_0 stored as values in memory (estimated size 4.7 KiB, free 30.5 GiB) + > INFO [2022-06-15 13:32:01,114] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Block broadcast_0_piece0 stored as bytes in memory (estimated size 2.5 KiB, free 30.5 GiB) + > INFO [2022-06-15 13:32:01,117] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Added broadcast_0_piece0 in memory on zeppelin:41313 (size: 2.5 KiB, free: 30.5 GiB) + > INFO [2022-06-15 13:32:01,120] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Created broadcast 0 from broadcast at DAGScheduler.scala:1388 + > INFO [2022-06-15 13:32:01,139] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[7] at toLocalIterator at NativeMethodAccessorImpl.java:0) (first 15 tasks are for partitions Vector(0)) + > INFO [2022-06-15 13:32:01,140] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Adding task set 0.0 with 1 tasks resource profile 0 + > INFO [2022-06-15 13:32:01,166] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Added task set TaskSet_0.0 tasks to pool default + > WARN [2022-06-15 13:32:16,167] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-15 13:32:31,166] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-15 13:32:46,167] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-15 13:33:01,167] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-15 13:33:16,168] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-15 13:33:31,166] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-15 13:33:46,169] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-15 13:34:01,166] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-15 13:34:16,169] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-15 13:34:31,166] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-15 13:34:46,167] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-15 13:35:01,167] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-15 13:35:16,169] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-15 13:35:31,167] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > .... + + # + # 4 jobs are RUNNING .. including the RandomForestClassifier notebook for Hamar via UI. + # 1 job for Masonania is stuck in ACCEPTED. + # RandomForestClassifier finishes and resources become available to the other jobns. + # + # Takes about 10 min for the system to get back to normal. + # + + > .... + > INFO [2022-06-15 13:47:19,283] ({task-result-getter-3} Logging.scala[logInfo]:57) - Finished task 195.0 in stage 9.0 (TID 7255) in 1006 ms on worker02 (executor 17) (200/200) + > INFO [2022-06-15 13:47:19,283] ({task-result-getter-3} Logging.scala[logInfo]:57) - Removed TaskSet 9.0, whose tasks have all completed, from pool default + > INFO [2022-06-15 13:47:19,283] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - ResultStage 9 (collect at :17) finished in 2.309 s + > INFO [2022-06-15 13:47:19,283] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Job 6 is finished. Cancelling potential speculative or zombie tasks for this job + > INFO [2022-06-15 13:47:19,283] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Killing all running tasks in stage 9: Stage finished + > INFO [2022-06-15 13:47:19,284] ({Thread-47} Logging.scala[logInfo]:57) - Job 6 finished: collect at :17, took 8.587013 s + > WARN [2022-06-15 13:47:20,340] ({Thread-47} PooledRemoteClient.java[releaseBrokenClient]:80) - release broken client + > WARN [2022-06-15 13:47:20,341] ({Thread-47} PooledRemoteClient.java[releaseBrokenClient]:80) - release broken client + > WARN [2022-06-15 13:47:20,341] ({Thread-47} PooledRemoteClient.java[releaseBrokenClient]:80) - release broken client + > .... + + # + # The Spark session for Hamar is still listed as RUNNING. + # Started [10:18:37] + # It is listed as RUNNING, but it doesn't seem to be taking up that many resources. + # All 6 test users are getting on with their work .. + # + + +# ----------------------------------------------------- +# Check the results ... +#[root@ansibler] + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-06-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-06-01.txt:------------ Test Result: [PASS] ------------ + > .... + > .... + > /tmp/results/multi-user-06-48.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-06-49.txt:------------ Test Result: [PASS] ------------ + + + # + # All 50 loops completed. + # The RandomForestClassifier notebook is still open in the UI. + # Corresponding app is still listed as RUNNING in Hadoop. + # http://master01:8088/cluster/apps/RUNNING + # + + + +# ----------------------------------------------------- +# Test with 7 users doing 5 loops. +#[root@ansibler] + + test-loop 7 5 \ + | tee /tmp/test-loop.json + + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-07-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-07-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-07-02.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-07-03.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-07-04.txt:------------ Test Result: [PASS] ------------ + + # + # Survives the run, all tests PASS. + # The corresponding Hadoop/Yarn applications have all been deleted. + # + # Login via the Zeppelin UI (left from previous daty) still has a corresponding Hadoop/Yarn application listed as RUNNING. + # Left for >12 hrs, the Hadoop/Yarn application is still listed as RUNNING. + # + # Cleared all the cells and ran the RandomForestClassifier notebook again. + # RandomForestClassifier notebook completed + # Running the RandomForestClassifier notebook uses the same Hadoop/Yarn application again. + # Hadoop/Yarn application has been listed as RUNNING since the notebook first started. + # + +# ----------------------------------------------------- +# Test with 7 users doing 5 loops. +#[root@ansibler] + + test-loop 7 5 \ + | tee /tmp/test-loop.json + + # + # ... wait until they show up in the Hadoop/Yarn application queue listed as RUNNING. + # AND then run the RandomForestClassifier via the Zeppelin UI. + # + + # + # Not all the new applications get listed as RUNNING. + # Two of the new applications are listed as ACCEPTED. + # + # The the RandomForestClassifier jumped the queue by + + # + # Start to see resource warning in the logs .. + + tail -f zeppelin-interpreter-spark-Mavaca-Mavaca-fedora-iris-gaia-blue-20220613-zeppelin.log + + > .... + > + > WARN [2022-06-16 10:21:13,281] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-16 10:21:28,280] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-16 10:21:43,281] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-16 10:21:58,281] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-16 10:22:13,280] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-16 10:22:28,280] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-16 10:22:43,281] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-16 10:22:58,281] ({Timer-0} Logging.scala[l.... + + # + # ACCEPTED applications have the same diagnostics message. + # http://master01:8088/cluster/app/application_1655122472463_1100 + + > [Thu Jun 16 10:20:30 +0000 2022] Application is Activated, waiting for resources to be assigned for AM. + > Details : AM Partition = + > Partition Resource = + > Queue's Absolute capacity = 100.0 % + > Queue's Absolute used capacity = 85.31746 % + > Queue's Absolute max capacity = 100.0 % + > Queue's capacity (absolute resource) = + > Queue's used capacity (absolute resource) = + > Queue's max capacity (absolute resource) = + + + # + # 11:44 Still 2 applications held at ACCEPTED + # Only 4 applications RUNNING (one of which is the RandomForestClassifier). + # RandomForestClassifier is running, but very slowly. + # + + # 11:45 RandomForestClassifier is still processing the first select [Raw catalogue with selected columns] cell + # [Raw catalogue with selected columns] cell has been running for > 20min. + + # 11:46 Zeppelin UI pop-up: + + "Note is now running sequentially. Can not be performed: COMMIT_PARAGRAPH" + [Close] + + > .... + > ERROR [2022-06-16 10:45:47,994] ({qtp686466458-621448} NotebookServer.java[onMessage]:463) + > Can't handle message: + > { + > "op":"COMMIT_PARAGRAPH", + > "data":{ + > "id":"20201013-132418_278702125", + > "noteId":"2H7PSR6CB" + > "title":"Raw catalogue with selected columns" + > "paragraph": "..." + > .... + + # 11:48 Only 4 applications RUNNING (one of which is the RandomForestClassifier). + # Still 2 applications held at ACCEPTED + + # 11:49 Click the [Close] button + + # 11:54 RandomForestClassifier is still running, reached [Train up the Random Forrest] + + # + # I think the COMMIT_PARAGRAPH error message is bogus. + # It is triggered by a Ctrl^C on the cell title, might be Zeppelin trying to save (non-existrient) changes the the note title while the notebook is running. + # + + # 11:58 RandomForestClassifier completes. + # Applications held at ACCEPTED move to RUNNING. + # Things get back to normal. + + + # RandomForestClassifier took longer that normal? + # 11:23:00 .. 11:56:44 + + # 12:04 RandomForestClassifier has finished, but application is still listed as RUNNING. + # Stress tests has moved on to the second iteration. + # Looks like it is normal to have 2 applications waiting as ACCEPTEDa and 6 applications RUNNING. + # RandomForestClassifier is taking up a RUNNING slot even though it has finmished. + + # + # Test run completes. + # + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-07-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-07-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-07-02.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-07-03.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-07-04.txt:------------ Test Result: [PASS] ------------ + + # + # Having the test runs delete their notebooks frees up the Hadoop/Yarn applications queue. + # + +# ----------------------------------------------------- +# Test with 10 users doing 5 loops. +#[root@ansibler] + + test-loop 10 5 \ + | tee /tmp/test-loop.json + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-10-00.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-10-01.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-10-02.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-10-03.txt:------------ Test Result: [FAIL] ------------ + > /tmp/results/multi-user-10-04.txt:------------ Test Result: [FAIL] ------------ + + # + # OK, that was my fault. + # + + > .... + > Exception encountered while trying to create a notebook: /tmp/NVIZBFZXAO.json for user in config: /tmp/user9.yml + > [Errno 2] No such file or directory: '/tmp/user9.yml' + > .... + + +# ----------------------------------------------------- +# Create some test users. +# TODO Move the create-user-tools to ansible/client/bin. +# TODO Add ansible/client/bin to the client PATH. +#[root@ansibler] + + source /deployments/zeppelin/bin/create-user-tools.sh + + testnames01=( + Rhaelhall + Fipa - + Mythicson - + Balline - + Hiness - + Anskelisia - + Iflee - + Mischiellis - + Kellaug - + Liphima - + Jarters - + Williazoga - + Carrovieus - + Pierione - + Hayesphasia - + Collinotter - + Adazoga - + Harinabla + Sanderlotus + Bellgrin + ) + + createarrayusers \ + "${testnames01[@]}" \ + | tee /tmp/testusers-01.json \ + | jq '[ .users[] | {"name": .shirouser.name, "pass": .shirouser.pass} ]' + + > [ + > { + > "name": "Rhaelhall", + > "pass": "ohxohT9fiew2ui1OhchiC0seeyeel9" + > }, + > { + > "name": "Fipa", + > "pass": "aePhei7zei4gaeM1ACaique5eir8ad" + > }, + > .... + > .... + > { + > "name": "Bellgrin", + > "pass": "ieheNg3AiXohV8aed6aesh5sah5zou" + > } + > ] + + +# ----------------------------------------------------- +# Test with 10 users doing 5 loops. +#[root@ansibler] + + testconfig=/deployments/zeppelin/test/config/quick.json + testusers=/tmp/testusers-01.json + + test-loop 10 5 \ + | tee /tmp/test-loop.json + + # + # Hadoop/Yarn lists show 6 applications RUNNING and 5 applications ACCEPTED + # 6 RUNNING and 4 ACCEPTED are from the test run, and the fith ACCEPTED is our login via Zeppelin GUI. + # If this runs to completion, then it suggests we have solved the issue for the test platform (delete the notebooks after test run). + # It will still be a problem for notebosk run via Zeppelin GUI, because they don't get released. + # TODO next - investigate setting the notebook expiry time. + # + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-10-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-10-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-10-02.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-10-03.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-10-04.txt:------------ Test Result: [PASS] ------------ + + +# ----------------------------------------------------- +# Test with all 19 users doing 5 loops. +# (*) benchmarker always skips one account. +#[root@ansibler] + + testconfig=/deployments/zeppelin/test/config/quick.json + testusers=/tmp/testusers-01.json + + test-loop 19 5 \ + | tee /tmp/test-loop.json + + # + # Goes quiet at the end of the first iteration. + # Nothing running, nothing in the queue, just .. nothing. + # First 16 test users FINISHED. + # No sign of the final 4 ... + # + + Harinabla + Sanderlotus + Bellgrin + + +# ----------------------------------------------------- +# Check the Spark interpreter log for one of the stalled users. +#[root@ansibler] + + less zeppelin-interpreter-spark-Harinabla-Harinabla-fedora-iris-gaia-blue-20220613-zeppelin.log + + > .... + > INFO [2022-06-16 15:03:29,249] ({FIFOScheduler-interpreter_327761460-Worker-1} Logging.scala[logInfo]:57) - Registering OutputCommitCoordinator + > INFO [2022-06-16 15:03:29,366] ({FIFOScheduler-interpreter_327761460-Worker-1} Log.java[initialized]:169) - Logging initialized @10988ms to org.sparkproject.jetty.util.log.Slf4jLog + > INFO [2022-06-16 15:03:29,468] ({FIFOScheduler-interpreter_327761460-Worker-1} Server.java[doStart]:375) - jetty-9.4.40.v20210413; built: 2021-04-13T20:42:42.668Z; git: b881a572662e1943a14ae12e7e1207989f218b74; jvm 1.8.0_272-b10 + > INFO [2022-06-16 15:03:29,500] ({FIFOScheduler-interpreter_327761460-Worker-1} Server.java[doStart]:415) - Started @11121ms + > WARN [2022-06-16 15:03:29,535] ({FIFOScheduler-interpreter_327761460-Worker-1} Logging.scala[logWarning]:69) - Service 'SparkUI' could not bind on port 4040. Attempting port 4041. + > WARN [2022-06-16 15:03:29,535] ({FIFOScheduler-interpreter_327761460-Worker-1} Logging.scala[logWarning]:69) - Service 'SparkUI' could not bind on port 4041. Attempting port 4042. + > .... + > WARN [2022-06-16 15:03:29,540] ({FIFOScheduler-interpreter_327761460-Worker-1} Logging.scala[logWarning]:69) - Service 'SparkUI' could not bind on port 4054. Attempting port 4055. + > WARN [2022-06-16 15:03:29,540] ({FIFOScheduler-interpreter_327761460-Worker-1} Logging.scala[logWarning]:69) - Service 'SparkUI' could not bind on port 4055. Attempting port 4056. + > ERROR [2022-06-16 15:03:29,543] ({FIFOScheduler-interpreter_327761460-Worker-1} Logging.scala[logError]:94) - Failed to bind SparkUI + > java.net.BindException: Failed to bind to /0.0.0.0:4056: Service 'SparkUI' failed after 16 retries (starting from 4040)! Consider explicitly setting the appropriate port for the service 'SparkUI' (for example spark.ui.port for SparkUI) to an available port or increasing spark.port.maxRetries. + > at org.sparkproject.jetty.server.ServerConnector.openAcceptChannel(ServerConnector.java:349) + > at org.sparkproject.jetty.server.ServerConnector.open(ServerConnector.java:310) + > at org.sparkproject.jetty.server.AbstractNetworkConnector.doStart(AbstractNetworkConnector.java:80) + > at org.sparkproject.jetty.server.ServerConnector.doStart(ServerConnector.java:234) + > at org.sparkproject.jetty.util.component.AbstractLifeCycle.start(AbstractLifeCycle.java:73) + > at org.apache.spark.ui.JettyUtils$.newConnector$1(JettyUtils.scala:302) + > at org.apache.spark.ui.JettyUtils$.httpConnect$1(JettyUtils.scala:333) + > at org.apache.spark.ui.JettyUtils$.$anonfun$startJettyServer$5(JettyUtils.scala:336) + > at org.apache.spark.ui.JettyUtils$.$anonfun$startJettyServer$5$adapted(JettyUtils.scala:336) + > at org.apache.spark.util.Utils$.$anonfun$startServiceOnPort$2(Utils.scala:2331) + > at scala.collection.immutable.Range.foreach$mVc$sp(Range.scala:158) + > at org.apache.spark.util.Utils$.startServiceOnPort(Utils.scala:2323) + > at org.apache.spark.ui.JettyUtils$.startJettyServer(JettyUtils.scala:337) + > at org.apache.spark.ui.WebUI.bind(WebUI.scala:146) + > at org.apache.spark.SparkContext.$anonfun$new$11(SparkContext.scala:486) + > at org.apache.spark.SparkContext.$anonfun$new$11$adapted(SparkContext.scala:486) + > at scala.Option.foreach(Option.scala:407) + > at org.apache.spark.SparkContext.(SparkContext.scala:486) + > at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2672) + > at org.apache.spark.sql.SparkSession$Builder.$anonfun$getOrCreate$2(SparkSession.scala:945) + > at scala.Option.getOrElse(Option.scala:189) + > at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:939) + > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + > at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + > at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + > at java.lang.reflect.Method.invoke(Method.java:498) + > at org.apache.zeppelin.spark.BaseSparkScalaInterpreter.spark2CreateContext(BaseSparkScalaInterpreter.scala:299) + > at org.apache.zeppelin.spark.BaseSparkScalaInterpreter.createSparkContext(BaseSparkScalaInterpreter.scala:228) + > at org.apache.zeppelin.spark.SparkScala212Interpreter.open(SparkScala212Interpreter.scala:88) + > at org.apache.zeppelin.spark.SparkInterpreter.open(SparkInterpreter.java:121) + > at org.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:70) + > at org.apache.zeppelin.interpreter.Interpreter.getInterpreterInTheSameSessionByClassName(Interpreter.java:322) + > at org.apache.zeppelin.interpreter.Interpreter.getInterpreterInTheSameSessionByClassName(Interpreter.java:333) + > at org.apache.zeppelin.spark.PySparkInterpreter.open(PySparkInterpreter.java:90) + > at org.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:70) + > at org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:833) + > at org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:741) + > at org.apache.zeppelin.scheduler.Job.run(Job.java:172) + > at org.apache.zeppelin.scheduler.AbstractScheduler.runJob(AbstractScheduler.java:132) + > at org.apache.zeppelin.scheduler.FIFOScheduler.lambda$runJobInScheduler$0(FIFOScheduler.java:42) + > at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + > at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + > at java.lang.Thread.run(Thread.java:748) + > INFO [2022-06-16 15:03:29,545] ({ShutdownThread} RemoteInterpreterServer.java[run]:646) - Shutting down... + + # + # £$%*& another failure mode + # TODO list all these in GitHub + # + + # + # Possible cause : + # https://kontext.tech/article/525/fix-error-sparkui-failed-to-bind-sparkui + # Spark configuration spark.ui.port can be used to specify the default port of Spark UI. + # By default it is on port 4040. + # If the port number is occupied by other programs, Spark will try to increase the port + # number and try up to spark.port.maxRetries times. + # By default, the value for spark.port.maxRetries is 16. + + # https://spark.apache.org/docs/latest/configuration.html + # spark.port.maxRetries + # Maximum number of retries when binding to a port before giving up. + # When a port is given a specific value (non 0), each subsequent retry will increment the port used in the previous attempt by 1 before retrying. + # This essentially allows it to try a range of ports from the start port specified to port + maxRetries. + + # + # Looks like once it reaches 16 retries, it just gives up. + # Possibly the Spark job finishes with an error, but the test system doesn't catch it ? + # Either way, this places a hard limit on the number of separate Spark contextx we can run. + # + # TODO Try setting this higher and see if the problem goes away. + # + + # + # TODO If we need to allow a large port range this may have implications for firewalls ? + # + +# ----------------------------------------------------- +# Test with 5 users doing 5 loops. +#[root@ansibler] + + test-loop 1 5 \ + | tee /tmp/test-loop.json + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-01-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-01-01.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-01-02.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-01-03.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-01-04.txt:------------ Test Result: [PASS] ------------ + +# ----------------------------------------------------- +# Edit the Spark settings. +#[root@ansibler] + + ssh zeppelin + + vi /opt/spark/conf/spark-defaults.conf + + spark.port.maxRetries 25 + +# ----------------------------------------------------- +# Test with 19 users doing 2 loops. +# (*) benchmarker always skips one account. +#[root@ansibler] + + test-loop 19 2 \ + | tee /tmp/test-loop.json + + # + # Still fails. + # Nothing is left active in Zeppelin, so I suspect the notebook failed, but the xx hasn't detected it. + # TODO Replicate and check the notebook status .. + # + +# ----------------------------------------------------- +# Re-start Zeppelin. +#[root@ansibler] + + ssh zeppelin + + zeppelin-daemon.sh restart + + > Zeppelin stop [ OK ] + > Zeppelin start [ OK ] + + +# ----------------------------------------------------- +# Test with 19 users doing 2 loops. +# (*) benchmarker always skips one account. +#[root@ansibler] + + test-loop 19 2 \ + | tee /tmp/test-loop.json + + + # + # Hadoop/Yarn is allowing between 7 applications RUNNING. + # The rest are queued as ACCEPTED. + # + +# ----------------------------------------------------- +# Check the Spark interpreter log for one of the stalled users. +#[root@ansibler] + + pushd ${HOME}/zeppelin/logs + + tail -f zeppelin-interpreter-spark-Harinabla-Harinabla-fedora-iris-gaia-blue-20220613-zeppelin.log + + > .... + > INFO [2022-06-16 17:17:35,986] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Block broadcast_0 stored as values in memory (estimated size 4.7 KiB, free 30.5 GiB) + > INFO [2022-06-16 17:17:36,026] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Block broadcast_0_piece0 stored as bytes in memory (estimated size 2.5 KiB, free 30.5 GiB) + > INFO [2022-06-16 17:17:36,029] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Added broadcast_0_piece0 in memory on zeppelin:38315 (size: 2.5 KiB, free: 30.5 GiB) + > INFO [2022-06-16 17:17:36,031] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Created broadcast 0 from broadcast at DAGScheduler.scala:1388 + > INFO [2022-06-16 17:17:36,052] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[7] at toLocalIterator at NativeMethodAccessorImpl.java:0) (first 15 tasks are for partitions Vector(0)) + > INFO [2022-06-16 17:17:36,053] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Adding task set 0.0 with 1 tasks resource profile 0 + > INFO [2022-06-16 17:17:36,079] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Added task set TaskSet_0.0 tasks to pool default + > WARN [2022-06-16 17:17:51,080] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-16 17:18:06,080] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > .... + > .... + > WARN [2022-06-16 17:20:36,080] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > WARN [2022-06-16 17:20:51,080] ({Timer-0} Logging.scala[logWarning]:69) - Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources + > INFO [2022-06-16 17:20:56,185] ({dispatcher-CoarseGrainedScheduler} Logging.scala[logInfo]:57) - Registered executor NettyRpcEndpointRef(spark-client://Executor) (10.10.2.198:45524) with ID 1, ResourceProfileId 0 + > INFO [2022-06-16 17:20:56,188] ({spark-listener-group-executorManagement} Logging.scala[logInfo]:57) - New executor 1 has registered (new total is 2) + > INFO [2022-06-16 17:20:56,307] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Registering block manager worker06:45025 with 3.6 GiB RAM, BlockManagerId(1, worker06, 45025, None) + > INFO [2022-06-16 17:20:56,776] ({dispatcher-CoarseGrainedScheduler} Logging.scala[logInfo]:57) - Starting task 0.0 in stage 0.0 (TID 0) (worker06, executor 1, partition 0, PROCESS_LOCAL, 4728 bytes) taskResourceAssignments Map() + > INFO [2022-06-16 17:20:57,074] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Added broadcast_0_piece0 in memory on worker06:45025 (size: 2.5 KiB, free: 3.6 GiB) + > INFO [2022-06-16 17:20:57,536] ({task-result-getter-0} Logging.scala[logInfo]:57) - Finished task 0.0 in stage 0.0 (TID 0) in 771 ms on worker06 (executor 1) (1/1) + > .... + + > .... + > INFO [2022-06-16 17:24:29,156] ({FIFOScheduler-interpreter_614510408-Worker-1} Logging.scala[logInfo]:57) - Submitting application application_1655122472463_1291 to ResourceManager + > INFO [2022-06-16 17:24:29,187] ({FIFOScheduler-interpreter_614510408-Worker-1} YarnClientImpl.java[submitApplication]:311) - Submitted application application_1655122472463_1291 + > INFO [2022-06-16 17:24:30,190] ({FIFOScheduler-interpreter_614510408-Worker-1} Logging.scala[logInfo]:57) - Application report for application_1655122472463_1291 (state: ACCEPTED) + > INFO [2022-06-16 17:24:30,193] ({FIFOScheduler-interpreter_614510408-Worker-1} Logging.scala[logInfo]:57) - + > client token: N/A + > diagnostics: [Thu Jun 16 17:24:29 +0000 2022] Application is added to the scheduler and is not yet activated. Queue's AM resource limit exceeded. Details : AM Partition = ; AM Resource Request = vCores:1>; Queue Resource Limit for AM = ; User AM Resource Limit of the queue = ; Queue AM Resource Usage = ; + > ApplicationMaster host: N/A + > ApplicationMaster RPC port: -1 + > queue: default + > start time: 1655400269168 + > final status: UNDEFINED + > tracking URL: http://master01:8088/proxy/application_1655122472463_1291/ + > user: Harinabla + > INFO [2022-06-16 17:24:31,195] ({FIFOScheduler-interpreter_614510408-Worker-1} Logging.scala[logInfo]:57) - Application report for application_1655122472463_1291 (state: ACCEPTED) + > INFO [2022-06-16 17:24:32,196] ({FIFOScheduler-interpreter_614510408-Worker-1} Logging.scala[logInfo]:57) - Application report for application_1655122472463_1291 (state: ACCEPTED) + > .... + > .... + > INFO [2022-06-16 17:33:03,115] ({FIFOScheduler-interpreter_614510408-Worker-1} Logging.scala[logInfo]:57) - Application report for application_1655122472463_1291 (state: RUNNING) + > INFO [2022-06-16 17:33:03,115] ({FIFOScheduler-interpreter_614510408-Worker-1} Logging.scala[logInfo]:57) - + > client token: N/A + > diagnostics: N/A + > ApplicationMaster host: 10.10.2.147 + > ApplicationMaster RPC port: -1 + > queue: default + > start time: 1655400269168 + > final status: UNDEFINED + > tracking URL: http://master01:8088/proxy/application_1655122472463_1291/ + > user: Harinabla + > INFO [2022-06-16 17:33:03,117] ({FIFOScheduler-interpreter_614510408-Worker-1} Logging.scala[logInfo]:57) - Application application_1655122472463_1291 has started running. + > INFO [2022-06-16 17:33:03,128] ({FIFOScheduler-interpreter_614510408-Worker-1} Logging.scala[logInfo]:57) - Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 46313. + > INFO [2022-06-16 17:33:03,128] ({FIFOScheduler-interpreter_614510408-Worker-1} NettyBlockTransferService.scala[init]:81) - Server created on zeppelin:46313 + > INFO [2022-06-16 17:33:03,130] ({FIFOScheduler-interpreter_614510408-Worker-1} Logging.scala[logInfo]:57) - Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy + > INFO [2022-06-16 17:33:03,140] ({FIFOScheduler-interpreter_614510408-Worker-1} Logging.scala[logInfo]:57) - Registering BlockManager BlockManagerId(driver, zeppelin, 46313, None) + > INFO [2022-06-16 17:33:03,144] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Registering block manager zeppelin:46313 with 30.5 GiB RAM, BlockManagerId(driver, zeppelin, 46313, None) + > INFO [2022-06-16 17:33:03,148] ({FIFOScheduler-interpreter_614510408-Worker-1} Logging.scala[logInfo]:57) - Registered BlockManager BlockManagerId(driver, zeppelin, 46313, None) + > INFO [2022-06-16 17:33:03,148] ({FIFOScheduler-interpreter_614510408-Worker-1} Logging.scala[logInfo]:57) - external shuffle service port = 7337 + > .... + +# ----------------------------------------------------- +# Check the test results. +#[root@ansibler] + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-19-00.txt:------------ Test Result: [ERROR] ------------ + > /tmp/results/multi-user-19-01.txt:------------ Test Result: [PASS] ------------ + + + # + # NOT what I was expecting ... + # + + > .... + > [ + > { + > 'GaiaDMPSetup': { + > 'result': 'ERROR', + > 'outputs': { + > 'valid': True + > }, + > 'time': { + > 'result': 'FAST', + > 'elapsed': '4.40', + > 'expected': '45.00', + > 'percent': '-90.23', + > 'start': '2022-06-16T17:10:07.129116', + > 'finish': '2022-06-16T17:10:11.525329' + > }, + > 'logs': ' + > Unexpected exception: java.util.ConcurrentModificationException + > at java.util.HashMap$ValueSpliterator.forEachRemaining(HashMap.java:1633) + > at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) + > at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) + > at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) + > at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) + > at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566) + > at org.apache.zeppelin.service.JobManagerService.getNoteJobInfoByUnixTime(JobManagerService.java:90) + > at org.apache.zeppelin.socket.NotebookServer.broadcastUpdateNoteJobInfo(NotebookServer.java:519) + > at org.apache.zeppelin.socket.NotebookServer.onStatusChange(NotebookServer.java:2007) + > at org.apache.zeppelin.socket.NotebookServer.onStatusChange(NotebookServer.java:105) + > at org.apache.zeppelin.scheduler.Job.setStatus(Job.java:141) + > at org.apache.zeppelin.notebook.Paragraph.setStatus(Paragraph.java:398) + > at org.apache.zeppelin.notebook.Paragraph.execute(Paragraph.java:349) + > at org.apache.zeppelin.notebook.Note.run(Note.java:873) + > at org.apache.zeppelin.service.NotebookService.runParagraph(NotebookService.java:390) + > at org.apache.zeppelin.rest.NotebookRestApi.runParagraph(NotebookRestApi.java:849) + > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + > at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + > at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + > at java.lang.reflect.Method.invoke(Method.java:498) + > at org.glassfish.jersey.server.model.internal.ResourceMethodInvocationHandlerFactory.lambda$static$0(ResourceMethodInvocationHandlerFactory.java:52) + > at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher$1.run(AbstractJavaResourceMethodDispatcher.java:124) + > at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.invoke(AbstractJavaResourceMethodDispatcher.java:167) + > at org.glassfish.jersey.server.model.internal.JavaResourceMethodDispatcherProvider$ResponseOutInvoker.doDispatch(JavaResourceMethodDispatcherProvider.java:176) + > at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.dispatch(AbstractJavaResourceMethodDispatcher.java:79) + > at org.glassfish.jersey.server.model.ResourceMethodInvoker.invoke(ResourceMethodInvoker.java:469) + > at org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:391) + > at org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:80) + > at org.glassfish.jersey.server.ServerRuntime$1.run(ServerRuntime.java:253) + > at org.glassfish.jersey.internal.Errors$1.call(Errors.java:248) + > at org.glassfish.jersey.internal.Errors$1.call(Errors.java:244) + > at org.glassfish.jersey.internal.Errors.process(Errors.java:292) + > at org.glassfish.jersey.internal.Errors.process(Errors.java:274) + > at org.glassfish.jersey.internal.Errors.process(Errors.java:244) + > at org.glassfish.jersey.process.internal.RequestScope.runInScope(RequestScope.java:265) + > at org.glassfish.jersey.server.ServerRuntime.process(ServerRuntime.java:232) + > at org.glassfish.jersey.server.ApplicationHandler.handle(ApplicationHandler.java:680) + > at org.glassfish.jersey.servlet.WebComponent.serviceImpl(WebComponent.java:394) + > at org.glassfish.jersey.servlet.WebComponent.service(WebComponent.java:346) + > at org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:366) + > at org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:319) + > at org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:205) + > at org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:763) + > at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1651) + > at org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:61) + > at org.apache.shiro.web.servlet.AdviceFilter.executeChain(AdviceFilter.java:108) + > at org.apache.shiro.web.servlet.AdviceFilter.doFilterInternal(AdviceFilter.java:137) + > at org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125) + > at org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:66) + > at org.apache.shiro.web.servlet.AdviceFilter.executeChain(AdviceFilter.java:108) + > at org.apache.shiro.web.servlet.AdviceFilter.doFilterInternal(AdviceFilter.java:137) + > at org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125) + > at org.apache.shiro.web.servlet.ProxiedFilterChain.doFilter(ProxiedFilterChain.java:66) + > at org.apache.shiro.web.servlet.AbstractShiroFilter.executeChain(AbstractShiroFilter.java:450) + > at org.apache.shiro.web.servlet.AbstractShiroFilter$1.call(AbstractShiroFilter.java:365) + > at org.apache.shiro.subject.support.SubjectCallable.doCall(SubjectCallable.java:90) + > at org.apache.shiro.subject.support.SubjectCallable.call(SubjectCallable.java:83) + > at org.apache.shiro.subject.support.DelegatingSubject.execute(DelegatingSubject.java:387) + > at org.apache.shiro.web.servlet.AbstractShiroFilter.doFilterInternal(AbstractShiroFilter.java:362) + > at org.apache.shiro.web.servlet.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:125) + > at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1638) + > at org.apache.zeppelin.server.CorsFilter.doFilter(CorsFilter.java:64) + > at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1638) + > at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:567) + > at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143) + > at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:602) + > at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) + > at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235) + > at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1610) + > at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233) + > at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1377) + > at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188) + > at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:507) + > at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1580) + > at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186) + > at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1292) + > at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141) + > at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:234) + > at io.micrometer.core.instrument.binder.jetty.TimedHandler.handle(TimedHandler.java:120) + > at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) + > at org.eclipse.jetty.server.Server.handle(Server.java:501) + > at org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:383) + > at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:556) + > at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:375) + > at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:273) + > at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311) + > at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105) + > at org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104) + > at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:336) + > at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:313) + > at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171) + > at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.produce(EatWhatYouKill.java:135) + > at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:806) + > at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:938) + > at java.lang.Thread.run(Thread.java:748) + > ' + > }, + > 'Mean_proper_motions_over_the_sky': { + > 'result': 'ERROR', + > 'outputs': { + > 'valid': True + > }, + > 'time': { + > 'result': 'SLOW', + > 'elapsed': '130.59', + > 'expected': '55.00', + > 'percent': '137.44', + > 'start': '2022-06-16T17:10:12.526309', + > 'finish': '2022-06-16T17:12:23.119005' + > }, + > 'logs': ' + > Fail to execute line 13: df = spark.sql(query).cache() + > Traceback (most recent call last): + > File "/tmp/1655399539296-1/zeppelin_python.py", line 158, in + > exec(code, _zcUserQueryNameSpace) + > .... + + # + # Back to ConcurrentModificationException again :-( + # + # This looks a bit suspicious to me. + # If the noteboos are different each time, why are we getting ConcurrentModificationExceptions ? + # + + > Unexpected exception: java.util.ConcurrentModificationException + > at java.util.HashMap$ValueSpliterator.forEachRemaining(HashMap.java:1633) + > at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482) + > at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472) + > at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) + > at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) + > at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566) + > at org.apache.zeppelin.service.JobManagerService.getNoteJobInfoByUnixTime(JobManagerService.java:90) + > at org.apache.zeppelin.socket.NotebookServer.broadcastUpdateNoteJobInfo(NotebookServer.java:519) + > at org.apache.zeppelin.socket.NotebookServer.onStatusChange(NotebookServer.java:2007) + > at org.apache.zeppelin.socket.NotebookServer.onStatusChange(NotebookServer.java:105) + > at org.apache.zeppelin.scheduler.Job.setStatus(Job.java:141) + > at org.apache.zeppelin.notebook.Paragraph.setStatus(Paragraph.java:398) + > at org.apache.zeppelin.notebook.Paragraph.execute(Paragraph.java:349) + > at org.apache.zeppelin.notebook.Note.run(Note.java:873) + > at org.apache.zeppelin.service.NotebookService.runParagraph(NotebookService.java:390) + > at org.apache.zeppelin.rest.NotebookRestApi.runParagraph(NotebookRestApi.java:849) + + # + # ConcurrentModificationException while iterating a HashMAp. + # + + > at java.util.HashMap$ValueSpliterator.forEachRemaining(HashMap.java:1633) + + # + # Implies unsynchronised access to a global instance !? + # + # https://docs.oracle.com/javase/8/docs/api/java/util/HashMap.html + + Note that this implementation is not synchronized. If multiple threads access a hash map + concurrently, and at least one of the threads modifies the map structurally, it must be + synchronized externally ... + This is typically accomplished by synchronizing on some object that naturally encapsulates + the map. If no such object exists, the map should be "wrapped" using the + Collections.synchronizedMap method. + + The iterators returned by all of this class's "collection view methods" are fail-fast: + if the map is structurally modified at any time after the iterator is created, in any + way except through the iterator's own remove method, the iterator will throw a + ConcurrentModificationException. + Thus, in the face of concurrent modification, the iterator fails quickly and cleanly, rather + than risking arbitrary, non-deterministic behavior at an undetermined time in the future. + + # + # Found a change to Zeppelin JobManagerService that matches this. + # https://github.com/apache/zeppelin/blame/master/zeppelin-server/src/main/java/org/apache/zeppelin/service/JobManagerService.java#L91-L99 + + # The GitHub blame view can show us the code before the change: + # https://github.com/apache/zeppelin/blame/bef579d87f7531480052d8e9451752cae1118e36/zeppelin-server/src/main/java/org/apache/zeppelin/service/JobManagerService.java#L86-L90 + # Which matches the stack trace we see in our logs. + + # The new code fixes an unrelated bug about a memory leak. + # https://issues.apache.org/jira/browse/ZEPPELIN-5559 + # NoteManager never releases the note memory after a note has been read. + + # The GitHub PullRequest + # https://github.com/apache/zeppelin/pull/4252 + # https://github.com/zlosim/zeppelin/commit/621837900005ad9a990d84435972461534881336 + + # The new code also introduces a setting + # https://github.com/apache/zeppelin/blob/master/docs/setup/operation/configuration.md#zeppelin_note_cache_threshold + + zeppelin.note.cache.threshold + Threshold for the number of notes in the cache before an eviction occurs. + + # This was merged on 3rd Feb 2022, so it might be in the latest release :-) + # TODO - update to the latest release and re-test ? + + +# ----------------------------------------------------- +# Test with 19 users doing 2 loops. +#[root@ansibler] + + # + # Increase the initial startup delay and the delay between notebooks. + # + # local usercount=${1:?'usercount required'} + # local loopcount=${2:?'loopcount required'} + # local looppause=${3:-10} + # local delaystart=${4:-1} + # local delaynotebook=${5:-1} + + test-loop 19 2 10 5 5 \ + | tee /tmp/test-loop.json + + grep 'Result:' /tmp/results/*.txt + + > /tmp/results/multi-user-19-00.txt:------------ Test Result: [PASS] ------------ + > /tmp/results/multi-user-19-01.txt:------------ Test Result: [PASS] ------------ + + +# ----------------------------------------------------- +# Create our long-loop function. +# https://stackoverflow.com/questions/17548064/how-to-have-a-bash-script-loop-until-a-specific-time +# https://stackoverflow.com/a/17548151 +# https://linuxize.com/post/bash-increment-decrement-variable/ +#[root@ansibler] + + long-loop() + { + local usercount=${1:?'usercount required'} + local loopfinish=${2:?'loopfinish required'} + local looppause=${3:-10} + local delaystart=${4:-1} + local delaynotebook=${5:-1} + + rm -f /tmp/results/* + +cat << EOF + { + "usercount": "${usercount}", + "loopcount": "${loopcount}", + "looppause": "${looppause}", + "delaystart": "${delaystart}", + "delaynotebook": "${delaynotebook}", + "iterations": [ +EOF + + local comma='' + local iter=0 + while [ $(date "+%H") -lt ${loopfinish} ] + do + + testname="test-$(date '+%Y%m%dT%H%M%S')-iter-$(printf "%02d" ${iter})" + +cat << EOF + ${comma} + { + "iteration": ${iter}, + "testname": "${testname}", + "threads": +EOF + + sleep "${looppause}" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + "${delaystart:?}" \ + "${delaynotebook:?}" \ + > "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + +cat << EOF + } +EOF + comma=',' + ((iter+=1)) + + done + +cat << EOF + ] + } +EOF + } + + +# ----------------------------------------------------- +# Test with 19 users doing 2 loops until 10:00. +#[root@ansibler] + + # local usercount=${1:?'usercount required'} + # local loopfinish=${2:?'loopfinish required'} + # local looppause=${3:-10} + # local delaystart=${4:-1} + # local delaynotebook=${5:-1} + + long-loop 19 10 10 5 5 \ + | tee /tmp/test-loop.json + + + diff --git a/notes/zrq/20220617-01-concurrent-tests.txt b/notes/zrq/20220617-01-concurrent-tests.txt new file mode 100644 index 00000000..c5682238 --- /dev/null +++ b/notes/zrq/20220617-01-concurrent-tests.txt @@ -0,0 +1,871 @@ +# +# +# +# Copyright (c) 2022, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Try to find out more about the limits on concurrent users. + Follow on from previous notes 20220615-01-concurrent-tests.txt. + + Result: + + Work in progress ... + What should have been a stable test .. wasn't. + But I haven't figured out why yet. + Lots of big stack traces because I'm not sure what the key parts are yet. + Just scrappy notes at this point + + +# ----------------------------------------------------- +# Create our long-loop function. +# https://stackoverflow.com/questions/17548064/how-to-have-a-bash-script-loop-until-a-specific-time +# https://stackoverflow.com/a/17548151 +# https://linuxize.com/post/bash-increment-decrement-variable/ +#[root@ansibler] + + long-loop() + { + local usercount=${1:?'usercount required'} + local loopfinish=${2:?'loopfinish required'} + local looppause=${3:-10} + local delaystart=${4:-1} + local delaynotebook=${5:-1} + + rm -f /tmp/results/* + +cat << EOF + { + "usercount": "${usercount}", + "loopcount": "${loopcount}", + "looppause": "${looppause}", + "delaystart": "${delaystart}", + "delaynotebook": "${delaynotebook}", + "iterations": [ +EOF + + local comma='' + local iter=0 + while [ $(date "+%H") -lt ${loopfinish} ] + do + + testname="test-$(date '+%Y%m%dT%H%M%S')-iter-$(printf "%02d" ${iter})" + +cat << EOF + ${comma} + { + "iteration": ${iter}, + "testname": "${testname}", + "threads": +EOF + + sleep "${looppause}" + + /tmp/run-benchmark.py \ + "${endpoint:?}" \ + "${testconfig:?}" \ + "${testusers:?}" \ + "${usercount:?}" \ + "${delaystart:?}" \ + "${delaynotebook:?}" \ + > "/tmp/results/${testname:?}.txt" + + filter-results "${testname:?}" + +cat << EOF + } +EOF + comma=',' + ((iter+=1)) + + done + +cat << EOF + ] + } +EOF + } + + +# ----------------------------------------------------- +# Test with 19 users doing 2 loops until 10am. +#[root@ansibler] + + # local usercount=${1:?'usercount required'} + # local loopfinish=${2:?'loopfinish required'} + # local looppause=${3:-10} + # local delaystart=${4:-1} + # local delaynotebook=${5:-1} + + long-loop 19 10 10 5 5 \ + | tee /tmp/test-loop.json + + + > .... + > { + > "iteration": 13, + > "testname": "test-20220617T062603-iter-13", + > "threads": [ + > [ + > { + > "name": "GaiaDMPSetup", + > "value": "PASS", + > "time": 36.99, + > "start": "2022-06-17T06:26:13.509228", + > "finish": "2022-06-17T06:26:50.498948" + > }, + > .... + > .... + > { + > "name": "Source_counts_over_the_sky.json", + > "value": "PASS", + > "time": 17.83, + > "start": "2022-06-17T06:27:35.418406", + > "finish": "2022-06-17T06:27:53.252655" + > } + > ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > [ .... ], + > }, + > { + > "iteration": 14, + > "testname": "test-20220617T064603-iter-14", + > "threads": [ + > [ + > .... + + # + # Test reached iteration 14, and stopped around 07:03 am. + # Everything looks normal .. just stopped. + # + + +# ----------------------------------------------------- +# Check the Zeppelin logs ... +#[user@zeppelin] + + # + # The Zeppelin log looks normal ... just stopped. + + less ${HOME}/zeppelin/logs/zeppelin-$(id -un)-$(hostname).log + + > .... + > .... + > WARN [2022-06-17 07:02:57,859] ({Exec Default Executor} ExecRemoteInterpreterProcess.java[onProcessComplete]:226) - Process is exited with exit value 0 + > INFO [2022-06-17 07:02:57,860] ({Exec Default Executor} ProcessLauncher.java[transition]:109) - Process state is transitioned to COMPLETED + > WARN [2022-06-17 07:02:58,055] ({Exec Default Executor} ExecRemoteInterpreterProcess.java[onProcessComplete]:226) - Process is exited with exit value 0 + > INFO [2022-06-17 07:02:58,056] ({Exec Default Executor} ProcessLauncher.java[transition]:109) - Process state is transitioned to COMPLETED + > INFO [2022-06-17 07:03:00,338] ({qtp2128029086-288820} ExecRemoteInterpreterProcess.java[stop]:136) - Remote exec process of interpreter group: spark-Bellgrin is terminated + > WARN [2022-06-17 07:03:00,338] ({qtp2128029086-288820} AuthorizationService.java[getOwners]:230) - No noteAuth found for noteId: 2H7RWP7WY + > INFO [2022-06-17 07:03:00,344] ({qtp2128029086-288823} ExecRemoteInterpreterProcess.java[stop]:136) - Remote exec process of interpreter group: md-Bellgrin is terminated + > INFO [2022-06-17 07:03:00,345] ({qtp2128029086-288823} InterpreterSettingManager.java[copyDependenciesFromLocalPath]:784) - Start to copy dependencies for interpreter: spark + > INFO [2022-06-17 07:03:00,345] ({qtp2128029086-288823} InterpreterSettingManager.java[copyDependenciesFromLocalPath]:795) - Finish copy dependencies for interpreter: spark + > INFO [2022-06-17 07:03:00,345] ({qtp2128029086-288823} InterpreterSettingManager.java[copyDependenciesFromLocalPath]:784) - Start to copy dependencies for interpreter: spark + > INFO [2022-06-17 07:03:00,345] ({qtp2128029086-288823} InterpreterSettingManager.java[copyDependenciesFromLocalPath]:795) - Finish copy dependencies for interpreter: spark + > INFO [2022-06-17 07:03:00,345] ({qtp2128029086-288823} InterpreterSettingManager.java[copyDependenciesFromLocalPath]:784) - Start to copy dependencies for interpreter: md + > INFO [2022-06-17 07:03:00,345] ({qtp2128029086-288823} InterpreterSettingManager.java[copyDependenciesFromLocalPath]:795) - Finish copy dependencies for interpreter: md + > INFO [2022-06-17 07:03:00,345] ({qtp2128029086-288823} InterpreterSettingManager.java[copyDependenciesFromLocalPath]:784) - Start to copy dependencies for interpreter: spark + > INFO [2022-06-17 07:03:00,346] ({qtp2128029086-288823} InterpreterSettingManager.java[copyDependenciesFromLocalPath]:795) - Finish copy dependencies for interpreter: spark + > INFO [2022-06-17 07:03:00,345] ({qtp2128029086-288316} ManagedInterpreterGroup.java[close]:102) - Close Session: shared_session for interpreter setting: md + > WARN [2022-06-17 07:03:00,346] ({qtp2128029086-288823} AuthorizationService.java[getOwners]:230) - No noteAuth found for noteId: 2H5PXXMKB + > INFO [2022-06-17 07:03:00,346] ({qtp2128029086-288316} ManagedInterpreterGroup.java[close]:106) - Remove this InterpreterGroup: md-Bellgrin as all the sessions are closed + > INFO [2022-06-17 07:03:00,346] ({qtp2128029086-288316} InterpreterSettingManager.java[copyDependenciesFromLocalPath]:784) - Start to copy dependencies for interpreter: md + > INFO [2022-06-17 07:03:00,346] ({qtp2128029086-288316} InterpreterSettingManager.java[copyDependenciesFromLocalPath]:795) - Finish copy dependencies for interpreter: md + > WARN [2022-06-17 07:03:00,347] ({qtp2128029086-288316} AuthorizationService.java[getOwners]:230) - No noteAuth found for noteId: 2H78KAXNA + > + > INFO [2022-06-17 07:10:07,760] ({SessionValidationThread-1} AbstractValidatingSessionManager.java[validateSessions]:275) - Validating all active sessions... + > INFO [2022-06-17 07:10:07,766] ({SessionValidationThread-1} AbstractValidatingSessionManager.java[validateSessions]:308) - Finished session validation. No sessions were stopped. + > + > INFO [2022-06-17 08:10:07,760] ({SessionValidationThread-1} AbstractValidatingSessionManager.java[validateSessions]:275) - Validating all active sessions... + > INFO [2022-06-17 08:10:07,765] ({SessionValidationThread-1} AbstractValidatingSessionManager.java[validateSessions]:308) - Finished session validation. No sessions were stopped. + > + > INFO [2022-06-17 09:10:07,760] ({SessionValidationThread-1} AbstractValidatingSessionManager.java[validateSessions]:275) - Validating all active sessions... + > INFO [2022-06-17 09:10:07,763] ({SessionValidationThread-1} AbstractValidatingSessionManager.java[validateSessions]:308) - Finished session validation. No sessions were stopped. + > + > INFO [2022-06-17 10:10:07,760] ({SessionValidationThread-1} AbstractValidatingSessionManager.java[validateSessions]:275) - Validating all active sessions... + > INFO [2022-06-17 10:10:07,764] ({SessionValidationThread-1} AbstractValidatingSessionManager.java[validateSessions]:308) - Finished session validation. No sessions were stopped. + + + # + # The Spark interpreter log has some clues. + + less zeppelin-interpreter-spark-Fipa-Fipa-$(id -un)-$(hostname).log + + > .... + > .... + > INFO [2022-06-17 06:48:12,591] ({FIFOScheduler-interpreter_238135472-Worker-1} AbstractScheduler.java[runJob]:154) - Job paragraph_1655448486850_407152630 finished by scheduler interpreter_238135472 with status FINISHED + > INFO [2022-06-17 06:48:13,651] ({FIFOScheduler-interpreter_238135472-Worker-1} AbstractScheduler.java[runJob]:127) - Job paragraph_1655448486850_276207388 started by scheduler interpreter_238135472 + > INFO [2022-06-17 06:48:13,781] ({FIFOScheduler-interpreter_238135472-Worker-1} AbstractScheduler.java[runJob]:154) - Job paragraph_1655448486850_276207388 finished by scheduler interpreter_238135472 with status FINISHED + > INFO [2022-06-17 06:48:14,776] ({FIFOScheduler-interpreter_238135472-Worker-1} AbstractScheduler.java[runJob]:127) - Job paragraph_1655448486851_1039163803 started by scheduler interpreter_238135472 + > INFO [2022-06-17 06:48:14,780] ({FIFOScheduler-interpreter_238135472-Worker-1} AbstractScheduler.java[runJob]:154) - Job paragraph_1655448486851_1039163803 finished by scheduler interpreter_238135472 with status FINISHED + > INFO [2022-06-17 06:48:21,314] ({pool-3-thread-1} PySparkInterpreter.java[close]:112) - Close PySparkInterpreter + > INFO [2022-06-17 06:48:21,314] ({pool-3-thread-1} PythonInterpreter.java[close]:258) - Kill python process + > INFO [2022-06-17 06:48:21,322] ({pool-3-thread-1} RemoteInterpreterServer.java[shutdown]:245) - Unregister interpreter process + > WARN [2022-06-17 06:48:21,328] ({Exec Default Executor} ProcessLauncher.java[onProcessFailed]:134) - Process with cmd [python, /tmp/1655448401905-0/zeppelin_python.py, 10.10.2.210, 38015] is failed due to + > org.apache.commons.exec.ExecuteException: Process exited with an error: 143 (Exit value: 143) + > at org.apache.commons.exec.DefaultExecutor.executeInternal(DefaultExecutor.java:404) + > at org.apache.commons.exec.DefaultExecutor.access$200(DefaultExecutor.java:48) + > at org.apache.commons.exec.DefaultExecutor$1.run(DefaultExecutor.java:200) + > at java.lang.Thread.run(Thread.java:748) + > INFO [2022-06-17 06:48:21,330] ({Exec Default Executor} ProcessLauncher.java[transition]:109) - Process state is transitioned to TERMINATED + > INFO [2022-06-17 06:48:21,330] ({ShutdownThread} RemoteInterpreterServer.java[run]:646) - Shutting down... + > INFO [2022-06-17 06:48:21,330] ({ShutdownThread} RemoteInterpreterServer.java[run]:647) - Shutdown initialized by ShutdownCall + > INFO [2022-06-17 06:48:21,330] ({ShutdownThread} SparkInterpreter.java[close]:182) - Close SparkInterpreter + > INFO [2022-06-17 06:48:21,349] ({ShutdownThread} BaseSparkScalaInterpreter.scala[cleanupStagingDirInternal]:218) - Deleted staging directory hdfs://master01:9000/albert/Fipa/.sparkStaging/application_1655122472463_1598 + > INFO [2022-06-17 06:48:21,361] ({ShutdownThread} AbstractConnector.java[doStop]:381) - Stopped Spark@52ba061{HTTP/1.1, (http/1.1)}{0.0.0.0:4040} + > INFO [2022-06-17 06:48:21,363] ({ShutdownThread} Logging.scala[logInfo]:57) - Stopped Spark web UI at http://zeppelin:4040 + > INFO [2022-06-17 06:48:21,368] ({YARN application state monitor} Logging.scala[logInfo]:57) - Interrupting monitor thread + > INFO [2022-06-17 06:48:21,370] ({ShutdownThread} Logging.scala[logInfo]:57) - Shutting down all executors + > INFO [2022-06-17 06:48:21,371] ({dispatcher-CoarseGrainedScheduler} Logging.scala[logInfo]:57) - Asking each executor to shut down + > INFO [2022-06-17 06:48:21,377] ({ShutdownThread} Logging.scala[logInfo]:57) - YARN client scheduler backend Stopped + > INFO [2022-06-17 06:48:21,385] ({dispatcher-event-loop-34} Logging.scala[logInfo]:57) - MapOutputTrackerMasterEndpoint stopped! + > WARN [2022-06-17 06:48:21,393] ({rpc-server-4-5} NioEventLoop.java[unexpectedSelectorWakeup]:554) - Selector.select() returned prematurely 512 times in a row; rebuilding Selector io.netty.channel.nio.SelectedSelectionKeySetSelector@78fe626c. + > INFO [2022-06-17 06:48:21,394] ({rpc-server-4-5} NioEventLoop.java[rebuildSelector0]:430) - Migrated 1 channel(s) to the new Selector. + > INFO [2022-06-17 06:48:21,397] ({ShutdownThread} Logging.scala[logInfo]:57) - MemoryStore cleared + > INFO [2022-06-17 06:48:21,397] ({ShutdownThread} Logging.scala[logInfo]:57) - BlockManager stopped + > INFO [2022-06-17 06:48:21,400] ({ShutdownThread} Logging.scala[logInfo]:57) - BlockManagerMaster stopped + > INFO [2022-06-17 06:48:21,406] ({dispatcher-event-loop-47} Logging.scala[logInfo]:57) - OutputCommitCoordinator stopped! + > INFO [2022-06-17 06:48:21,414] ({ShutdownThread} Logging.scala[logInfo]:57) - Successfully stopped SparkContext + > INFO [2022-06-17 06:48:21,414] ({ShutdownThread} Logging.scala[logInfo]:57) - SparkContext already stopped. + > INFO [2022-06-17 06:48:21,417] ({ShutdownThread} SchedulerFactory.java[destroy]:61) - Destroy all executors + > INFO [2022-06-17 06:48:21,418] ({ShutdownThread} SchedulerFactory.java[destroy]:65) - Stopping Scheduler interpreter_238135472 + > WARN [2022-06-17 06:48:21,418] ({SchedulerFactory2} AbstractScheduler.java[run]:91) - FIFOScheduler is interrupted + > INFO [2022-06-17 06:48:21,418] ({ShutdownThread} SchedulerFactory.java[destroy]:65) - Stopping Scheduler interpreter_1908998913 + > INFO [2022-06-17 06:48:21,419] ({ShutdownThread} SchedulerFactory.java[destroy]:65) - Stopping Scheduler interpreter_182853103 + > WARN [2022-06-17 06:48:21,419] ({SchedulerFactory7} AbstractScheduler.java[run]:91) - FIFOScheduler is interrupted + > WARN [2022-06-17 06:48:21,419] ({SchedulerFactory8} AbstractScheduler.java[run]:91) - FIFOScheduler is interrupted + > INFO [2022-06-17 06:48:21,419] ({ShutdownThread} SchedulerFactory.java[destroy]:65) - Stopping Scheduler interpreter_79060382 + > INFO [2022-06-17 06:48:21,419] ({ShutdownThread} SchedulerFactory.java[destroy]:65) - Stopping Scheduler org.apache.zeppelin.spark.SparkSqlInterpreter861182032 + > WARN [2022-06-17 06:48:21,419] ({SchedulerFactory6} AbstractScheduler.java[run]:91) - FIFOScheduler is interrupted + > WARN [2022-06-17 06:48:21,419] ({SchedulerFactory3} AbstractScheduler.java[run]:91) - ParallelScheduler is interrupted + > INFO [2022-06-17 06:48:21,419] ({ShutdownThread} SchedulerFactory.java[destroy]:65) - Stopping Scheduler org.apache.zeppelin.spark.SparkRInterpreter526653806 + > INFO [2022-06-17 06:48:21,419] ({ShutdownThread} SchedulerFactory.java[destroy]:65) - Stopping Scheduler interpreter_768785502 + > INFO [2022-06-17 06:48:21,419] ({ShutdownThread} SchedulerFactory.java[destroy]:65) - Stopping Scheduler interpreter_409799365 + > WARN [2022-06-17 06:48:21,419] ({SchedulerFactory4} AbstractScheduler.java[run]:91) - FIFOScheduler is interrupted + > WARN [2022-06-17 06:48:21,419] ({SchedulerFactory5} AbstractScheduler.java[run]:91) - FIFOScheduler is interrupted + > WARN [2022-06-17 06:48:21,419] ({SchedulerFactory1} AbstractScheduler.java[run]:91) - FIFOScheduler is interrupted + > INFO [2022-06-17 06:48:21,420] ({RemoteInterpreterServer-Thread} RemoteInterpreterServer.java[run]:199) - RemoteInterpreterServer-Thread finished + > INFO [2022-06-17 06:48:21,420] ({main} RemoteInterpreterServer.java[main]:317) - RemoteInterpreterServer thread is finished + > INFO [2022-06-17 06:48:21,423] ({shutdown-hook-0} Logging.scala[logInfo]:57) - Shutdown hook called + > INFO [2022-06-17 06:48:21,424] ({shutdown-hook-0} Logging.scala[logInfo]:57) - Deleting directory /tmp/spark-3a13c1dd-b1a5-421b-8154-9d2f56dfe1d2 + > INFO [2022-06-17 06:48:21,427] ({shutdown-hook-0} Logging.scala[logInfo]:57) - Deleting directory /mnt/cinder/vdc/spark/temp/spark-5759feb6-4497-4370-a5ca-ddbc6c067218/pyspark-de460db7-1e8a-4134-8ceb-f9d0a912610f + > INFO [2022-06-17 06:48:21,430] ({shutdown-hook-0} Logging.scala[logInfo]:57) - Deleting directory /mnt/cinder/vdc/spark/temp/spark-5759feb6-4497-4370-a5ca-ddbc6c067218 + + # + # Some of this looks bad (exit code 143 is associated with out of memory issues). + # https://stackoverflow.com/questions/42972908/container-killed-by-the-applicationmaster-exit-code-is-143 + # https://stackoverflow.com/a/52403247 + # + # On the other hand, the Hadoop/Yarn UI shows this application as FINISHED. + # http://master01:8088/cluster/app/application_1655122472463_1598 + + > User: Fipa + > Name: spark-Fipa + > Application Type: SPARK + > Application Tags: - + > Application Priority: 0 (Higher Integer value indicates higher priority) + > YarnApplicationState: FINISHED + > Queue: default + > FinalStatus Reported by AM: SUCCEEDED + > Started: Fri Jun 17 06:46:27 +0000 2022 + > Launched: Fri Jun 17 06:46:28 +0000 2022 + > Finished: Fri Jun 17 06:48:21 +0000 2022 + > Elapsed: 1mins, 53sec + > Tracking URL: History + > Log Aggregation Status: DISABLED + > Application Timeout (Remaining Time): Unlimited + > Diagnostics: + > Unmanaged Application: false + > Application Node Label expression: + > AM container Node Label expression: + + # + # There is one application left in the RUNNING state. + # http://master01:8088/cluster/app/application_1655122472463_1609 + + > User: Carrovieus + > Name: spark-Carrovieus + > Application Type: SPARK + > Application Tags: + > Application Priority: 0 (Higher Integer value indicates higher priority) + > YarnApplicationState: RUNNING: AM has registered with RM and started running. + > Queue: default + > FinalStatus Reported by AM: Application has not completed yet. + > Started: Fri Jun 17 06:47:23 +0000 2022 + > Launched: Fri Jun 17 06:51:01 +0000 2022 + > Finished: N/A + > Elapsed: 4hrs, 9mins, 43sec + > Tracking URL: ApplicationMaster + > Log Aggregation Status: DISABLED + > Application Timeout (Remaining Time): Unlimited + > Diagnostics: + > Unmanaged Application: false + > Application Node Label expression: + > AM container Node Label expression: + + +# ----------------------------------------------------- +# Check the Zeppelin log for Carrovieus ... +#[user@zeppelin] + + less zeppelin-interpreter-spark-Carrovieus-Carrovieus-$(id -un)-$(hostname).log + + # + # This looks good .. + + > .... + > INFO [2022-06-17 06:55:01,987] ({FIFOScheduler-interpreter_723077581-Worker-1} AbstractScheduler.java[runJob]:154) - Job paragraph_1655448901045_1404462898 finished by scheduler interpreter_723077581 with status FINISHED + > INFO [2022-06-17 06:55:02,083] ({FIFOScheduler-interpreter_723077581-Worker-1} AbstractScheduler.java[runJob]:127) - Job paragraph_1655448901045_635421648 started by scheduler interpreter_723077581 + > INFO [2022-06-17 06:55:02,090] ({FIFOScheduler-interpreter_723077581-Worker-1} AbstractScheduler.java[runJob]:154) - Job paragraph_1655448901045_635421648 finished by scheduler interpreter_723077581 with status FINISHED + > INFO [2022-06-17 06:55:03,163] ({FIFOScheduler-interpreter_723077581-Worker-1} AbstractScheduler.java[runJob]:127) - Job paragraph_1655448901045_490513828 started by scheduler interpreter_723077581 + > INFO [2022-06-17 06:55:03,325] ({FIFOScheduler-interpreter_723077581-Worker-1} AbstractScheduler.java[runJob]:154) - Job paragraph_1655448901045_490513828 finished by scheduler interpreter_723077581 with status FINISHED + > INFO [2022-06-17 06:55:04,290] ({FIFOScheduler-interpreter_723077581-Worker-1} AbstractScheduler.java[runJob]:127) - Job paragraph_1655448901045_1652921845 started by scheduler interpreter_723077581 + > INFO [2022-06-17 06:55:04,294] ({FIFOScheduler-interpreter_723077581-Worker-1} AbstractScheduler.java[runJob]:154) - Job paragraph_1655448901045_1652921845 finished by scheduler interpreter_723077581 with status FINISHED + > .... + > INFO [2022-06-17 06:55:04,389] ({pool-3-thread-1} SchedulerFactory.java[createOrGetFIFOScheduler]:76) - Create FIFOScheduler: interpreter_1994647206 + > INFO [2022-06-17 06:55:04,389] ({pool-3-thread-1} SchedulerFactory.java[createOrGetFIFOScheduler]:76) - Create FIFOScheduler: org.apache.zeppelin.spark.SparkRInterpreter1459163571 + > INFO [2022-06-17 06:55:04,390] ({pool-3-thread-1} SchedulerFactory.java[createOrGetFIFOScheduler]:76) - Create FIFOScheduler: interpreter_600913622 + > INFO [2022-06-17 06:55:04,390] ({pool-3-thread-1} SchedulerFactory.java[createOrGetFIFOScheduler]:76) - Create FIFOScheduler: interpreter_1255350662 + > INFO [2022-06-17 06:55:04,390] ({pool-3-thread-1} SchedulerFactory.java[createOrGetFIFOScheduler]:76) - Create FIFOScheduler: interpreter_1530179903 + > .... + + # + # This does not look good .. + + > .... + > INFO [2022-06-17 06:55:52,336] ({spark-dynamic-executor-allocation} Logging.scala[logInfo]:57) - Requesting to kill executor(s) 2, 8, 3 + > INFO [2022-06-17 06:55:52,340] ({spark-dynamic-executor-allocation} Logging.scala[logInfo]:57) - Actual list of executor(s) to be killed is 2, 8, 3 + > INFO [2022-06-17 06:55:52,373] ({spark-dynamic-executor-allocation} Logging.scala[logInfo]:57) - Executors 2,8,3 removed due to idle timeout. + > INFO [2022-06-17 06:55:52,473] ({spark-dynamic-executor-allocation} Logging.scala[logInfo]:57) - Requesting to kill executor(s) 7, 1, 4, 6 + > INFO [2022-06-17 06:55:52,473] ({spark-dynamic-executor-allocation} Logging.scala[logInfo]:57) - Actual list of executor(s) to be killed is 7, 1, 4, 6 + > INFO [2022-06-17 06:55:52,479] ({spark-dynamic-executor-allocation} Logging.scala[logInfo]:57) - Executors 7,1,4,6 removed due to idle timeout. + > INFO [2022-06-17 06:55:52,765] ({dispatcher-CoarseGrainedScheduler} Logging.scala[logInfo]:57) - Disabling executor 8. + > INFO [2022-06-17 06:55:52,769] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Executor lost: 8 (epoch 2) + > INFO [2022-06-17 06:55:52,770] ({dispatcher-CoarseGrainedScheduler} Logging.scala[logInfo]:57) - Disabling executor 2. + > INFO [2022-06-17 06:55:52,770] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Trying to remove executor 8 from BlockManagerMaster. + > WARN [2022-06-17 06:55:52,771] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_16 ! + > WARN [2022-06-17 06:55:52,771] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_32 ! + > WARN [2022-06-17 06:55:52,771] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_8 ! + > WARN [2022-06-17 06:55:52,771] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_24 ! + > WARN [2022-06-17 06:55:52,771] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_0 ! + > INFO [2022-06-17 06:55:52,772] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Removing block manager BlockManagerId(8, worker02, 36583, None) + > INFO [2022-06-17 06:55:52,772] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Removed 8 successfully in removeExecutor + > INFO [2022-06-17 06:55:52,773] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Executor lost: 2 (epoch 2) + > INFO [2022-06-17 06:55:52,773] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Trying to remove executor 2 from BlockManagerMaster. + > WARN [2022-06-17 06:55:52,773] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_195 ! + > WARN [2022-06-17 06:55:52,773] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_23 ! + > .... + > .... + > WARN [2022-06-17 06:55:52,771] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_24 ! + > WARN [2022-06-17 06:55:52,771] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_0 ! + > INFO [2022-06-17 06:55:52,772] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Removing block manager BlockManagerId(8, worker02, 36583, None) + > INFO [2022-06-17 06:55:52,772] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Removed 8 successfully in removeExecutor + > INFO [2022-06-17 06:55:52,773] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Executor lost: 2 (epoch 2) + > INFO [2022-06-17 06:55:52,773] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Trying to remove executor 2 from BlockManagerMaster. + > WARN [2022-06-17 06:55:52,773] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_195 ! + > WARN [2022-06-17 06:55:52,773] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_23 ! + > .... + > .... + > WARN [2022-06-17 06:55:52,774] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_82 ! + > WARN [2022-06-17 06:55:52,774] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_159 ! + > INFO [2022-06-17 06:55:52,774] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Removing block manager BlockManagerId(2, worker02, 33739, None) + > INFO [2022-06-17 06:55:52,774] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Removed 2 successfully in removeExecutor + > INFO [2022-06-17 06:55:52,780] ({dispatcher-CoarseGrainedScheduler} Logging.scala[logInfo]:57) - Disabling executor 6. + > INFO [2022-06-17 06:55:52,780] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Executor lost: 6 (epoch 2) + > INFO [2022-06-17 06:55:52,780] ({dispatcher-CoarseGrainedScheduler} Logging.scala[logInfo]:57) - Disabling executor 4. + > INFO [2022-06-17 06:55:52,780] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Trying to remove executor 6 from BlockManagerMaster. + > WARN [2022-06-17 06:55:52,780] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_64 ! + > WARN [2022-06-17 06:55:52,780] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_38 ! + > .... + > .... + > WARN [2022-06-17 06:55:52,841] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_136 ! + > WARN [2022-06-17 06:55:52,841] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_177 ! + > INFO [2022-06-17 06:55:52,841] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Removing block manager BlockManagerId(3, worker04, 38819, None) + > INFO [2022-06-17 06:55:52,841] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Removed 3 successfully in removeExecutor + > INFO [2022-06-17 06:55:52,848] ({dispatcher-CoarseGrainedScheduler} Logging.scala[logInfo]:57) - Executor 7 on worker04 killed by driver. + > INFO [2022-06-17 06:55:52,870] ({dispatcher-CoarseGrainedScheduler} Logging.scala[logInfo]:57) - Executor 3 on worker04 killed by driver. + > INFO [2022-06-17 07:21:13,066] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Removed broadcast_9_piece0 on zeppelin:36463 in memory (size: 13.1 KiB, free: 30.5 GiB) + > INFO [2022-06-17 07:21:13,067] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Removed broadcast_9_piece0 on worker06:37225 in memory (size: 13.1 KiB, free: 3.6 GiB) + > INFO [2022-06-17 07:21:13,072] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Removed broadcast_10_piece0 on zeppelin:36463 in memory (size: 17.0 KiB, free: 30.5 GiB) + > INFO [2022-06-17 07:21:13,072] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Removed broadcast_10_piece0 on worker06:37225 in memory (size: 17.0 KiB, free: 3.6 GiB) + + +# ----------------------------------------------------- +# Process list in the client container matches this. +#[root@ansibler] + + ps -ef + + > UID PID PPID C STIME TTY TIME CMD + > root 1 0 0 Jun13 pts/0 00:00:01 bash + > root 21693 0 0 Jun14 pts/1 00:00:00 bash + > root 21715 1 0 Jun14 ? 00:02:42 ssh: /root/.ssh/fedora@128.232.222.196:22 [mux] + > root 26273 21693 0 Jun15 pts/1 00:00:00 ssh zeppelin + > root 36558 0 0 Jun15 pts/2 00:00:00 bash + > root 36578 36558 0 Jun15 pts/2 00:00:00 ssh zeppelin + > root 44529 1 0 02:38 pts/0 00:00:00 bash + > root 44530 1 0 02:38 pts/0 00:00:00 tee /tmp/test-loop.json + > root 49242 44529 0 06:46 pts/0 00:00:00 /bin/python3 /tmp/run-benchmark.py http://zeppelin:8080 /deployments/zeppelin/test/config/quick.json + > root 49243 49242 0 06:46 pts/0 00:00:00 /bin/python3 /tmp/run-benchmark.py http://zeppelin:8080 /deployments/zeppelin/test/config/quick.json + > .... + > .... + > root 49260 49242 0 06:46 pts/0 00:00:00 /bin/python3 /tmp/run-benchmark.py http://zeppelin:8080 /deployments/zeppelin/test/config/quick.json + > root 49261 49242 0 06:46 pts/0 00:00:00 /bin/python3 /tmp/run-benchmark.py http://zeppelin:8080 /deployments/zeppelin/test/config/quick.json + > root 49322 49243 0 06:48 pts/0 00:00:00 [zdairi] + > root 49323 49243 0 06:48 pts/0 00:00:00 [zdairi] + > .... + > .... + > root 49447 49250 0 06:54 pts/0 00:00:00 [zdairi] + > root 49448 49250 0 06:54 pts/0 00:00:00 [zdairi] + > root 49464 49254 0 06:55 pts/0 00:00:59 /usr/bin/python3 /usr/local/bin/zdairi --config /tmp/user12.yml notebook run --notebook 2H65J1XS4 + > root 49465 49252 0 06:55 pts/0 00:00:00 [zdairi] + > root 49466 49252 0 06:55 pts/0 00:00:00 [zdairi] + > .... + > .... + > root 49562 49261 0 07:02 pts/0 00:00:00 [zdairi] + > root 49563 49261 0 07:02 pts/0 00:00:00 [zdairi] + > root 49564 0 0 10:19 pts/3 00:00:00 bash + > root 49582 49564 0 10:20 pts/3 00:00:00 ps -ef + + + cat /tmp/user12.yml + + > zeppelin_url: http://zeppelin:8080 + > zeppelin_auth: true + > zeppelin_user: Carrovieus + > zeppelin_password: ######## + + +# ----------------------------------------------------- +# Check the memory and disc on the workers. +#[root@ansibler] + + workers=( + worker01 + worker02 + worker03 + worker04 + worker05 + worker06 + ) + + for worker in ${workers[@]} + do + echo "" + echo "worker [${worker}]" + ssh "${worker}" \ + ' + hostname + date + echo + free -h + echo + df -h / + ' + done + + > worker [worker01] + > iris-gaia-blue-20220613-worker01 + > Fri Jun 17 11:13:14 UTC 2022 + > + > total used free shared buff/cache available + > Mem: 42Gi 2.8Gi 13Gi 2.0Mi 25Gi 38Gi + > Swap: 0B 0B 0B + > + > Filesystem Size Used Avail Use% Mounted on + > /dev/vda1 20G 4.2G 15G 23% / + + > worker [worker02] + > iris-gaia-blue-20220613-worker02 + > Fri Jun 17 11:13:14 UTC 2022 + > + > total used free shared buff/cache available + > Mem: 42Gi 2.7Gi 14Gi 2.0Mi 25Gi 38Gi + > Swap: 0B 0B 0B + > + > Filesystem Size Used Avail Use% Mounted on + > /dev/vda1 20G 4.2G 15G 23% / + + > worker [worker03] + > iris-gaia-blue-20220613-worker03 + > Fri Jun 17 11:13:14 UTC 2022 + > + > total used free shared buff/cache available + > Mem: 42Gi 3.1Gi 13Gi 2.0Mi 25Gi 38Gi + > Swap: 0B 0B 0B + > + > Filesystem Size Used Avail Use% Mounted on + > /dev/vda1 20G 4.2G 15G 23% / + + > worker [worker04] + > iris-gaia-blue-20220613-worker04 + > Fri Jun 17 11:13:14 UTC 2022 + > + > total used free shared buff/cache available + > Mem: 42Gi 3.3Gi 14Gi 2.0Mi 24Gi 38Gi + > Swap: 0B 0B 0B + > + > Filesystem Size Used Avail Use% Mounted on + > /dev/vda1 20G 4.2G 15G 23% / + + > worker [worker05] + > iris-gaia-blue-20220613-worker05 + > Fri Jun 17 11:13:14 UTC 2022 + > + > total used free shared buff/cache available + > Mem: 42Gi 2.8Gi 12Gi 2.0Mi 26Gi 38Gi + > Swap: 0B 0B 0B + > + > Filesystem Size Used Avail Use% Mounted on + > /dev/vda1 20G 4.2G 15G 23% / + + > worker [worker06] + > iris-gaia-blue-20220613-worker06 + > Fri Jun 17 11:13:14 UTC 2022 + > + > total used free shared buff/cache available + > Mem: 42Gi 5.5Gi 10Gi 2.0Mi 25Gi 36Gi + > Swap: 0B 0B 0B + > + > Filesystem Size Used Avail Use% Mounted on + > /dev/vda1 20G 4.2G 15G 23% / + + +# ----------------------------------------------------- +# Kill off the failed application. +#[root@ansibler] + + # + # Kill all applications on YARN which are in RUNNING state: + # https://stackoverflow.com/a/56035711 + # + + ssh master01 + + for appid in $( + yarn application -list -appStates RUNNING | awk 'NR > 2 { print $1 }' + ) + do + echo "" + echo "App ID [${appid}]" + yarn application -kill ${appid} + done + + + > + > 2022-06-17 14:54:47,149 INFO client.RMProxy: Connecting to ResourceManager at master01/10.10.0.35:8032 + > + > App ID [application_1655122472463_1609] + > 2022-06-17 14:54:48,486 INFO client.RMProxy: Connecting to ResourceManager at master01/10.10.0.35:8032 + > Killing application application_1655122472463_1609 + > 2022-06-17 14:54:49,087 INFO impl.YarnClientImpl: Killed application application_1655122472463_1609 + + + + +# ----------------------------------------------------- +# Check the Zeppelin interpreter log. +#[user@zepplin] + + tail -f zeppelin-interpreter-spark-Carrovieus-Carrovieus-$(id -un)-$(hostname).log + + > .... + > .... + > INFO [2022-06-17 07:21:13,072] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Removed broadcast_10_piece0 on zeppelin:36463 in memory (size: 17.0 KiB, free: 30.5 GiB) + > INFO [2022-06-17 07:21:13,072] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Removed broadcast_10_piece0 on worker06:37225 in memory (size: 17.0 KiB, free: 3.6 GiB) + > .... + > INFO [2022-06-17 14:54:49,658] ({dispatcher-CoarseGrainedScheduler} Logging.scala[logInfo]:57) - Disabling executor 5. + > INFO [2022-06-17 14:54:49,659] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Executor lost: 5 (epoch 2) + > INFO [2022-06-17 14:54:49,660] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Trying to remove executor 5 from BlockManagerMaster. + > WARN [2022-06-17 14:54:49,660] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_137 ! + > ERROR [2022-06-17 14:54:49,660] ({rpc-server-4-1} TransportClient.java[operationComplete]:337) - Failed to send RPC RPC 7738039617265107252 to /10.10.1.113:53550: java.nio.channels.ClosedChannelException + > java.nio.channels.ClosedChannelException + > at io.netty.channel.AbstractChannel$AbstractUnsafe.newClosedChannelException(AbstractChannel.java:957) + > at io.netty.channel.AbstractChannel$AbstractUnsafe.write(AbstractChannel.java:865) + > at io.netty.channel.DefaultChannelPipeline$HeadContext.write(DefaultChannelPipeline.java:1367) + > at io.netty.channel.AbstractChannelHandlerContext.invokeWrite0(AbstractChannelHandlerContext.java:717) + > at io.netty.channel.AbstractChannelHandlerContext.invokeWriteAndFlush(AbstractChannelHandlerContext.java:764) + > at io.netty.channel.AbstractChannelHandlerContext$WriteTask.run(AbstractChannelHandlerContext.java:1071) + > at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:164) + > at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:472) + > at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:500) + > at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) + > at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) + > at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) + > at java.lang.Thread.run(Thread.java:748) + > WARN [2022-06-17 14:54:49,660] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_106 ! + > WARN [2022-06-17 14:54:49,661] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_17 ! + > .... + > WARN [2022-06-17 14:54:49,662] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_149 ! + > WARN [2022-06-17 14:54:49,662] ({dispatcher-BlockManagerMaster} Logging.scala[logWarning]:69) - No more replicas available for rdd_15_9 ! + > INFO [2022-06-17 14:54:49,662] ({dispatcher-BlockManagerMaster} Logging.scala[logInfo]:57) - Removing block manager BlockManagerId(5, worker06, 37225, None) + > INFO [2022-06-17 14:54:49,662] ({dag-scheduler-event-loop} Logging.scala[logInfo]:57) - Removed 5 successfully in removeExecutor + > WARN [2022-06-17 14:54:49,665] ({rpc-server-4-1} Logging.scala[logWarning]:90) - Attempted to get executor loss reason for executor id 5 at RPC address 10.10.2.198:35570, but got no response. Marking as agent lost. + > java.io.IOException: Failed to send RPC RPC 7738039617265107252 to /10.10.1.113:53550: java.nio.channels.ClosedChannelException + > at org.apache.spark.network.client.TransportClient$RpcChannelListener.handleFailure(TransportClient.java:363) + > at org.apache.spark.network.client.TransportClient$StdChannelListener.operationComplete(TransportClient.java:340) + > .... + > .... + > Caused by: java.nio.channels.ClosedChannelException + > at io.netty.channel.AbstractChannel$AbstractUnsafe.newClosedChannelException(AbstractChannel.java:957) + > ... 12 more + > ERROR [2022-06-17 14:54:49,667] ({dispatcher-CoarseGrainedScheduler} Logging.scala[logError]:73) - Lost executor 5 on worker06: Executor Process Lost + > INFO [2022-06-17 14:54:49,695] ({YARN application state monitor} Logging.scala[logInfo]:57) - Deleted staging directory hdfs://master01:9000/albert/Carrovieus/.sparkStaging/application_1655122472463_1609 + > ERROR [2022-06-17 14:54:49,696] ({YARN application state monitor} Logging.scala[logError]:73) - YARN application has exited unexpectedly with state KILLED! Check the YARN application logs for more details. + > ERROR [2022-06-17 14:54:49,697] ({YARN application state monitor} Logging.scala[logError]:73) - Diagnostics message: Application application_1655122472463_1609 was killed by user fedora at 10.10.0.35 + > INFO [2022-06-17 14:54:49,705] ({YARN application state monitor} AbstractConnector.java[doStop]:381) - Stopped Spark@1abede1a{HTTP/1.1, (http/1.1)}{0.0.0.0:4051} + > INFO [2022-06-17 14:54:49,708] ({YARN application state monitor} Logging.scala[logInfo]:57) - Stopped Spark web UI at http://zeppelin:4051 + > ERROR [2022-06-17 14:54:49,712] ({rpc-server-4-1} TransportClient.java[operationComplete]:337) - Failed to send RPC RPC 8965578969429977538 to /10.10.1.113:53550: java.nio.channels.ClosedChannelException + > java.nio.channels.ClosedChannelException + > .... + > .... + > ERROR [2022-06-17 14:54:49,667] ({dispatcher-CoarseGrainedScheduler} Logging.scala[logError]:73) - Lost executor 5 on worker06: Executor Process Lost + > INFO [2022-06-17 14:54:49,695] ({YARN application state monitor} Logging.scala[logInfo]:57) - Deleted staging directory hdfs://master01:9000/albert/Carrovieus/.sparkStaging/application_1655122472463_1609 + > ERROR [2022-06-17 14:54:49,696] ({YARN application state monitor} Logging.scala[logError]:73) - YARN application has exited unexpectedly with state KILLED! Check the YARN application logs for more details. + > ERROR [2022-06-17 14:54:49,697] ({YARN application state monitor} Logging.scala[logError]:73) - Diagnostics message: Application application_1655122472463_1609 was killed by user fedora at 10.10.0.35 + > INFO [2022-06-17 14:54:49,705] ({YARN application state monitor} AbstractConnector.java[doStop]:381) - Stopped Spark@1abede1a{HTTP/1.1, (http/1.1)}{0.0.0.0:4051} + > INFO [2022-06-17 14:54:49,708] ({YARN application state monitor} Logging.scala[logInfo]:57) - Stopped Spark web UI at http://zeppelin:4051 + > ERROR [2022-06-17 14:54:49,712] ({rpc-server-4-1} TransportClient.java[operationComplete]:337) - Failed to send RPC RPC 8965578969429977538 to /10.10.1.113:53550: java.nio.channels.ClosedChannelException + > java.nio.channels.ClosedChannelException + > at io.netty.channel.AbstractChannel$AbstractUnsafe.newClosedChannelException(AbstractChannel.java:957) + > at io.netty.channel.AbstractChannel$AbstractUnsafe.write(AbstractChannel.java:865) + > .... + > .... + > ERROR [2022-06-17 14:54:49,713] ({rpc-server-4-1} Logging.scala[logError]:94) - Sending RequestExecutors(Map(),Map(),Map(),Set()) to AM was unsuccessful + > java.io.IOException: Failed to send RPC RPC 8965578969429977538 to /10.10.1.113:53550: java.nio.channels.ClosedChannelException + > at org.apache.spark.network.client.TransportClient$RpcChannelListener.handleFailure(TransportClient.java:363) + > at org.apache.spark.network.client.TransportClient$StdChannelListener.operationComplete(TransportClient.java:340) + > .... + > .... + > ... 12 more + > INFO [2022-06-17 14:54:49,721] ({dispatcher-event-loop-42} Logging.scala[logInfo]:57) - MapOutputTrackerMasterEndpoint stopped! + > INFO [2022-06-17 14:54:49,737] ({YARN application state monitor} Logging.scala[logInfo]:57) - MemoryStore cleared + > INFO [2022-06-17 14:54:49,737] ({YARN application state monitor} Logging.scala[logInfo]:57) - BlockManager stopped + > INFO [2022-06-17 14:54:49,739] ({YARN application state monitor} Logging.scala[logInfo]:57) - BlockManagerMaster stopped + > INFO [2022-06-17 14:54:49,742] ({dispatcher-event-loop-46} Logging.scala[logInfo]:57) - OutputCommitCoordinator stopped! + > INFO [2022-06-17 14:54:49,753] ({YARN application state monitor} Logging.scala[logInfo]:57) - Successfully stopped SparkContext + + + # + # Our stuck application has gone (killed). + # Our AglaisBenchmarker test is still stuck. + # + + +# ----------------------------------------------------- +# Kill off the test with Ctrl^C. +#[root@ansibler] + + + > .... + > .... + > Process ForkPoolWorker-19: + > Process ForkPoolWorker-16: + > Process ForkPoolWorker-13: + > Process ForkPoolWorker-3: + > Process ForkPoolWorker-17: + > Process ForkPoolWorker-18: + > Process ForkPoolWorker-14: + > Process ForkPoolWorker-10: + > Process ForkPoolWorker-9: + > Process ForkPoolWorker-8: + > Traceback (most recent call last): + > File "/tmp/run-benchmark.py", line 46, in + > AglaisBenchmarker( + > File "/usr/local/lib/python3.10/site-packages/aglais_benchmark/aglais_benchmark.py", line 194, in run + > results = self._run_parallel(users, delay_start, delay_notebook, delete) + > File "/usr/local/lib/python3.10/site-packages/aglais_benchmark/aglais_benchmark.py", line 231, in _run_parallel + > results = pool.starmap(self._run_single, list(zip(range(concurrent_users), [True]*concurrent_users, [delay_start]*concurrent_users, [delay_notebook]*concurrent_users, [delete]*concurrent_users))) + > File "/usr/lib64/python3.10/multiprocessing/pool.py", line 372, in starmap + > return self._map_async(func, iterable, starmapstar, chunksize).get() + > File "/usr/lib64/python3.10/multiprocessing/pool.py", line 765, in get + > self.wait(timeout) + > File "/usr/lib64/python3.10/multiprocessing/pool.py", line 762, in wait + > self._event.wait(timeout) + > File "/usr/lib64/python3.10/threading.py", line 600, in wait + > signaled = self._cond.wait(timeout) + > File "/usr/lib64/python3.10/threading.py", line 320, in wait + > waiter.acquire() + > KeyboardInterrupt + + +# ----------------------------------------------------- +# Test with 19 users doing 2 loops until 5pm. +#[root@ansibler] + + # local usercount=${1:?'usercount required'} + # local loopfinish=${2:?'loopfinish required'} + # local looppause=${3:-10} + # local delaystart=${4:-1} + # local delaynotebook=${5:-1} + + long-loop 19 17 10 5 5 \ + | tee /tmp/test-loop.json + + + # + # Test started, but stalled waiting for the 19th user. + # Applications run by the other users ran and are now FINISHED. + # No application was created for the user that was stalled in the last run. + # + + # + # Looks like SparkInterpreter was created. + # + + > .... + > INFO [2022-06-16 15:03:11,991] ({qtp686466458-725679} LoginRestApi.java[postLogin]:249) - {"status":"OK","message":"","body":{"principal":"Carrovieus","ticket":"8d1973e0-2796-43f3-82cd-968a6dee765a","roles":"[\"user\"]"}} + > INFO [2022-06-16 15:03:12,081] ({SchedulerFactory88} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkInterpreter + > INFO [2022-06-16 15:03:12,082] ({qtp686466458-725598} NotebookRestApi.java[getNoteJobStatus]:783) - Get note job status. + > INFO [2022-06-16 15:03:12,125] ({qtp686466458-725563} NotebookRestApi.java[runParagraph]:836) - Run paragraph job asynchronously 2H6HKN3WH paragraph_1655391791337_454665494 + > INFO [2022-06-16 15:03:12,127] ({qtp686466458-725563} NotebookService.java[runParagraph]:346) - Start to run paragraph: paragraph_1655391791337_454665494 of note: 2H6HKN3WH + > INFO [2022-06-16 15:03:12,127] ({qtp686466458-725563} VFSNotebookRepo.java[save]:144) - Saving note 2H6HKN3WH to tmp/ZDLBZVW2KC.json_2H6HKN3WH.zpln + > INFO [2022-06-16 15:03:12,131] ({qtp686466458-725563} InterpreterSetting.java[getOrCreateInterpreterGroup]:454) - Create InterpreterGroup with groupId: spark-Carrovieus for ExecutionContext{user='Carrovieus', noteId='2H6HKN3WH', interpreterGroupId='null', defaultInterpreterGroup='spark', inIsolatedMode=false, startTime=} + > INFO [2022-06-16 15:03:12,131] ({qtp686466458-725563} InterpreterSetting.java[createInterpreters]:823) - Interpreter org.apache.zeppelin.spark.SparkInterpreter created for user: Carrovieus, sessionId: shared_session + > INFO [2022-06-16 15:03:12,131] ({qtp686466458-725563} InterpreterSetting.java[createInterpreters]:823) - Interpreter org.apache.zeppelin.spark.SparkSqlInterpreter created for user: Carrovieus, sessionId: shared_session + > INFO [2022-06-16 15:03:12,131] ({qtp686466458-725563} InterpreterSetting.java[createInterpreters]:823) - Interpreter org.apache.zeppelin.spark.PySparkInterpreter created for user: Carrovieus, sessionId: shared_session + > INFO [2022-06-16 15:03:12,131] ({qtp686466458-725563} InterpreterSetting.java[createInterpreters]:823) - Interpreter org.apache.zeppelin.spark.IPySparkInterpreter created for user: Carrovieus, sessionId: shared_session + > INFO [2022-06-16 15:03:12,132] ({qtp686466458-725563} InterpreterSetting.java[createInterpreters]:823) - Interpreter org.apache.zeppelin.spark.SparkRInterpreter created for user: Carrovieus, sessionId: shared_session + > INFO [2022-06-16 15:03:12,132] ({qtp686466458-725563} InterpreterSetting.java[createInterpreters]:823) - Interpreter org.apache.zeppelin.spark.SparkIRInterpreter created for user: Carrovieus, sessionId: shared_session + > INFO [2022-06-16 15:03:12,132] ({qtp686466458-725563} InterpreterSetting.java[createInterpreters]:823) - Interpreter org.apache.zeppelin.spark.SparkShinyInterpreter created for user: Carrovieus, sessionId: shared_session + > INFO [2022-06-16 15:03:12,132] ({qtp686466458-725563} InterpreterSetting.java[createInterpreters]:823) - Interpreter org.apache.zeppelin.spark.KotlinSparkInterpreter created for user: Carrovieus, sessionId: shared_session + > INFO [2022-06-16 15:03:12,132] ({qtp686466458-725563} ManagedInterpreterGroup.java[getOrCreateSession]:180) - Create Session: shared_session in InterpreterGroup: spark-Carrovieus for user: Carrovieus + > INFO [2022-06-16 15:03:12,133] ({SchedulerFactory93} AbstractScheduler.java[runJob]:127) - Job paragraph_1655391791337_454665494 started by scheduler RemoteInterpreter-spark-Carrovieus-shared_session + > INFO [2022-06-16 15:03:12,135] ({SchedulerFactory93} Paragraph.java[jobRun]:416) - Run paragraph [paragraph_id: paragraph_1655391791337_454665494, interpreter: org.apache.zeppelin.spark.PySparkInterpreter, note_id: 2H6HKN3WH, user: Carrovieus] + > INFO [2022-06-16 15:03:12,135] ({SchedulerFactory93} ManagedInterpreterGroup.java[getOrCreateInterpreterProcess]:65) - Create InterpreterProcess for InterpreterGroup: spark-Carrovieus + > INFO [2022-06-16 15:03:12,137] ({SchedulerFactory93} PluginManager.java[loadInterpreterLauncher]:154) - Loading Interpreter Launcher Plugin: org.apache.zeppelin.interpreter.launcher.SparkInterpreterLauncher + > INFO [2022-06-16 15:03:12,137] ({SchedulerFactory93} StandardInterpreterLauncher.java[launchDirectly]:50) - Launching new interpreter process of spark + > INFO [2022-06-16 15:03:12,137] ({SchedulerFactory93} SparkInterpreterLauncher.java[buildEnvFromProperties]:213) - Run Spark under non-secure mode as no keytab and principal is specified + > INFO [2022-06-16 15:03:12,138] ({SchedulerFactory93} SparkInterpreterLauncher.java[buildEnvFromProperties]:245) - buildEnvFromProperties: {PATH=/home/fedora/.local/bin:/home/fedora/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/aglais/bin:/opt/hadoop/bin:/opt/hadoop/sbin:/opt/spark/python:/opt/spark/bin:/home/fedora/zeppelin/bin, HADOOP_CONF_DIR=/opt/hadoop/etc/hadoop, ZEPPELIN_LOG_DIR=/home/fedora/zeppelin/logs, ZEPPELIN_WAR=/home/fedora/zeppelin/zeppelin-web-0.10.0.war, ZEPPELIN_ENCODING=UTF-8, ZEPPELIN_SPARK_CONF=--proxy-user|Carrovieus|--conf|spark.yarn.dist.archives=/opt/spark/R/lib/sparkr.zip#sparkr|--conf|spark.submit.deployMode=client|--conf|spark.webui.yarn.useProxy=false|--conf|spark.yarn.isPython=true|--conf|spark.app.name=spark-Carrovieus|--conf|spark.master=yarn, ZEPPELIN_NICENESS=0, HADOOP_DATA=/var/hadoop/data, JAVA_OPTS= -Dfile.encoding=UTF-8 -Xmx1024m -Dlog4j.configuration=file:///home/fedora/zeppelin/conf/log4j.properties -Dzeppelin.log.file=/home/fedora/zeppelin/logs/zeppelin-fedora-iris-gaia-blue-20220613-zeppelin.log, DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus, JAVA_INTP_OPTS= -Dfile.encoding=UTF-8 -Dlog4j.configuration=file:///home/fedora/zeppelin/conf/log4j.properties -Dlog4j.configurationFile=file:///home/fedora/zeppelin/conf/log4j2.properties, ZEPPELIN_CONF_DIR=/home/fedora/zeppelin/conf, LOGNAME=fedora, PWD=/home/fedora, PYTHONPATH=:/opt/spark/python:/opt/spark/python/lib/py4j-0.10.4-src.zip, LESSOPEN=||/usr/bin/lesspipe.sh %s, SHELL=/bin/bash, ZEPPELIN_INTP_MEM=-Xmx1024m, SELINUX_USE_CURRENT_RANGE=, PYSPARK_DRIVER_PYTHON=python, ZEPPELIN_ANGULAR_WAR=/home/fedora/zeppelin/zeppelin-web-angular-0.10.0.war, HADOOP_HOME=/opt/hadoop, SHLVL=1, MASTER=yarn-client, JAVA_HOME=/etc/alternatives/jre, INTERPRETER_GROUP_ID=spark-Carrovieus, LANG=en_US.UTF-8, XDG_SESSION_ID=3, XDG_SESSION_TYPE=tty, SELINUX_LEVEL_REQUESTED=, PYSPARK_PYTHON=python, SELINUX_ROLE_REQUESTED=, SPARK_HOME=/opt/spark, ZEPPELIN_RUNNER=/etc/alternatives/jre/bin/java, _=/usr/bin/nohup, XDG_SESSION_CLASS=user, ZEPPELIN_HOME=/home/fedora/zeppelin, SSH_CLIENT=90.155.51.57 34062 22, USER=fedora, ZEPPELIN_PID_DIR=/home/fedora/zeppelin/run, ZEPPELIN_MEM=-Xmx1024m, SSH_AUTH_SOCK=/tmp/ssh-sdEjU07vpR/agent.1623, SSH_CONNECTION=90.155.51.57 34062 10.10.2.210 22, ZEPPELIN_IDENT_STRING=fedora, PYSPARK_PIN_THREAD=true, HADOOP_LOG_DIR=/var/hadoop/logs, ZEPPELIN_INTERPRETER_REMOTE_RUNNER=bin/interpreter.sh, XDG_RUNTIME_DIR=/run/user/1000, HOME=/home/fedora} + > INFO [2022-06-16 15:03:12,160] ({SchedulerFactory88} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkSqlInterpreter + > INFO [2022-06-16 15:03:12,162] ({SchedulerFactory88} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.PySparkInterpreter + > INFO [2022-06-16 15:03:12,166] ({qtp686466458-725682} LoginRestApi.java[postLogin]:249) - {"status":"OK","message":"","body":{"principal":"Pierione","ticket":"a04d17a2-7dcd-4585-9271-96a1cefdfed9","roles":"[\"user\"]"}} + > INFO [2022-06-16 15:03:12,167] ({SchedulerFactory88} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.IPySparkInterpreter + > INFO [2022-06-16 15:03:12,171] ({SchedulerFactory88} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkRInterpreter + > INFO [2022-06-16 15:03:12,196] ({SchedulerFactory88} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkIRInterpreter + > INFO [2022-06-16 15:03:12,198] ({SchedulerFactory88} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkShinyInterpreter + > INFO [2022-06-16 15:03:12,199] ({SchedulerFactory93} ProcessLauncher.java[transition]:109) - Process state is transitioned to LAUNCHED + > INFO [2022-06-16 15:03:12,199] ({SchedulerFactory93} ProcessLauncher.java[launch]:96) - Process is launched: [/home/fedora/zeppelin/bin/interpreter.sh, -d, /home/fedora/zeppelin/interpreter/spark, -c, 10.10.2.210, -p, 35643, -r, :, -i, spark-Carrovieus, -u, Carrovieus, -l, /home/fedora/zeppelin/local-repo/spark, -g, spark] + > .... + + # + # No idea .... + # + + +# ----------------------------------------------------- +# Restart Zeppelin ... +#[root@ansibler] + + /deployments/hadoop-yarn/bin/restart-zeppelin.sh + + > Zeppelin stop [ OK ] + > Zeppelin start [ OK ] + + # + # Test continues as soon as Zeppelin is restarted. + # + + + INFO [2022-06-17 16:29:20,858] ({main} ZeppelinServer.java[main]:265) - Done, zeppelin server started + INFO [2022-06-17 16:29:21,337] ({Exec Stream Pumper} ProcessLauncher.java[processLine]:189) - [INFO] Interpreter launch command: ssh Carrovieus@localhost source /home/fedora/zeppelin/conf/zeppelin-env.sh; /etc/alternatives/jre/bin/java -Dfile.encoding=UTF-8 -Dlog4j.configuration=file:///home/fedora/zeppelin/conf/log4j.properties -Dlog4j.configurationFile=file:///home/fedora/zeppelin/conf/log4j2.properties -Dzeppelin.log.file=/home/fedora/zeppelin/logs/zeppelin-interpreter-md-Carrovieus-Carrovieus-fedora-iris-gaia-blue-20220613-zeppelin.log -Xmx1024m -cp :/home/fedora/zeppelin/local-repo/md/*:/home/fedora/zeppelin/interpreter/md/*:::/home/fedora/zeppelin/interpreter/zeppelin-interpreter-shaded-0.10.0.jar org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer 10.10.2.210 39349 md-Carrovieus : + INFO [2022-06-17 16:29:23,096] ({pool-7-thread-2} RemoteInterpreterEventServer.java[registerInterpreterProcess]:183) - Register interpreter process: 10.10.2.210:43937, interpreterGroup: md-Carrovieus + INFO [2022-06-17 16:29:23,096] ({pool-7-thread-2} ProcessLauncher.java[transition]:109) - Process state is transitioned to RUNNING + INFO [2022-06-17 16:29:23,259] ({SchedulerFactory2} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.markdown.Markdown + INFO [2022-06-17 16:29:23,343] ({SchedulerFactory2} RemoteInterpreter.java[lambda$open$0]:134) - Open RemoteInterpreter org.apache.zeppelin.markdown.Markdown + INFO [2022-06-17 16:29:23,343] ({SchedulerFactory2} RemoteInterpreter.java[pushAngularObjectRegistryToRemote]:393) - Push local angular object registry from ZeppelinServer to remote interpreter group md-Carrovieus + INFO [2022-06-17 16:29:23,508] ({JobStatusPoller-paragraph_1655483358576_1138661423} NotebookServer.java[onStatusChange]:1989) - Job paragraph_1655483358576_1138661423 starts to RUNNING + INFO [2022-06-17 16:29:23,509] ({JobStatusPoller-paragraph_1655483358576_1138661423} VFSNotebookRepo.java[save]:144) - Saving note 2H5C33YE1 to tmp/TNHOY4K62F.json_2H5C33YE1.zpln + INFO [2022-06-17 16:29:23,649] ({SchedulerFactory2} NotebookServer.java[onStatusChange]:1984) - Job paragraph_1655483358576_1138661423 is finished successfully, status: FINISHED + INFO [2022-06-17 16:29:23,650] ({SchedulerFactory2} VFSNotebookRepo.java[save]:144) - Saving note 2H5C33YE1 to tmp/TNHOY4K62F.json_2H5C33YE1.zpln + INFO [2022-06-17 16:29:23,653] ({SchedulerFactory2} AbstractScheduler.java[runJob]:154) - Job paragraph_1655483358576_1138661423 finished by scheduler RemoteInterpreter-md-Carrovieus-shared_session with status FINISHED + INFO [2022-06-17 16:29:24,038] ({qtp2128029086-40} NotebookRestApi.java[runParagraph]:836) - Run paragraph job asynchronously 2H5C33YE1 paragraph_1655483358578_1063825342 + INFO [2022-06-17 16:29:24,040] ({qtp2128029086-40} NotebookService.java[runParagraph]:346) - Start to run paragraph: paragraph_1655483358578_1063825342 of note: 2H5C33YE1 + INFO [2022-06-17 16:29:24,040] ({qtp2128029086-40} VFSNotebookRepo.java[save]:144) - Saving note 2H5C33YE1 to tmp/TNHOY4K62F.json_2H5C33YE1.zpln + INFO [2022-06-17 16:29:24,044] ({SchedulerFactory3} AbstractScheduler.java[runJob]:127) - Job paragraph_1655483358578_1063825342 started by scheduler RemoteInterpreter-spark-Carrovieus-shared_session + INFO [2022-06-17 16:29:24,044] ({SchedulerFactory3} Paragraph.java[jobRun]:416) - Run paragraph [paragraph_id: paragraph_1655483358578_1063825342, interpreter: org.apache.zeppelin.spark.PySparkInterpreter, note_id: 2H5C33YE1, user: Carrovieus] + + INFO [2022-06-17 16:29:24,049] ({SchedulerFactory3} ProcessLauncher.java[transition]:109) - Process state is transitioned to LAUNCHED + INFO [2022-06-17 16:29:24,049] ({SchedulerFactory3} ProcessLauncher.java[launch]:96) - Process is launched: [/home/fedora/zeppelin/bin/interpreter.sh, -d, /home/fedora/zeppelin/interpreter/spark, -c, 10.10.2.210, -p, 39349, -r, :, -i, spark-Carrovieus, -u, Carrovieus, -l, /home/fedora/zeppelin/local-repo/spark, -g, spark] + INFO [2022-06-17 16:29:24,519] ({Exec Stream Pumper} ProcessLauncher.java[processLine]:189) - [INFO] Interpreter launch command: /opt/spark/bin/spark-submit --class org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer --driver-class-path :/home/fedora/zeppelin/local-repo/spark/*:/home/fedora/zeppelin/interpreter/spark/*:::/home/fedora/zeppelin/interpreter/zeppelin-interpreter-shaded-0.10.0.jar:/home/fedora/zeppelin/interpreter/spark/spark-interpreter-0.10.0.jar:/opt/hadoop/etc/hadoop --driver-java-options -Dfile.encoding=UTF-8 -Dlog4j.configuration=file:///home/fedora/zeppelin/conf/log4j.properties -Dlog4j.configurationFile=file:///home/fedora/zeppelin/conf/log4j2.properties -Dzeppelin.log.file=/home/fedora/zeppelin/logs/zeppelin-interpreter-spark-Carrovieus-Carrovieus-fedora-iris-gaia-blue-20220613-zeppelin.log --proxy-user Carrovieus --conf spark.yarn.dist.archives=/opt/spark/R/lib/sparkr.zip#sparkr --conf spark.submit.deployMode=client --conf spark.webui.yarn.useProxy=false --conf spark.yarn.isPython=true --conf spark.app.name=spark-Carrovieus --conf spark.master=yarn /home/fedora/zeppelin/interpreter/spark/spark-interpreter-0.10.0.jar 10.10.2.210 39349 spark-Carrovieus : + INFO [2022-06-17 16:29:27,413] ({pool-7-thread-5} RemoteInterpreterEventServer.java[registerInterpreterProcess]:183) - Register interpreter process: 10.10.2.210:37509, interpreterGroup: spark-Carrovieus + INFO [2022-06-17 16:29:27,413] ({pool-7-thread-5} ProcessLauncher.java[transition]:109) - Process state is transitioned to RUNNING + INFO [2022-06-17 16:29:27,538] ({qtp2128029086-125} NotebookServer.java[onOpen]:246) - New connection from 10.10.2.210:36970 + INFO [2022-06-17 16:29:27,548] ({SchedulerFactory3} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkInterpreter + INFO [2022-06-17 16:29:27,619] ({SchedulerFactory3} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkSqlInterpreter + INFO [2022-06-17 16:29:27,621] ({SchedulerFactory3} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.PySparkInterpreter + INFO [2022-06-17 16:29:27,624] ({SchedulerFactory3} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.IPySparkInterpreter + INFO [2022-06-17 16:29:27,627] ({SchedulerFactory3} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkRInterpreter + INFO [2022-06-17 16:29:27,629] ({SchedulerFactory3} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkIRInterpreter + INFO [2022-06-17 16:29:27,631] ({SchedulerFactory3} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.SparkShinyInterpreter + INFO [2022-06-17 16:29:27,632] ({SchedulerFactory3} RemoteInterpreter.java[lambda$internal_create$1]:160) - Create RemoteInterpreter org.apache.zeppelin.spark.KotlinSparkInterpreter + + # + # Test fails because "UnresolvedRelation [gaia_source]" + # + + INFO [2022-06-17 16:29:27,662] ({SchedulerFactory3} RemoteInterpreter.java[lambda$open$0]:134) - Open RemoteInterpreter org.apache.zeppelin.spark.PySparkInterpreter + INFO [2022-06-17 16:29:27,663] ({SchedulerFactory3} RemoteInterpreter.java[pushAngularObjectRegistryToRemote]:393) - Push local angular object registry from ZeppelinServer to remote interpreter group spark-Carrovieus + INFO [2022-06-17 16:29:51,384] ({JobStatusPoller-paragraph_1655483358578_1063825342} NotebookServer.java[onStatusChange]:1989) - Job paragraph_1655483358578_1063825342 starts to RUNNING + INFO [2022-06-17 16:29:51,385] ({JobStatusPoller-paragraph_1655483358578_1063825342} VFSNotebookRepo.java[save]:144) - Saving note 2H5C33YE1 to tmp/TNHOY4K62F.json_2H5C33YE1.zpln + WARN [2022-06-17 16:29:53,398] ({SchedulerFactory3} NotebookServer.java[onStatusChange]:1986) - Job paragraph_1655483358578_1063825342 is finished, status: ERROR, exception: null, result: %text Fail to execute line 21: df = spark.sql("SELECT FLOOR(source_id / %d"%(divisor) + ") AS hpx_id, COUNT(*) AS n FROM gaia_source GROUP BY hpx_id") +Traceback (most recent call last): + File "/tmp/1655483391411-0/zeppelin_python.py", line 158, in + exec(code, _zcUserQueryNameSpace) + File "", line 21, in + File "/opt/spark/python/pyspark/sql/session.py", line 723, in sql + return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped) + File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__ + answer, self.gateway_client, self.target_id, self.name) + File "/opt/spark/python/pyspark/sql/utils.py", line 117, in deco + raise converted from None +pyspark.sql.utils.AnalysisException: Table or view not found: gaia_source; line 1 pos 72; +'Aggregate ['hpx_id], ['FLOOR(('source_id / 140737488355328)) AS hpx_id#0, count(1) AS n#1L] ++- 'UnresolvedRelation [gaia_source], [], false + + # + # Test moves on to next iteration. + # It recovers, but the test is unreliable. + # + + + diff --git a/notes/zrq/20220617-02-hadoop-rest.txt b/notes/zrq/20220617-02-hadoop-rest.txt new file mode 100644 index 00000000..041d24ef --- /dev/null +++ b/notes/zrq/20220617-02-hadoop-rest.txt @@ -0,0 +1,87 @@ +# +# +# +# Copyright (c) 2022, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Exploring the Hadoop REST API. + https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-site/ResourceManagerRest.html + + Result: + + Work in progress ... + Just scrappy notes at this point + + +# ----------------------------------------------------- + + ssh zeppelin + + curl \ + --silent \ + http://master01:8088/ws/v1/cluster/apps \ + | jq '.' + + > > | jq '.' + > { + > "apps": { + > "app": [ + > { + > "id": "application_1655122472463_1076", + > "user": "Mavaca", + > "name": "spark-Mavaca", + > .... + > .... + > }, + > .... + > .... + > { + > "id": "application_1655122472463_1181", + > "user": "Balline", + > "name": "spark-Balline", + > .... + > .... + > } + > ] + > } + > } + + + curl \ + --silent \ + http://master01:8088/ws/v1/cluster/scheduler \ + | jq '.' + + > { + > .... + > .... + > } + + + + +