diff --git a/experiments/kubernetes/bin/create-all.sh b/experiments/kubernetes/bin/create-all.sh index 1ceaac73..e8e8274d 100755 --- a/experiments/kubernetes/bin/create-all.sh +++ b/experiments/kubernetes/bin/create-all.sh @@ -67,7 +67,7 @@ yq write \ --inplace \ "${statusyml:?}" \ - 'aglais.spec.openstack.cloudname' \ + 'aglais.status.openstack.cloudname' \ "${cloudname}" echo "---- ---- ----" @@ -99,7 +99,7 @@ yq write \ --inplace \ "${statusyml:?}" \ - 'aglais.status.openstack.cluster.id' \ + 'aglais.status.openstack.magnum.uuid' \ "${clusterid}" # ----------------------------------------------------- @@ -192,13 +192,22 @@ EOF #TODO Patch the k8s metrics + # We can't capture the external IP address here because it won't be ready yet. + + yq write \ + --inplace \ + "${statusyml:?}" \ + 'aglais.status.kubernetes.ingress.dashboard.hostname' \ + "${dashhost}" + # ----------------------------------------------------- # Mount the data shares. # Using a hard coded cloud name to make it portable. +# Hard coded mode to 'rw' due to problems with ReadOnlyMany sharelist='/common/manila/datashares.yaml' - sharemode='ro' + sharemode='rw' for shareid in $( yq read \ @@ -283,6 +292,14 @@ EOF "/kubernetes/helm/tools/zeppelin" \ --values "/tmp/zeppelin-values.yaml" + # We can't capture the IP address here because it won't be ready yet. + + yq write \ + --inplace \ + "${statusyml:?}" \ + 'aglais.status.kubernetes.ingress.zeppelin.hostname' \ + "${zepphost}" + # ----------------------------------------------------- # Install our Drupal chart. @@ -307,3 +324,57 @@ EOF cat > "/tmp/drupal-values.yaml" << EOF drupal_server_hostname: "${drupalhost:?}" EOF + + + + +# ----------------------------------------------------- +# Capture our Dashboard ingress IP address. +# ** This has to be done after a delay to allow Kubernetes time to allocate the IP address. + +# sleep 30 + + daship=$( + kubectl \ + --namespace "${namespace:?}" \ + get Ingress \ + --output json \ + | jq -r ' + .items[] + | select(.metadata.name == "aglais-dashboard-kubernetes-dashboard") + | .status.loadBalancer.ingress[0].ip + ' + ) + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + 'aglais.status.kubernetes.ingress.dashboard.ipv4' \ + "${daship}" + + +# ----------------------------------------------------- +# Capture our Zeppelin ingress IP address. +# ** This has to be done after a delay to allow Kubernetes time to allocate the IP address. + +# sleep 30 + + zeppip=$( + kubectl \ + --namespace "${namespace:?}" \ + get Ingress \ + --output json \ + | jq -r ' + .items[] + | select(.metadata.name == "zeppelin-server-ingress") + | .status.loadBalancer.ingress[0].ip + ' + ) + + yq write \ + --inplace \ + "${statusyml:?}" \ + 'aglais.status.kubernetes.ingress.zeppelin.ipv4' \ + "${zeppip}" + + diff --git a/notes/zrq/20210125-01-kubernetes-deploy.txt b/notes/zrq/20210125-01-kubernetes-deploy.txt new file mode 100644 index 00000000..fecbaaea --- /dev/null +++ b/notes/zrq/20210125-01-kubernetes-deploy.txt @@ -0,0 +1,992 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Try to get the Kubernetes deployment to work. + + Results: + + Failed. + Failing to mount the PV claims, intermittent .. different results, different reasons. + + +# ----------------------------------------------------- +# Update the Openstack cloud name. +#[user@desktop] + + cloudname=gaia-dev + + sed -i ' + s/^\(AGLAIS_CLOUD\)=.*$/\1='${cloudname:?}'/ + ' "${HOME}/aglais.env" + +# ----------------------------------------------------- +# Create a container to work with. +# (*) extra volume mount for /common +# (*) mount kubernetes directory as read/write +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name kubernator \ + --hostname kubernator \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --env "cloudname=${AGLAIS_CLOUD:?}" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/common:/common:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/openstack:/openstack:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/kubernetes:/kubernetes:rw,z" \ + atolmis/ansible-client:latest \ + bash + + +# ----------------------------------------------------- +# Delete everything. +#[root@kubernator] + + /openstack/bin/delete-all.sh \ + "${cloudname:?}" + + > .... + > .... + + +# ----------------------------------------------------- +# Create our Aglais configuration. +#[root@kubernator] + +cat > '/tmp/aglais-config.yml' << EOF +aglais: + version: 1.0 + spec: + openstack: + cloudname: ${cloudname:?} + dashboard: + hostname: dashboard.metagrid.xyz + zeppelin: + hostname: zeppelin.metagrid.xyz + drupal: + hostname: drupal.metagrid.xyz +EOF + + +# ----------------------------------------------------- +# Create everything. +#[root@kubernator] + + /kubernetes/bin/create-all.sh + + + > .... + > .... + > Installing dashboard Helm chart + > Namespace [aglais-20210125] + > Dash host [dashboard.metagrid.xyz] + > Getting updates for unmanaged Helm repositories... + > ...Successfully got an update from the "https://kubernetes.github.io/dashboard" chart repository + > Saving 1 charts + > Downloading kubernetes-dashboard from repo https://kubernetes.github.io/dashboard + > Deleting outdated charts + > Release "aglais-dashboard" does not exist. Installing it now. + > Error: Internal error occurred: failed calling webhook "validate.nginx.ingress.kubernetes.io": Post https://aglais-ingress-nginx-controller-admission.aglais-20210125.svc:443/networking/v1beta1/ingresses?timeout=10s: dial tcp 10.254.121.232:443: connect: connection refused + > .... + > .... + + # + # Dashboard error is back ... + # + +# ----------------------------------------------------- +# Check the results. +#[root@kubernator] + + cat '/tmp/aglais-status.yml' + + > aglais: + > status: + > deployment: + > type: kubernetes + > name: aglais-20210125 + > date: 20210125:024859 + > openstack: + > cluster: + > id: 789e1e50-735a-4705-ade3-b409e6f62fc5 + > kubernetes: + > namespace: aglais-20210125 + > spec: + > openstack: + > cloudname: gaia-dev + + +# ----------------------------------------------------- +# Get the cluster ID and K8s namespace. +#[root@kubernator] + + clusterid=$( + yq read '/tmp/aglais-status.yml' 'aglais.status.openstack.cluster.id' + ) + + namespace=$( + yq read '/tmp/aglais-status.yml' 'aglais.status.kubernetes.namespace' + ) + +cat << EOF +Cluster ID [${clusterid}] +Name space [${namespace}] +EOF + + + > Cluster ID [789e1e50-735a-4705-ade3-b409e6f62fc5] + > Name space [aglais-20210125] + + +# ----------------------------------------------------- +# Get the Dashboard ServiceAccount token. +#[root@kubernator] + + secretname=$( + kubectl \ + --output json \ + --namespace "${namespace:?}" \ + get ServiceAccount \ + "aglais-dashboard-kubernetes-dashboard" \ + | jq -r '.secrets[0].name' + ) + + kubectl \ + --output json \ + --namespace "${namespace:?}" \ + get Secret \ + "${secretname:?}" \ + | jq -r '.data.token | @base64d' + + + > .... + > .... + + +# ----------------------------------------------------- +# Get the Ingress address. +#[root@kubernator] + + kubectl \ + --namespace "${namespace:?}" \ + get Ingress + + > NAME HOSTS ADDRESS PORTS AGE + > zeppelin-server-ingress zeppelin.metagrid.xyz 128.232.227.215 80, 443 5m28s + + + zeppelinip=$( + kubectl \ + --namespace "${namespace:?}" \ + get Ingress \ + --output json \ + | jq -r ' + .items[] + | select(.metadata.name == "zeppelin-server-ingress") + | .status.loadBalancer.ingress[0].ip + ' + ) + + echo "Zeppelin IP [${zeppelinip:?}]" + + > Zeppelin IP [128.232.227.215] + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + 'aglais.status.kubernetes.ingress.zeppelin.ipv4' \ + "${zeppelinip:?}" + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + # + # Update our DNS .. + # + + +# ----------------------------------------------------- +# Check the Dashboard page. +#[root@kubernator] + + curl --head --insecure "https://dashboard.metagrid.xyz/" + + > HTTP/2 404 + > date: Mon, 25 Jan 2021 03:30:30 GMT + > content-type: text/html + > content-length: 146 + > strict-transport-security: max-age=15724800; includeSubDomains + + # + # As expected .. + # + + +# ----------------------------------------------------- +# Check the Zeppelin page. +#[root@kubernator] + + curl --head --insecure "https://zeppelin.metagrid.xyz/" + + > HTTP/2 200 + > date: Mon, 25 Jan 2021 03:31:08 GMT + > .... + > .... + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Login to Zeppelin and test ... +#[user@desktop] + + firefox --new-window "https://zeppelin.metagrid.xyz/" & + + > Looks good. + > Login works :-) + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Mount each of the external catalogs in Spark. +#[user@zeppelin] + +# -------------------------------- +%spark.conf + +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-gaia-dr2.mount.path /data/gaia/dr2 +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-gaia-dr2.mount.readOnly true +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-gaia-dr2.options.claimName aglais-gaia-dr2-claim + +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-gaia-edr3.mount.path /data/gaia/edr3 +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-gaia-edr3.mount.readOnly true +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-gaia-edr3.options.claimName aglais-gaia-edr3-claim + +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-wise-allwise.mount.path /data/wise/allwise +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-wise-allwise.mount.readOnly true +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-wise-allwise.options.claimName aglais-wise-allwise-claim + +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-panstarrs-dr1.mount.path /data/panstarrs/dr1 +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-panstarrs-dr1.mount.readOnly true +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-panstarrs-dr1.options.claimName aglais-panstarrs-dr1-claim + +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-twomass-allsky.mount.path /data/twomass/allsky +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-twomass-allsky.mount.readOnly true +spark.kubernetes.executor.volumes.persistentVolumeClaim.aglais-twomass-allsky.options.claimName aglais-twomass-allsky-claim + + +# -------------------------------- +%spark.pyspark + +gaia_dr2 = sqlContext.read.parquet( + "/data/gaia/dr2" + ) + +print("gaia-dr2 count: ", gaia_dr2.count()) +print("gaia-dr2 partitions: ", gaia_dr2.rdd.getNumPartitions()) + + > org.apache.zeppelin.interpreter.InterpreterException: java.io.IOException: Launching zeppelin interpreter on kubernetes is time out, kill it now + > at org.apache.zeppelin.interpreter.remote.RemoteInterpreter.open(RemoteInterpreter.java:132) + > at org.apache.zeppelin.interpreter.remote.RemoteInterpreter.getFormType(RemoteInterpreter.java:279) + > at org.apache.zeppelin.notebook.Paragraph.jobRun(Paragraph.java:465) + > at org.apache.zeppelin.notebook.Paragraph.jobRun(Paragraph.java:73) + > at org.apache.zeppelin.scheduler.Job.run(Job.java:172) + > at org.apache.zeppelin.scheduler.AbstractScheduler.runJob(AbstractScheduler.java:130) + > at org.apache.zeppelin.scheduler.RemoteScheduler$JobRunner.run(RemoteScheduler.java:180).... + > .... + + # + # Failed to load the interpreter. + # + + +# ----------------------------------------------------- +# List the active Pods. +#[root@kubernator] + + kubectl --namespace ${namespace} get pod + + > NAME READY STATUS RESTARTS AGE + > aglais-ceph-csi-cephfs-nodeplugin-j672m 3/3 Running 0 60m + > aglais-ceph-csi-cephfs-nodeplugin-vdswl 3/3 Running 0 60m + > aglais-ceph-csi-cephfs-nodeplugin-wdf9h 3/3 Running 0 60m + > aglais-ceph-csi-cephfs-nodeplugin-xplzq 3/3 Running 0 60m + > aglais-ceph-csi-cephfs-provisioner-f9ff8cd4c-2hzwq 6/6 Running 0 60m + > aglais-ceph-csi-cephfs-provisioner-f9ff8cd4c-fltvf 6/6 Running 0 60m + > aglais-ceph-csi-cephfs-provisioner-f9ff8cd4c-hn6xn 6/6 Running 0 60m + > aglais-dashboard-kubernetes-dashboard-b5f955c8f-7sjvf 2/2 Running 0 60m + > aglais-gaia-dr2-testpod 0/1 ContainerCreating 0 60m + > aglais-gaia-edr3-testpod 0/1 ContainerCreating 0 60m + > aglais-ingress-nginx-controller-54f444477b-65hj2 1/1 Running 0 60m + > aglais-openstack-manila-csi-controllerplugin-0 3/3 Running 0 60m + > aglais-openstack-manila-csi-nodeplugin-jbnqb 2/2 Running 0 60m + > aglais-openstack-manila-csi-nodeplugin-jwwgj 2/2 Running 0 60m + > aglais-openstack-manila-csi-nodeplugin-ssd5j 2/2 Running 0 60m + > aglais-openstack-manila-csi-nodeplugin-vprhp 2/2 Running 0 60m + > aglais-panstarrs-dr1-testpod 0/1 ContainerCreating 0 59m + > aglais-twomass-allsky-testpod 0/1 ContainerCreating 0 59m + > aglais-user-nch-testpod 1/1 Running 0 59m + > aglais-user-stv-testpod 0/1 ContainerCreating 0 58m + > aglais-user-zrq-testpod 0/1 ContainerCreating 0 59m + > aglais-wise-allwise-testpod 0/1 ContainerCreating 0 59m + > spark-sukuqz 0/1 Init:0/1 0 2s + > zeppelin-server-deploy-7cb7f54d5c-srs95 3/3 Running 0 58m + > .... + > .... + + +# ----------------------------------------------------- +# Check the Spark interpreter Pod. +#[root@kubernator] + + kubectl --namespace ${namespace} get pod spark-sukuqz + + > NAME READY STATUS RESTARTS AGE + > spark-sukuqz 0/1 Init:0/1 0 8s + + + kubectl --namespace ${namespace} describe pod spark-sukuqz + + > Name: spark-sukuqz + > Namespace: aglais-20210125 + > Node: aglais-20210125-cluster-rqiklyztkmq6-node-0/10.0.0.74 + > Start Time: Mon, 25 Jan 2021 03:57:36 +0000 + > Labels: app=spark-sukuqz + > interpreterGroupId=spark-shared_process + > interpreterSettingName=spark + > Annotations: + > Status: Pending + > .... + > .... + > Events: + > Type Reason Age From Message + > ---- ------ ---- ---- ------- + > Normal Scheduled default-scheduler Successfully assigned aglais-20210125/spark-sukuqz to aglais-20210125-cluster-rqiklyztkmq6-node-0 + > Warning FailedMount 18s (x4 over 23s) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-0 MountVolume.MountDevice failed for volume "aglais-gaia-edr3-volume" : rpc error: code = InvalidArgument desc = stage secrets cannot be nil or empty + > Warning FailedMount 18s (x4 over 23s) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-0 MountVolume.MountDevice failed for volume "aglais-wise-allwise-volume" : rpc error: code = InvalidArgument desc = stage secrets cannot be nil or empty + > Warning FailedMount 17s (x4 over 23s) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-0 MountVolume.MountDevice failed for volume "aglais-gaia-dr2-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-gaia-dr2-volume/globalmount: permission denied + > Warning FailedMount 15s (x5 over 24s) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-0 MountVolume.MountDevice failed for volume "aglais-panstarrs-dr1-volume" : rpc error: code = InvalidArgument desc = stage secrets cannot be nil or empty + > Warning FailedMount 15s (x4 over 20s) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-0 MountVolume.MountDevice failed for volume "aglais-user-stv-volume" : rpc error: code = InvalidArgument desc = stage secrets cannot be nil or empty + > Warning FailedMount 15s (x4 over 19s) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-0 MountVolume.MountDevice failed for volume "aglais-user-zrq-volume" : rpc error: code = InvalidArgument desc = stage secrets cannot be nil or empty + + +# ----------------------------------------------------- +# Check the DR2 test Pod. +#[root@kubernator] + + kubectl --namespace ${namespace} describe pod aglais-gaia-dr2-testpod + + > Name: aglais-gaia-dr2-testpod + > Namespace: aglais-20210125 + > Node: aglais-20210125-cluster-rqiklyztkmq6-node-2/10.0.0.81 + > Start Time: Mon, 25 Jan 2021 02:57:14 +0000 + > Labels: aglais.dataset=aglais-gaia-dr2 + > aglais.name=aglais-gaia-dr2-testpod + > app.kubernetes.io/instance=aglais-gaia-dr2 + > app.kubernetes.io/managed-by=Helm + > app.kubernetes.io/name=manila-share + > app.kubernetes.io/version=0.0.1 + > helm.sh/chart=manila-share-0.0.1 + > Annotations: meta.helm.sh/release-name: aglais-gaia-dr2 + > meta.helm.sh/release-namespace: aglais-20210125 + > Status: Pending + > .... + > .... + > Events: + > Type Reason Age From Message + > ---- ------ ---- ---- ------- + > Warning FailedMount 45m (x4 over 65m) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-2 Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[local-data default-token-swjxp test-data]: timed out waiting for the condition + > Warning FailedMount 22m (x3 over 54m) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-2 Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[default-token-swjxp test-data local-data]: timed out waiting for the condition + > Warning FailedMount 6m26s (x17 over 63m) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-2 Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[test-data local-data default-token-swjxp]: timed out waiting for the condition + > Warning FailedMount 2m7s (x34 over 66m) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-2 MountVolume.MountDevice failed for volume "aglais-gaia-dr2-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-gaia-dr2-volume/globalmount: permission denied + +# ----------------------------------------------------- +# Check the eDR3 test Pod. +#[root@kubernator] + + kubectl --namespace ${namespace} describe pod aglais-gaia-edr3-testpod + + > Name: aglais-gaia-edr3-testpod + > Namespace: aglais-20210125 + > Node: aglais-20210125-cluster-rqiklyztkmq6-node-1/10.0.0.172 + > Start Time: Mon, 25 Jan 2021 02:57:27 +0000 + > Labels: aglais.dataset=aglais-gaia-edr3 + > aglais.name=aglais-gaia-edr3-testpod + > app.kubernetes.io/instance=aglais-gaia-edr3 + > app.kubernetes.io/managed-by=Helm + > app.kubernetes.io/name=manila-share + > app.kubernetes.io/version=0.0.1 + > helm.sh/chart=manila-share-0.0.1 + > Annotations: meta.helm.sh/release-name: aglais-gaia-edr3 + > meta.helm.sh/release-namespace: aglais-20210125 + > Status: Pending + > .... + > .... + > Events: + > Type Reason Age From Message + > ---- ------ ---- ---- ------- + > Warning FailedMount 22m (x16 over 63m) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-1 Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[test-data local-data default-token-swjxp]: timed out waiting for the condition + > Warning FailedMount 7m5s (x5 over 65m) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-1 Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[default-token-swjxp test-data local-data]: timed out waiting for the condition + > Warning FailedMount 2m47s (x39 over 68m) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-1 MountVolume.MountDevice failed for volume "aglais-gaia-edr3-volume" : rpc error: code = InvalidArgument desc = stage secrets cannot be nil or empty + + +# ----------------------------------------------------- +# Uninstall the DR2 Helm chart. +#[root@kubernator] + + helm --namespace ${namespace} list + + > NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION + > aglais aglais-20210125 1 2021-01-25 02:56:33.71216801 +0000 UTC deployed aglais-0.0.1 0.0.1 + > aglais-dashboard aglais-20210125 1 2021-01-25 02:56:57.237206389 +0000 UTC failed aglais-dashboard-0.0.1 0.0.1 + > aglais-gaia-dr2 aglais-20210125 1 2021-01-25 02:57:13.386058418 +0000 UTC deployed manila-share-0.0.1 0.0.1 + > aglais-gaia-edr3 aglais-20210125 1 2021-01-25 02:57:26.35641445 +0000 UTC deployed manila-share-0.0.1 0.0.1 + > aglais-panstarrs-dr1 aglais-20210125 1 2021-01-25 02:57:50.90011394 +0000 UTC deployed manila-share-0.0.1 0.0.1 + > aglais-twomass-allsky aglais-20210125 1 2021-01-25 02:58:02.568336858 +0000 UTC deployed manila-share-0.0.1 0.0.1 + > aglais-user-nch aglais-20210125 1 2021-01-25 02:58:15.395168148 +0000 UTC deployed manila-share-0.0.1 0.0.1 + > aglais-user-stv aglais-20210125 1 2021-01-25 02:58:40.132723076 +0000 UTC deployed manila-share-0.0.1 0.0.1 + > aglais-user-zrq aglais-20210125 1 2021-01-25 02:58:27.136220405 +0000 UTC deployed manila-share-0.0.1 0.0.1 + > aglais-wise-allwise aglais-20210125 1 2021-01-25 02:57:38.726474125 +0000 UTC deployed manila-share-0.0.1 0.0.1 + > aglais-zeppelin aglais-20210125 1 2021-01-25 02:58:42.919251268 +0000 UTC deployed aglais-zeppelin-0.0.1 0.0.1 + + + helm --namespace ${namespace} uninstall aglais-gaia-dr2 + + > release "aglais-gaia-dr2" uninstalled + + +# ----------------------------------------------------- +# Install the DR2 Helm chart. +#[root@kubernator] + + sharename=aglais-gaia-dr2 + mountpath=/data/gaia/dr2 + sharemode='rw' + + '/kubernetes/bin/cephfs-mount.sh' \ + 'gaia-prod' \ + "${namespace:?}" \ + "${sharename:?}" \ + "${mountpath:?}" \ + "${sharemode:?}" + + > ---- ---- ---- + > File [cephfs-mount.sh] + > Path [/kubernetes/bin] + > ---- ---- ---- + > Cloud name [gaia-prod] + > Namespace [aglais-20210125] + > Share name [aglais-gaia-dr2] + > Mount path [/data/gaia/dr2] + > Share mode [rw] + > ---- ---- ---- + > + > ---- + > Share uuid [2e46b5a5-c5d9-44c0-b11c-310c222f4818] + > ---- + > Share size [512] + > ---- + > Access rule [50ad6086-491d-4056-9092-c57ac49d4d3d] + > Release "aglais-gaia-dr2" does not exist. Installing it now. + > NAME: aglais-gaia-dr2 + > LAST DEPLOYED: Mon Jan 25 04:18:06 2021 + > NAMESPACE: aglais-20210125 + > STATUS: deployed + > REVISION: 1 + > TEST SUITE: None + > NOTES: + > Use the testpod to check access to the mounted volume. + + + kubectl --namespace ${namespace} describe pod aglais-gaia-dr2-testpod + + > Name: aglais-gaia-dr2-testpod + > Namespace: aglais-20210125 + > Node: aglais-20210125-cluster-rqiklyztkmq6-node-3/10.0.0.107 + > Start Time: Mon, 25 Jan 2021 04:18:06 +0000 + > Labels: aglais.dataset=aglais-gaia-dr2 + > aglais.name=aglais-gaia-dr2-testpod + > app.kubernetes.io/instance=aglais-gaia-dr2 + > app.kubernetes.io/managed-by=Helm + > app.kubernetes.io/name=manila-share + > app.kubernetes.io/version=0.0.1 + > helm.sh/chart=manila-share-0.0.1 + > Annotations: meta.helm.sh/release-name: aglais-gaia-dr2 + > meta.helm.sh/release-namespace: aglais-20210125 + > Status: Running + > .... + > .... + > Events: + > Type Reason Age From Message + > ---- ------ ---- ---- ------- + > Normal Scheduled default-scheduler Successfully assigned aglais-20210125/aglais-gaia-dr2-testpod to aglais-20210125-cluster-rqiklyztkmq6-node-3 + > Normal Pulling 14s kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-3 Pulling image "fedora:latest" + > Normal Pulled 7s kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-3 Successfully pulled image "fedora:latest" + > Normal Created 6s kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-3 Created container aglais-gaia-dr2-container + > Normal Started 6s kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-3 Started container aglais-gaia-dr2-container + + +# ----------------------------------------------------- +# Uninstall the eDR3 Helm chart. +#[root@kubernator] + + helm --namespace ${namespace} uninstall aglais-gaia-edr3 + + > release "aglais-gaia-edr3" uninstalled + + +# ----------------------------------------------------- +# Install the eDR3 Helm chart. +#[root@kubernator] + + sharename=aglais-gaia-edr3 + mountpath=/data/gaia/edr3 + sharemode='rw' + + '/kubernetes/bin/cephfs-mount.sh' \ + 'gaia-prod' \ + "${namespace:?}" \ + "${sharename:?}" \ + "${mountpath:?}" \ + "${sharemode:?}" + + > + > ---- ---- ---- + > File [cephfs-mount.sh] + > Path [/kubernetes/bin] + > ---- ---- ---- + > Cloud name [gaia-prod] + > Namespace [aglais-20210125] + > Share name [aglais-gaia-edr3] + > Mount path [/data/gaia/edr3] + > Share mode [rw] + > ---- ---- ---- + > + > ---- + > Share uuid [ca8231c3-1f5c-4ebf-8ec0-d3cfe2629976] + > ---- + > Share size [540] + > ---- + > Access rule [0a4b37bc-e07e-4763-a8af-4d9cf3ae9620] + > Release "aglais-gaia-edr3" does not exist. Installing it now. + > NAME: aglais-gaia-edr3 + > LAST DEPLOYED: Mon Jan 25 04:39:15 2021 + > NAMESPACE: aglais-20210125 + > STATUS: deployed + > REVISION: 1 + > TEST SUITE: None + > NOTES: + > Use the testpod to check access to the mounted volume. + + + kubectl --namespace ${namespace} describe pod aglais-gaia-edr3-testpod + + > Name: aglais-gaia-edr3-testpod + > Namespace: aglais-20210125 + > Node: aglais-20210125-cluster-rqiklyztkmq6-node-1/10.0.0.172 + > Start Time: Mon, 25 Jan 2021 04:39:16 +0000 + > Labels: aglais.dataset=aglais-gaia-edr3 + > aglais.name=aglais-gaia-edr3-testpod + > app.kubernetes.io/instance=aglais-gaia-edr3 + > app.kubernetes.io/managed-by=Helm + > app.kubernetes.io/name=manila-share + > app.kubernetes.io/version=0.0.1 + > helm.sh/chart=manila-share-0.0.1 + > Annotations: meta.helm.sh/release-name: aglais-gaia-edr3 + > meta.helm.sh/release-namespace: aglais-20210125 + > Status: Pending + > .... + > .... + > Events: + > Type Reason Age From Message + > ---- ------ ---- ---- ------- + > Normal Scheduled default-scheduler Successfully assigned aglais-20210125/aglais-gaia-edr3-testpod to aglais-20210125-cluster-rqiklyztkmq6-node-1 + > Warning FailedMount 5s (x7 over 37s) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-1 MountVolume.MountDevice failed for volume "aglais-gaia-edr3-volume" : rpc error: code = InvalidArgument desc = stage secrets cannot be nil or empty + + + # + # Issue is intermittent .. + # Repeat the uninstall/install for DR2 and we get a different result. + # + + +# ----------------------------------------------------- +# Install the DR2 Helm chart. +#[root@kubernator] + + helm --namespace ${namespace} uninstall aglais-gaia-dr2 + + > release "aglais-gaia-dr2" uninstalled + + # + # Wait until the pod, pv and pvc have all gone. + # + + sharename=aglais-gaia-dr2 + mountpath=/data/gaia/dr2 + sharemode='rw' + + '/kubernetes/bin/cephfs-mount.sh' \ + 'gaia-prod' \ + "${namespace:?}" \ + "${sharename:?}" \ + "${mountpath:?}" \ + "${sharemode:?}" + + > ---- ---- ---- + > File [cephfs-mount.sh] + > Path [/kubernetes/bin] + > ---- ---- ---- + > Cloud name [gaia-prod] + > Namespace [aglais-20210125] + > Share name [aglais-gaia-dr2] + > Mount path [/data/gaia/dr2] + > Share mode [rw] + > ---- ---- ---- + > + > ---- + > Share uuid [2e46b5a5-c5d9-44c0-b11c-310c222f4818] + > ---- + > Share size [512] + > ---- + > Access rule [50ad6086-491d-4056-9092-c57ac49d4d3d] + > Release "aglais-gaia-dr2" does not exist. Installing it now. + > NAME: aglais-gaia-dr2 + > LAST DEPLOYED: Mon Jan 25 04:43:46 2021 + > NAMESPACE: aglais-20210125 + > STATUS: deployed + > REVISION: 1 + > TEST SUITE: None + > NOTES: + > Use the testpod to check access to the mounted volume. + + + kubectl --namespace ${namespace} get pv + + > NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE + > aglais-gaia-dr2-volume 512 RWX Retain Bound aglais-20210125/aglais-gaia-dr2-claim 18s + > .... + > .... + + + kubectl --namespace ${namespace} get pvc + + > NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE + > aglais-gaia-dr2-claim Bound aglais-gaia-dr2-volume 512 RWX 13s + > .... + > .... + + + kubectl --namespace ${namespace} describe pod aglais-gaia-dr2-testpod + + > Name: aglais-gaia-dr2-testpod + > Namespace: aglais-20210125 + > Node: aglais-20210125-cluster-rqiklyztkmq6-node-0/10.0.0.74 + > Start Time: Mon, 25 Jan 2021 04:43:47 +0000 + > Labels: aglais.dataset=aglais-gaia-dr2 + > aglais.name=aglais-gaia-dr2-testpod + > app.kubernetes.io/instance=aglais-gaia-dr2 + > app.kubernetes.io/managed-by=Helm + > app.kubernetes.io/name=manila-share + > app.kubernetes.io/version=0.0.1 + > helm.sh/chart=manila-share-0.0.1 + > Annotations: meta.helm.sh/release-name: aglais-gaia-dr2 + > meta.helm.sh/release-namespace: aglais-20210125 + > Status: Pending + > .... + > .... + > Events: + > Type Reason Age From Message + > ---- ------ ---- ---- ------- + > Normal Scheduled default-scheduler Successfully assigned aglais-20210125/aglais-gaia-dr2-testpod to aglais-20210125-cluster-rqiklyztkmq6-node-0 + > Warning FailedMount 9s (x6 over 25s) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-0 MountVolume.MountDevice failed for volume "aglais-gaia-dr2-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-gaia-dr2-volume/globalmount: permission denied + + # + # Issue is intermittent .. + # Repeat the uninstall/install for DR2 and we get a different result. + # + +# ----------------------------------------------------- +# Try again ..... +#[root@kubernator] + + helm --namespace ${namespace} uninstall aglais-gaia-dr2 + + > release "aglais-gaia-dr2" uninstalled + + # + # Wait until the pod, pv and pvc have all gone. + # + + sharename=aglais-gaia-dr2 + mountpath=/data/gaia/dr2 + sharemode='rw' + + '/kubernetes/bin/cephfs-mount.sh' \ + 'gaia-prod' \ + "${namespace:?}" \ + "${sharename:?}" \ + "${mountpath:?}" \ + "${sharemode:?}" + + > ---- ---- ---- + > File [cephfs-mount.sh] + > Path [/kubernetes/bin] + > ---- ---- ---- + > Cloud name [gaia-prod] + > Namespace [aglais-20210125] + > Share name [aglais-gaia-dr2] + > Mount path [/data/gaia/dr2] + > Share mode [rw] + > ---- ---- ---- + > + > ---- + > Share uuid [2e46b5a5-c5d9-44c0-b11c-310c222f4818] + > ---- + > Share size [512] + > ---- + > Access rule [50ad6086-491d-4056-9092-c57ac49d4d3d] + > Release "aglais-gaia-dr2" does not exist. Installing it now. + > NAME: aglais-gaia-dr2 + > LAST DEPLOYED: Mon Jan 25 04:47:22 2021 + > NAMESPACE: aglais-20210125 + > STATUS: deployed + > REVISION: 1 + > TEST SUITE: None + > NOTES: + > Use the testpod to check access to the mounted volume. + + + kubectl --namespace ${namespace} describe pod aglais-gaia-dr2-testpod + + + > Name: aglais-gaia-dr2-testpod + > Namespace: aglais-20210125 + > Node: aglais-20210125-cluster-rqiklyztkmq6-node-3/10.0.0.107 + > Start Time: Mon, 25 Jan 2021 04:47:23 +0000 + > Labels: aglais.dataset=aglais-gaia-dr2 + > aglais.name=aglais-gaia-dr2-testpod + > app.kubernetes.io/instance=aglais-gaia-dr2 + > app.kubernetes.io/managed-by=Helm + > app.kubernetes.io/name=manila-share + > app.kubernetes.io/version=0.0.1 + > helm.sh/chart=manila-share-0.0.1 + > Annotations: meta.helm.sh/release-name: aglais-gaia-dr2 + > meta.helm.sh/release-namespace: aglais-20210125 + > Status: Running + > .... + > .... + > Events: + > Type Reason Age From Message + > ---- ------ ---- ---- ------- + > Normal Scheduled default-scheduler Successfully assigned aglais-20210125/aglais-gaia-dr2-testpod to aglais-20210125-cluster-rqiklyztkmq6-node-3 + > Normal Pulling 33s kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-3 Pulling image "fedora:latest" + > Normal Pulled 30s kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-3 Successfully pulled image "fedora:latest" + > Normal Created 30s kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-3 Created container aglais-gaia-dr2-container + > Normal Started 30s kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-3 Started container aglais-gaia-dr2-container + + + # + # Issue is intermittent .. + # Repeat the uninstall/install for eDR3 and we get a different result. + # + +# ----------------------------------------------------- +# Try again ..... +#[root@kubernator] + + helm --namespace ${namespace} uninstall aglais-gaia-edr3 + + > release "aglais-gaia-edr3" uninstalled + + # + # Wait until the pod, pv and pvc have all gone. + # + + sharename=aglais-gaia-edr3 + mountpath=/data/gaia/edr3 + sharemode='rw' + + '/kubernetes/bin/cephfs-mount.sh' \ + 'gaia-prod' \ + "${namespace:?}" \ + "${sharename:?}" \ + "${mountpath:?}" \ + "${sharemode:?}" + + > ---- ---- ---- + > File [cephfs-mount.sh] + > Path [/kubernetes/bin] + > ---- ---- ---- + > Cloud name [gaia-prod] + > Namespace [aglais-20210125] + > Share name [aglais-gaia-edr3] + > Mount path [/data/gaia/edr3] + > Share mode [rw] + > ---- ---- ---- + > + > ---- + > Share uuid [ca8231c3-1f5c-4ebf-8ec0-d3cfe2629976] + > ---- + > Share size [540] + > ---- + > Access rule [0a4b37bc-e07e-4763-a8af-4d9cf3ae9620] + > Release "aglais-gaia-edr3" does not exist. Installing it now. + > NAME: aglais-gaia-edr3 + > LAST DEPLOYED: Mon Jan 25 04:55:26 2021 + > NAMESPACE: aglais-20210125 + > STATUS: deployed + > REVISION: 1 + > TEST SUITE: None + > NOTES: + > Use the testpod to check access to the mounted volume. + + + kubectl --namespace ${namespace} describe pod aglais-gaia-edr3-testpod + + > Name: aglais-gaia-edr3-testpod + > Namespace: aglais-20210125 + > Node: aglais-20210125-cluster-rqiklyztkmq6-node-1/10.0.0.172 + > Start Time: Mon, 25 Jan 2021 04:55:27 +0000 + > Labels: aglais.dataset=aglais-gaia-edr3 + > aglais.name=aglais-gaia-edr3-testpod + > app.kubernetes.io/instance=aglais-gaia-edr3 + > app.kubernetes.io/managed-by=Helm + > app.kubernetes.io/name=manila-share + > app.kubernetes.io/version=0.0.1 + > helm.sh/chart=manila-share-0.0.1 + > Annotations: meta.helm.sh/release-name: aglais-gaia-edr3 + > meta.helm.sh/release-namespace: aglais-20210125 + > Status: Pending + > .... + > .... + > Events: + > Type Reason Age From Message + > ---- ------ ---- ---- ------- + > Normal Scheduled default-scheduler Successfully assigned aglais-20210125/aglais-gaia-edr3-testpod to aglais-20210125-cluster-rqiklyztkmq6-node-1 + > Warning FailedMount 2s (x7 over 34s) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-1 MountVolume.MountDevice failed for volume "aglais-gaia-edr3-volume" : rpc error: code = InvalidArgument desc = stage secrets cannot be nil or empty + + + # + # DR2 is intermittent. + # Haven't seen eDR3 work at all. + # Try 2mass ... + # + + +# ----------------------------------------------------- +# Try twomass +#[root@kubernator] + + helm --namespace ${namespace} uninstall aglais-twomass-allsky + + > release "aglais-twomass-allsky" uninstalled + + # + # Wait until the pod, pv and pvc have all gone. + # + + sharename=aglais-twomass-allsky + mountpath=/data/twomass/allsky + sharemode='rw' + + '/kubernetes/bin/cephfs-mount.sh' \ + 'gaia-prod' \ + "${namespace:?}" \ + "${sharename:?}" \ + "${mountpath:?}" \ + "${sharemode:?}" + + > ---- ---- ---- + > File [cephfs-mount.sh] + > Path [/kubernetes/bin] + > ---- ---- ---- + > Cloud name [gaia-prod] + > Namespace [aglais-20210125] + > Share name [aglais-twomass-allsky] + > Mount path [/data/twomass/allsky] + > Share mode [rw] + > ---- ---- ---- + > + > ---- + > Share uuid [9dc3016a-f010-48bc-89fc-a9cbd688b7cc] + > ---- + > Share size [40] + > ---- + > Access rule [5647d075-83fb-4a60-b562-a5248da54ec7] + > Release "aglais-twomass-allsky" does not exist. Installing it now. + > NAME: aglais-twomass-allsky + > LAST DEPLOYED: Mon Jan 25 04:59:54 2021 + > NAMESPACE: aglais-20210125 + > STATUS: deployed + > REVISION: 1 + > TEST SUITE: None + > NOTES: + > Use the testpod to check access to the mounted volume. + + + kubectl --namespace ${namespace} describe pod aglais-twomass-allsky-testpod + + > Name: aglais-twomass-allsky-testpod + > Namespace: aglais-20210125 + > Node: aglais-20210125-cluster-rqiklyztkmq6-node-0/10.0.0.74 + > Start Time: Mon, 25 Jan 2021 04:59:55 +0000 + > Labels: aglais.dataset=aglais-twomass-allsky + > aglais.name=aglais-twomass-allsky-testpod + > app.kubernetes.io/instance=aglais-twomass-allsky + > app.kubernetes.io/managed-by=Helm + > app.kubernetes.io/name=manila-share + > app.kubernetes.io/version=0.0.1 + > helm.sh/chart=manila-share-0.0.1 + > Annotations: meta.helm.sh/release-name: aglais-twomass-allsky + > meta.helm.sh/release-namespace: aglais-20210125 + > Status: Pending + > .... + > .... + > Events: + > Type Reason Age From Message + > ---- ------ ---- ---- ------- + > Normal Scheduled default-scheduler Successfully assigned aglais-20210125/aglais-twomass-allsky-testpod to aglais-20210125-cluster-rqiklyztkmq6-node-0 + > Warning FailedMount 11s (x7 over 43s) kubelet, aglais-20210125-cluster-rqiklyztkmq6-node-0 MountVolume.MountDevice failed for volume "aglais-twomass-allsky-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-twomass-allsky-volume/globalmount: permission denied + + + # + # DR2 works, some of the time. + # eDR3 fails with 'stage secrets cannot be nil'. + # twomass fails with 'permission denied' + # + + diff --git a/notes/zrq/20210125-02-kubernetes-deploy.txt b/notes/zrq/20210125-02-kubernetes-deploy.txt new file mode 100644 index 00000000..6925dc32 --- /dev/null +++ b/notes/zrq/20210125-02-kubernetes-deploy.txt @@ -0,0 +1,288 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Try to get the Kubernetes deployment to work. + Starting from clean ... again. + + Results: + + Failed. + Failing to mount the PV claims, intermittent .. different results, different reasons. + +# ----------------------------------------------------- +# Update the Openstack cloud name. +#[user@desktop] + + cloudname=gaia-dev + + sed -i ' + s/^\(AGLAIS_CLOUD\)=.*$/\1='${cloudname:?}'/ + ' "${HOME}/aglais.env" + +# ----------------------------------------------------- +# Create a container to work with. +# (*) extra volume mount for /common +# (*) mount kubernetes directory as read/write +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name kubernator \ + --hostname kubernator \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --env "cloudname=${AGLAIS_CLOUD:?}" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/common:/common:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/openstack:/openstack:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/kubernetes:/kubernetes:rw,z" \ + atolmis/ansible-client:2020.12.02 \ + bash + + +# ----------------------------------------------------- +# Delete everything. +#[root@kubernator] + + /openstack/bin/delete-all.sh \ + "${cloudname:?}" + + > .... + > .... + + +# ----------------------------------------------------- +# Create our Aglais configuration. +#[root@kubernator] + +cat > '/tmp/aglais-config.yml' << EOF +aglais: + version: 1.0 + spec: + openstack: + cloudname: ${cloudname:?} + dashboard: + hostname: dashboard.metagrid.xyz + zeppelin: + hostname: zeppelin.metagrid.xyz + drupal: + hostname: drupal.metagrid.xyz +EOF + + +# ----------------------------------------------------- +# Create everything. +#[root@kubernator] + + /kubernetes/bin/create-all.sh + + > .... + > .... + > Installing dashboard Helm chart + > Namespace [aglais-20210125] + > Dash host [dashboard.metagrid.xyz] + > Getting updates for unmanaged Helm repositories... + > ...Successfully got an update from the "https://kubernetes.github.io/dashboard" chart repository + > Saving 1 charts + > Downloading kubernetes-dashboard from repo https://kubernetes.github.io/dashboard + > Deleting outdated charts + > Release "aglais-dashboard" does not exist. Installing it now. + > NAME: aglais-dashboard + > LAST DEPLOYED: Mon Jan 25 05:27:58 2021 + > NAMESPACE: aglais-20210125 + > STATUS: deployed + > REVISION: 1 + > TEST SUITE: None + > .... + > .... + + # + # Dashboard worked this time, no errors. + # + +# ----------------------------------------------------- +# Check the results. +#[root@kubernator] + + cat '/tmp/aglais-status.yml' + + > aglais: + > status: + > deployment: + > type: kubernetes + > name: aglais-20210125 + > date: 20210125:051944 + > openstack: + > cluster: + > id: 958b10a3-aa60-4762-8405-5101eaaf6e1f + > kubernetes: + > namespace: aglais-20210125 + > spec: + > openstack: + > cloudname: gaia-dev + + +# ----------------------------------------------------- +# Get the cluster ID and K8s namespace. +#[root@kubernator] + + clusterid=$( + yq read '/tmp/aglais-status.yml' 'aglais.status.openstack.cluster.id' + ) + + namespace=$( + yq read '/tmp/aglais-status.yml' 'aglais.status.kubernetes.namespace' + ) + +cat << EOF +Cluster ID [${clusterid}] +Name space [${namespace}] +EOF + + + > Cluster ID [958b10a3-aa60-4762-8405-5101eaaf6e1f] + > Name space [aglais-20210125] + + +# ----------------------------------------------------- +# Get the Dashboard ServiceAccount token. +#[root@kubernator] + + secretname=$( + kubectl \ + --output json \ + --namespace "${namespace:?}" \ + get ServiceAccount \ + "aglais-dashboard-kubernetes-dashboard" \ + | jq -r '.secrets[0].name' + ) + + kubectl \ + --output json \ + --namespace "${namespace:?}" \ + get Secret \ + "${secretname:?}" \ + | jq -r '.data.token | @base64d' + + + > .... + > .... + + +# ----------------------------------------------------- +# Get the Ingress address. +#[root@kubernator] + + kubectl \ + --namespace "${namespace:?}" \ + get Ingress + + > NAME HOSTS ADDRESS PORTS AGE + > aglais-dashboard-kubernetes-dashboard dashboard.metagrid.xyz 128.232.227.177 80 5m6s + > zeppelin-server-ingress zeppelin.metagrid.xyz 128.232.227.177 80, 443 3m20s + + + zeppelinip=$( + kubectl \ + --namespace "${namespace:?}" \ + get Ingress \ + --output json \ + | jq -r ' + .items[] + | select(.metadata.name == "zeppelin-server-ingress") + | .status.loadBalancer.ingress[0].ip + ' + ) + + echo "Zeppelin IP [${zeppelinip:?}]" + + > Zeppelin IP [128.232.227.177] + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + 'aglais.status.kubernetes.ingress.zeppelin.ipv4' \ + "${zeppelinip:?}" + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + # + # Update our DNS .. + # + + +# ----------------------------------------------------- +# Check the Dashboard page. +#[root@kubernator] + + curl --head --insecure "https://dashboard.metagrid.xyz/" + + > HTTP/2 200 + > date: Mon, 25 Jan 2021 05:34:47 GMT + > .... + > .... + + +# ----------------------------------------------------- +# Check the Zeppelin page. +#[root@kubernator] + + curl --head --insecure "https://zeppelin.metagrid.xyz/" + + > HTTP/2 200 + > date: Mon, 25 Jan 2021 05:35:12 GMT + > .... + > .... + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Login to Dashboard and test ... +#[user@desktop] + + firefox --new-window "https://dashboard.metagrid.xyz/" & + + > Dashboard looks good. + > Token works :-) + + + # + # Checking in dashboard, none of the share mounts worked :-( + # Failing to mount the PV claims .. different results, different reasons. + # + + + diff --git a/notes/zrq/20210125-03-ansible-deploy.txt b/notes/zrq/20210125-03-ansible-deploy.txt new file mode 100644 index 00000000..d1c9c818 --- /dev/null +++ b/notes/zrq/20210125-03-ansible-deploy.txt @@ -0,0 +1,319 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Test if the Manila shares work on the Ansible deploy ... + + Results: + + Success. + All the nodes deployed correctly. + Data shares appear to be mounted correctly. + + TODO: + + We still need some tools to verify the contents. + - https://github.com/wfau/aglais/issues/82 + - https://github.com/wfau/aglais/issues/323 + - https://github.com/wfau/aglais/issues/32 + + + +# ----------------------------------------------------- +# Update the Openstack cloud name. +#[user@desktop] + + cloudname=gaia-dev + + sed -i ' + s/^\(AGLAIS_CLOUD\)=.*$/\1='${cloudname:?}'/ + ' "${HOME}/aglais.env" + + +# ----------------------------------------------------- +# Create a container to work with. +# (*) extra volume mount for /common +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name ansibler \ + --hostname ansibler \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --env "cloudname=${AGLAIS_CLOUD:?}" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/common:/common:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/openstack:/openstack:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/hadoop-yarn:/hadoop-yarn:ro,z" \ + atolmis/ansible-client:2020.12.02 \ + bash + + +# ----------------------------------------------------- +# Create our Aglais configuration. +#[root@ansibler] + +cat > /tmp/aglais-config.yml << EOF +aglais: + version: 1.0 + spec: + openstack: + cloudname: ${cloudname:?} +EOF + + +# ----------------------------------------------------- +# Delete everything. +#[root@ansibler] + + /openstack/bin/delete-all.sh \ + "${cloudname:?}" + + > .... + > .... + + +# ----------------------------------------------------- +# Create everything. +#[root@ansibler] + + /hadoop-yarn/bin/create-all.sh + + > .... + > .... + + +# ----------------------------------------------------- +# Check the results. +#[root@ansibler] + + cat '/tmp/aglais-status.yml' + + > aglais: + > status: + > deployment: + > type: hadoop-yarn + > name: aglais-20210125 + > date: 20210125:054847 + > spec: + > openstack: + > cloudname: gaia-dev + + buildtag=$( + yq read \ + '/tmp/aglais-status.yml' \ + 'aglais.status.deployment.name' + ) + + +# ----------------------------------------------------- +# Get the public IP address of our Zeppelin node. +#[root@ansibler] + + zeppelinid=$( + openstack \ + --os-cloud "${cloudname:?}" \ + server list \ + --format json \ + | jq -r '.[] | select(.Name == "'${buildtag:?}'-zeppelin") | .ID' + ) + + zeppelinip=$( + openstack \ + --os-cloud "${cloudname:?}" \ + server show \ + --format json \ + "${zeppelinid:?}" \ + | jq -r '.addresses' \ + | sed ' + s/[[:space:]]// + s/.*=\(.*\)/\1/ + s/.*,\(.*\)/\1/ + ' + ) + +cat << EOF +Zeppelin ID [${zeppelinid:?}] +Zeppelin IP [${zeppelinip:?}] +EOF + + > Zeppelin ID [ba030ca2-cce1-47b7-b8df-249691c92fa7] + > Zeppelin IP [128.232.227.242] + + +# ----------------------------------------------------- +# Login to the Zeppelin node and check the data shares. +#[root@ansibler] + + sharelist='/common/manila/datashares.yaml' + + for shareid in $( + yq read "${sharelist:?}" 'shares.[*].id' + ) + do + echo "" + echo "---- ----" + echo "Share [${shareid:?}]" + echo "----" + + sharename=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).sharename") + mountpath=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).mountpath") + + ssh "fedora@${zeppelinip:?}" \ + " + date + hostname + echo '----' + df -h '${mountpath:?}' + echo '----' + ls -al '${mountpath:?}' | tail + " + done + + + > ---- ---- + > Share [GDR2] + > ---- + > Mon Jan 25 12:01:46 UTC 2021 + > aglais-20210125-zeppelin.novalocal + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 512G 473G 40G 93% /data/gaia/dr2 + > ---- + > -rw-r--r--. 1 fedora fedora 30825240 Oct 24 17:59 part-06504-70392076-8b82-4457-8828-22069e7626e9-c000.snappy.parquet + > -rw-r--r--. 1 fedora fedora 31802127 Oct 24 17:59 part-06505-70392076-8b82-4457-8828-22069e7626e9-c000.snappy.parquet + > -rw-r--r--. 1 fedora fedora 31538538 Oct 24 17:59 part-06506-70392076-8b82-4457-8828-22069e7626e9-c000.snappy.parquet + > -rw-r--r--. 1 fedora fedora 31218434 Oct 24 17:59 part-06507-70392076-8b82-4457-8828-22069e7626e9-c000.snappy.parquet + > -rw-r--r--. 1 fedora fedora 30815074 Oct 24 17:59 part-06508-70392076-8b82-4457-8828-22069e7626e9-c000.snappy.parquet + > -rw-r--r--. 1 fedora fedora 30406730 Oct 24 17:59 part-06509-70392076-8b82-4457-8828-22069e7626e9-c000.snappy.parquet + > -rw-r--r--. 1 fedora fedora 29995058 Oct 24 17:59 part-06510-70392076-8b82-4457-8828-22069e7626e9-c000.snappy.parquet + > -rw-r--r--. 1 fedora fedora 29447614 Oct 24 17:59 part-06511-70392076-8b82-4457-8828-22069e7626e9-c000.snappy.parquet + > -rw-r--r--. 1 fedora fedora 28448646 Oct 24 17:59 part-06512-70392076-8b82-4457-8828-22069e7626e9-c000.snappy.parquet + > -rw-r--r--. 1 fedora fedora 6317774 Oct 24 17:59 part-06513-70392076-8b82-4457-8828-22069e7626e9-c000.snappy.parquet + > + > ---- ---- + > Share [GEDR3] + > ---- + > Mon Jan 25 12:01:47 UTC 2021 + > aglais-20210125-zeppelin.novalocal + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 540G 533G 7.9G 99% /data/gaia/edr3 + > ---- + > -rw-r--r--. 1 root root 36858229 Jan 11 22:27 part-11922-59b9273a-2ef1-4988-8778-e00f67e65264-c000.snappy.parquet + > -rw-r--r--. 1 root root 35391788 Jan 11 22:27 part-11923-59b9273a-2ef1-4988-8778-e00f67e65264-c000.snappy.parquet + > -rw-r--r--. 1 root root 39969879 Jan 11 22:27 part-11924-59b9273a-2ef1-4988-8778-e00f67e65264-c000.snappy.parquet + > -rw-r--r--. 1 root root 38923149 Jan 11 22:27 part-11925-59b9273a-2ef1-4988-8778-e00f67e65264-c000.snappy.parquet + > -rw-r--r--. 1 root root 36280019 Jan 11 22:27 part-11926-59b9273a-2ef1-4988-8778-e00f67e65264-c000.snappy.parquet + > -rw-r--r--. 1 root root 39559908 Jan 11 22:27 part-11927-59b9273a-2ef1-4988-8778-e00f67e65264-c000.snappy.parquet + > -rw-r--r--. 1 root root 34715127 Jan 11 22:27 part-11928-59b9273a-2ef1-4988-8778-e00f67e65264-c000.snappy.parquet + > -rw-r--r--. 1 root root 35453747 Jan 11 22:27 part-11929-59b9273a-2ef1-4988-8778-e00f67e65264-c000.snappy.parquet + > -rw-r--r--. 1 root root 30599245 Jan 11 22:27 part-11930-59b9273a-2ef1-4988-8778-e00f67e65264-c000.snappy.parquet + > -rw-r--r--. 1 root root 10852913 Jan 11 22:27 part-11931-59b9273a-2ef1-4988-8778-e00f67e65264-c000.snappy.parquet + > + > ---- ---- + > Share [ALLWISE] + > ---- + > Mon Jan 25 12:01:49 UTC 2021 + > aglais-20210125-zeppelin.novalocal + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 350G 341G 9.9G 98% /data/wise/allwise + > ---- + > -rw-r--r--. 1 root root 21195981 Jan 11 21:26 part-09124-6f95fee1-90c7-4207-911a-ebcc0ef05615-c000.snappy.parquet + > -rw-r--r--. 1 root root 20760761 Jan 11 21:26 part-09125-6f95fee1-90c7-4207-911a-ebcc0ef05615-c000.snappy.parquet + > -rw-r--r--. 1 root root 37549253 Jan 11 21:26 part-09126-6f95fee1-90c7-4207-911a-ebcc0ef05615-c000.snappy.parquet + > -rw-r--r--. 1 root root 32687920 Jan 11 21:26 part-09127-6f95fee1-90c7-4207-911a-ebcc0ef05615-c000.snappy.parquet + > -rw-r--r--. 1 root root 30215740 Jan 11 21:26 part-09128-6f95fee1-90c7-4207-911a-ebcc0ef05615-c000.snappy.parquet + > -rw-r--r--. 1 root root 26528776 Jan 11 21:26 part-09129-6f95fee1-90c7-4207-911a-ebcc0ef05615-c000.snappy.parquet + > -rw-r--r--. 1 root root 36999673 Jan 11 21:26 part-09130-6f95fee1-90c7-4207-911a-ebcc0ef05615-c000.snappy.parquet + > -rw-r--r--. 1 root root 30382801 Jan 11 21:26 part-09131-6f95fee1-90c7-4207-911a-ebcc0ef05615-c000.snappy.parquet + > -rw-r--r--. 1 root root 31622359 Jan 11 21:26 part-09132-6f95fee1-90c7-4207-911a-ebcc0ef05615-c000.snappy.parquet + > -rw-r--r--. 1 root root 9956618 Jan 11 21:26 part-09133-6f95fee1-90c7-4207-911a-ebcc0ef05615-c000.snappy.parquet + > + > ---- ---- + > Share [PS1] + > ---- + > Mon Jan 25 12:01:52 UTC 2021 + > aglais-20210125-zeppelin.novalocal + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 300G 270G 31G 90% /data/panstarrs/dr1 + > ---- + > -rw-r--r--. 1 root root 27803868 Jan 11 19:43 part-07723-22b55fbd-2678-4993-8e3a-3f384b1854bc-c000.snappy.parquet + > -rw-r--r--. 1 root root 22025506 Jan 11 19:43 part-07724-22b55fbd-2678-4993-8e3a-3f384b1854bc-c000.snappy.parquet + > -rw-r--r--. 1 root root 25756891 Jan 11 19:43 part-07725-22b55fbd-2678-4993-8e3a-3f384b1854bc-c000.snappy.parquet + > -rw-r--r--. 1 root root 31396660 Jan 11 19:43 part-07726-22b55fbd-2678-4993-8e3a-3f384b1854bc-c000.snappy.parquet + > -rw-r--r--. 1 root root 26859792 Jan 11 19:44 part-07727-22b55fbd-2678-4993-8e3a-3f384b1854bc-c000.snappy.parquet + > -rw-r--r--. 1 root root 24735889 Jan 11 19:44 part-07728-22b55fbd-2678-4993-8e3a-3f384b1854bc-c000.snappy.parquet + > -rw-r--r--. 1 root root 25470955 Jan 11 19:44 part-07729-22b55fbd-2678-4993-8e3a-3f384b1854bc-c000.snappy.parquet + > -rw-r--r--. 1 root root 25640631 Jan 11 19:44 part-07730-22b55fbd-2678-4993-8e3a-3f384b1854bc-c000.snappy.parquet + > -rw-r--r--. 1 root root 22504695 Jan 11 19:44 part-07731-22b55fbd-2678-4993-8e3a-3f384b1854bc-c000.snappy.parquet + > -rw-r--r--. 1 root root 13200198 Jan 11 19:44 part-07732-22b55fbd-2678-4993-8e3a-3f384b1854bc-c000.snappy.parquet + > + > ---- ---- + > Share [2MASS] + > ---- + > Mon Jan 25 12:01:53 UTC 2021 + > aglais-20210125-zeppelin.novalocal + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 40G 37G 3.5G 92% /data/twomass/allsky + > ---- + > -rw-r--r--. 1 root root 16875933 Jan 11 17:44 part-01176-ce75a128-1cde-4ce1-90fc-4a36208209b2-c000.snappy.parquet + > -rw-r--r--. 1 root root 31847987 Jan 11 17:44 part-01177-ce75a128-1cde-4ce1-90fc-4a36208209b2-c000.snappy.parquet + > -rw-r--r--. 1 root root 33978033 Jan 11 17:45 part-01178-ce75a128-1cde-4ce1-90fc-4a36208209b2-c000.snappy.parquet + > -rw-r--r--. 1 root root 33170642 Jan 11 17:45 part-01179-ce75a128-1cde-4ce1-90fc-4a36208209b2-c000.snappy.parquet + > -rw-r--r--. 1 root root 33115257 Jan 11 17:45 part-01180-ce75a128-1cde-4ce1-90fc-4a36208209b2-c000.snappy.parquet + > -rw-r--r--. 1 root root 33854964 Jan 11 17:45 part-01181-ce75a128-1cde-4ce1-90fc-4a36208209b2-c000.snappy.parquet + > -rw-r--r--. 1 root root 31874821 Jan 11 17:45 part-01182-ce75a128-1cde-4ce1-90fc-4a36208209b2-c000.snappy.parquet + > -rw-r--r--. 1 root root 33091386 Jan 11 17:45 part-01183-ce75a128-1cde-4ce1-90fc-4a36208209b2-c000.snappy.parquet + > -rw-r--r--. 1 root root 31078087 Jan 11 17:45 part-01184-ce75a128-1cde-4ce1-90fc-4a36208209b2-c000.snappy.parquet + > -rw-r--r--. 1 root root 14460710 Jan 11 17:45 part-01185-ce75a128-1cde-4ce1-90fc-4a36208209b2-c000.snappy.parquet + + + # + # Looks OK. + # + # We still need something to verify the contents. + # https://github.com/wfau/aglais/issues/82 + # https://github.com/wfau/aglais/issues/323 + # https://github.com/wfau/aglais/issues/32 + # + + + + + + diff --git a/notes/zrq/20210125-04-kubernetes-deploy.txt b/notes/zrq/20210125-04-kubernetes-deploy.txt new file mode 100644 index 00000000..f5b535da --- /dev/null +++ b/notes/zrq/20210125-04-kubernetes-deploy.txt @@ -0,0 +1,1284 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Try to get the Kubernetes deployment to work. + Starting from clean ... again. + + Results: + + Success. + Figured out what was causing the problems. + All the csi-manila shares mounted and tested. + + TODO: + + We still need some tools to verify the contents. + - https://github.com/wfau/aglais/issues/82 + - https://github.com/wfau/aglais/issues/323 + - https://github.com/wfau/aglais/issues/32 + + + +# ----------------------------------------------------- +# Update the Openstack cloud name. +#[user@desktop] + + cloudname=gaia-dev + + sed -i ' + s/^\(AGLAIS_CLOUD\)=.*$/\1='${cloudname:?}'/ + ' "${HOME}/aglais.env" + +# ----------------------------------------------------- +# Create a container to work with. +# (*) extra volume mount for /common +# (*) mount kubernetes directory as read/write +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name kubernator \ + --hostname kubernator \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --env "cloudname=${AGLAIS_CLOUD:?}" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/common:/common:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/openstack:/openstack:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/kubernetes:/kubernetes:rw,z" \ + atolmis/ansible-client:2020.12.02 \ + bash + + +# ----------------------------------------------------- +# Delete everything. +#[root@kubernator] + + /openstack/bin/delete-all.sh \ + "${cloudname:?}" + + > .... + > .... + + +# ----------------------------------------------------- +# Create our Aglais configuration. +#[root@kubernator] + +cat > '/tmp/aglais-config.yml' << EOF +aglais: + version: 1.0 + spec: + openstack: + cloudname: ${cloudname:?} + dashboard: + hostname: dashboard.metagrid.xyz + zeppelin: + hostname: zeppelin.metagrid.xyz + drupal: + hostname: drupal.metagrid.xyz +EOF + + +# ----------------------------------------------------- +# Create everything. +#[root@kubernator] + + /kubernetes/bin/create-all.sh + + > .... + > .... + + # + # Dashboard installed OK this time ... + # + + +# ----------------------------------------------------- +# Check the results. +#[root@kubernator] + + cat '/tmp/aglais-status.yml' + + > aglais: + > status: + > deployment: + > type: kubernetes + > name: aglais-20210125 + > date: 20210125:123446 + > openstack: + > cloudname: gaia-dev + > magnum: + > uuid: fb90b7c3-49d8-48e8-8b7b-60976ba3f187 + > kubernetes: + > namespace: aglais-20210125 + > ingress: + > dashboard: + > hostname: dashboard.metagrid.xyz + > ipv4: null + > zeppelin: + > hostname: zeppelin.metagrid.xyz + > ipv4: null + + +# ----------------------------------------------------- +# Get the cluster ID and K8s namespace. +#[root@kubernator] + + magnumid=$( + yq read '/tmp/aglais-status.yml' 'aglais.status.openstack.magnum.uuid' + ) + + namespace=$( + yq read '/tmp/aglais-status.yml' 'aglais.status.kubernetes.namespace' + ) + +cat << EOF +Magnum uuid [${magnumid}] +Name space [${namespace}] +EOF + + > Magnum uuid [fb90b7c3-49d8-48e8-8b7b-60976ba3f187] + > Name space [aglais-20210125] + + +# ----------------------------------------------------- +# Get the Dashboard ServiceAccount token. +#[root@kubernator] + + secretname=$( + kubectl \ + --output json \ + --namespace "${namespace:?}" \ + get ServiceAccount \ + "aglais-dashboard-kubernetes-dashboard" \ + | jq -r '.secrets[0].name' + ) + + kubectl \ + --output json \ + --namespace "${namespace:?}" \ + get Secret \ + "${secretname:?}" \ + | jq -r '.data.token | @base64d' + + + > .... + > .... + + +# ----------------------------------------------------- +# Check our ingress status. +# ** Kubernetes needs time time to allocate the IP address. +#[root@kubernator] + + kubectl \ + --namespace "${namespace:?}" \ + get Ingress + + > NAME HOSTS ADDRESS PORTS AGE + > aglais-dashboard-kubernetes-dashboard dashboard.metagrid.xyz 128.232.227.177 80 5m6s + > zeppelin-server-ingress zeppelin.metagrid.xyz 128.232.227.177 80, 443 3m20s + + +# ----------------------------------------------------- +# Capture our Dashboard ingress IP address. +# ** Kubernetes needs time time to allocate the IP address. + + daship=$( + kubectl \ + --namespace "${namespace:?}" \ + get Ingress \ + --output json \ + | jq -r ' + .items[] + | select(.metadata.name == "aglais-dashboard-kubernetes-dashboard") + | .status.loadBalancer.ingress[0].ip + ' + ) + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + 'aglais.status.kubernetes.ingress.dashboard.ipv4' \ + "${daship}" + + +# ----------------------------------------------------- +# Capture our Zeppelin ingress IP address. +# ** Kubernetes needs time time to allocate the IP address. + + zeppip=$( + kubectl \ + --namespace "${namespace:?}" \ + get Ingress \ + --output json \ + | jq -r ' + .items[] + | select(.metadata.name == "zeppelin-server-ingress") + | .status.loadBalancer.ingress[0].ip + ' + ) + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + 'aglais.status.kubernetes.ingress.zeppelin.ipv4' \ + "${zeppip}" + + + +# ----------------------------------------------------- +# ----------------------------------------------------- + + # + # Update our DNS .. + # + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Check the Dashboard page. +#[root@kubernator] + + curl --head --insecure "https://dashboard.metagrid.xyz/" + + > HTTP/2 200 + > date: Mon, 25 Jan 2021 13:05:50 GMT + > .... + > .... + + +# ----------------------------------------------------- +# Check the Zeppelin page. +#[root@kubernator] + + curl --head --insecure "https://zeppelin.metagrid.xyz/" + + > HTTP/2 200 + > date: Mon, 25 Jan 2021 13:06:05 GMT + > .... + > .... + + +# ----------------------------------------------------- +# Check the test pod events for the data shares. +#[root@kubernator] + + sharelist='/common/manila/datashares.yaml' + + for shareid in $( + yq read "${sharelist:?}" 'shares.[*].id' + ) + do + echo "" + echo "---- ----" + echo "Share [${shareid:?}]" + echo "----" + + sharename=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).sharename") + mountpath=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).mountpath") + + kubectl \ + --namespace "${namespace:?}" \ + get event \ + --field-selector "involvedObject.name=${sharename:?}-testpod" + + done + + > ---- ---- + > Share [GDR2] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-gaia-dr2-testpod Successfully assigned aglais-20210125/aglais-gaia-dr2-testpod to aglais-20210125-cluster-shr4k5gaja5a-node-2 + > 18m Warning FailedMount pod/aglais-gaia-dr2-testpod MountVolume.MountDevice failed for volume "aglais-gaia-dr2-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-gaia-dr2-volume/globalmount: permission denied + > 8m33s Warning FailedMount pod/aglais-gaia-dr2-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[test-data local-data default-token-fwhqc]: timed out waiting for the condition + > 3m59s Warning FailedMount pod/aglais-gaia-dr2-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[local-data default-token-fwhqc test-data]: timed out waiting for the condition + > 33m Warning FailedMount pod/aglais-gaia-dr2-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[default-token-fwhqc test-data local-data]: timed out waiting for the condition + > + > ---- ---- + > Share [GEDR3] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-gaia-edr3-testpod Successfully assigned aglais-20210125/aglais-gaia-edr3-testpod to aglais-20210125-cluster-shr4k5gaja5a-node-3 + > 18m Warning FailedMount pod/aglais-gaia-edr3-testpod MountVolume.MountDevice failed for volume "aglais-gaia-edr3-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-gaia-edr3-volume/globalmount: permission denied + > 8m17s Warning FailedMount pod/aglais-gaia-edr3-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[test-data local-data default-token-fwhqc]: timed out waiting for the condition + > 3m46s Warning FailedMount pod/aglais-gaia-edr3-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[default-token-fwhqc test-data local-data]: timed out waiting for the condition + > + > ---- ---- + > Share [ALLWISE] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-wise-allwise-testpod Successfully assigned aglais-20210125/aglais-wise-allwise-testpod to aglais-20210125-cluster-shr4k5gaja5a-node-0 + > 17m Warning FailedMount pod/aglais-wise-allwise-testpod MountVolume.MountDevice failed for volume "aglais-wise-allwise-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-wise-allwise-volume/globalmount: permission denied + > 3m30s Warning FailedMount pod/aglais-wise-allwise-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[test-data local-data default-token-fwhqc]: timed out waiting for the condition + > 8m4s Warning FailedMount pod/aglais-wise-allwise-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[default-token-fwhqc test-data local-data]: timed out waiting for the condition + > + > ---- ---- + > Share [PS1] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-panstarrs-dr1-testpod Successfully assigned aglais-20210125/aglais-panstarrs-dr1-testpod to aglais-20210125-cluster-shr4k5gaja5a-node-0 + > 17m Warning FailedMount pod/aglais-panstarrs-dr1-testpod MountVolume.MountDevice failed for volume "aglais-panstarrs-dr1-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-panstarrs-dr1-volume/globalmount: permission denied + > 30m Warning FailedMount pod/aglais-panstarrs-dr1-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[default-token-fwhqc test-data local-data]: timed out waiting for the condition + > 3m17s Warning FailedMount pod/aglais-panstarrs-dr1-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[test-data local-data default-token-fwhqc]: timed out waiting for the condition + > 37m Warning FailedMount pod/aglais-panstarrs-dr1-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[local-data default-token-fwhqc test-data]: timed out waiting for the condition + > + > ---- ---- + > Share [2MASS] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-twomass-allsky-testpod Successfully assigned aglais-20210125/aglais-twomass-allsky-testpod to aglais-20210125-cluster-shr4k5gaja5a-node-2 + > 76s Warning FailedMount pod/aglais-twomass-allsky-testpod MountVolume.MountDevice failed for volume "aglais-twomass-allsky-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-twomass-allsky-volume/globalmount: permission denied + > 27m Warning FailedMount pod/aglais-twomass-allsky-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[local-data default-token-fwhqc test-data]: timed out waiting for the condition + > 7m44s Warning FailedMount pod/aglais-twomass-allsky-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[test-data local-data default-token-fwhqc]: timed out waiting for the condition + > 30m Warning FailedMount pod/aglais-twomass-allsky-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[default-token-fwhqc test-data local-data]: timed out waiting for the condition + + # + # All of them failed this time. + # All of them reported the same error this time. + # + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Delete everything. +#[root@kubernator] + + /openstack/bin/delete-all.sh \ + "${cloudname:?}" + + > .... + > .... + +# ----------------------------------------------------- +# Create everything. +#[root@kubernator] + + /kubernetes/bin/create-all.sh + + > .... + > .... + + +# ----------------------------------------------------- +# Check the results. +#[root@kubernator] + + cat '/tmp/aglais-status.yml' + + > .... + > .... + + +# ----------------------------------------------------- +# Get the cluster ID and K8s namespace. +#[root@kubernator] + + magnumid=$( + yq read '/tmp/aglais-status.yml' 'aglais.status.openstack.magnum.uuid' + ) + + namespace=$( + yq read '/tmp/aglais-status.yml' 'aglais.status.kubernetes.namespace' + ) + +cat << EOF +Magnum uuid [${magnumid}] +Name space [${namespace}] +EOF + + > Magnum uuid [cc17f847-fb02-427b-909a-6750dbad2060] + > Name space [aglais-20210125] + + +# ----------------------------------------------------- +# Check the test pod events for the data shares. +#[root@kubernator] + + sharelist='/common/manila/datashares.yaml' + + for shareid in $( + yq read "${sharelist:?}" 'shares.[*].id' + ) + do + echo "" + echo "---- ----" + echo "Share [${shareid:?}]" + echo "----" + + sharename=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).sharename") + mountpath=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).mountpath") + + kubectl \ + --namespace "${namespace:?}" \ + get event \ + --field-selector "involvedObject.name=${sharename:?}-testpod" + + done + + > ---- ---- + > Share [GDR2] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-gaia-dr2-testpod Successfully assigned aglais-20210125/aglais-gaia-dr2-testpod to aglais-20210125-cluster-bf3en5lv3e6a-node-0 + > 90s Warning FailedMount pod/aglais-gaia-dr2-testpod MountVolume.MountDevice failed for volume "aglais-gaia-dr2-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-gaia-dr2-volume/globalmount: permission denied + > 97s Warning FailedMount pod/aglais-gaia-dr2-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[test-data local-data default-token-4zmrq]: timed out waiting for the condition + > + > ---- ---- + > Share [GEDR3] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-gaia-edr3-testpod Successfully assigned aglais-20210125/aglais-gaia-edr3-testpod to aglais-20210125-cluster-bf3en5lv3e6a-node-1 + > 78s Warning FailedMount pod/aglais-gaia-edr3-testpod MountVolume.MountDevice failed for volume "aglais-gaia-edr3-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-gaia-edr3-volume/globalmount: permission denied + > 86s Warning FailedMount pod/aglais-gaia-edr3-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[local-data default-token-4zmrq test-data]: timed out waiting for the condition + > + > ---- ---- + > Share [ALLWISE] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-wise-allwise-testpod Successfully assigned aglais-20210125/aglais-wise-allwise-testpod to aglais-20210125-cluster-bf3en5lv3e6a-node-2 + > 66s Warning FailedMount pod/aglais-wise-allwise-testpod MountVolume.MountDevice failed for volume "aglais-wise-allwise-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-wise-allwise-volume/globalmount: permission denied + > 73s Warning FailedMount pod/aglais-wise-allwise-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[test-data local-data default-token-4zmrq]: timed out waiting for the condition + > + > ---- ---- + > Share [PS1] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-panstarrs-dr1-testpod Successfully assigned aglais-20210125/aglais-panstarrs-dr1-testpod to aglais-20210125-cluster-bf3en5lv3e6a-node-2 + > 53s Warning FailedMount pod/aglais-panstarrs-dr1-testpod MountVolume.MountDevice failed for volume "aglais-panstarrs-dr1-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-panstarrs-dr1-volume/globalmount: permission denied + > 60s Warning FailedMount pod/aglais-panstarrs-dr1-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[test-data local-data default-token-4zmrq]: timed out waiting for the condition + > + > ---- ---- + > Share [2MASS] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-twomass-allsky-testpod Successfully assigned aglais-20210125/aglais-twomass-allsky-testpod to aglais-20210125-cluster-bf3en5lv3e6a-node-3 + > 40s Warning FailedMount pod/aglais-twomass-allsky-testpod MountVolume.MountDevice failed for volume "aglais-twomass-allsky-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-twomass-allsky-volume/globalmount: permission denied + > 47s Warning FailedMount pod/aglais-twomass-allsky-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[default-token-4zmrq test-data local-data]: timed out waiting for the condition + > [root@kubernator /]# + + + # + # All of them failed this time. + # All of them reported the same error this time. + # + + +# ----------------------------------------------------- +# Check the test pod events for the user shares. +#[root@kubernator] + + sharelist='/common/manila/usershares.yaml' + + for shareid in $( + yq read "${sharelist:?}" 'shares.[*].id' + ) + do + echo "" + echo "---- ----" + echo "Share [${shareid:?}]" + echo "----" + + sharename=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).sharename") + mountpath=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).mountpath") + + kubectl \ + --namespace "${namespace:?}" \ + get event \ + --field-selector "involvedObject.name=${sharename:?}-testpod" + + done + + > ---- ---- + > Share [nch] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-user-nch-testpod Successfully assigned aglais-20210125/aglais-user-nch-testpod to aglais-20210125-cluster-bf3en5lv3e6a-node-0 + > 6m1s Normal Pulling pod/aglais-user-nch-testpod Pulling image "fedora:latest" + > 5m54s Normal Pulled pod/aglais-user-nch-testpod Successfully pulled image "fedora:latest" + > 5m53s Normal Created pod/aglais-user-nch-testpod Created container aglais-user-nch-container + > 5m53s Normal Started pod/aglais-user-nch-testpod Started container aglais-user-nch-container + > + > ---- ---- + > Share [zrq] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-user-zrq-testpod Successfully assigned aglais-20210125/aglais-user-zrq-testpod to aglais-20210125-cluster-bf3en5lv3e6a-node-2 + > 5m50s Warning FailedMount pod/aglais-user-zrq-testpod MountVolume.MountDevice failed for volume "aglais-user-zrq-volume" : rpc error: code = Internal desc = failed to retrieve share ff351afd-1f06-4d02-9f53-cbe20b0676cc: Request forbidden: [GET https://cumulus.openstack.hpc.cam.ac.uk:8786/v2/08e24c6d87f94740aa59c172462ed927/shares/ff351afd-1f06-4d02-9f53-cbe20b0676cc], error message: {"forbidden": {"message": "Policy doesn't allow share:get to be performed.", "code": 403}} + > 100s Warning FailedMount pod/aglais-user-zrq-testpod MountVolume.MountDevice failed for volume "aglais-user-zrq-volume" : rpc error: code = InvalidArgument desc = stage secrets cannot be nil or empty + > 3m48s Warning FailedMount pod/aglais-user-zrq-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[test-data local-data default-token-4zmrq]: timed out waiting for the condition + > 90s Warning FailedMount pod/aglais-user-zrq-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[default-token-4zmrq test-data local-data]: timed out waiting for the condition + > + > ---- ---- + > Share [stv] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-user-stv-testpod Successfully assigned aglais-20210125/aglais-user-stv-testpod to aglais-20210125-cluster-bf3en5lv3e6a-node-0 + > 5m38s Warning FailedMount pod/aglais-user-stv-testpod MountVolume.MountDevice failed for volume "aglais-user-stv-volume" : rpc error: code = Internal desc = failed to retrieve share fe63568a-d90c-4fb0-8979-07504328809d: Request forbidden: [GET https://cumulus.openstack.hpc.cam.ac.uk:8786/v2/08e24c6d87f94740aa59c172462ed927/shares/fe63568a-d90c-4fb0-8979-07504328809d], error message: {"forbidden": {"message": "Policy doesn't allow share:get to be performed.", "code": 403}} + > 88s Warning FailedMount pod/aglais-user-stv-testpod MountVolume.MountDevice failed for volume "aglais-user-stv-volume" : rpc error: code = InvalidArgument desc = stage secrets cannot be nil or empty + > 3m36s Warning FailedMount pod/aglais-user-stv-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[default-token-4zmrq test-data local-data]: timed out waiting for the condition + > 81s Warning FailedMount pod/aglais-user-stv-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[local-data default-token-4zmrq test-data]: timed out waiting for the condition + + # + # One worked, the rest failed. + # Same error rmessage, but different to the data shares. + # + +# ----------------------------------------------------- +# ----------------------------------------------------- + + Looking at the Horizon GUI, the failed user shares were not public. + Updated the share properties, making all of them public. + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Delete everything. +#[root@kubernator] + + /openstack/bin/delete-all.sh \ + "${cloudname:?}" + + > .... + > .... + +# ----------------------------------------------------- +# Create everything. +#[root@kubernator] + + /kubernetes/bin/create-all.sh + + > .... + > .... + + +# ----------------------------------------------------- +# Check the results. +#[root@kubernator] + + cat '/tmp/aglais-status.yml' + + > aglais: + > status: + > deployment: + > type: kubernetes + > name: aglais-20210125 + > date: 20210125:140610 + > openstack: + > cloudname: gaia-dev + > magnum: + > uuid: befc7a6f-57fd-4a8f-94a6-3694d20229b9 + > kubernetes: + > namespace: aglais-20210125 + > ingress: + > dashboard: + > hostname: dashboard.metagrid.xyz + > ipv4: + > zeppelin: + > hostname: zeppelin.metagrid.xyz + > ipv4: null + + +# ----------------------------------------------------- +# Get the cluster ID and K8s namespace. +#[root@kubernator] + + magnumid=$( + yq read '/tmp/aglais-status.yml' 'aglais.status.openstack.magnum.uuid' + ) + + namespace=$( + yq read '/tmp/aglais-status.yml' 'aglais.status.kubernetes.namespace' + ) + +cat << EOF +Magnum uuid [${magnumid}] +Name space [${namespace}] +EOF + + > Magnum uuid [befc7a6f-57fd-4a8f-94a6-3694d20229b9] + > Name space [aglais-20210125] + + +# ----------------------------------------------------- +# Check the test pod events for the user shares. +#[root@kubernator] + + sharelist='/common/manila/usershares.yaml' + + for shareid in $( + yq read "${sharelist:?}" 'shares.[*].id' + ) + do + echo "" + echo "---- ----" + echo "Share [${shareid:?}]" + echo "----" + + sharename=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).sharename") + mountpath=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).mountpath") + + kubectl \ + --namespace "${namespace:?}" \ + get event \ + --field-selector "involvedObject.name=${sharename:?}-testpod" + + done + + > ---- ---- + > Share [nch] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-user-nch-testpod Successfully assigned aglais-20210125/aglais-user-nch-testpod to aglais-20210125-cluster-cmauzltjts5o-node-0 + > 2m22s Normal Pulling pod/aglais-user-nch-testpod Pulling image "fedora:latest" + > 2m15s Normal Pulled pod/aglais-user-nch-testpod Successfully pulled image "fedora:latest" + > 2m14s Normal Created pod/aglais-user-nch-testpod Created container aglais-user-nch-container + > 2m14s Normal Started pod/aglais-user-nch-testpod Started container aglais-user-nch-container + > + > ---- ---- + > Share [zrq] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-user-zrq-testpod Successfully assigned aglais-20210125/aglais-user-zrq-testpod to aglais-20210125-cluster-cmauzltjts5o-node-2 + > 2m9s Normal Pulling pod/aglais-user-zrq-testpod Pulling image "fedora:latest" + > 2m1s Normal Pulled pod/aglais-user-zrq-testpod Successfully pulled image "fedora:latest" + > 2m1s Normal Created pod/aglais-user-zrq-testpod Created container aglais-user-zrq-container + > 2m1s Normal Started pod/aglais-user-zrq-testpod Started container aglais-user-zrq-container + > + > ---- ---- + > Share [stv] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-user-stv-testpod Successfully assigned aglais-20210125/aglais-user-stv-testpod to aglais-20210125-cluster-cmauzltjts5o-node-2 + > 116s Normal Pulling pod/aglais-user-stv-testpod Pulling image "fedora:latest" + > 112s Normal Pulled pod/aglais-user-stv-testpod Successfully pulled image "fedora:latest" + > 112s Normal Created pod/aglais-user-stv-testpod Created container aglais-user-stv-container + > 112s Normal Started pod/aglais-user-stv-testpod Started container aglais-user-stv-container + + # + # OK - so they all worked. + # Needed to make them all public. + # + + +# ----------------------------------------------------- +# Check the test pod events for the data shares. +#[root@kubernator] + + sharelist='/common/manila/datashares.yaml' + + for shareid in $( + yq read "${sharelist:?}" 'shares.[*].id' + ) + do + echo "" + echo "---- ----" + echo "Share [${shareid:?}]" + echo "----" + + sharename=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).sharename") + mountpath=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).mountpath") + + kubectl \ + --namespace "${namespace:?}" \ + get event \ + --field-selector "involvedObject.name=${sharename:?}-testpod" + + done + + + > ---- ---- + > Share [GDR2] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-gaia-dr2-testpod Successfully assigned aglais-20210125/aglais-gaia-dr2-testpod to aglais-20210125-cluster-cmauzltjts5o-node-0 + > 2m7s Warning FailedMount pod/aglais-gaia-dr2-testpod MountVolume.MountDevice failed for volume "aglais-gaia-dr2-volume" : kubernetes.io/csi: attacher.MountDevice failed to create newCsiDriverClient: driver name cephfs.manila.csi.openstack.org not found in the list of registered CSI drivers + > 6s Warning FailedMount pod/aglais-gaia-dr2-testpod MountVolume.MountDevice failed for volume "aglais-gaia-dr2-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-gaia-dr2-volume/globalmount: permission denied + > 12s Warning FailedMount pod/aglais-gaia-dr2-testpod Unable to attach or mount volumes: unmounted volumes=[test-data], unattached volumes=[local-data default-token-vc7cp test-data]: timed out waiting for the condition + > + > ---- ---- + > Share [GEDR3] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-gaia-edr3-testpod Successfully assigned aglais-20210125/aglais-gaia-edr3-testpod to aglais-20210125-cluster-cmauzltjts5o-node-0 + > 57s Warning FailedMount pod/aglais-gaia-edr3-testpod MountVolume.MountDevice failed for volume "aglais-gaia-edr3-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-gaia-edr3-volume/globalmount: permission denied + > + > ---- ---- + > Share [ALLWISE] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-wise-allwise-testpod Successfully assigned aglais-20210125/aglais-wise-allwise-testpod to aglais-20210125-cluster-cmauzltjts5o-node-1 + > 44s Warning FailedMount pod/aglais-wise-allwise-testpod MountVolume.MountDevice failed for volume "aglais-wise-allwise-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-wise-allwise-volume/globalmount: permission denied + > + > ---- ---- + > Share [PS1] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-panstarrs-dr1-testpod Successfully assigned aglais-20210125/aglais-panstarrs-dr1-testpod to aglais-20210125-cluster-cmauzltjts5o-node-1 + > 32s Warning FailedMount pod/aglais-panstarrs-dr1-testpod MountVolume.MountDevice failed for volume "aglais-panstarrs-dr1-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-panstarrs-dr1-volume/globalmount: permission denied + > + > ---- ---- + > Share [2MASS] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-twomass-allsky-testpod Successfully assigned aglais-20210125/aglais-twomass-allsky-testpod to aglais-20210125-cluster-cmauzltjts5o-node-3 + > 20s Warning FailedMount pod/aglais-twomass-allsky-testpod MountVolume.MountDevice failed for volume "aglais-twomass-allsky-volume" : rpc error: code = Internal desc = chmod /var/lib/kubelet/plugins/kubernetes.io/csi/pv/aglais-twomass-allsky-volume/globalmount: permission denied + + + # + # First one is a glitch - the rest are as before. + # + + # + # Difference between user shares and data shares ? + # We create the data shares as ro and user shares as rw. + # The cephfs-mount script the read/write mode to select the access rule. + # ... but due to earlier issues with cephfs-csi, we always mount using ReadWriteMany. + # Hence the 'permission denied' errors ? + # + # + + # + # Edit the create all script to mount them as rw. + # + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Delete everything. +#[root@kubernator] + + /openstack/bin/delete-all.sh \ + "${cloudname:?}" + + > .... + > .... + +# ----------------------------------------------------- +# Create everything. +#[root@kubernator] + + /kubernetes/bin/create-all.sh + + > .... + > .... + + +# ----------------------------------------------------- +# Check the results. +#[root@kubernator] + + cat '/tmp/aglais-status.yml' + + > aglais: + > status: + > deployment: + > type: kubernetes + > name: aglais-20210125 + > date: 20210125:145052 + > openstack: + > cloudname: gaia-dev + > magnum: + > uuid: 96fc649b-f5be-4ac0-8293-59a4ffdf4e97 + > kubernetes: + > namespace: aglais-20210125 + > ingress: + > dashboard: + > hostname: dashboard.metagrid.xyz + > ipv4: + > zeppelin: + > hostname: zeppelin.metagrid.xyz + > ipv4: null + +# ----------------------------------------------------- +# Get the cluster ID and K8s namespace. +#[root@kubernator] + + magnumid=$( + yq read '/tmp/aglais-status.yml' 'aglais.status.openstack.magnum.uuid' + ) + + namespace=$( + yq read '/tmp/aglais-status.yml' 'aglais.status.kubernetes.namespace' + ) + +cat << EOF +Magnum uuid [${magnumid}] +Name space [${namespace}] +EOF + + > Magnum uuid [96fc649b-f5be-4ac0-8293-59a4ffdf4e97] + > Name space [aglais-20210125] + + +# ----------------------------------------------------- +# Check the test pod events for the user shares. +#[root@kubernator] + + sharelist='/common/manila/usershares.yaml' + + for shareid in $( + yq read "${sharelist:?}" 'shares.[*].id' + ) + do + echo "" + echo "---- ----" + echo "Share [${shareid:?}]" + echo "----" + + sharename=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).sharename") + mountpath=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).mountpath") + + kubectl \ + --namespace "${namespace:?}" \ + get event \ + --field-selector "involvedObject.name=${sharename:?}-testpod" + + done + + > ---- ---- + > Share [nch] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-user-nch-testpod Successfully assigned aglais-20210125/aglais-user-nch-testpod to aglais-20210125-cluster-jbncmdarhg4l-node-2 + > 9m5s Normal Pulling pod/aglais-user-nch-testpod Pulling image "fedora:latest" + > 9m2s Normal Pulled pod/aglais-user-nch-testpod Successfully pulled image "fedora:latest" + > 9m2s Normal Created pod/aglais-user-nch-testpod Created container aglais-user-nch-container + > 9m2s Normal Started pod/aglais-user-nch-testpod Started container aglais-user-nch-container + > + > ---- ---- + > Share [zrq] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-user-zrq-testpod Successfully assigned aglais-20210125/aglais-user-zrq-testpod to aglais-20210125-cluster-jbncmdarhg4l-node-3 + > 8m53s Normal Pulling pod/aglais-user-zrq-testpod Pulling image "fedora:latest" + > 8m49s Normal Pulled pod/aglais-user-zrq-testpod Successfully pulled image "fedora:latest" + > 8m49s Normal Created pod/aglais-user-zrq-testpod Created container aglais-user-zrq-container + > 8m49s Normal Started pod/aglais-user-zrq-testpod Started container aglais-user-zrq-container + > + > ---- ---- + > Share [stv] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-user-stv-testpod Successfully assigned aglais-20210125/aglais-user-stv-testpod to aglais-20210125-cluster-jbncmdarhg4l-node-1 + > 8m39s Normal Pulling pod/aglais-user-stv-testpod Pulling image "fedora:latest" + > 8m36s Normal Pulled pod/aglais-user-stv-testpod Successfully pulled image "fedora:latest" + > 8m36s Normal Created pod/aglais-user-stv-testpod Created container aglais-user-stv-container + > 8m36s Normal Started pod/aglais-user-stv-testpod Started container aglais-user-stv-container + + +# ----------------------------------------------------- +# Check the test pod events for the data shares. +#[root@kubernator] + + sharelist='/common/manila/datashares.yaml' + + for shareid in $( + yq read "${sharelist:?}" 'shares.[*].id' + ) + do + echo "" + echo "---- ----" + echo "Share [${shareid:?}]" + echo "----" + + sharename=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).sharename") + mountpath=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).mountpath") + + kubectl \ + --namespace "${namespace:?}" \ + get event \ + --field-selector "involvedObject.name=${sharename:?}-testpod" + + done + + > ---- ---- + > Share [GDR2] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-gaia-dr2-testpod Successfully assigned aglais-20210125/aglais-gaia-dr2-testpod to aglais-20210125-cluster-jbncmdarhg4l-node-1 + > 10m Warning FailedMount pod/aglais-gaia-dr2-testpod MountVolume.MountDevice failed for volume "aglais-gaia-dr2-volume" : kubernetes.io/csi: attacher.MountDevice failed to create newCsiDriverClient: driver name cephfs.manila.csi.openstack.org not found in the list of registered CSI drivers + > 10m Normal Pulling pod/aglais-gaia-dr2-testpod Pulling image "fedora:latest" + > 10m Normal Pulled pod/aglais-gaia-dr2-testpod Successfully pulled image "fedora:latest" + > 10m Normal Created pod/aglais-gaia-dr2-testpod Created container aglais-gaia-dr2-container + > 10m Normal Started pod/aglais-gaia-dr2-testpod Started container aglais-gaia-dr2-container + > + > ---- ---- + > Share [GEDR3] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-gaia-edr3-testpod Successfully assigned aglais-20210125/aglais-gaia-edr3-testpod to aglais-20210125-cluster-jbncmdarhg4l-node-2 + > 10m Normal Pulling pod/aglais-gaia-edr3-testpod Pulling image "fedora:latest" + > 10m Normal Pulled pod/aglais-gaia-edr3-testpod Successfully pulled image "fedora:latest" + > 10m Normal Created pod/aglais-gaia-edr3-testpod Created container aglais-gaia-edr3-container + > 10m Normal Started pod/aglais-gaia-edr3-testpod Started container aglais-gaia-edr3-container + > + > ---- ---- + > Share [ALLWISE] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-wise-allwise-testpod Successfully assigned aglais-20210125/aglais-wise-allwise-testpod to aglais-20210125-cluster-jbncmdarhg4l-node-0 + > 10m Normal Pulling pod/aglais-wise-allwise-testpod Pulling image "fedora:latest" + > 9m53s Normal Pulled pod/aglais-wise-allwise-testpod Successfully pulled image "fedora:latest" + > 9m52s Normal Created pod/aglais-wise-allwise-testpod Created container aglais-wise-allwise-container + > 9m52s Normal Started pod/aglais-wise-allwise-testpod Started container aglais-wise-allwise-container + > + > ---- ---- + > Share [PS1] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-panstarrs-dr1-testpod Successfully assigned aglais-20210125/aglais-panstarrs-dr1-testpod to aglais-20210125-cluster-jbncmdarhg4l-node-0 + > 9m48s Normal Pulling pod/aglais-panstarrs-dr1-testpod Pulling image "fedora:latest" + > 9m45s Normal Pulled pod/aglais-panstarrs-dr1-testpod Successfully pulled image "fedora:latest" + > 9m45s Normal Created pod/aglais-panstarrs-dr1-testpod Created container aglais-panstarrs-dr1-container + > 9m45s Normal Started pod/aglais-panstarrs-dr1-testpod Started container aglais-panstarrs-dr1-container + > + > ---- ---- + > Share [2MASS] + > ---- + > LAST SEEN TYPE REASON OBJECT MESSAGE + > Normal Scheduled pod/aglais-twomass-allsky-testpod Successfully assigned aglais-20210125/aglais-twomass-allsky-testpod to aglais-20210125-cluster-jbncmdarhg4l-node-3 + > 9m37s Normal Pulling pod/aglais-twomass-allsky-testpod Pulling image "fedora:latest" + > 9m29s Normal Pulled pod/aglais-twomass-allsky-testpod Successfully pulled image "fedora:latest" + > 9m28s Normal Created pod/aglais-twomass-allsky-testpod Created container aglais-twomass-allsky-container + + # + # So the failed mounts were due to a combination of two things. + # Due to known issue with Cephfs CSI plugin, credentials don't work for anything other than ReadWriteMany. + # We were trying to use an Openstack 'ro' access rule to mount a CSI volume as 'ReadWriteMany'. + # => permission error + # + # We were trying to access a Openstack Manila share that wasn't public. + # => stage secrets cannot be nil or empty + # + + +# ----------------------------------------------------- +# Check the CSI volumes, claims and testpods for the data volumes. +#[root@kubernator] + + sharelist='/common/manila/datashares.yaml' + + for shareid in $( + yq read "${sharelist:?}" 'shares.[*].id' + ) + do + echo "" + echo "---- ----" + echo "Share [${shareid:?}]" + + sharename=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).sharename") + mountpath=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).mountpath") + + podphase=$( + kubectl \ + --namespace "${namespace:?}" \ + get pod \ + --output json \ + "${sharename:?}-testpod" \ + | jq -r '.status.phase' + ) + + volphase=$( + kubectl \ + --namespace "${namespace:?}" \ + get PersistentVolume \ + --output json \ + "${sharename:?}-volume" \ + | jq -r '.status.phase' + ) + + claimphase=$( + kubectl \ + --namespace "${namespace:?}" \ + get PersistentVolumeClaim \ + --output json \ + "${sharename:?}-claim" \ + | jq -r '.status.phase' + ) + + echo "Testpod [${podphase}]" + echo "Volume [${volphase}]" + echo "Claim [${claimphase}]" + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + "aglais.status.kubernetes.csi-manila.${sharename:?}.testpod" \ + "${podphase}" + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + "aglais.status.kubernetes.csi-manila.${sharename:?}.volume" \ + "${volphase}" + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + "aglais.status.kubernetes.csi-manila.${sharename:?}.claim" \ + "${claimphase}" + + echo "----" + kubectl \ + --namespace "${namespace:?}" \ + exec \ + --tty \ + --stdin \ + "${sharename:?}-testpod" \ + -- \ + /usr/bin/df -h "${mountpath:?}" + echo "----" + + done + + > ---- ---- + > Share [GDR2] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 512G 473G 40G 93% /data/gaia/dr2 + > ---- + > + > ---- ---- + > Share [GEDR3] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 540G 533G 7.9G 99% /data/gaia/edr3 + > ---- + > + > ---- ---- + > Share [ALLWISE] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 350G 341G 9.9G 98% /data/wise/allwise + > ---- + > + > ---- ---- + > Share [PS1] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 300G 270G 31G 90% /data/panstarrs/dr1 + > ---- + > + > ---- ---- + > Share [2MASS] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 40G 37G 3.5G 92% /data/twomass/allsky + > ---- + + +# ----------------------------------------------------- +# Check the CSI volumes, claims and testpods for the user volumes. +#[root@kubernator] + + sharelist='/common/manila/usershares.yaml' + + for shareid in $( + yq read "${sharelist:?}" 'shares.[*].id' + ) + do + echo "" + echo "---- ----" + echo "Share [${shareid:?}]" + + sharename=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).sharename") + mountpath=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).mountpath") + + podphase=$( + kubectl \ + --namespace "${namespace:?}" \ + get pod \ + --output json \ + "${sharename:?}-testpod" \ + | jq -r '.status.phase' + ) + + volphase=$( + kubectl \ + --namespace "${namespace:?}" \ + get PersistentVolume \ + --output json \ + "${sharename:?}-volume" \ + | jq -r '.status.phase' + ) + + claimphase=$( + kubectl \ + --namespace "${namespace:?}" \ + get PersistentVolumeClaim \ + --output json \ + "${sharename:?}-claim" \ + | jq -r '.status.phase' + ) + + echo "Testpod [${podphase}]" + echo "Volume [${volphase}]" + echo "Claim [${claimphase}]" + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + "aglais.status.kubernetes.csi-manila.${sharename:?}.testpod" \ + "${podphase}" + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + "aglais.status.kubernetes.csi-manila.${sharename:?}.volume" \ + "${volphase}" + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + "aglais.status.kubernetes.csi-manila.${sharename:?}.claim" \ + "${claimphase}" + + echo "----" + kubectl \ + --namespace "${namespace:?}" \ + exec \ + --tty \ + --stdin \ + "${sharename:?}-testpod" \ + -- \ + /usr/bin/df -h "${mountpath:?}" + echo "----" + + done + + > ---- ---- + > Share [nch] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 10T 4.9T 5.2T 49% /user/nch + > ---- + > + > ---- ---- + > Share [zrq] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 1.0T 30G 995G 3% /user/zrq + > ---- + > + > ---- ---- + > Share [stv] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 1.0T 0 1.0T 0% /user/stv + > ---- + + +# ----------------------------------------------------- +# Check our results +#[root@kubernator] + + cat /tmp/aglais-status.yml + + > aglais: + > status: + > deployment: + > type: kubernetes + > name: aglais-20210125 + > date: 20210125:145052 + > openstack: + > cloudname: gaia-dev + > magnum: + > uuid: 96fc649b-f5be-4ac0-8293-59a4ffdf4e97 + > kubernetes: + > namespace: aglais-20210125 + > ingress: + > dashboard: + > hostname: dashboard.metagrid.xyz + > ipv4: + > zeppelin: + > hostname: zeppelin.metagrid.xyz + > ipv4: null + > csi-manila: + > aglais-gaia-dr2: + > testpod: Running + > volume: Bound + > claim: Bound + > aglais-gaia-edr3: + > testpod: Running + > volume: Bound + > claim: Bound + > aglais-wise-allwise: + > testpod: Running + > volume: Bound + > claim: Bound + > aglais-panstarrs-dr1: + > testpod: Running + > volume: Bound + > claim: Bound + > aglais-twomass-allsky: + > testpod: Running + > volume: Bound + > claim: Bound + > aglais-user-nch: + > testpod: Running + > volume: Bound + > claim: Bound + > aglais-user-zrq: + > testpod: Running + > volume: Bound + > claim: Bound + > aglais-user-stv: + > testpod: Running + > volume: Bound + + diff --git a/notes/zrq/20210127-01-kubernetes-deploy.txt b/notes/zrq/20210127-01-kubernetes-deploy.txt new file mode 100644 index 00000000..648ee825 --- /dev/null +++ b/notes/zrq/20210127-01-kubernetes-deploy.txt @@ -0,0 +1,465 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Run the Kubernetes deployment. + + Results: + + Success. + + TODO: + + Store aglais-status in a persistent volume so it can be re-mounted. + Update our DNS records. + + +# ----------------------------------------------------- +# Update the Openstack cloud name. +#[user@desktop] + + cloudname=gaia-dev + + sed -i ' + s/^\(AGLAIS_CLOUD\)=.*$/\1='${cloudname:?}'/ + ' "${HOME}/aglais.env" + +# ----------------------------------------------------- +# Create a container to work with. +# (*) extra volume mount for /common +# (*) mount kubernetes directory as read/write +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name kubernator \ + --hostname kubernator \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --env "cloudname=${AGLAIS_CLOUD:?}" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/common:/common:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/openstack:/openstack:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/kubernetes:/kubernetes:rw,z" \ + atolmis/ansible-client:2020.12.02 \ + bash + + +# ----------------------------------------------------- +# Delete everything. +#[root@kubernator] + + /openstack/bin/delete-all.sh \ + "${cloudname:?}" + + > .... + > .... + + +# ----------------------------------------------------- +# Create our Aglais configuration. +#[root@kubernator] + +cat > '/tmp/aglais-config.yml' << EOF +aglais: + version: 1.0 + spec: + openstack: + cloudname: ${cloudname:?} + dashboard: + hostname: dashboard.metagrid.xyz + zeppelin: + hostname: zeppelin.metagrid.xyz + drupal: + hostname: drupal.metagrid.xyz +EOF + + +# ----------------------------------------------------- +# Create everything. +#[root@kubernator] + + /kubernetes/bin/create-all.sh + + > .... + > .... + + +# ----------------------------------------------------- +# Check the results. +#[root@kubernator] + + cat '/tmp/aglais-status.yml' + + > aglais: + > status: + > deployment: + > type: kubernetes + > name: aglais-20210127 + > date: 20210127:050320 + > openstack: + > cloudname: gaia-dev + > magnum: + > uuid: 350c310b-f343-439f-b265-3b5ac7f9d903 + > kubernetes: + > namespace: aglais-20210127 + > ingress: + > dashboard: + > hostname: dashboard.metagrid.xyz + > ipv4: 128.232.227.236 + > zeppelin: + > hostname: zeppelin.metagrid.xyz + > ipv4: null + + +# ----------------------------------------------------- +# Get the cluster ID and K8s namespace. +#[root@kubernator] + + magnumid=$( + yq read '/tmp/aglais-status.yml' 'aglais.status.openstack.magnum.uuid' + ) + + namespace=$( + yq read '/tmp/aglais-status.yml' 'aglais.status.kubernetes.namespace' + ) + +cat << EOF +Magnum uuid [${magnumid}] +Name space [${namespace}] +EOF + + > Magnum uuid [350c310b-f343-439f-b265-3b5ac7f9d903] + > Name space [aglais-20210127] + + +# ----------------------------------------------------- +# Check the CSI volumes, claims and testpods for the data volumes. +#[root@kubernator] + + sharelist='/common/manila/datashares.yaml' + + for shareid in $( + yq read "${sharelist:?}" 'shares.[*].id' + ) + do + echo "" + echo "---- ----" + echo "Share [${shareid:?}]" + + sharename=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).sharename") + mountpath=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).mountpath") + + podphase=$( + kubectl \ + --namespace "${namespace:?}" \ + get pod \ + --output json \ + "${sharename:?}-testpod" \ + | jq -r '.status.phase' + ) + + volphase=$( + kubectl \ + --namespace "${namespace:?}" \ + get PersistentVolume \ + --output json \ + "${sharename:?}-volume" \ + | jq -r '.status.phase' + ) + + claimphase=$( + kubectl \ + --namespace "${namespace:?}" \ + get PersistentVolumeClaim \ + --output json \ + "${sharename:?}-claim" \ + | jq -r '.status.phase' + ) + + echo "Testpod [${podphase}]" + echo "Volume [${volphase}]" + echo "Claim [${claimphase}]" + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + "aglais.status.kubernetes.csi-manila.${sharename:?}.testpod" \ + "${podphase}" + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + "aglais.status.kubernetes.csi-manila.${sharename:?}.volume" \ + "${volphase}" + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + "aglais.status.kubernetes.csi-manila.${sharename:?}.claim" \ + "${claimphase}" + + echo "----" + kubectl \ + --namespace "${namespace:?}" \ + exec \ + --tty \ + --stdin \ + "${sharename:?}-testpod" \ + -- \ + /usr/bin/df -h "${mountpath:?}" + echo "----" + + done + + > ---- ---- + > Share [GDR2] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 512G 473G 40G 93% /data/gaia/dr2 + > ---- + > + > ---- ---- + > Share [GEDR3] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 540G 533G 7.9G 99% /data/gaia/edr3 + > ---- + > + > ---- ---- + > Share [ALLWISE] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 350G 341G 9.9G 98% /data/wise/allwise + > ---- + > + > ---- ---- + > Share [PS1] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 300G 270G 31G 90% /data/panstarrs/dr1 + > ---- + > + > ---- ---- + > Share [2MASS] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 40G 37G 3.5G 92% /data/twomass/allsky + > ---- + + +# ----------------------------------------------------- +# Check the CSI volumes, claims and testpods for the user volumes. +#[root@kubernator] + + sharelist='/common/manila/usershares.yaml' + + for shareid in $( + yq read "${sharelist:?}" 'shares.[*].id' + ) + do + echo "" + echo "---- ----" + echo "Share [${shareid:?}]" + + sharename=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).sharename") + mountpath=$(yq read "${sharelist:?}" "shares.(id==${shareid:?}).mountpath") + + podphase=$( + kubectl \ + --namespace "${namespace:?}" \ + get pod \ + --output json \ + "${sharename:?}-testpod" \ + | jq -r '.status.phase' + ) + + volphase=$( + kubectl \ + --namespace "${namespace:?}" \ + get PersistentVolume \ + --output json \ + "${sharename:?}-volume" \ + | jq -r '.status.phase' + ) + + claimphase=$( + kubectl \ + --namespace "${namespace:?}" \ + get PersistentVolumeClaim \ + --output json \ + "${sharename:?}-claim" \ + | jq -r '.status.phase' + ) + + echo "Testpod [${podphase}]" + echo "Volume [${volphase}]" + echo "Claim [${claimphase}]" + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + "aglais.status.kubernetes.csi-manila.${sharename:?}.testpod" \ + "${podphase}" + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + "aglais.status.kubernetes.csi-manila.${sharename:?}.volume" \ + "${volphase}" + + yq write \ + --inplace \ + '/tmp/aglais-status.yml' \ + "aglais.status.kubernetes.csi-manila.${sharename:?}.claim" \ + "${claimphase}" + + echo "----" + kubectl \ + --namespace "${namespace:?}" \ + exec \ + --tty \ + --stdin \ + "${sharename:?}-testpod" \ + -- \ + /usr/bin/df -h "${mountpath:?}" + echo "----" + + done + + > ---- ---- + > Share [nch] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 10T 4.9T 5.2T 49% /user/nch + > ---- + > + > ---- ---- + > Share [zrq] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 1.0T 30G 995G 3% /user/zrq + > ---- + > + > ---- ---- + > Share [stv] + > Testpod [Running] + > Volume [Bound] + > Claim [Bound] + > ---- + > Filesystem Size Used Avail Use% Mounted on + > ceph-fuse 1.0T 0 1.0T 0% /user/stv + > ---- + + +# ----------------------------------------------------- +# Check our results +#[root@kubernator] + + cat /tmp/aglais-status.yml + + > aglais: + > status: + > deployment: + > type: kubernetes + > name: aglais-20210127 + > date: 20210127:050320 + > openstack: + > cloudname: gaia-dev + > magnum: + > uuid: 350c310b-f343-439f-b265-3b5ac7f9d903 + > kubernetes: + > namespace: aglais-20210127 + > ingress: + > dashboard: + > hostname: dashboard.metagrid.xyz + > ipv4: 128.232.227.236 + > zeppelin: + > hostname: zeppelin.metagrid.xyz + > ipv4: null + > csi-manila: + > aglais-gaia-dr2: + > testpod: Running + > volume: Bound + > claim: Bound + > aglais-gaia-edr3: + > testpod: Running + > volume: Bound + > claim: Bound + > aglais-wise-allwise: + > testpod: Running + > volume: Bound + > claim: Bound + > aglais-panstarrs-dr1: + > testpod: Running + > volume: Bound + > claim: Bound + > aglais-twomass-allsky: + > testpod: Running + > volume: Bound + > claim: Bound + > aglais-user-nch: + > testpod: Running + > volume: Bound + > claim: Bound + > aglais-user-zrq: + > testpod: Running + > volume: Bound + > claim: Bound + > aglais-user-stv: + > testpod: Running + > volume: Bound + > claim: Bound + + diff --git a/notes/zrq/20210127-02-google-oauth-proxy.txt b/notes/zrq/20210127-02-google-oauth-proxy.txt new file mode 100644 index 00000000..23666347 --- /dev/null +++ b/notes/zrq/20210127-02-google-oauth-proxy.txt @@ -0,0 +1,667 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Install and test OAuthProxy with Google OAuth. + + Results: + + Success :-) + + Source: + + Automated K8s deployment. + notes/zrq/20210127-01-kubernetes-deploy.txt + + +# ----------------------------------------------------- +# Create a container to work with. +# (*) extra volume mount for /common +# (*) mount kubernetes directory as read/write +#[user@desktop] + + source "${HOME:?}/aglais.env" + + podman run \ + --rm \ + --tty \ + --interactive \ + --name kubernator \ + --hostname kubernator \ + --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ + --volume "${SSH_AUTH_SOCK}:/mnt/ssh_auth_sock:rw,z" \ + --env "cloudname=${AGLAIS_CLOUD:?}" \ + --volume "${HOME:?}/clouds.yaml:/etc/openstack/clouds.yaml:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/common:/common:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/openstack:/openstack:ro,z" \ + --volume "${AGLAIS_CODE:?}/experiments/kubernetes:/kubernetes:rw,z" \ + atolmis/ansible-client:2020.12.02 \ + bash + + +# ----------------------------------------------------- +# Configure our secret function. +#[root@kubernator] + + mkdir "${HOME}/bin" + + cat > "${HOME}/bin/secret" << 'EOF' +ssh -n \ + 'Zarquan@data.metagrid.co.uk' \ + "bin/secret '${1}'" +EOF + + chmod a+x "${HOME}/bin/secret" + + secret frog + + # FAILs + +# ----------------------------------------------------- +# Test SSH access to the server. +#[root@kubernator] + + ssh -v Zarquan@data.metagrid.co.uk \ + ' + date + hostname + ' + + > .... + > .... + > debug1: Next authentication method: publickey + > debug1: Offering public key: /home/Zarquan/.ssh/zrq.metagrid.co.uk.rsa RSA SHA256:26sAWXfK3hzPzWHrZCqvhj6gKCkmbG/N2U9/AvZaHzI agent + > debug1: send_pubkey_test: no mutual signature algorithm + > .... + > .... + + +# ----------------------------------------------------- +# Allow RSA keys. +# https://dev.to/bowmanjd/upgrade-ssh-client-keys-and-remote-servers-after-fedora-33-s-new-crypto-policy-47ag +#[root@kubernator] + + cat >> "${HOME}/.ssh/config" << EOF +# Allow RSA keys. +# https://dev.to/bowmanjd/upgrade-ssh-client-keys-and-remote-servers-after-fedora-33-s-new-crypto-policy-47ag +PubkeyAcceptedKeyTypes +ssh-rsa +EOF + + +# ----------------------------------------------------- +# Test SSH access to the server. +#[root@kubernator] + + ssh -v Zarquan@data.metagrid.co.uk \ + ' + date + hostname + ' + + > .... + > .... + > debug1: Next authentication method: publickey + > debug1: Offering public key: /home/Zarquan/.ssh/zrq.metagrid.co.uk.rsa RSA SHA256:26sAWXfK3hzPzWHrZCqvhj6gKCkmbG/N2U9/AvZaHzI agent + > debug1: Server accepts key: /home/Zarquan/.ssh/zrq.metagrid.co.uk.rsa RSA SHA256:26sAWXfK3hzPzWHrZCqvhj6gKCkmbG/N2U9/AvZaHzI agent + > debug1: Authentication succeeded (publickey). + > .... + > .... + + +# ----------------------------------------------------- +# Test the secret function. +#[root@kubernator] + + secret frog + + > Green Frog + + +# ----------------------------------------------------- +# Get the connection details the first cluster in the list. +#[root@kubernator] + + clusterid=$( + openstack \ + --os-cloud "${cloudname:?}" \ + coe cluster list \ + --format json \ + | jq -r '.[0] | .uuid' + ) + + '/kubernetes/bin/cluster-config.sh' \ + "${cloudname:?}" \ + "${clusterid:?}" + + kubectl \ + cluster-info + + > Kubernetes master is running at https://128.232.224.75:6443 + > Heapster is running at https://128.232.224.75:6443/api/v1/namespaces/kube-system/services/heapster/proxy + > CoreDNS is running at https://128.232.224.75:6443/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy + + +# ----------------------------------------------------- +# Get the name of the 'aglais' namespace. +#[root@kubernator] + + namespace=$( + kubectl \ + get namespace \ + --output json \ + | jq -r '.items[] | .metadata.name | select(. | startswith("aglais"))' + ) + + echo "Namespace [${namespace}]" + + > Namespace [aglais-20210127] + + +# ----------------------------------------------------- +# Get a token for the dashboard account. +#[root@kubernator] + + secretname=$( + kubectl \ + --output json \ + --namespace "${namespace:?}" \ + get ServiceAccount \ + "aglais-dashboard-kubernetes-dashboard" \ + | jq -r '.secrets[0].name' + ) + + dashtoken=$( + kubectl \ + --output json \ + --namespace "${namespace:?}" \ + get Secret \ + "${secretname:?}" \ + | jq -r '.data.token | @base64d' + ) + + echo ${dashtoken:?} + + > .... + > .... + + +# ----------------------------------------------------- +# Check the ingress address. +#[root@kubernator] + + kubectl \ + --namespace "${namespace}" \ + get ingress + + > NAME HOSTS ADDRESS PORTS AGE + > aglais-dashboard-kubernetes-dashboard dashboard.metagrid.xyz 128.232.227.227 80 44m + > zeppelin-server-ingress zeppelin.metagrid.xyz 128.232.227.227 80, 443 42m + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Update our internal and external DNS +#[user@dns-serer] + + aglais-001.metagrid.xyz A 128.232.227.227 + + vernon.metagrid.xyz CNAME aglais-001.metagrid.xyz. + + +# ----------------------------------------------------- +# Check the zeppelin interface. +#[root@kubernator] + + curl --head --insecure 'https://zeppelin.metagrid.xyz/' + + > HTTP/2 200 + > date: Wed, 27 Jan 2021 05:38:25 GMT + > .... + > .... + + +# ----------------------------------------------------- +# Check the dashboard interface. +#[root@kubernator] + + curl --head --insecure 'https://dashboard.metagrid.xyz/' + + > HTTP/2 200 + > date: Wed, 27 Jan 2021 05:38:59 GMT + > .... + > .... + + +# ----------------------------------------------------- +# Get a copy of the Kubernetes NGINX Ingress project. +# https://github.com/kubernetes/ingress-nginx.git +#[user@kubernator] + + dnf install -y git + + cd ${HOME} + git clone https://github.com/kubernetes/ingress-nginx.git + + > Cloning into 'ingress-nginx'... + > remote: Enumerating objects: 99890, done. + > remote: Total 99890 (delta 0), reused 0 (delta 0), pack-reused 99890 + > Receiving objects: 100% (99890/99890), 114.20 MiB | 1.62 MiB/s, done. + > Resolving deltas: 100% (56462/56462), done. + + +# ----------------------------------------------------- +# Deploy a test HTTP service. +# https://github.com/kubernetes/ingress-nginx/blob/master/docs/examples/PREREQUISITES.md#test-http-service +#[user@kubernator] + + pushd "${HOME}/ingress-nginx" + pushd 'docs/examples' + + kubectl create \ + --filename http-svc.yaml + + popd + popd + + > deployment.apps/http-svc created + > service/http-svc created + + +# ----------------------------------------------------- +# Configure our OAuth settings. +#[user@kubernator] + + deployname=google + + deployhostname=vernon.metagrid.xyz + deployauthpath=agromulupt + deploycallback=https://${deployhostname:?}/${deployauthpath:?}/callback + + +# ----------------------------------------------------- +# Create our SSL keys and store them in a Kubernetes secret. +# https://github.com/kubernetes/ingress-nginx/blob/master/docs/examples/PREREQUISITES.md#tls-certificates +#[user@kubernator] + + dnf install -y openssl + + openssl req \ + -x509 \ + -sha256 \ + -nodes \ + -days 365 \ + -newkey rsa:2048 \ + -keyout /tmp/tls.key \ + -out /tmp/tls.crt \ + -subj "/CN=${deployhostname:?}/O=Aglais" + + > Generating a RSA private key + > ................................................+++++ + > .................................+++++ + > writing new private key to '/tmp/tls.key' + + + kubectl create secret \ + tls \ + ${deployname:?}-tls-secret \ + --key /tmp/tls.key \ + --cert /tmp/tls.crt + + > secret/google-tls-secret created + + +# ----------------------------------------------------- +# Deploy a TLS test Ingress +# https://github.com/kubernetes/ingress-nginx/tree/master/docs/examples/tls-termination#deployment +# https://github.com/kubernetes/ingress-nginx/tree/master/docs/examples/tls-termination#validation +#[user@kubernator] + + cat << EOF > /tmp/${deployname:?}-tls-test.yaml +apiVersion: networking.k8s.io/v1beta1 +kind: Ingress +metadata: + name: ${deployname:?}-tls-test +spec: + tls: + - hosts: + - ${deployhostname} + secretName: ${deployname:?}-tls-secret + rules: + - host: ${deployhostname} + http: + paths: + - path: /tls-test + backend: + serviceName: http-svc + servicePort: 80 +EOF + + kubectl apply \ + --filename /tmp/${deployname:?}-tls-test.yaml + + > ingress.networking.k8s.io/google-tls-test created + + + kubectl describe \ + Ingress ${deployname:?}-tls-test + + > Name: google-tls-test + > Namespace: default + > Address: 128.232.227.227 + > Default backend: default-http-backend:80 () + > TLS: + > google-tls-secret terminates vernon.metagrid.xyz + > Rules: + > Host Path Backends + > ---- ---- -------- + > vernon.metagrid.xyz + > /tls-test http-svc:80 (10.100.1.14:8080) + > Annotations: Events: + > Type Reason Age From Message + > ---- ------ ---- ---- ------- + > Normal CREATE 12s nginx-ingress-controller Ingress default/google-tls-test + > Normal UPDATE 5s nginx-ingress-controller Ingress default/google-tls-test + + + ingressip=$( + kubectl get \ + Ingress ${deployname:?}-tls-test \ + --output json \ + | jq -r '.status.loadBalancer.ingress[0].ip' + ) + + echo "Ingress [${ingressip:?}]" + + > Ingress [128.232.227.227] + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Update our internal and external DNS +#[user@dns-serer] + + aglais-001.metagrid.xyz A 128.232.227.227 + + vernon.metagrid.xyz CNAME aglais-001.metagrid.xyz. + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Test our SSL keys. +# https://github.com/kubernetes/ingress-nginx/tree/master/docs/examples/tls-termination#deployment +# https://github.com/kubernetes/ingress-nginx/tree/master/docs/examples/tls-termination#validation +#[user@kubernator] + + + curl --head "http://${ingressip:?}/tls-test" + + > HTTP/1.1 404 Not Found + > Date: Wed, 27 Jan 2021 06:09:36 GMT + > .... + > .... + + + curl --head "http://${deployhostname:?}/tls-test" + + > HTTP/1.1 308 Permanent Redirect + > Date: Wed, 27 Jan 2021 06:09:55 GMT + > .... + > .... + > Location: https://vernon.metagrid.xyz/tls-test + + + curl --head "https://${deployhostname:?}/tls-test" + + > curl: (60) SSL certificate problem: self signed certificate + > More details here: https://curl.haxx.se/docs/sslcerts.html + > .... + > .... + + + curl --insecure --head "https://${deployhostname:?}/tls-test" + + > HTTP/2 200 + > date: Wed, 27 Jan 2021 06:10:44 GMT + > content-type: text/plain + > strict-transport-security: max-age=15724800; includeSubDomains + + +# ----------------------------------------------------- +# Configure our Google secrets. +#[user@kubernator] + + dnf install -y python + + OAUTH2_CLIENT_IDENT=$( + secret google.amdar.id + ) + OAUTH2_CLIENT_SECRET=$( + secret google.amdar.secret + ) + OAUTH2_COOKIE_SECRET=$( + python -c 'import os,base64; print(base64.b64encode(os.urandom(16)).decode("ascii"))' + ) + + +# ----------------------------------------------------- +# Configure our oauth2_proxy Service. +#[user@kubernator] + + cat > /tmp/${deployname:?}-oauth-proxy.yaml << EOF +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + k8s-app: ${deployname:?}-oauth-proxy + name: ${deployname:?}-oauth-proxy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + k8s-app: ${deployname:?}-oauth-proxy + template: + metadata: + labels: + k8s-app: ${deployname:?}-oauth-proxy + spec: + containers: + - name: ${deployname:?}-oauth-proxy + image: quay.io/oauth2-proxy/oauth2-proxy:latest + imagePullPolicy: Always + ports: + - containerPort: 4180 + protocol: TCP + args: + - --provider=google + - --email-domain=* + - --http-address=0.0.0.0:4180 + - --proxy-prefix=/${deployauthpath:?} + - --set-xauthrequest=true + - --client-id=${OAUTH2_CLIENT_IDENT:?} + - --client-secret=${OAUTH2_CLIENT_SECRET:?} + - --cookie-secret=${OAUTH2_COOKIE_SECRET:?} +--- +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: ${deployname:?}-oauth-proxy + name: ${deployname:?}-oauth-proxy + namespace: default +spec: + ports: + - name: http + port: 4180 + protocol: TCP + targetPort: 4180 + selector: + k8s-app: ${deployname:?}-oauth-proxy +EOF + + +# ----------------------------------------------------- +# Deploy the OAuth proxy. +#[user@kubernator] + + kubectl create \ + --filename /tmp/${deployname:?}-oauth-proxy.yaml + + > deployment.apps/google-oauth-proxy created + > service/google-oauth-proxy created + + +# ----------------------------------------------------- +# Configure the oauth_proxy Ingress. +#[user@kubernator] + + # + # WARNING The auth-url and auth-signin URLs contain '$' values. + # WARNING If bash tries to fill them in, they will end up blank. + # https:///oauth2/auth + # https:///oauth2/start?rd + # WARNING This disables the authentication, leaving the protected resource exposed. + # + + cat > /tmp/${deployname:?}-oauth-ingress.yaml << EOF +--- +apiVersion: networking.k8s.io/v1beta1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-url: "https://\$host/${deployauthpath:?}/auth" + nginx.ingress.kubernetes.io/auth-signin: "https://\$host/${deployauthpath:?}/start?rd=\$escaped_request_uri" + nginx.ingress.kubernetes.io/auth-response-headers: "x-auth-request-user, x-auth-request-email" + name: ${deployname:?}-oauth-protected + namespace: default +spec: + rules: + - host: ${deployhostname:?} + http: + paths: + - path: / + backend: + serviceName: http-svc + servicePort: 80 + tls: + - hosts: + - ${deployhostname:?} + secretName: ${deployname:?}-tls-secret + +--- +apiVersion: networking.k8s.io/v1beta1 +kind: Ingress +metadata: + name: ${deployname:?}-oauth-protector + namespace: default +spec: + rules: + - host: ${deployhostname:?} + http: + paths: + - path: /${deployauthpath:?} + backend: + serviceName: ${deployname:?}-oauth-proxy + servicePort: 4180 + + tls: + - hosts: + - ${deployhostname:?} + secretName: ${deployname:?}-tls-secret +EOF + + +# ----------------------------------------------------- +# Deploy the OAuth Ingress connectors. +#[user@kubernator] + + kubectl apply \ + --filename /tmp/${deployname:?}-oauth-ingress.yaml + + > ingress.networking.k8s.io/google-oauth-protected created + > ingress.networking.k8s.io/google-oauth-protector created + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Test the deployment. +#[user@desktop] + + firefox "http://vernon.metagrid.xyz/frog" & + + + > + > Hostname: http-svc-66b7b8b4c6-9dgxg + > + > Pod Information: + > node name: aglais-20210127-cluster-bq7hhlqwjr57-node-3 + > pod name: http-svc-66b7b8b4c6-9dgxg + > pod namespace: default + > pod IP: 10.100.1.14 + > + > Server values: + > server_version=nginx: 1.12.2 - lua: 10010 + > + > Request Information: + > client_address=10.100.3.3 + > method=GET + > real path=/frog + > query= + > request_version=1.1 + > request_scheme=http + > request_uri=http://vernon.metagrid.xyz:8080/frog + > + > Request Headers: + > accept=text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 + > accept-encoding=gzip, deflate, br + > accept-language=en-GB,en;q=0.5 + > cookie=_oauth2_proxy=NjUekxcc........oHvV9yC8= + > dnt=1 + > host=vernon.metagrid.xyz + > referer=https://accounts.google.com/o/oauth2/auth/oauthchooseaccount.... + > upgrade-insecure-requests=1 + > user-agent=Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0 + > x-auth-request-email=........ + > x-auth-request-user=........ + > x-forwarded-for=10.100.4.0 + > x-forwarded-host=vernon.metagrid.xyz + > x-forwarded-port=443 + > x-forwarded-proto=https + > x-real-ip=10.100.4.0 + > x-request-id=6c52........95a1 + > x-scheme=https + > + > Request Body: + > -no body in request- + > + + + + + diff --git a/notes/zrq/20210127-03-iris-oauth-proxy.txt b/notes/zrq/20210127-03-iris-oauth-proxy.txt new file mode 100644 index 00000000..a28e13bb --- /dev/null +++ b/notes/zrq/20210127-03-iris-oauth-proxy.txt @@ -0,0 +1,428 @@ +# +# +# +# Copyright (c) 2021, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# + + Target: + + Install and test OAuthProxy with IRIS IAM. + + Results: + + Success :-) + + Source: + + Automated K8s deployment. + notes/zrq/20210127-01-kubernetes-deploy.txt + + OAuthProxy deployment + notes/zrq/20210127-02-oauth-proxy-google.txt + + +# ----------------------------------------------------- +# Configure our OAuth settings. +#[user@kubernator] + + deployname=iris + + deployhostname=claire.metagrid.xyz + deployauthpath=pidjert + deploycallback=https://${deployhostname:?}/${deployauthpath:?}/callback + + +# ----------------------------------------------------- +# Create our SSL keys and store them in a Kubernetes secret. +# https://github.com/kubernetes/ingress-nginx/blob/master/docs/examples/PREREQUISITES.md#tls-certificates +#[user@kubernator] + + dnf install -y openssl + + openssl req \ + -x509 \ + -sha256 \ + -nodes \ + -days 365 \ + -newkey rsa:2048 \ + -keyout /tmp/tls.key \ + -out /tmp/tls.crt \ + -subj "/CN=${deployhostname:?}/O=Aglais" + + > Generating a RSA private key + > .........+++++ + > .............................+++++ + > writing new private key to '/tmp/tls.key' + + + kubectl create secret \ + tls \ + ${deployname:?}-tls-secret \ + --key /tmp/tls.key \ + --cert /tmp/tls.crt + + > secret/iris-tls-secret created + + +# ----------------------------------------------------- +# Deploy a TLS test Ingress +# https://github.com/kubernetes/ingress-nginx/tree/master/docs/examples/tls-termination#deployment +# https://github.com/kubernetes/ingress-nginx/tree/master/docs/examples/tls-termination#validation +#[user@kubernator] + + cat << EOF > /tmp/${deployname:?}-tls-test.yaml +apiVersion: networking.k8s.io/v1beta1 +kind: Ingress +metadata: + name: ${deployname:?}-tls-test +spec: + tls: + - hosts: + - ${deployhostname} + secretName: ${deployname:?}-tls-secret + rules: + - host: ${deployhostname} + http: + paths: + - path: /tls-test + backend: + serviceName: http-svc + servicePort: 80 +EOF + + kubectl apply \ + --filename /tmp/${deployname:?}-tls-test.yaml + + > ingress.networking.k8s.io/iris-tls-test created + + + kubectl describe \ + Ingress ${deployname:?}-tls-test + + > Name: iris-tls-test + > Namespace: default + > Address: 128.232.227.227 + > Default backend: default-http-backend:80 () + > TLS: + > iris-tls-secret terminates claire.metagrid.xyz + > Rules: + > Host Path Backends + > ---- ---- -------- + > claire.metagrid.xyz + > /tls-test http-svc:80 (10.100.1.14:8080) + > Annotations: Events: + > Type Reason Age From Message + > ---- ------ ---- ---- ------- + > Normal CREATE 63s nginx-ingress-controller Ingress default/iris-tls-test + > Normal UPDATE 11s nginx-ingress-controller Ingress default/iris-tls-test + + + ingressip=$( + kubectl get \ + Ingress ${deployname:?}-tls-test \ + --output json \ + | jq -r '.status.loadBalancer.ingress[0].ip' + ) + + echo "Ingress [${ingressip:?}]" + + > Ingress [128.232.227.227] + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Update our internal and external DNS +#[user@dns-serer] + + aglais-001.metagrid.xyz A 128.232.227.236 + + claire.metagrid.xyz CNAME aglais-001.metagrid.xyz. + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Test HTTP to HTTPS redirect and our TLS keys. +# https://github.com/kubernetes/ingress-nginx/tree/master/docs/examples/tls-termination#deployment +# https://github.com/kubernetes/ingress-nginx/tree/master/docs/examples/tls-termination#validation +#[user@kubernator] + + + curl --head "http://${ingressip:?}/tls-test" + + > HTTP/1.1 404 Not Found + > Date: Wed, 27 Jan 2021 17:51:46 GMT + > .... + > .... + + + curl --head "http://${deployhostname:?}/tls-test" + + > HTTP/1.1 308 Permanent Redirect + > Date: Wed, 27 Jan 2021 17:51:55 GMT + > .... + > .... + > Location: https://claire.metagrid.xyz/tls-test + + + curl --head "https://${deployhostname:?}/tls-test" + + > curl: (60) SSL certificate problem: self signed certificate + > More details here: https://curl.haxx.se/docs/sslcerts.html + > .... + > .... + + + curl --insecure --head "https://${deployhostname:?}/tls-test" + + > HTTP/2 200 + > date: Wed, 27 Jan 2021 17:52:24 GMT + > .... + > .... + + +# ----------------------------------------------------- +# Configure our IRIS secrets. +#[user@kubernator] + + dnf install -y python + + OAUTH2_CLIENT_IDENT=$( + secret iris-iam.oauth.client + ) + OAUTH2_CLIENT_SECRET=$( + secret iris-iam.oauth.secret + ) + OAUTH2_COOKIE_SECRET=$( + python -c 'import os,base64; print(base64.b64encode(os.urandom(16)).decode("ascii"))' + ) + + OAUTH2_SERVER_ISSUER=https://iris-iam.stfc.ac.uk/ + OAUTH2_SERVER_AUTH=https://iris-iam.stfc.ac.uk/authorize + OAUTH2_SERVER_TOKEN=https://iris-iam.stfc.ac.uk/token + OAUTH2_SERVER_USER=https://iris-iam.stfc.ac.uk/userinfo + OAUTH2_SERVER_JWKS=https://iris-iam.stfc.ac.uk/.well-known/openid-jwks + +# ----------------------------------------------------- +# Configure our oauth2_proxy Service. +#[user@kubernator] + + cat > /tmp/${deployname:?}-oauth-proxy.yaml << EOF +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + k8s-app: ${deployname:?}-oauth-proxy + name: ${deployname:?}-oauth-proxy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + k8s-app: ${deployname:?}-oauth-proxy + template: + metadata: + labels: + k8s-app: ${deployname:?}-oauth-proxy + spec: + containers: + - name: ${deployname:?}-oauth-proxy + image: quay.io/oauth2-proxy/oauth2-proxy:latest + imagePullPolicy: Always + ports: + - containerPort: 4180 + protocol: TCP + args: + - --provider=oidc + - --email-domain=* + - --http-address=0.0.0.0:4180 + - --proxy-prefix=/${deployauthpath:?} + - --redirect-url=${deploycallback:?} + - --set-xauthrequest=true + - --client-id=${OAUTH2_CLIENT_IDENT:?} + - --client-secret=${OAUTH2_CLIENT_SECRET:?} + - --oidc-issuer-url=${OAUTH2_SERVER_ISSUER:?} + - --login-url=${OAUTH2_SERVER_AUTH:?} + - --redeem-url=${OAUTH2_SERVER_TOKEN:?} + - --profile-url=${OAUTH2_SERVER_USER:?} + - --oidc-jwks-url=${OAUTH2_SERVER_JWKS:?} + - --cookie-secret=${OAUTH2_COOKIE_SECRET:?} + - --ssl-insecure-skip-verify + - --ssl-upstream-insecure-skip-verify + +--- +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: ${deployname:?}-oauth-proxy + name: ${deployname:?}-oauth-proxy + namespace: default +spec: + ports: + - name: http + port: 4180 + protocol: TCP + targetPort: 4180 + selector: + k8s-app: ${deployname:?}-oauth-proxy +EOF + + +# ----------------------------------------------------- +# Deploy the OAuth proxy. +#[user@kubernator] + + kubectl create \ + --filename /tmp/${deployname:?}-oauth-proxy.yaml + + > deployment.apps/iris-oauth-proxy created + > service/iris-oauth-proxy created + + +# ----------------------------------------------------- +# Configure the oauth_proxy Ingress. +#[user@kubernator] + + # + # WARNING The auth-url and auth-signin URLs contain '$' values. + # WARNING If bash tries to fill them in, they will end up blank. + # https:///oauth2/auth + # https:///oauth2/start?rd + # WARNING This disables the authentication, leaving the protected resource exposed. + # + + cat > /tmp/${deployname:?}-oauth-ingress.yaml << EOF +--- +apiVersion: networking.k8s.io/v1beta1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-url: "https://\$host/${deployauthpath:?}/auth" + nginx.ingress.kubernetes.io/auth-signin: "https://\$host/${deployauthpath:?}/start?rd=\$escaped_request_uri" + nginx.ingress.kubernetes.io/auth-response-headers: "x-auth-request-user, x-auth-request-email" + name: ${deployname:?}-oauth-protected + namespace: default +spec: + rules: + - host: ${deployhostname:?} + http: + paths: + - path: / + backend: + serviceName: http-svc + servicePort: 80 + tls: + - hosts: + - ${deployhostname:?} + secretName: ${deployname:?}-tls-secret + +--- +apiVersion: networking.k8s.io/v1beta1 +kind: Ingress +metadata: + name: ${deployname:?}-oauth-protector + namespace: default +spec: + rules: + - host: ${deployhostname:?} + http: + paths: + - path: /${deployauthpath:?} + backend: + serviceName: ${deployname:?}-oauth-proxy + servicePort: 4180 + tls: + - hosts: + - ${deployhostname:?} + secretName: ${deployname:?}-tls-secret +EOF + + +# ----------------------------------------------------- +# Deploy the OAuth Ingress connectors. +#[user@kubernator] + + kubectl apply \ + --filename /tmp/${deployname:?}-oauth-ingress.yaml + + > ingress.networking.k8s.io/iris-oauth-protected created + > ingress.networking.k8s.io/iris-oauth-protector created + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Test the deployment. +#[user@desktop] + + firefox "http://claire.metagrid.xyz/frog" & + + > + > Hostname: http-svc-66b7b8b4c6-9dgxg + > + > Pod Information: + > node name: aglais-20210127-cluster-bq7hhlqwjr57-node-3 + > pod name: http-svc-66b7b8b4c6-9dgxg + > pod namespace: default + > pod IP: 10.100.1.14 + > + > Server values: + > server_version=nginx: 1.12.2 - lua: 10010 + > + > Request Information: + > client_address=10.100.3.3 + > method=GET + > real path=/frog + > query= + > request_version=1.1 + > request_scheme=http + > request_uri=http://claire.metagrid.xyz:8080/frog + > + > Request Headers: + > accept=text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 + > accept-encoding=gzip, deflate, br + > accept-language=en-GB,en;q=0.5 + > cookie=_oauth2_proxy=ivsL........PShs= + > dnt=1 + > host=claire.metagrid.xyz + > upgrade-insecure-requests=1 + > user-agent=Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0 + > x-auth-request-email=........ + > x-auth-request-user=........ + > x-forwarded-for=10.100.4.0 + > x-forwarded-host=claire.metagrid.xyz + > x-forwarded-port=443 + > x-forwarded-proto=https + > x-real-ip=10.100.4.0 + > x-request-id=f246........fd8d + > x-scheme=https + > + > Request Body: + > -no body in request- + > + + + +