Updating start-workers to support /public mounts, launch-sh from dsml…

…p-login
ucsd-ets · Feb 23, 2024 · ead67e1 · ead67e1
1 parent ee9d583
commit ead67e1
Showing 1 changed file with 114 additions and 88 deletions.
diff --git a/start-workers.sh b/start-workers.sh
@@ -1,97 +1,123 @@
 #!/bin/bash
 
-# Modify this script by copying it to your home directory
-# *** be sure to remove the "exec" line below from your copy! ***
-USERSCRIPT=$HOME/start-workers.sh
-if [ -x "$USERSCRIPT" ]; then
-    exec "$USERSCRIPT" "$@"
-fi
-
-NUM_WORKERS=2
+NUM_WORKERS=1
 
-WORKER_CPU_REQUEST=3 #set
-WORKER_CPU_LIMIT=3 #upper bound
-WORKER_MEM_REQUEST=8192M
-WORKER_MEM_LIMIT=8192M
+WORKER_CPU_REQUEST=1
+WORKER_CPU_LIMIT=2
+WORKER_MEM_REQUEST=2048M
+WORKER_MEM_LIMIT=2048M
 WORKER_GPU_COUNT=0
 
+IMAGE=${JUPYTER_IMAGE_SPEC:-${DOCKER_IMAGE}}
+
 if kubectl get deployment deployment-ray-worker 2>/dev/null > /dev/null; then
-	kubectl delete -f deployment deployment-ray-worker
+	kubectl delete deployment deployment-ray-worker
 fi
 
-kubectl create -f - <<EOM
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: deployment-ray-worker
-  labels:
-    app: ray-cluster-worker
-spec:
-  # Change this to scale the number of worker nodes started in the Ray cluster.
-  replicas: ${NUM_WORKERS}
-  selector:
-    matchLabels:
-      component: ray-worker
-      type: ray
-      app: ray-cluster-worker
-  template:
-    metadata:
-      labels:
-        component: ray-worker
-        type: ray
-        app: ray-cluster-worker
-    spec:
-      restartPolicy: Always
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      - name: home
-        persistentVolumeClaim:
-          claimName: home
-      securityContext:
-        runAsUser: $(id -u)
-        
-      containers:
-      - name: ray-worker
-        image: "${JUPYTER_IMAGE_SPEC}"
-        imagePullPolicy: Always
-        command: ["/bin/bash", "-c", "--"]
-        args:
-          - "ray start --num-cpus=${WORKER_CPU_REQUEST} --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block --object-store-memory 4294967296 --memory 7516192768"
-        securityContext:
-          allowPrivilegeEscalation: false
-        # This volume allocates shared memory for Ray to use for its plasma
-        # object store. If you do not provide this, Ray will fall back to
-        # /tmp which cause slowdowns if it's not a shared memory volume.
-        volumeMounts:
-          - mountPath: /dev/shm
-            name: dshm
-          - mountPath: "/home/${USER}/private"
-            name: home
-          
-            
-        env:
-          # This is used in the ray start command so that Ray can spawn the
-          # correct number of processes. Omitting this may lead to degraded
-          # performance.
-          - name: MY_CPU_REQUEST
-            valueFrom:
-              resourceFieldRef:
-                resource: requests.cpu
-          # The resource requests and limits in this config are too small for production!
-          # It is better to use a few large Ray pods than many small ones.
-          # For production, it is ideal to size each Ray pod to take up the
-          # entire Kubernetes node on which it is scheduled.
-        resources:
-          limits:
-            cpu: "${WORKER_CPU_LIMIT}"
-            memory: "${WORKER_MEM_LIMIT}"
-            # For production use-cases, we recommend specifying integer CPU reqests and limits.
-            # We also recommend setting requests equal to limits for both CPU and memory.
-            # For this example, we use a 500m CPU request to accomodate resource-constrained local
-            # Kubernetes testing environments such as Kind and minikube.
-          requests:
-            cpu: "${WORKER_CPU_REQUEST}"
-            memory: "${WORKER_MEM_REQUEST}"
+
+read -d '' DEPLOYMENT <<EOM
+{
+    "apiVersion": "apps/v1",
+    "kind": "Deployment",
+    "metadata": {
+        "labels": {
+            "app": "ray-cluster-worker"
+        },
+        "name": "deployment-ray-worker"
+    },
+    "spec": {
+        "progressDeadlineSeconds": 600,
+        "replicas": ${NUM_WORKERS},
+        "selector": {
+            "matchLabels": {
+                "app": "ray-cluster-worker",
+                "component": "ray-worker",
+                "type": "ray"
+            }
+        },
+        "template": {
+            "metadata": {
+                "creationTimestamp": null,
+                "labels": {
+                    "app": "ray-cluster-worker",
+                    "component": "ray-worker",
+                    "type": "ray"
+                }
+            },
+            "spec": {
+                "containers": [
+                    {
+                        "args": [
+                            "ray start --num-cpus=1 --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block"
+                        ],
+                        "command": [
+                            "/bin/bash",
+                            "-c",
+                            "--"
+                        ],
+                        "env": [
+                            {
+                                "name": "MY_CPU_REQUEST",
+                                "valueFrom": {
+                                    "resourceFieldRef": {
+                                        "divisor": "0",
+                                        "resource": "requests.cpu"
+                                    }
+                                }
+                            }
+                        ],
+                        "image": "ghcr.io/ucsd-ets/ray-notebook:main",
+                        "imagePullPolicy": "Always",
+                        "name": "ray-worker",
+                        "resources": {
+                            "limits": {
+                                "cpu": "${WORKER_CPU_LIMIT}",
+                                "memory": "${WORKER_MEM_LIMIT}"
+                            },
+                            "requests": {
+                                "cpu": "${WORKER_CPU_REQUEST}",
+                                "memory": "${WORKER_MEM_REQUEST}"
+                            }
+                        },
+                        "securityContext": {
+                            "allowPrivilegeEscalation": false
+                        },
+                        "terminationMessagePath": "/dev/termination-log",
+                        "terminationMessagePolicy": "File",
+                        "volumeMounts": [
+                            {
+                                "mountPath": "/dev/shm",
+                                "name": "dshm"
+                            }
+                        ]
+                    }
+                ],
+                "terminationGracePeriodSeconds": 30,
+                "volumes": [
+                    {
+                        "emptyDir": {
+                            "medium": "Memory"
+                        },
+                        "name": "dshm"
+                    }
+                ]
+            }
+        }
+    }
+}
 EOM
+
+VOL=$( kubectl get pod ${HOSTNAME} -o json | jq '.spec.volumes[] | select(.name=="course-workspace")' )
+
+read -d '' VOLMNT <<"EOM"
+{
+	"mountPath": "/public",
+	"name": "course-workspace",
+	"subPath": "public"
+}
+EOM
+
+DEPLOYMENT=$( echo "$DEPLOYMENT" | jq --argjson v "$VOL" '.spec.template.spec.volumes += [ $v ]')
+DEPLOYMENT=$( echo "$DEPLOYMENT" | jq --argjson v "$VOLMNT" '.spec.template.spec.containers[0].volumeMounts += [ $v ]')
+
+echo "$DEPLOYMENT" | kubectl create -f -