new multi-node setup with shared private mount

ucsd-ets · Feb 19, 2024 · ee9d583 · ee9d583
1 parent 1b81708
commit ee9d583
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 11 deletions.
diff --git a/start-cluster.sh b/start-cluster.sh
@@ -10,14 +10,14 @@ fi
 exec 2>>$HOME/ray-head.log 1>&2
 
 # Datahub exposes limits/requests as floating point, Ray wants int
-MY_CPU_REQUEST=$(printf "%.0f" "$CPU_GUARANTEE")
+MY_CPU_REQUEST=$(printf "%.0f" "$CPU_GUARANTEE") 
 
-ray start --head --port=6380 --num-cpus=8 --dashboard-host=0.0.0.0 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365  --disable-usage-stats --object-store-memory 4294967296 --memory 10737418240 
+ray start --head --port=6380 --num-cpus=$MY_CPU_REQUEST --dashboard-host=0.0.0.0 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365  --disable-usage-stats --object-store-memory 4294967296
 
-# Ray head node service, allowing worker pods to discover the head node to perform the bidirectional communication.
-# More contexts can be found at [the Ports configurations doc](https://docs.ray.io/en/latest/ray-core/configure.html#ports-configurations).
 if !  kubectl get svc service-ray-cluster 2>/dev/null > /dev/null; then
     kubectl create -f - <<EOM
+# Ray head node service, allowing worker pods to discover the head node to perform the bidirectional communication.
+# More contexts can be found at [the Ports configurations doc](https://docs.ray.io/en/latest/ray-core/configure.html#ports-configurations).
 apiVersion: v1
 kind: Service
 metadata:
@@ -47,4 +47,5 @@ fi
 # Now fire up workers
 /opt/ray-support/start-workers.sh "$@"
 
+# Execution shouldn't reach here unless both start-workers scripts are missing, if so assume that's intentional
 exit 0
diff --git a/start-workers.sh b/start-workers.sh
@@ -7,15 +7,14 @@ if [ -x "$USERSCRIPT" ]; then
     exec "$USERSCRIPT" "$@"
 fi
 
-NUM_WORKERS=1
+NUM_WORKERS=2
 
-WORKER_CPU_REQUEST=8
-WORKER_CPU_LIMIT=8
-WORKER_MEM_REQUEST=16384M
-WORKER_MEM_LIMIT=16384M
+WORKER_CPU_REQUEST=3 #set
+WORKER_CPU_LIMIT=3 #upper bound
+WORKER_MEM_REQUEST=8192M
+WORKER_MEM_LIMIT=8192M
 WORKER_GPU_COUNT=0
 
-# Jupyter pod's stop hook _should_ delete the deployment, but check again just in case
 if kubectl get deployment deployment-ray-worker 2>/dev/null > /dev/null; then
 	kubectl delete -f deployment deployment-ray-worker
 fi
@@ -47,15 +46,19 @@ spec:
       - name: dshm
         emptyDir:
           medium: Memory
+      - name: home
+        persistentVolumeClaim:
+          claimName: home
       securityContext:
         runAsUser: $(id -u)
+        
       containers:
       - name: ray-worker
         image: "${JUPYTER_IMAGE_SPEC}"
         imagePullPolicy: Always
         command: ["/bin/bash", "-c", "--"]
         args:
-          - "ray start --num-cpus=${WORKER_CPU_REQUEST} --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block"
+          - "ray start --num-cpus=${WORKER_CPU_REQUEST} --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block --object-store-memory 4294967296 --memory 7516192768"
         securityContext:
           allowPrivilegeEscalation: false
         # This volume allocates shared memory for Ray to use for its plasma
@@ -64,6 +67,10 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
+          - mountPath: "/home/${USER}/private"
+            name: home
+          
+            
         env:
           # This is used in the ray start command so that Ray can spawn the
           # correct number of processes. Omitting this may lead to degraded