Skip to content

Commit

Permalink
new multi-node setup with shared private mount
Browse files Browse the repository at this point in the history
  • Loading branch information
SumanthRH committed Feb 19, 2024
1 parent 1b81708 commit ee9d583
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 11 deletions.
9 changes: 5 additions & 4 deletions start-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ fi
exec 2>>$HOME/ray-head.log 1>&2

# Datahub exposes limits/requests as floating point, Ray wants int
MY_CPU_REQUEST=$(printf "%.0f" "$CPU_GUARANTEE")
MY_CPU_REQUEST=$(printf "%.0f" "$CPU_GUARANTEE")

ray start --head --port=6380 --num-cpus=8 --dashboard-host=0.0.0.0 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --disable-usage-stats --object-store-memory 4294967296 --memory 10737418240
ray start --head --port=6380 --num-cpus=$MY_CPU_REQUEST --dashboard-host=0.0.0.0 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --disable-usage-stats --object-store-memory 4294967296

# Ray head node service, allowing worker pods to discover the head node to perform the bidirectional communication.
# More contexts can be found at [the Ports configurations doc](https://docs.ray.io/en/latest/ray-core/configure.html#ports-configurations).
if ! kubectl get svc service-ray-cluster 2>/dev/null > /dev/null; then
kubectl create -f - <<EOM
# Ray head node service, allowing worker pods to discover the head node to perform the bidirectional communication.
# More contexts can be found at [the Ports configurations doc](https://docs.ray.io/en/latest/ray-core/configure.html#ports-configurations).
apiVersion: v1
kind: Service
metadata:
Expand Down Expand Up @@ -47,4 +47,5 @@ fi
# Now fire up workers
/opt/ray-support/start-workers.sh "$@"

# Execution shouldn't reach here unless both start-workers scripts are missing, if so assume that's intentional
exit 0
21 changes: 14 additions & 7 deletions start-workers.sh
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,14 @@ if [ -x "$USERSCRIPT" ]; then
exec "$USERSCRIPT" "$@"
fi

NUM_WORKERS=1
NUM_WORKERS=2

WORKER_CPU_REQUEST=8
WORKER_CPU_LIMIT=8
WORKER_MEM_REQUEST=16384M
WORKER_MEM_LIMIT=16384M
WORKER_CPU_REQUEST=3 #set
WORKER_CPU_LIMIT=3 #upper bound
WORKER_MEM_REQUEST=8192M
WORKER_MEM_LIMIT=8192M
WORKER_GPU_COUNT=0

# Jupyter pod's stop hook _should_ delete the deployment, but check again just in case
if kubectl get deployment deployment-ray-worker 2>/dev/null > /dev/null; then
kubectl delete -f deployment deployment-ray-worker
fi
Expand Down Expand Up @@ -47,15 +46,19 @@ spec:
- name: dshm
emptyDir:
medium: Memory
- name: home
persistentVolumeClaim:
claimName: home
securityContext:
runAsUser: $(id -u)
containers:
- name: ray-worker
image: "${JUPYTER_IMAGE_SPEC}"
imagePullPolicy: Always
command: ["/bin/bash", "-c", "--"]
args:
- "ray start --num-cpus=${WORKER_CPU_REQUEST} --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block"
- "ray start --num-cpus=${WORKER_CPU_REQUEST} --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block --object-store-memory 4294967296 --memory 7516192768"
securityContext:
allowPrivilegeEscalation: false
# This volume allocates shared memory for Ray to use for its plasma
Expand All @@ -64,6 +67,10 @@ spec:
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: "/home/${USER}/private"
name: home
env:
# This is used in the ray start command so that Ray can spawn the
# correct number of processes. Omitting this may lead to degraded
Expand Down

0 comments on commit ee9d583

Please sign in to comment.