Skip to content

Commit

Permalink
Updating start-workers to support /public mounts, launch-sh from dsml…
Browse files Browse the repository at this point in the history
…p-login
  • Loading branch information
agt committed Feb 23, 2024
1 parent ee9d583 commit ead67e1
Showing 1 changed file with 114 additions and 88 deletions.
202 changes: 114 additions & 88 deletions start-workers.sh
Original file line number Diff line number Diff line change
@@ -1,97 +1,123 @@
#!/bin/bash

# Modify this script by copying it to your home directory
# *** be sure to remove the "exec" line below from your copy! ***
USERSCRIPT=$HOME/start-workers.sh
if [ -x "$USERSCRIPT" ]; then
exec "$USERSCRIPT" "$@"
fi

NUM_WORKERS=2
NUM_WORKERS=1

WORKER_CPU_REQUEST=3 #set
WORKER_CPU_LIMIT=3 #upper bound
WORKER_MEM_REQUEST=8192M
WORKER_MEM_LIMIT=8192M
WORKER_CPU_REQUEST=1
WORKER_CPU_LIMIT=2
WORKER_MEM_REQUEST=2048M
WORKER_MEM_LIMIT=2048M
WORKER_GPU_COUNT=0

IMAGE=${JUPYTER_IMAGE_SPEC:-${DOCKER_IMAGE}}

if kubectl get deployment deployment-ray-worker 2>/dev/null > /dev/null; then
kubectl delete -f deployment deployment-ray-worker
kubectl delete deployment deployment-ray-worker
fi

kubectl create -f - <<EOM
apiVersion: apps/v1
kind: Deployment
metadata:
name: deployment-ray-worker
labels:
app: ray-cluster-worker
spec:
# Change this to scale the number of worker nodes started in the Ray cluster.
replicas: ${NUM_WORKERS}
selector:
matchLabels:
component: ray-worker
type: ray
app: ray-cluster-worker
template:
metadata:
labels:
component: ray-worker
type: ray
app: ray-cluster-worker
spec:
restartPolicy: Always
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: home
persistentVolumeClaim:
claimName: home
securityContext:
runAsUser: $(id -u)
containers:
- name: ray-worker
image: "${JUPYTER_IMAGE_SPEC}"
imagePullPolicy: Always
command: ["/bin/bash", "-c", "--"]
args:
- "ray start --num-cpus=${WORKER_CPU_REQUEST} --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block --object-store-memory 4294967296 --memory 7516192768"
securityContext:
allowPrivilegeEscalation: false
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if it's not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: "/home/${USER}/private"
name: home
env:
# This is used in the ray start command so that Ray can spawn the
# correct number of processes. Omitting this may lead to degraded
# performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
# The resource requests and limits in this config are too small for production!
# It is better to use a few large Ray pods than many small ones.
# For production, it is ideal to size each Ray pod to take up the
# entire Kubernetes node on which it is scheduled.
resources:
limits:
cpu: "${WORKER_CPU_LIMIT}"
memory: "${WORKER_MEM_LIMIT}"
# For production use-cases, we recommend specifying integer CPU reqests and limits.
# We also recommend setting requests equal to limits for both CPU and memory.
# For this example, we use a 500m CPU request to accomodate resource-constrained local
# Kubernetes testing environments such as Kind and minikube.
requests:
cpu: "${WORKER_CPU_REQUEST}"
memory: "${WORKER_MEM_REQUEST}"

read -d '' DEPLOYMENT <<EOM
{
"apiVersion": "apps/v1",
"kind": "Deployment",
"metadata": {
"labels": {
"app": "ray-cluster-worker"
},
"name": "deployment-ray-worker"
},
"spec": {
"progressDeadlineSeconds": 600,
"replicas": ${NUM_WORKERS},
"selector": {
"matchLabels": {
"app": "ray-cluster-worker",
"component": "ray-worker",
"type": "ray"
}
},
"template": {
"metadata": {
"creationTimestamp": null,
"labels": {
"app": "ray-cluster-worker",
"component": "ray-worker",
"type": "ray"
}
},
"spec": {
"containers": [
{
"args": [
"ray start --num-cpus=1 --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block"
],
"command": [
"/bin/bash",
"-c",
"--"
],
"env": [
{
"name": "MY_CPU_REQUEST",
"valueFrom": {
"resourceFieldRef": {
"divisor": "0",
"resource": "requests.cpu"
}
}
}
],
"image": "ghcr.io/ucsd-ets/ray-notebook:main",
"imagePullPolicy": "Always",
"name": "ray-worker",
"resources": {
"limits": {
"cpu": "${WORKER_CPU_LIMIT}",
"memory": "${WORKER_MEM_LIMIT}"
},
"requests": {
"cpu": "${WORKER_CPU_REQUEST}",
"memory": "${WORKER_MEM_REQUEST}"
}
},
"securityContext": {
"allowPrivilegeEscalation": false
},
"terminationMessagePath": "/dev/termination-log",
"terminationMessagePolicy": "File",
"volumeMounts": [
{
"mountPath": "/dev/shm",
"name": "dshm"
}
]
}
],
"terminationGracePeriodSeconds": 30,
"volumes": [
{
"emptyDir": {
"medium": "Memory"
},
"name": "dshm"
}
]
}
}
}
}
EOM

VOL=$( kubectl get pod ${HOSTNAME} -o json | jq '.spec.volumes[] | select(.name=="course-workspace")' )

read -d '' VOLMNT <<"EOM"
{
"mountPath": "/public",
"name": "course-workspace",
"subPath": "public"
}
EOM

DEPLOYMENT=$( echo "$DEPLOYMENT" | jq --argjson v "$VOL" '.spec.template.spec.volumes += [ $v ]')
DEPLOYMENT=$( echo "$DEPLOYMENT" | jq --argjson v "$VOLMNT" '.spec.template.spec.containers[0].volumeMounts += [ $v ]')

echo "$DEPLOYMENT" | kubectl create -f -

0 comments on commit ead67e1

Please sign in to comment.