-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Updating start-workers to support /public mounts, launch-sh from dsml…
…p-login
- Loading branch information
Showing
1 changed file
with
114 additions
and
88 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,97 +1,123 @@ | ||
#!/bin/bash | ||
|
||
# Modify this script by copying it to your home directory | ||
# *** be sure to remove the "exec" line below from your copy! *** | ||
USERSCRIPT=$HOME/start-workers.sh | ||
if [ -x "$USERSCRIPT" ]; then | ||
exec "$USERSCRIPT" "$@" | ||
fi | ||
|
||
NUM_WORKERS=2 | ||
NUM_WORKERS=1 | ||
|
||
WORKER_CPU_REQUEST=3 #set | ||
WORKER_CPU_LIMIT=3 #upper bound | ||
WORKER_MEM_REQUEST=8192M | ||
WORKER_MEM_LIMIT=8192M | ||
WORKER_CPU_REQUEST=1 | ||
WORKER_CPU_LIMIT=2 | ||
WORKER_MEM_REQUEST=2048M | ||
WORKER_MEM_LIMIT=2048M | ||
WORKER_GPU_COUNT=0 | ||
|
||
IMAGE=${JUPYTER_IMAGE_SPEC:-${DOCKER_IMAGE}} | ||
|
||
if kubectl get deployment deployment-ray-worker 2>/dev/null > /dev/null; then | ||
kubectl delete -f deployment deployment-ray-worker | ||
kubectl delete deployment deployment-ray-worker | ||
fi | ||
|
||
kubectl create -f - <<EOM | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
name: deployment-ray-worker | ||
labels: | ||
app: ray-cluster-worker | ||
spec: | ||
# Change this to scale the number of worker nodes started in the Ray cluster. | ||
replicas: ${NUM_WORKERS} | ||
selector: | ||
matchLabels: | ||
component: ray-worker | ||
type: ray | ||
app: ray-cluster-worker | ||
template: | ||
metadata: | ||
labels: | ||
component: ray-worker | ||
type: ray | ||
app: ray-cluster-worker | ||
spec: | ||
restartPolicy: Always | ||
volumes: | ||
- name: dshm | ||
emptyDir: | ||
medium: Memory | ||
- name: home | ||
persistentVolumeClaim: | ||
claimName: home | ||
securityContext: | ||
runAsUser: $(id -u) | ||
containers: | ||
- name: ray-worker | ||
image: "${JUPYTER_IMAGE_SPEC}" | ||
imagePullPolicy: Always | ||
command: ["/bin/bash", "-c", "--"] | ||
args: | ||
- "ray start --num-cpus=${WORKER_CPU_REQUEST} --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block --object-store-memory 4294967296 --memory 7516192768" | ||
securityContext: | ||
allowPrivilegeEscalation: false | ||
# This volume allocates shared memory for Ray to use for its plasma | ||
# object store. If you do not provide this, Ray will fall back to | ||
# /tmp which cause slowdowns if it's not a shared memory volume. | ||
volumeMounts: | ||
- mountPath: /dev/shm | ||
name: dshm | ||
- mountPath: "/home/${USER}/private" | ||
name: home | ||
env: | ||
# This is used in the ray start command so that Ray can spawn the | ||
# correct number of processes. Omitting this may lead to degraded | ||
# performance. | ||
- name: MY_CPU_REQUEST | ||
valueFrom: | ||
resourceFieldRef: | ||
resource: requests.cpu | ||
# The resource requests and limits in this config are too small for production! | ||
# It is better to use a few large Ray pods than many small ones. | ||
# For production, it is ideal to size each Ray pod to take up the | ||
# entire Kubernetes node on which it is scheduled. | ||
resources: | ||
limits: | ||
cpu: "${WORKER_CPU_LIMIT}" | ||
memory: "${WORKER_MEM_LIMIT}" | ||
# For production use-cases, we recommend specifying integer CPU reqests and limits. | ||
# We also recommend setting requests equal to limits for both CPU and memory. | ||
# For this example, we use a 500m CPU request to accomodate resource-constrained local | ||
# Kubernetes testing environments such as Kind and minikube. | ||
requests: | ||
cpu: "${WORKER_CPU_REQUEST}" | ||
memory: "${WORKER_MEM_REQUEST}" | ||
|
||
read -d '' DEPLOYMENT <<EOM | ||
{ | ||
"apiVersion": "apps/v1", | ||
"kind": "Deployment", | ||
"metadata": { | ||
"labels": { | ||
"app": "ray-cluster-worker" | ||
}, | ||
"name": "deployment-ray-worker" | ||
}, | ||
"spec": { | ||
"progressDeadlineSeconds": 600, | ||
"replicas": ${NUM_WORKERS}, | ||
"selector": { | ||
"matchLabels": { | ||
"app": "ray-cluster-worker", | ||
"component": "ray-worker", | ||
"type": "ray" | ||
} | ||
}, | ||
"template": { | ||
"metadata": { | ||
"creationTimestamp": null, | ||
"labels": { | ||
"app": "ray-cluster-worker", | ||
"component": "ray-worker", | ||
"type": "ray" | ||
} | ||
}, | ||
"spec": { | ||
"containers": [ | ||
{ | ||
"args": [ | ||
"ray start --num-cpus=1 --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block" | ||
], | ||
"command": [ | ||
"/bin/bash", | ||
"-c", | ||
"--" | ||
], | ||
"env": [ | ||
{ | ||
"name": "MY_CPU_REQUEST", | ||
"valueFrom": { | ||
"resourceFieldRef": { | ||
"divisor": "0", | ||
"resource": "requests.cpu" | ||
} | ||
} | ||
} | ||
], | ||
"image": "ghcr.io/ucsd-ets/ray-notebook:main", | ||
"imagePullPolicy": "Always", | ||
"name": "ray-worker", | ||
"resources": { | ||
"limits": { | ||
"cpu": "${WORKER_CPU_LIMIT}", | ||
"memory": "${WORKER_MEM_LIMIT}" | ||
}, | ||
"requests": { | ||
"cpu": "${WORKER_CPU_REQUEST}", | ||
"memory": "${WORKER_MEM_REQUEST}" | ||
} | ||
}, | ||
"securityContext": { | ||
"allowPrivilegeEscalation": false | ||
}, | ||
"terminationMessagePath": "/dev/termination-log", | ||
"terminationMessagePolicy": "File", | ||
"volumeMounts": [ | ||
{ | ||
"mountPath": "/dev/shm", | ||
"name": "dshm" | ||
} | ||
] | ||
} | ||
], | ||
"terminationGracePeriodSeconds": 30, | ||
"volumes": [ | ||
{ | ||
"emptyDir": { | ||
"medium": "Memory" | ||
}, | ||
"name": "dshm" | ||
} | ||
] | ||
} | ||
} | ||
} | ||
} | ||
EOM | ||
|
||
VOL=$( kubectl get pod ${HOSTNAME} -o json | jq '.spec.volumes[] | select(.name=="course-workspace")' ) | ||
|
||
read -d '' VOLMNT <<"EOM" | ||
{ | ||
"mountPath": "/public", | ||
"name": "course-workspace", | ||
"subPath": "public" | ||
} | ||
EOM | ||
|
||
DEPLOYMENT=$( echo "$DEPLOYMENT" | jq --argjson v "$VOL" '.spec.template.spec.volumes += [ $v ]') | ||
DEPLOYMENT=$( echo "$DEPLOYMENT" | jq --argjson v "$VOLMNT" '.spec.template.spec.containers[0].volumeMounts += [ $v ]') | ||
|
||
echo "$DEPLOYMENT" | kubectl create -f - |