Skip to content

Commit

Permalink
Merge pull request #1 from ucsd-ets/pa3
Browse files Browse the repository at this point in the history
New PA 3 startup files
  • Loading branch information
SumanthRH authored Mar 5, 2024
2 parents c8fba2d + daa0746 commit 909f10f
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 95 deletions.
16 changes: 3 additions & 13 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

FROM rayproject/ray-ml:latest-gpu
FROM rayproject/ray-ml:2.9.3

USER root

Expand Down Expand Up @@ -75,18 +75,8 @@ COPY jupyter_config.py start-workers.sh start-cluster.sh stop-cluster.sh /opt/ra
RUN chmod 0755 /opt/ray-support/*.sh
RUN mkdir -p /usr/local/etc/jupyter && cat /opt/ray-support/jupyter_config.py >> /usr/local/etc/jupyter/jupyter_config.py

# install bazel
RUN apt install apt-transport-https curl gnupg -y && \
curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg && \
mv bazel-archive-keyring.gpg /usr/share/keyrings && \
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list
RUN apt update && apt install bazel-3.2.0
# downgrade protobuf and numpy for transformers and pygloo
RUN pip3 install protobuf==3.20.0 numpy==1.20.0 pandas==1.4.0
# install pygloo
RUN git clone https://github.com/ray-project/pygloo.git
RUN ln -s "/usr/bin/bazel-3.2.0" "/usr/bin/bazel"
RUN alias bazel="bazel-3.2.0" && cd pygloo && python setup.py install && cd ..
# add gensim: experimental
RUN pip3 install gensim

USER 1000

12 changes: 10 additions & 2 deletions start-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,14 @@ fi

exec 2>>$HOME/ray-head.log 1>&2

CPU_GUARANTEE=${CPU_GUARANTEE:-$(( $KUBERNETES_LIMIT_CPU))}
MEM_GUARANTEE=${MEM_GUARANTEE:-$(( $KUBERNETES_LIMIT_MEM ))}
OBJECT_STORE_MEM=$(( $MEM_GUARANTEE / 2))

# Datahub exposes limits/requests as floating point, Ray wants int
MY_CPU_REQUEST=$(printf "%.0f" "$CPU_GUARANTEE")

ray start --head --port=6380 --num-cpus=$MY_CPU_REQUEST --dashboard-host=0.0.0.0 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --disable-usage-stats --object-store-memory 4294967296
ray start --head --port=6380 --num-cpus=$MY_CPU_REQUEST --dashboard-host=0.0.0.0 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --disable-usage-stats --object-store-memory $OBJECT_STORE_MEM --memory $MEM_GUARANTEE --system-config='{"object_spilling_config":"{\"type\":\"filesystem\",\"params\":{\"directory_path\":\"/tmp/spill\"}}"}'

if ! kubectl get svc service-ray-cluster 2>/dev/null > /dev/null; then
kubectl create -f - <<EOM
Expand Down Expand Up @@ -45,7 +49,11 @@ EOM
fi

# Now fire up workers
/opt/ray-support/start-workers.sh "$@"
if [ -x "$HOME/start-workers.sh" ]; then
exec "$HOME/start-workers.sh" "$@"
else
exec /opt/ray-support/start-workers.sh "$@"
fi

# Execution shouldn't reach here unless both start-workers scripts are missing, if so assume that's intentional
exit 0
217 changes: 137 additions & 80 deletions start-workers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,89 +9,146 @@ fi

NUM_WORKERS=2

WORKER_CPU_REQUEST=3 #set
WORKER_CPU_LIMIT=3 #upper bound
WORKER_MEM_REQUEST=8192M
WORKER_MEM_LIMIT=8192M
WORKER_CPU_REQUEST=8
WORKER_CPU_LIMIT=8
WORKER_MEM_REQUEST=16384M
WORKER_MEM_LIMIT=16384M
WORKER_GPU_COUNT=0

IMAGE=${JUPYTER_IMAGE_SPEC:-${DOCKER_IMAGE}}
echo "STARTING WORKERS WITH WORKER_CPU_REQUEST=${WORKER_CPU_REQUEST}"
if kubectl get deployment deployment-ray-worker 2>/dev/null > /dev/null; then
kubectl delete -f deployment deployment-ray-worker
kubectl delete deployment deployment-ray-worker
fi

kubectl create -f - <<EOM
apiVersion: apps/v1
kind: Deployment
metadata:
name: deployment-ray-worker
labels:
app: ray-cluster-worker
spec:
# Change this to scale the number of worker nodes started in the Ray cluster.
replicas: ${NUM_WORKERS}
selector:
matchLabels:
component: ray-worker
type: ray
app: ray-cluster-worker
template:
metadata:
labels:
component: ray-worker
type: ray
app: ray-cluster-worker
spec:
restartPolicy: Always
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: home
persistentVolumeClaim:
claimName: home
securityContext:
runAsUser: $(id -u)
containers:
- name: ray-worker
image: "${JUPYTER_IMAGE_SPEC}"
imagePullPolicy: Always
command: ["/bin/bash", "-c", "--"]
args:
- "ray start --num-cpus=${WORKER_CPU_REQUEST} --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block --object-store-memory 4294967296 --memory 7516192768"
securityContext:
allowPrivilegeEscalation: false
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if it's not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: "/home/${USER}/private"
name: home
env:
# This is used in the ray start command so that Ray can spawn the
# correct number of processes. Omitting this may lead to degraded
# performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
# The resource requests and limits in this config are too small for production!
# It is better to use a few large Ray pods than many small ones.
# For production, it is ideal to size each Ray pod to take up the
# entire Kubernetes node on which it is scheduled.
resources:
limits:
cpu: "${WORKER_CPU_LIMIT}"
memory: "${WORKER_MEM_LIMIT}"
# For production use-cases, we recommend specifying integer CPU reqests and limits.
# We also recommend setting requests equal to limits for both CPU and memory.
# For this example, we use a 500m CPU request to accomodate resource-constrained local
# Kubernetes testing environments such as Kind and minikube.
requests:
cpu: "${WORKER_CPU_REQUEST}"
memory: "${WORKER_MEM_REQUEST}"

read -d '' DEPLOYMENT <<EOM
{
"apiVersion": "apps/v1",
"kind": "Deployment",
"metadata": {
"labels": {
"app": "ray-cluster-worker"
},
"name": "deployment-ray-worker"
},
"spec": {
"progressDeadlineSeconds": 600,
"replicas": ${NUM_WORKERS},
"selector": {
"matchLabels": {
"app": "ray-cluster-worker",
"component": "ray-worker",
"type": "ray"
}
},
"template": {
"metadata": {
"creationTimestamp": null,
"labels": {
"app": "ray-cluster-worker",
"component": "ray-worker",
"type": "ray"
}
},
"spec": {
"containers": [
{
"args": [
"ray start --num-cpus=${WORKER_CPU_REQUEST} --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block --object-store-memory=7516192768 --memory=17179869184"
],
"command": [
"/bin/bash",
"-c",
"--"
],
"env": [
{
"name": "MY_CPU_REQUEST",
"valueFrom": {
"resourceFieldRef": {
"divisor": "0",
"resource": "requests.cpu"
}
}
}
],
"image": "${IMAGE}",
"imagePullPolicy": "Always",
"name": "ray-worker",
"resources": {
"limits": {
"cpu": "${WORKER_CPU_LIMIT}",
"memory": "${WORKER_MEM_LIMIT}"
},
"requests": {
"cpu": "${WORKER_CPU_REQUEST}",
"memory": "${WORKER_MEM_REQUEST}"
}
},
"securityContext": {
"allowPrivilegeEscalation": false,
"runAsUser": ${UID}
},
"terminationMessagePath": "/dev/termination-log",
"terminationMessagePolicy": "File",
"volumeMounts": [
{
"mountPath": "/dev/shm",
"name": "dshm"
},
{
"mountPath": "/datasets",
"name": "datasets"
},
{
"mountPath": "/home/${USER}/private",
"name": "home"
},
{
"mountPath": "/home/${USER}",
"name": "course-workspace",
"subPath": "home/${USER}"
},
{
"mountPath": "/home/${USER}/public",
"name": "course-workspace",
"subPath": "public"
}
]
}
],
"terminationGracePeriodSeconds": 30,
"volumes": [
{
"emptyDir": {
"medium": "Memory"
},
"name": "dshm"
},
{
"persistentVolumeClaim": {
"claimName": "dsmlp-datasets"
},
"name": "datasets"
},
{
"persistentVolumeClaim": {
"claimName": "home"
},
"name": "home"
}
]
}
}
}
}
EOM

VOL=$( kubectl get pod ${HOSTNAME} -o json | jq '.spec.volumes[] | select(.name=="course-workspace")' )

DEPLOYMENT=$( echo "$DEPLOYMENT" | jq --argjson v "$VOL" '.spec.template.spec.volumes += [ $v ]')


# echo "$DEPLOYMENT"
echo "$DEPLOYMENT" | kubectl create -f -

0 comments on commit 909f10f

Please sign in to comment.