Merge pull request #1 from ucsd-ets/pa3

New PA 3 startup files
ucsd-ets · Mar 5, 2024 · 909f10f · 909f10f
2 parents c8fba2d + daa0746
commit 909f10f
Show file tree

Hide file tree

Showing 3 changed files with 150 additions and 95 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
 
-FROM rayproject/ray-ml:latest-gpu
+FROM rayproject/ray-ml:2.9.3
 
 USER root
 
@@ -75,18 +75,8 @@ COPY jupyter_config.py start-workers.sh start-cluster.sh stop-cluster.sh /opt/ra
 RUN chmod 0755 /opt/ray-support/*.sh
 RUN mkdir -p /usr/local/etc/jupyter && cat /opt/ray-support/jupyter_config.py >> /usr/local/etc/jupyter/jupyter_config.py
 
-# install bazel
-RUN apt install apt-transport-https curl gnupg -y && \
-    curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg && \
-    mv bazel-archive-keyring.gpg /usr/share/keyrings && \
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list
-RUN apt update && apt install bazel-3.2.0
-# downgrade protobuf and numpy for transformers and pygloo
-RUN pip3 install protobuf==3.20.0 numpy==1.20.0 pandas==1.4.0
-# install pygloo
-RUN git clone https://github.com/ray-project/pygloo.git
-RUN ln -s "/usr/bin/bazel-3.2.0" "/usr/bin/bazel"
-RUN alias bazel="bazel-3.2.0" && cd pygloo && python setup.py install && cd ..
+# add gensim: experimental
+RUN pip3 install gensim
 
 USER 1000
 
diff --git a/start-cluster.sh b/start-cluster.sh
@@ -9,10 +9,14 @@ fi
 
 exec 2>>$HOME/ray-head.log 1>&2
 
+CPU_GUARANTEE=${CPU_GUARANTEE:-$(( $KUBERNETES_LIMIT_CPU))}
+MEM_GUARANTEE=${MEM_GUARANTEE:-$(( $KUBERNETES_LIMIT_MEM ))}
+OBJECT_STORE_MEM=$(( $MEM_GUARANTEE / 2))
+
 # Datahub exposes limits/requests as floating point, Ray wants int
 MY_CPU_REQUEST=$(printf "%.0f" "$CPU_GUARANTEE") 
 
-ray start --head --port=6380 --num-cpus=$MY_CPU_REQUEST --dashboard-host=0.0.0.0 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365  --disable-usage-stats --object-store-memory 4294967296
+ray start --head --port=6380 --num-cpus=$MY_CPU_REQUEST --dashboard-host=0.0.0.0 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365  --disable-usage-stats --object-store-memory $OBJECT_STORE_MEM --memory $MEM_GUARANTEE --system-config='{"object_spilling_config":"{\"type\":\"filesystem\",\"params\":{\"directory_path\":\"/tmp/spill\"}}"}'
 
 if !  kubectl get svc service-ray-cluster 2>/dev/null > /dev/null; then
     kubectl create -f - <<EOM
@@ -45,7 +49,11 @@ EOM
 fi
 
 # Now fire up workers
-/opt/ray-support/start-workers.sh "$@"
+if [ -x "$HOME/start-workers.sh" ]; then
+    exec "$HOME/start-workers.sh" "$@"
+else
+    exec /opt/ray-support/start-workers.sh "$@"
+fi
 
 # Execution shouldn't reach here unless both start-workers scripts are missing, if so assume that's intentional
 exit 0
diff --git a/start-workers.sh b/start-workers.sh
@@ -9,89 +9,146 @@ fi
 
 NUM_WORKERS=2
 
-WORKER_CPU_REQUEST=3 #set
-WORKER_CPU_LIMIT=3 #upper bound
-WORKER_MEM_REQUEST=8192M
-WORKER_MEM_LIMIT=8192M
+WORKER_CPU_REQUEST=8
+WORKER_CPU_LIMIT=8
+WORKER_MEM_REQUEST=16384M
+WORKER_MEM_LIMIT=16384M
 WORKER_GPU_COUNT=0
 
+IMAGE=${JUPYTER_IMAGE_SPEC:-${DOCKER_IMAGE}}
+echo "STARTING WORKERS WITH WORKER_CPU_REQUEST=${WORKER_CPU_REQUEST}"
 if kubectl get deployment deployment-ray-worker 2>/dev/null > /dev/null; then
-	kubectl delete -f deployment deployment-ray-worker
+	kubectl delete deployment deployment-ray-worker
 fi
 
-kubectl create -f - <<EOM
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: deployment-ray-worker
-  labels:
-    app: ray-cluster-worker
-spec:
-  # Change this to scale the number of worker nodes started in the Ray cluster.
-  replicas: ${NUM_WORKERS}
-  selector:
-    matchLabels:
-      component: ray-worker
-      type: ray
-      app: ray-cluster-worker
-  template:
-    metadata:
-      labels:
-        component: ray-worker
-        type: ray
-        app: ray-cluster-worker
-    spec:
-      restartPolicy: Always
-      volumes:
-      - name: dshm
-        emptyDir:
-          medium: Memory
-      - name: home
-        persistentVolumeClaim:
-          claimName: home
-      securityContext:
-        runAsUser: $(id -u)
-        
-      containers:
-      - name: ray-worker
-        image: "${JUPYTER_IMAGE_SPEC}"
-        imagePullPolicy: Always
-        command: ["/bin/bash", "-c", "--"]
-        args:
-          - "ray start --num-cpus=${WORKER_CPU_REQUEST} --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block --object-store-memory 4294967296 --memory 7516192768"
-        securityContext:
-          allowPrivilegeEscalation: false
-        # This volume allocates shared memory for Ray to use for its plasma
-        # object store. If you do not provide this, Ray will fall back to
-        # /tmp which cause slowdowns if it's not a shared memory volume.
-        volumeMounts:
-          - mountPath: /dev/shm
-            name: dshm
-          - mountPath: "/home/${USER}/private"
-            name: home
-          
-            
-        env:
-          # This is used in the ray start command so that Ray can spawn the
-          # correct number of processes. Omitting this may lead to degraded
-          # performance.
-          - name: MY_CPU_REQUEST
-            valueFrom:
-              resourceFieldRef:
-                resource: requests.cpu
-          # The resource requests and limits in this config are too small for production!
-          # It is better to use a few large Ray pods than many small ones.
-          # For production, it is ideal to size each Ray pod to take up the
-          # entire Kubernetes node on which it is scheduled.
-        resources:
-          limits:
-            cpu: "${WORKER_CPU_LIMIT}"
-            memory: "${WORKER_MEM_LIMIT}"
-            # For production use-cases, we recommend specifying integer CPU reqests and limits.
-            # We also recommend setting requests equal to limits for both CPU and memory.
-            # For this example, we use a 500m CPU request to accomodate resource-constrained local
-            # Kubernetes testing environments such as Kind and minikube.
-          requests:
-            cpu: "${WORKER_CPU_REQUEST}"
-            memory: "${WORKER_MEM_REQUEST}"
+
+read -d '' DEPLOYMENT <<EOM
+{
+    "apiVersion": "apps/v1",
+    "kind": "Deployment",
+    "metadata": {
+        "labels": {
+            "app": "ray-cluster-worker"
+        },
+        "name": "deployment-ray-worker"
+    },
+    "spec": {
+        "progressDeadlineSeconds": 600,
+        "replicas": ${NUM_WORKERS},
+        "selector": {
+            "matchLabels": {
+                "app": "ray-cluster-worker",
+                "component": "ray-worker",
+                "type": "ray"
+            }
+        },
+        "template": {
+            "metadata": {
+                "creationTimestamp": null,
+                "labels": {
+                    "app": "ray-cluster-worker",
+                    "component": "ray-worker",
+                    "type": "ray"
+                }
+            },
+            "spec": {
+                "containers": [
+                    {
+                        "args": [
+                            "ray start --num-cpus=${WORKER_CPU_REQUEST} --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block --object-store-memory=7516192768 --memory=17179869184"
+                        ],
+                        "command": [
+                            "/bin/bash",
+                            "-c",
+                            "--"
+                        ],
+                        "env": [
+                            {
+                                "name": "MY_CPU_REQUEST",
+                                "valueFrom": {
+                                    "resourceFieldRef": {
+                                        "divisor": "0",
+                                        "resource": "requests.cpu"
+                                    }
+                                }
+                            }
+                        ],
+                        "image": "${IMAGE}",
+                        "imagePullPolicy": "Always",
+                        "name": "ray-worker",
+                        "resources": {
+                            "limits": {
+                                "cpu": "${WORKER_CPU_LIMIT}",
+                                "memory": "${WORKER_MEM_LIMIT}"
+                            },
+                            "requests": {
+                                "cpu": "${WORKER_CPU_REQUEST}",
+                                "memory": "${WORKER_MEM_REQUEST}"
+                            }
+                        },
+                        "securityContext": {
+                            "allowPrivilegeEscalation": false,
+                            "runAsUser": ${UID}
+                        },
+                        "terminationMessagePath": "/dev/termination-log",
+                        "terminationMessagePolicy": "File",
+                        "volumeMounts": [
+                            {
+                                "mountPath": "/dev/shm",
+                                "name": "dshm"
+                            },
+                            {
+                                "mountPath": "/datasets",
+                                "name": "datasets"
+                            },
+                            {
+                                "mountPath": "/home/${USER}/private",
+                                "name": "home"
+                            },
+                             {
+                                "mountPath": "/home/${USER}",
+                                "name": "course-workspace",
+                                "subPath": "home/${USER}"
+                             },
+                              {
+                                "mountPath": "/home/${USER}/public",
+                                "name": "course-workspace",
+                                "subPath": "public"
+                              }
+                        ]
+                    }
+                ],
+                "terminationGracePeriodSeconds": 30,
+                "volumes": [
+                    {
+                        "emptyDir": {
+                            "medium": "Memory"
+                        },
+                        "name": "dshm"
+                    },
+                    {
+                        "persistentVolumeClaim": {
+                            "claimName": "dsmlp-datasets"
+                        },
+                        "name": "datasets"
+                    },
+                    {
+                        "persistentVolumeClaim": {
+                            "claimName": "home"
+                        },
+                        "name": "home"
+                    }
+                ]
+            }
+        }
+    }
+}
 EOM
+
+VOL=$( kubectl get pod ${HOSTNAME} -o json | jq '.spec.volumes[] | select(.name=="course-workspace")' )
+
+DEPLOYMENT=$( echo "$DEPLOYMENT" | jq --argjson v "$VOL" '.spec.template.spec.volumes += [ $v ]')
+
+
+# echo "$DEPLOYMENT"
+echo "$DEPLOYMENT" | kubectl create -f -