Skip to content

Commit

Permalink
Update Ray version in Dockerfile and add v5 configs (#161)
Browse files Browse the repository at this point in the history
update ray; add v5 configs
  • Loading branch information
richardsliu authored Jul 31, 2024
1 parent c81767f commit ed1e853
Show file tree
Hide file tree
Showing 5 changed files with 289 additions and 5 deletions.
2 changes: 1 addition & 1 deletion kuberay/image/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM rayproject/ray:2.22.0-py310
FROM rayproject/ray:2.32.0-py310

RUN pip install flax==0.8.3
RUN pip install jax[tpu]==0.4.30 -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
Expand Down
4 changes: 2 additions & 2 deletions kuberay/manifests/ray-cluster.tpu-v4-multihost.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ spec:
- mountPath: /tmp/ray
name: ray-logs
name: ray-head
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240709
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
Expand Down Expand Up @@ -100,7 +100,7 @@ spec:
mountPath: /llama
readOnly: true
name: ray-worker
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240709
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
Expand Down
4 changes: 2 additions & 2 deletions kuberay/manifests/ray-cluster.tpu-v4-singlehost.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ spec:
- mountPath: /tmp/ray
name: ray-logs
name: ray-head
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240709
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
Expand Down Expand Up @@ -96,7 +96,7 @@ spec:
mountPath: /llama
readOnly: true
name: ray-worker
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240709
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
Expand Down
144 changes: 144 additions & 0 deletions kuberay/manifests/ray-cluster.tpu-v5-multihost.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# This template contains a Kuberay cluster using a 2x2x2 TPU v4 PodSlice.
# To get access to TPU resources, please follow instructions in this link:
# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: example-cluster-kuberay
spec:
headGroupSpec:
rayStartParams:
{}
template:
spec:
imagePullSecrets:
[]
serviceAccountName: ray-ksa
containers:
- volumeMounts:
- name: gcs-fuse-checkpoint
mountPath: /llama
readOnly: true
- mountPath: /tmp/ray
name: ray-logs
name: ray-head
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "4"
ephemeral-storage: 30Gi
memory: 40G
requests:
cpu: "4"
ephemeral-storage: 30Gi
memory: 40G
securityContext:
{}
env:
- name: JAX_PLATFORMS
value: "cpu"
- name: RAY_memory_monitor_refresh_ms
value: "0"
- name: RAY_GRAFANA_IFRAME_HOST
value: http://${grafana_host}
- name: RAY_GRAFANA_HOST
value: http://grafana:80
- name: RAY_PROMETHEUS_HOST
value: http://frontend:9090
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
- containerPort: 8471
name: slicebuilder
- containerPort: 8081
name: mxla
- containerPort: 8888
name: grpc
volumes:
- emptyDir: {}
name: ray-logs
- name: gcs-fuse-checkpoint
csi:
driver: gcsfuse.csi.storage.gke.io
readOnly: true
volumeAttributes:
bucketName: ricliu-llama2-70b-chat
mountOptions: "implicit-dirs"
metadata:
annotations:
gke-gcsfuse/volumes: "true"
labels:
cloud.google.com/gke-ray-node-type: head
app.kubernetes.io/name: kuberay
app.kubernetes.io/instance: example-cluster

workerGroupSpecs:
- rayStartParams:
{}
replicas: 1
minReplicas: 1
maxReplicas: 1
numOfHosts: 2
groupName: workergroup
template:
spec:
imagePullSecrets:
[]
serviceAccountName: ray-ksa
containers:
- volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
- name: gcs-fuse-checkpoint
mountPath: /llama
readOnly: true
name: ray-worker
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "8"
ephemeral-storage: 30Gi
google.com/tpu: "4"
memory: 180G
requests:
cpu: "8"
ephemeral-storage: 30Gi
google.com/tpu: "4"
memory: 180G
securityContext:
{}
env:
- name: JAX_PLATFORMS
value: "cpu"
ports:
null
volumes:
- emptyDir: {}
name: ray-logs
- name: gcs-fuse-checkpoint
csi:
driver: gcsfuse.csi.storage.gke.io
readOnly: true
volumeAttributes:
bucketName: ricliu-llama2-70b-chat
mountOptions: "implicit-dirs"
nodeSelector:
iam.gke.io/gke-metadata-server-enabled: "true"
cloud.google.com/gke-tpu-topology: 2x4
cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
metadata:
annotations:
gke-gcsfuse/volumes: "true"
labels:
cloud.google.com/gke-ray-node-type: worker
app.kubernetes.io/name: kuberay
app.kubernetes.io/instance: example-cluster

140 changes: 140 additions & 0 deletions kuberay/manifests/ray-cluster.tpu-v5-singlehost.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# This template contains a Kuberay cluster using a 2x2x1 TPU v4 PodSlice.
# To get access to TPU resources, please follow instructions in this link:
# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: example-cluster-kuberay
spec:
headGroupSpec:
rayStartParams:
{}
template:
spec:
imagePullSecrets:
[]
serviceAccountName: ray-ksa
containers:
- volumeMounts:
- name: gcs-fuse-checkpoint
mountPath: /llama
readOnly: true
- mountPath: /tmp/ray
name: ray-logs
name: ray-head
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "4"
ephemeral-storage: 30Gi
memory: 40G
requests:
cpu: "4"
ephemeral-storage: 30Gi
memory: 40G
securityContext:
{}
env:
- name: JAX_PLATFORMS
value: "cpu"
- name: RAY_memory_monitor_refresh_ms
value: "0"
- name: RAY_GRAFANA_IFRAME_HOST
value: http://${grafana_host}
- name: RAY_GRAFANA_HOST
value: http://grafana:80
- name: RAY_PROMETHEUS_HOST
value: http://frontend:9090
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
- containerPort: 8888
name: grpc
volumes:
- emptyDir: {}
name: ray-logs
- name: gcs-fuse-checkpoint
csi:
driver: gcsfuse.csi.storage.gke.io
readOnly: true
volumeAttributes:
bucketName: ricliu-llama2
mountOptions: "implicit-dirs"
metadata:
annotations:
gke-gcsfuse/volumes: "true"
labels:
cloud.google.com/gke-ray-node-type: head
app.kubernetes.io/name: kuberay
app.kubernetes.io/instance: example-cluster

workerGroupSpecs:
- rayStartParams:
{}
replicas: 1
minReplicas: 1
maxReplicas: 1
numOfHosts: 1
groupName: workergroup
template:
spec:
imagePullSecrets:
[]
serviceAccountName: ray-ksa
containers:
- volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
- name: gcs-fuse-checkpoint
mountPath: /llama
readOnly: true
name: ray-worker
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "8"
ephemeral-storage: 30Gi
google.com/tpu: "8"
memory: 200G
requests:
cpu: "8"
ephemeral-storage: 30Gi
google.com/tpu: "8"
memory: 200G
securityContext:
{}
env:
- name: JAX_PLATFORMS
value: "cpu"
ports:
null
volumes:
- emptyDir: {}
name: ray-logs
- name: gcs-fuse-checkpoint
csi:
driver: gcsfuse.csi.storage.gke.io
readOnly: true
volumeAttributes:
bucketName: ricliu-llama2
mountOptions: "implicit-dirs"
nodeSelector:
cloud.google.com/gke-tpu-topology: 2x4
cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
iam.gke.io/gke-metadata-server-enabled: "true"
metadata:
annotations:
gke-gcsfuse/volumes: "true"
labels:
cloud.google.com/gke-ray-node-type: worker
app.kubernetes.io/name: kuberay
app.kubernetes.io/instance: example-cluster

0 comments on commit ed1e853

Please sign in to comment.