Skip to content

Commit

Permalink
Add repos for gpu and mlx install to kickstart file in Rocky 8 (#2405)
Browse files Browse the repository at this point in the history
* Remove gpu and mlx install from kickstart file

We will be installing directly from CIQ repos once they're ready anyway

* Add CIQ repos for Nvidia and Mlx drivers

* Add dnf install commands to repos added in previous commit

* Replace preview Rocky 9 workflows with final workflows

Also cleaning up old workflow files for preview builds

* Fix missing EOM

* Fix naming of workflows

More concise, but still clear

* Add cuda repos for kickstart configs

* Fix missing EOM

* Install nvidia-accelerated-graphics-driver with kickstart
  • Loading branch information
jjerger authored Nov 14, 2024
1 parent 7f38213 commit 6d8c76a
Show file tree
Hide file tree
Showing 11 changed files with 365 additions and 128 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"build": {
"TimeOut": "60m",
"IncludeWorkflow": {
"Path": "${workflow_root}/image_build/enterprise_linux/rocky_linux_8_optimized_gcp_with_nvidia_latest.wf.json",
"Path": "${workflow_root}/image_build/enterprise_linux/rocky_linux_8_optimized_gcp_nvidia_latest.wf.json",
"Vars": {
"build_date": "${build_date}",
"installer_iso": "${installer_iso}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
Template to publish Rocky 9 optimized for GCP/Nvidia images.
By default this template is setup to publish to the 'gce-image-builder'
project, the 'environment' variable can be used to publish to 'test', 'prod'
DeleteAfter is set to 180 days for all environments other than prod where no
DeleteAfter is set to 190 days for all environments other than prod where no
time period is set.
*/}}
{
"Name": "rocky-linux-9-optimized-gcp-with-nvidia-550",
"Name": "rocky-linux-9-optimized-gcp-nvidia-latest",
{{$work_project := printf "%q" "gce-image-builder" -}}
{{$endpoint := `"https://www.googleapis.com/compute/alpha/projects/"` -}}
{{$delete_after := `"24h*30*4"` -}}
Expand All @@ -33,9 +33,9 @@
{{$time := trimPrefix .publish_version "v"}}
"Images": [
{
"Prefix": "rocky-linux-9-optimized-gcp-with-nvidia-550",
"Family": "rocky-linux-9-optimized-gcp-with-nvidia-550",
"Description": "Rocky Linux 9 optimized for GCP with Nvidia 550 built on {{$time}}",
"Prefix": "rocky-linux-9-optimized-gcp-nvidia-latest",
"Family": "rocky-linux-9-optimized-gcp-nvidia-latest",
"Description": "Rocky Linux 9 optimized for GCP with latest Nvidia driver built on {{$time}}",
"Architecture": "X86_64",
"Licenses": [
"https://www.googleapis.com/compute/v1/projects/accelerator-preview-images/global/licenses/accelerator-preview-image",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"Name": "rocky-linux-9-optimized-gcp-with-nvidia-550",
"Name": "rocky-linux-9-optimized-gcp-nvidia-latest",
"Project": "gce-image-builder",
"Zone": "us-central1-b",
"GCSPath": "gs://gce-image-build-bucket/daisy/${USERNAME}",
Expand All @@ -24,6 +24,10 @@
"Value": "${OUTSPATH}/export-image.sbom.json",
"Description": "SBOM final export destination, copies in place by default"
},
"installer_iso": {
"Required": true,
"Description": "The Rocky Linux 9 installer ISO to build from."
},
"sbom_util_gcs_root": {
"Value": "",
"Description": "The root gcs bucket for sbomutil, if using sbomutil to generate the SBOM."
Expand All @@ -37,44 +41,34 @@
"build": {
"TimeOut": "60m",
"IncludeWorkflow": {
"Path": "${workflow_root}/image_build/enterprise_linux/rocky_linux_9_optimized_gcp_with_nvidia_550.wf.json",
"Path": "${workflow_root}/image_build/enterprise_linux/rocky_linux_9_optimized_gcp_nvidia_latest.wf.json",
"Vars": {
"build_date": "${build_date}"
"build_date": "${build_date}",
"installer_iso": "${installer_iso}"
}
}
},
"create-disk": {
"CreateDisks": [
{
"Name": "disk-rocky-linux-9-optimized-gcp-with-nvidia-550",
"SourceImage": "rocky-linux-9-optimized-gcp-with-nvidia-550-v${build_date}",
"SizeGb": "30",
"Type": "pd-ssd"
}
]
},
"export-image": {
"Timeout": "60m",
"IncludeWorkflow": {
"Path": "${workflow_root}/export/disk_export.wf.json",
"Vars": {
"destination": "${gcs_url}",
"sbom_destination": "${sbom_destination}",
"source_disk": "disk-rocky-linux-9-optimized-gcp-with-nvidia-550",
"source_disk": "el-install-disk",
"sbom_util_gcs_root": "${sbom_util_gcs_root}",
"sha256_txt": "${sha256_txt}"
}
}
},
"cleanup-image": {
"DeleteResources": {
"Images": ["rocky-linux-9-optimized-gcp-with-nvidia-550-v${build_date}"]
"Images": ["rocky-linux-9-optimized-gcp-nvidia-latest-v${build_date}"]
}
}
},
"Dependencies": {
"create-disk": ["build"],
"export-image": ["create-disk"],
"export-image": ["build"],
"cleanup-image": ["export-image"]
}
}

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -184,20 +184,33 @@ dnf install -y google-compute-engine google-osconfig-agent gce-disk-expand
dnf install -y google-cloud-cli

# Install Accelerator components: nvidia and mellanox drivers
tee -a /etc/yum.repos.d/Rocky-OpenGPU.repo << EOM
[open-gpu-kernel-modules-el8-x86_64]
name = Open gpu kernel modules (x86_64)
baseurl = https://depot.ciq.com/public/files/gce-accelerator/open-gpu-kernel-modules-el8-x86_64
metadata_expire = 5
priority = 50
repo_gpgcheck = false
gpgcheck = false
enabled = true
skip_if_unavailable = true
EOM
tee -a /etc/yum.repos.d/Rocky-Mlx.repo << EOM
[nvidia-mellanox-ofed-driver-el8-x86_64]
name = Nvidia Mellanox OFED Drivers
baseurl = https://depot.ciq.com/public/files/gce-accelerator/nvidia-mellanox-ofed-driver-el8-x86_64
metadata_expire = 5
priority = 50
repo_gpgcheck = false
gpgcheck = false
enabled = true
skip_if_unavailable = true
EOM

dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
dnf install -y gcc make kernel-devel kernel
test -f /var/tmp/kernel-upgrade-done || sh -c 'touch /var/tmp/kernel-upgrade-done'
curl -L -o nvidia.run https://us.download.nvidia.com/tesla/550.90.12/NVIDIA-Linux-x86_64-550.90.12.run
chmod +x ./nvidia.run
# DKMS - not suitable for prod
./nvidia.run -s --kernel-source-path=/usr/src/kernels/$(uname -r)/
dnf install -y createrepo gdb-headless libtool autoconf rpm-build kernel-rpm-macros patch automake wget lsof tk gcc-gfortran tcl pciutils
wget https://content.mellanox.com/ofed/MLNX_OFED-23.10-3.2.2.0/MLNX_OFED_LINUX-23.10-3.2.2.0-rhel8.9-x86_64.tgz
tar xf MLNX_OFED_LINUX-23.10-3.2.2.0-rhel8.9-x86_64.tgz
cd MLNX_OFED_LINUX-23.10-3.2.2.0-rhel8.9-x86_64
./mlnxofedinstall --guest --force --skip-distro-check --add-kernel-support
cd ..
rm -rf MLNX_OFED_LINUX-23.10-3.2.2.0-rhel8.9-x86_64 MLNX_OFED_LINUX-23.10-3.2.2.0-rhel8.9-x86_64.tgz
dnf install -y open-gpu-kernel-modules-el8-x86_64
dnf install -y nvidia-accelerated-graphics-driver
dnf install -y nvidia-mellanox-ofed-driver-el8-x86_64

# Send /root/anaconda-ks.cfg to our logs.
cp /run/install/ks.cfg /tmp/anaconda-ks.cfg
Expand Down
Loading

0 comments on commit 6d8c76a

Please sign in to comment.