Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nvidia-container-toolkit: only mount existing paths in the host #319772

Merged
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,14 @@
inherit hostPath containerPath;
options = mountOptions;
};
jqAddMountExpression = ".containerEdits.mounts[.containerEdits.mounts | length] |= . +";
allJqMounts = lib.concatMap
(mount:
["${lib.getExe jq} '${jqAddMountExpression} ${builtins.toJSON (mkMount mount)}'"])
mounts;
mountToCommand = mount:
"additionalMount \"${mount.hostPath}\" \"${mount.containerPath}\" '${builtins.toJSON mount.mountOptions}'";
mountsToCommands = mounts:
if (builtins.length mounts) == 0 then
"cat"
else
(lib.strings.concatMapStringsSep " | \\\n"
mountToCommand mounts);
in
writeScriptBin "nvidia-cdi-generator"
''
Expand All @@ -32,6 +35,18 @@ function cdiGenerate {
--nvidia-ctk-path ${lib.getExe' nvidia-container-toolkit "nvidia-ctk"}
}
cdiGenerate | \
${lib.concatStringsSep " | " allJqMounts} > $RUNTIME_DIRECTORY/nvidia-container-toolkit.json
function additionalMount {
Copy link
Contributor

@SomeoneSerge SomeoneSerge Aug 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well one comment is that this could be rewritten with pythonMinimal or a similar language that doesn't try so much to make this painful (you could remove concatMapStringsSep and just export a json for the script)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But as commented before, if you say this is the current iteration I'll go with it

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I'll work on a reimplementation of this logic with Python in future PR's, if that's fine with you.

local hostPath="$1"
local containerPath="$2"
local mountOptions="$3"
if [ -e "$hostPath" ]; then
${lib.getExe jq} ".containerEdits.mounts[.containerEdits.mounts | length] = { \"hostPath\": \"$hostPath\", \"containerPath\": \"$containerPath\", \"options\": $mountOptions }"
else
echo "Mount $hostPath ignored: could not find path in the host machine" >&2
cat
fi
}
cdiGenerate |
${mountsToCommands mounts} > $RUNTIME_DIRECTORY/nvidia-container-toolkit.json
''
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@
/usr/local/nvidia/lib64.
'';
};

package = lib.mkPackageOption pkgs "nvidia-container-toolkit" { };
};

};
Expand Down Expand Up @@ -129,6 +131,7 @@
let
script = pkgs.callPackage ./cdi-generate.nix {
inherit (config.hardware.nvidia-container-toolkit) mounts;
nvidia-container-toolkit = config.hardware.nvidia-container-toolkit.package;
nvidia-driver = config.hardware.nvidia.package;
deviceNameStrategy = config.hardware.nvidia-container-toolkit.device-name-strategy;
};
Expand Down
1 change: 1 addition & 0 deletions nixos/tests/all-tests.nix
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,7 @@ in {
ntfy-sh = handleTest ./ntfy-sh.nix {};
ntfy-sh-migration = handleTest ./ntfy-sh-migration.nix {};
ntpd-rs = handleTest ./ntpd-rs.nix {};
nvidia-container-toolkit = handleTest ./nvidia-container-toolkit.nix {};
nvmetcfg = handleTest ./nvmetcfg.nix {};
nzbget = handleTest ./nzbget.nix {};
nzbhydra2 = handleTest ./nzbhydra2.nix {};
Expand Down
177 changes: 177 additions & 0 deletions nixos/tests/nvidia-container-toolkit.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import ./make-test-python.nix (
{
pkgs,
lib,
system,
...
}:
let
testContainerImage =
let
testCDIScript = pkgs.writeShellScriptBin "test-cdi" ''
die() {
echo "$1"
exit 1
}
check_file_referential_integrity() {
echo "checking $1 referential integrity"
( ${pkgs.glibc.bin}/bin/ldd "$1" | ${lib.getExe pkgs.gnugrep} "not found" &> /dev/null ) && return 1
return 0
}
check_directory_referential_integrity() {
${lib.getExe pkgs.findutils} "$1" -type f -print0 | while read -d $'\0' file; do
if [[ $(${lib.getExe pkgs.file} "$file" | ${lib.getExe pkgs.gnugrep} ELF) ]]; then
check_file_referential_integrity "$file" || exit 1
else
echo "skipping $file: not an ELF file"
fi
done
}
check_directory_referential_integrity "/usr/bin" || exit 1
check_directory_referential_integrity "${pkgs.addDriverRunpath.driverLink}" || exit 1
check_directory_referential_integrity "/usr/local/nvidia" || exit 1
'';
in
pkgs.dockerTools.buildImage {
name = "cdi-test";
tag = "latest";
config = {
Cmd = [ "${testCDIScript}/bin/test-cdi" ];
};
copyToRoot = (
with pkgs.dockerTools;
[
usrBinEnv
binSh
]
);
};
emptyCDISpec = ''
#! ${pkgs.runtimeShell}
cat <<CDI_DOCUMENT
{
"cdiVersion": "0.5.0",
"kind": "nvidia.com/gpu",
"devices": [
{
"name": "all",
"containerEdits": {
"deviceNodes": [
{
"path": "/dev/urandom"
}
],
"hooks": [],
"mounts": []
}
}
],
"containerEdits": {
"deviceNodes": [],
"hooks": [],
"mounts": []
}
}
CDI_DOCUMENT
'';
nvidia-container-toolkit = {
enable = true;
package = pkgs.stdenv.mkDerivation {
name = "nvidia-ctk-dummy";
version = "1.0.0";
dontUnpack = true;
dontBuild = true;
installPhase = ''
mkdir -p $out/bin
cat <<EOF > $out/bin/nvidia-ctk
${emptyCDISpec}
EOF
chmod +x $out/bin/nvidia-ctk
'';
};
};
in
{
name = "nvidia-container-toolkit";
meta = with lib.maintainers; {
maintainers = [ ereslibre ];
};
nodes = {
no-nvidia-gpus =
{ config, ... }:
{
environment.systemPackages = with pkgs; [ jq ];
hardware = {
inherit nvidia-container-toolkit;
nvidia = {
open = true;
package = config.boot.kernelPackages.nvidiaPackages.stable.open;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ehhh it's stupid that open doesn't change the default to... an open driver

};
};
};

nvidia-one-gpu =
{ config, pkgs, ... }:
{
virtualisation.diskSize = 10240;
environment.systemPackages = with pkgs; [
jq
podman
];
hardware = {
inherit nvidia-container-toolkit;
nvidia = {
open = true;
package = config.boot.kernelPackages.nvidiaPackages.stable.open;
};
opengl.enable = true;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand why Ofborg isn't complaining, we have a mkRenamedModule for this one

};
virtualisation.containers.enable = true;
};

nvidia-one-gpu-invalid-host-paths =
{ config, pkgs, ... }:
{
virtualisation.diskSize = 10240;
environment.systemPackages = with pkgs; [ jq ];
hardware = {
nvidia-container-toolkit = nvidia-container-toolkit // {
mounts = [
{
hostPath = "/non-existant-path";
containerPath = "/some/path";
}
];
};
nvidia = {
open = true;
package = config.boot.kernelPackages.nvidiaPackages.stable.open;
};
opengl.enable = true;
};
virtualisation.containers.enable = true;
};
};
testScript = ''
start_all()
with subtest("Generate an empty CDI spec for a machine with no Nvidia GPUs"):
no_nvidia_gpus.wait_for_unit("nvidia-container-toolkit-cdi-generator.service")
no_nvidia_gpus.succeed("cat /var/run/cdi/nvidia-container-toolkit.json | jq")
with subtest("Podman loads the generated CDI spec for a machine with an Nvidia GPU"):
nvidia_one_gpu.wait_for_unit("nvidia-container-toolkit-cdi-generator.service")
nvidia_one_gpu.succeed("cat /var/run/cdi/nvidia-container-toolkit.json | jq")
nvidia_one_gpu.succeed("podman load < ${testContainerImage}")
print(nvidia_one_gpu.succeed("podman run --pull=never --device=nvidia.com/gpu=all -v /run/opengl-driver:/run/opengl-driver:ro cdi-test:latest"))
# Issue: https://github.com/NixOS/nixpkgs/issues/319201
with subtest("The generated CDI spec skips specified non-existant paths in the host"):
nvidia_one_gpu_invalid_host_paths.wait_for_unit("nvidia-container-toolkit-cdi-generator.service")
nvidia_one_gpu_invalid_host_paths.fail("grep 'non-existant-path' /var/run/cdi/nvidia-container-toolkit.json")
'';
}
)