python3Packages.triton.tests.axpy-cuda: init

NixOS · Oct 15, 2024 · 2aa951f · 2aa951f
1 parent ae56006
commit 2aa951f
Show file tree

Hide file tree

Showing 4 changed files with 164 additions and 32 deletions.
diff --git a/...evelopment/python-modules/torch/0001-cmake.py-propagate-cmakeFlags-from-environment.patch b/...evelopment/python-modules/torch/0001-cmake.py-propagate-cmakeFlags-from-environment.patch
@@ -0,0 +1,29 @@
+From c5d4087519eae6f41c80bbd8ffbcc9390db44c7f Mon Sep 17 00:00:00 2001
+From: SomeoneSerge <else+aalto@someonex.net>
+Date: Thu, 10 Oct 2024 19:19:18 +0000
+Subject: [PATCH] cmake.py: propagate cmakeFlags from environment
+
+---
+ tools/setup_helpers/cmake.py | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
+index 4b605fe5975..ea1d6a1ef46 100644
+--- a/tools/setup_helpers/cmake.py
++++ b/tools/setup_helpers/cmake.py
+@@ -332,6 +332,12 @@ class CMake:
+                         file=sys.stderr,
+                     )
+                     print(e, file=sys.stderr)
++
++        # Nixpkgs compat:
++        if "cmakeFlags" in os.environ:
++            import shlex
++            args.extend(shlex.split(os.environ["cmakeFlags"]))
++
+         # According to the CMake manual, we should pass the arguments first,
+         # and put the directory as the last element. Otherwise, these flags
+         # may not be passed correctly.
+-- 
+2.46.0
+
diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
@@ -35,10 +35,8 @@
   removeReferencesTo,
 
   # Build inputs
+  darwin,
   numactl,
-  Accelerate,
-  CoreServices,
-  libobjc,
 
   # Propagated build inputs
   astunparse,
@@ -56,6 +54,17 @@
   tritonSupport ? (!stdenv.hostPlatform.isDarwin),
   triton,
 
+  # TODO: 1. callPackage needs to learn to distinguish between the task
+  #          of "asking for an attribute from the parent scope" and
+  #          the task of "exposing a formal parameter in .override".
+  # TODO: 2. We should probably abandon attributes such as `torchWithCuda` (etc.)
+  #          as they routinely end up consuming the wrong arguments\
+  #          (dependencies without cuda support).
+  #          Instead we should rely on overlays and nixpkgsFun.
+  # (@SomeoneSerge)
+  _tritonEffective ? if cudaSupport then triton-cuda else triton,
+  triton-cuda,
+
   # Unit tests
   hypothesis,
   psutil,
@@ -95,6 +104,8 @@ let
     ;
   inherit (cudaPackages) cudaFlags cudnn nccl;
 
+  triton = throw "python3Packages.torch: use _tritonEffective instead of triton to avoid divergence";
+
   rocmPackages = rocmPackages_5;
 
   setBool = v: if v then "1" else "0";
@@ -240,6 +251,7 @@ buildPythonPackage rec {
       # Allow setting PYTHON_LIB_REL_PATH with an environment variable.
       # https://github.com/pytorch/pytorch/pull/128419
       ./passthrough-python-lib-rel-path.patch
+      ./0001-cmake.py-propagate-cmakeFlags-from-environment.patch
     ]
     ++ lib.optionals cudaSupport [ ./fix-cmake-cuda-toolkit.patch ]
     ++ lib.optionals (stdenv.hostPlatform.isDarwin && stdenv.hostPlatform.isx86_64) [
@@ -257,7 +269,18 @@ buildPythonPackage rec {
     ];
 
   postPatch =
-    lib.optionalString rocmSupport ''
+    ''
+      substituteInPlace cmake/public/cuda.cmake \
+        --replace-fail \
+          'message(FATAL_ERROR "Found two conflicting CUDA' \
+          'message(WARNING "Found two conflicting CUDA' \
+        --replace-warn \
+          "set(CUDAToolkit_ROOT" \
+          "# Upstream: set(CUDAToolkit_ROOT"
+      substituteInPlace third_party/gloo/cmake/Cuda.cmake \
+        --replace-warn "find_package(CUDAToolkit 7.0" "find_package(CUDAToolkit"
+    ''
+    + lib.optionalString rocmSupport ''
       # https://github.com/facebookincubator/gloo/pull/297
       substituteInPlace third_party/gloo/cmake/Hipify.cmake \
         --replace "\''${HIPIFY_COMMAND}" "python \''${HIPIFY_COMMAND}"
@@ -351,6 +374,17 @@ buildPythonPackage rec {
   # NB technical debt: building without NNPACK as workaround for missing `six`
   USE_NNPACK = 0;
 
+  cmakeFlags =
+    [
+      # (lib.cmakeBool "CMAKE_FIND_DEBUG_MODE" true)
+      (lib.cmakeFeature "CUDAToolkit_VERSION" cudaPackages.cudaVersion)
+    ]
+    ++ lib.optionals cudaSupport [
+      # Unbreaks version discovery in enable_language(CUDA) when wrapping nvcc with ccache
+      # Cf. https://gitlab.kitware.com/cmake/cmake/-/issues/26363
+      (lib.cmakeFeature "CMAKE_CUDA_COMPILER_TOOLKIT_VERSION" cudaPackages.cudaVersion)
+    ];
+
   preBuild = ''
     export MAX_JOBS=$NIX_BUILD_CORES
     ${python.pythonOnBuildForHost.interpreter} setup.py build --cmake-only
@@ -495,11 +529,11 @@ buildPythonPackage rec {
     ++ lib.optionals (cudaSupport || rocmSupport) [ effectiveMagma ]
     ++ lib.optionals stdenv.hostPlatform.isLinux [ numactl ]
     ++ lib.optionals stdenv.hostPlatform.isDarwin [
-      Accelerate
-      CoreServices
-      libobjc
+      darwin.apple_sdk.frameworks.Accelerate
+      darwin.apple_sdk.frameworks.CoreServices
+      darwin.libobjc
     ]
-    ++ lib.optionals tritonSupport [ triton ]
+    ++ lib.optionals tritonSupport [ _tritonEffective ]
     ++ lib.optionals MPISupport [ mpi ]
     ++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
 
@@ -527,7 +561,7 @@ buildPythonPackage rec {
 
     # torch/csrc requires `pybind11` at runtime
     pybind11
-  ] ++ lib.optionals tritonSupport [ triton ];
+  ] ++ lib.optionals tritonSupport [ _tritonEffective ];
 
   propagatedCxxBuildInputs =
     [ ] ++ lib.optionals MPISupport [ mpi ] ++ lib.optionals rocmSupport [ rocmtoolkit_joined ];
@@ -662,7 +696,9 @@ buildPythonPackage rec {
       thoughtpolice
       tscholak
     ]; # tscholak esp. for darwin-related builds
-    platforms = with lib.platforms; linux ++ lib.optionals (!cudaSupport && !rocmSupport) darwin;
+    platforms =
+      lib.platforms.linux
+      ++ lib.optionals (!cudaSupport && !rocmSupport) lib.platforms.darwin;
     broken = builtins.any trivial.id (builtins.attrValues brokenConditions);
   };
 }
diff --git a/pkgs/development/python-modules/triton/default.nix b/pkgs/development/python-modules/triton/default.nix
@@ -15,14 +15,16 @@
   ninja,
   pybind11,
   python,
-  runCommand,
+  pytestCheckHook,
+  stdenv,
   substituteAll,
   setuptools,
   torchWithRocm,
   zlib,
   cudaSupport ? config.cudaSupport,
   rocmSupport ? config.rocmSupport,
   rocmPackages,
+  triton,
 }:
 
 buildPythonPackage {
@@ -164,37 +166,102 @@ buildPythonPackage {
   # CMake is run by setup.py instead
   dontUseCmakeConfigure = true;
 
-  nativeCheckInputs = [
-    cmake
-    # Requires torch (circular dependency) and GPU access: pytestCheckHook
-  ];
+  nativeCheckInputs = [ cmake ];
   preCheck = ''
     # build/temp* refers to build_ext.build_temp (looked up in the build logs)
     (cd ./build/temp* ; ctest)
-
-    # For pytestCheckHook
-    cd test/unit
   '';
 
   pythonImportsCheck = [
     "triton"
     "triton.language"
   ];
 
-  # Ultimately, torch is our test suite:
+  passthru.gpuCheck = stdenv.mkDerivation {
+    pname = "triton-pytest";
+    inherit (triton) version src;
+
+    requiredSystemFeatures = [ "cuda" ];
+
+    nativeBuildInputs = [
+      (python.withPackages (ps: [
+        ps.scipy
+        ps.torchWithCuda
+        ps.triton-cuda
+      ]))
+    ];
+
+    dontBuild = true;
+    nativeCheckInputs = [ pytestCheckHook ];
+
+    doCheck = true;
+
+    preCheck = ''
+      cd python/test/unit
+      export HOME=$TMPDIR
+    '';
+    checkPhase = "pytestCheckPhase";
+
+    installPhase = "touch $out";
+  };
+
   passthru.tests = {
+    # Ultimately, torch is our test suite:
     inherit torchWithRocm;
-    # Implemented as alternative to pythonImportsCheck, in case if circular dependency on torch occurs again,
-    # and pythonImportsCheck is commented back.
-    import-triton =
-      runCommand "import-triton"
-        { nativeBuildInputs = [ (python.withPackages (ps: [ ps.triton ])) ]; }
+
+    # Test as `nix run -f "<nixpkgs>" python3Packages.triton.tests.axpy-cuda`
+    # or, using `programs.nix-required-mounts`, as `nix build -f "<nixpkgs>" python3Packages.triton.tests.axpy-cuda.gpuCheck`
+    axpy-cuda =
+      cudaPackages.writeGpuTestPython
+        {
+          libraries = ps: [
+            ps.triton
+            ps.torch-no-triton
+          ];
+        }
         ''
-          python << \EOF
+          # Adopted from Philippe Tillet https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html
+
           import triton
-          import triton.language
-          EOF
-          touch "$out"
+          import triton.language as tl
+          import torch
+          import os
+
+          @triton.jit
+          def axpy_kernel(n, a: tl.constexpr, x_ptr, y_ptr, out, BLOCK_SIZE: tl.constexpr):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n
+            x = tl.load(x_ptr + offsets, mask=mask)
+            y = tl.load(y_ptr + offsets, mask=mask)
+            output = a * x + y
+            tl.store(out + offsets, output, mask=mask)
+
+          def axpy(a, x, y):
+            output = torch.empty_like(x)
+            assert x.is_cuda and y.is_cuda and output.is_cuda
+            n_elements = output.numel()
+
+            def grid(meta):
+              return (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
+
+            axpy_kernel[grid](n_elements, a, x, y, output, BLOCK_SIZE=1024)
+            return output
+
+          if __name__ == "__main__":
+            if os.environ.get("HOME", None) == "/homeless-shelter":
+              os.environ["HOME"] = os.environ.get("TMPDIR", "/tmp")
+            if "CC" not in os.environ:
+              os.environ["CC"] = "${lib.getExe' cudaPackages.backendStdenv.cc "cc"}"
+            torch.manual_seed(0)
+            size = 12345
+            x = torch.rand(size, device='cuda')
+            y = torch.rand(size, device='cuda')
+            output_torch = 3.14 * x + y
+            output_triton = axpy(3.14, x, y)
+            assert output_torch.sub(output_triton).abs().max().item() < 1e-6
+            print("Triton axpy: OK")
         '';
   };
 

diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix
@@ -15717,10 +15717,10 @@ self: super: with self; {
 
   toposort = callPackage ../development/python-modules/toposort { };
 
-  torch = callPackage ../development/python-modules/torch {
-    inherit (pkgs.darwin.apple_sdk.frameworks) Accelerate CoreServices;
-    inherit (pkgs.darwin) libobjc;
-  };
+  torch = callPackage ../development/python-modules/torch { };
+
+  # Required to test triton
+  torch-no-triton = self.torch.override { tritonSupport = false; };
 
   torch-audiomentations = callPackage ../development/python-modules/torch-audiomentations { };