From 071392f3624578f01525e2e427a1a316f5a7ece5 Mon Sep 17 00:00:00 2001 From: machineko Date: Thu, 11 Jul 2024 22:32:56 +0200 Subject: [PATCH] remove __half (it was causing weird memory deallocation errors) remove Pythonkit and bring swift-testing with few more fixes --- Package.swift | 6 +- .../SwiftCUBLAS/CUBLASUtils/CUBlasUtils.swift | 42 ++-- Sources/SwiftCUBLAS/SwiftCUBLAS.swift | 1 - Sources/cxxCUBLAS/include/cublas_head.hpp | 1 - Tests/SwiftCUBLASTests/SwiftCUBLASTests.swift | 216 ++++++++++-------- Tests/SwiftCUBLASTests/TestHelpers.swift | 29 +++ 6 files changed, 162 insertions(+), 133 deletions(-) create mode 100644 Tests/SwiftCUBLASTests/TestHelpers.swift diff --git a/Package.swift b/Package.swift index a39a29e..63c9810 100644 --- a/Package.swift +++ b/Package.swift @@ -18,9 +18,7 @@ let package = Package( dependencies: [ .package(url: "https://github.com/machineko/SwiftCU", branch: "main"), - .package(url: "https://github.com/pvieito/PythonKit.git", branch: "master"), - .package(url: "https://github.com/apple/swift-docc-plugin", from: "1.3.0"), - + .package(url: "https://github.com/apple/swift-testing.git", from: "0.10.0"), ], targets: [ .target( @@ -48,7 +46,7 @@ let package = Package( dependencies: [ "SwiftCU", "cxxCUBLAS", "SwiftCUBLAS", - .product(name: "PythonKit", package: "PythonKit") + .product(name: "Testing", package: "swift-testing"), ], swiftSettings: [ .interoperabilityMode(.Cxx), diff --git a/Sources/SwiftCUBLAS/CUBLASUtils/CUBlasUtils.swift b/Sources/SwiftCUBLAS/CUBLASUtils/CUBlasUtils.swift index a84cdf9..4c167e9 100644 --- a/Sources/SwiftCUBLAS/CUBLASUtils/CUBlasUtils.swift +++ b/Sources/SwiftCUBLAS/CUBLASUtils/CUBlasUtils.swift @@ -35,10 +35,10 @@ extension CUBLASParamsMixed { return CUDA_R_8I case is Int32.Type: return CUDA_R_32I - case is __half.Type: + case is Float16.Type: return CUDA_R_16F default: - fatalError("Unsupported CUBLAS data type") + fatalError("\(inputType.self) not supported") } } @@ -49,14 +49,14 @@ extension CUBLASParamsMixed { return CUDA_R_32F case is Double.Type: return CUDA_R_64F - case is UInt8.Type: - return CUDA_R_8U + case is Int8.Type: + return CUDA_R_8I case is Int32.Type: return CUDA_R_32I - case is __half.Type: + case is Float16.Type: return CUDA_R_16F default: - fatalError("Unsupported CUBLAS data type") + fatalError("\(inputType.self) not supported") } } } @@ -223,24 +223,11 @@ extension CUBLASHandle { let status = cublasSgemm_v2( self.handle, transposeA.ascublas, transposeB.ascublas, params.m, params.n, params.k, ¶ms.alpha, params.A, params.lda, params.B, params.ldb, ¶ms.beta, params.C, params.ldc - ) - return status.asSwift - } - - /// Performs half-precision general matrix multiplication (HGEMM) using CUBLAS. - /// - Parameters: - /// - transposeA: Specifies whether to transpose matrix A. - /// - transposeB: Specifies whether to transpose matrix B. - /// - params: The parameters for the HGEMM operation. - /// - Returns: The status of the HGEMM operation. - public func hgemm( - transposeA: cublasOperation = .cublas_op_n, transposeB: cublasOperation = .cublas_op_n, params: inout CUBLASParams<__half> - ) -> cublasStatus { - let status = cublasHgemm( - self.handle, transposeA.ascublas, transposeB.ascublas, params.m, params.n, - params.k, ¶ms.alpha, params.A, params.lda, params.B, params.ldb, ¶ms.beta, params.C, params.ldc - ) - return status.asSwift + ).asSwift + #if safetyCheck + status.safetyCheckCondition(message: "Can't run sgemm cublasSgemm_v2 function \(status)") + #endif + return status } /// Performs mixed-precision general matrix multiplication (GEMM) using CUBLAS. @@ -269,7 +256,10 @@ extension CUBLASHandle { params.C, params.outputCUDAType, params.ldc, computeType.ascublas, cublasGemmAlgo.ascublas - ) - return status.asSwift + ).asSwift + #if safetyCheck + status.safetyCheckCondition(message: "Can't run cublasGemmEx function \(status)") + #endif + return status } } diff --git a/Sources/SwiftCUBLAS/SwiftCUBLAS.swift b/Sources/SwiftCUBLAS/SwiftCUBLAS.swift index a1f6097..157b888 100644 --- a/Sources/SwiftCUBLAS/SwiftCUBLAS.swift +++ b/Sources/SwiftCUBLAS/SwiftCUBLAS.swift @@ -8,7 +8,6 @@ extension Float: CUBLASDataType {} extension Double: CUBLASDataType {} extension Int8: CUBLASDataType {} extension Int32: CUBLASDataType {} -extension __half: CUBLASDataType {} /// A structure that manages a CUBLAS handle. public struct CUBLASHandle: ~Copyable { diff --git a/Sources/cxxCUBLAS/include/cublas_head.hpp b/Sources/cxxCUBLAS/include/cublas_head.hpp index e6b219b..e19ee7d 100644 --- a/Sources/cxxCUBLAS/include/cublas_head.hpp +++ b/Sources/cxxCUBLAS/include/cublas_head.hpp @@ -1,2 +1 @@ #include -#include diff --git a/Tests/SwiftCUBLASTests/SwiftCUBLASTests.swift b/Tests/SwiftCUBLASTests/SwiftCUBLASTests.swift index 6bfe700..21d0f88 100644 --- a/Tests/SwiftCUBLASTests/SwiftCUBLASTests.swift +++ b/Tests/SwiftCUBLASTests/SwiftCUBLASTests.swift @@ -1,15 +1,15 @@ -import PythonKit import SwiftCU -import XCTest +import Testing import cxxCU import cxxCUBLAS - @testable import SwiftCUBLAS -let npy = Python.import("numpy") +@Suite("Basic GEMM tests") +struct SwiftCUBLASGEMMTests { -final class SwiftCUBLASTests: XCTestCase { - func testSimpleMatmulRowMajor() throws { + @Test func testSimpleMatmulRowMajor() async throws { + let cuStatus = CUDevice(index: 0).setDevice() + #expect(cuStatus) let m = 2 let n = 2 let k = 4 @@ -51,23 +51,24 @@ final class SwiftCUBLASTests: XCTestCase { ) let status = handle.sgemm_v2(params: ¶ms) - XCTAssert(status.isSuccessful) + #expect(status.isSuccessful) C.withUnsafeMutableBytes { rawBufferPointer in var pointerAddress = rawBufferPointer.baseAddress let outStatus = pointerAddress.cudaMemoryCopy( fromMutableRawPointer: cPointer, numberOfBytes: m * n * f32Size, copyKind: .cudaMemcpyDeviceToHost) - XCTAssert(outStatus.isSuccessful) + #expect(outStatus.isSuccessful) } cudaDeviceSynchronize() - let npyMatmul: [Float32] = Array(npy.matmul(npy.array(A).reshape([2, 4]), npy.array(B).reshape([4, 2])).flatten())! - XCTAssert((0...stride + let f32Size = MemoryLayout.stride + _ = aPointer.cudaMemoryAllocate(m * k * f16Size) _ = bPointer.cudaMemoryAllocate(k * n * f16Size) - _ = cPointer.cudaMemoryAllocate(m * n * f16Size) + _ = cPointer.cudaMemoryAllocate(m * n * f32Size) _ = aPointer.cudaMemoryCopy(fromRawPointer: &A, numberOfBytes: A.count * f16Size, copyKind: .cudaMemcpyHostToDevice) _ = bPointer.cudaMemoryCopy(fromRawPointer: &B, numberOfBytes: B.count * f16Size, copyKind: .cudaMemcpyHostToDevice) let handle = CUBLASHandle() - var params = CUBLASParams<__half>( - fromRowMajor: aPointer!.assumingMemoryBound(to: __half.self), B: bPointer!.assumingMemoryBound(to: __half.self), - C: cPointer!.assumingMemoryBound(to: __half.self), m: Int32(m), n: Int32(n), k: Int32(k), alpha: __half(1.0), beta: __half(0.0) + // Input types => __half, Output type => F32, compute type => F32 + var params = CUBLASParamsMixed( + fromRowMajor: aPointer!.assumingMemoryBound(to: Float16.self), B: bPointer!.assumingMemoryBound(to: Float16.self), + C: cPointer!.assumingMemoryBound(to: Float32.self), m: Int32(m), n: Int32(n), k: Int32(k), alpha: 1.0, beta: 0.0 ) - let status = handle.hgemm(params: ¶ms) - XCTAssert(status.isSuccessful) + let status = handle.gemmEx(params: ¶ms, computeType: .cublas_compute_32f) + + #expect(status.isSuccessful) C.withUnsafeMutableBytes { rawBufferPointer in var pointerAddress = rawBufferPointer.baseAddress let outStatus = pointerAddress.cudaMemoryCopy( - fromMutableRawPointer: cPointer, numberOfBytes: m * n * f16Size, copyKind: .cudaMemcpyDeviceToHost) - XCTAssert(outStatus.isSuccessful) + fromMutableRawPointer: cPointer, numberOfBytes: m * n * f32Size, copyKind: .cudaMemcpyDeviceToHost) + #expect(outStatus.isSuccessful) } cudaDeviceSynchronize() - let npyMatmul = npy.matmul(npy.array(A.map { Float32($0) }).reshape([2, 4]), npy.array(B.map { Float32($0) }).reshape([4, 2])) - let cNpyArray = npy.array(C.map { Float32($0) }).reshape([2, 2]) - XCTAssert(Bool(npy.allclose(npyMatmul, cNpyArray))!) + + let cExpected = matrixMultiply(m, n, k, A, B, isRowMajor: true) + #expect(cExpected.map{Float32($0)} ~= C) } - func testSimpleMatmulColumnMajorHalf() throws { + @Test func testSimpleMatmulRowMajorI8F32() async throws { + let cuStatus = CUDevice(index: 0).setDevice() + #expect(cuStatus) let m = 2 let n = 2 let k = 4 - var A: [Float16] = [ - 1.0, 5.0, - 2.0, 6.0, - 3.0, 7.0, - 4.0, 8.0, + var A: [Int8] = [ + 1, 2, 3, 4, + 5, 6, 7, 8, ] - var B: [Float16] = [ - 8.0, 6.0, 4.0, 2.0, - 7.0, 5.0, 3.0, 1.0, + var B: [Int8] = [ + 8, 7, + 6, 5, + 4, 3, + 2, 1, ] - var C: [Float16] = [Float16](repeating: 0.0, count: m * n) + var C: [Float32] = [Float32](repeating: 0.0, count: m * n) var aPointer: UnsafeMutableRawPointer? var bPointer: UnsafeMutableRawPointer? @@ -200,40 +213,39 @@ final class SwiftCUBLASTests: XCTestCase { _ = bPointer.cudaAndHostDeallocate() _ = cPointer.cudaAndHostDeallocate() } + let i8Size = MemoryLayout.stride let f32Size = MemoryLayout.stride - _ = aPointer.cudaMemoryAllocate(m * k * f32Size) - _ = bPointer.cudaMemoryAllocate(k * n * f32Size) + + _ = aPointer.cudaMemoryAllocate(m * k * i8Size) + _ = bPointer.cudaMemoryAllocate(k * n * i8Size) _ = cPointer.cudaMemoryAllocate(m * n * f32Size) - _ = aPointer.cudaMemoryCopy(fromRawPointer: &A, numberOfBytes: A.count * f32Size, copyKind: .cudaMemcpyHostToDevice) - _ = bPointer.cudaMemoryCopy(fromRawPointer: &B, numberOfBytes: B.count * f32Size, copyKind: .cudaMemcpyHostToDevice) + _ = aPointer.cudaMemoryCopy(fromRawPointer: &A, numberOfBytes: A.count * i8Size, copyKind: .cudaMemcpyHostToDevice) + _ = bPointer.cudaMemoryCopy(fromRawPointer: &B, numberOfBytes: B.count * i8Size, copyKind: .cudaMemcpyHostToDevice) let handle = CUBLASHandle() - var params = CUBLASParams<__half>( - fromColumnMajor: aPointer!.assumingMemoryBound(to: __half.self), B: bPointer!.assumingMemoryBound(to: __half.self), - C: cPointer!.assumingMemoryBound(to: __half.self), m: Int32(m), n: Int32(n), k: Int32(k), alpha: __half(1.0), beta: __half(0.0) + // Input types => Int8, Output type => F32, compute type => F32 + var params = CUBLASParamsMixed( + fromRowMajor: aPointer!.assumingMemoryBound(to: Int8.self), B: bPointer!.assumingMemoryBound(to: Int8.self), + C: cPointer!.assumingMemoryBound(to: Float32.self), m: Int32(m), n: Int32(n), k: Int32(k), alpha: 1.0, beta: 0.0 ) + let status = handle.gemmEx(params: ¶ms, computeType: .cublas_compute_32f) - let status = handle.hgemm(params: ¶ms) - XCTAssert(status.isSuccessful) + #expect(status.isSuccessful) C.withUnsafeMutableBytes { rawBufferPointer in var pointerAddress = rawBufferPointer.baseAddress let outStatus = pointerAddress.cudaMemoryCopy( fromMutableRawPointer: cPointer, numberOfBytes: m * n * f32Size, copyKind: .cudaMemcpyDeviceToHost) - XCTAssert(outStatus.isSuccessful) + #expect(outStatus.isSuccessful) } cudaDeviceSynchronize() - let npyMatmul = npy.matmul( - npy.array(A.map { Float32($0) }).reshape([2, 4], order: "F"), npy.array(B.map { Float32($0) }).reshape([4, 2], order: "F")) - let cNpyArray = npy.array(C.map { Float32($0) }).reshape([2, 2], order: "F") - XCTAssert(Bool(npy.allclose(npyMatmul, cNpyArray))!) + let cExpected = matrixMultiply(m, n, k, A, B, isRowMajor: true) + #expect(cExpected.map{Float32($0)} ~= C) } -} - -final class SwiftCUBLASGenericTests: XCTestCase { - - func testSimpleMatmulRowMajorHalfF32() throws { + @Test func testSimpleMatmulRowMajorHalf() async throws { + let cuStatus = CUDevice(index: 0).setDevice() + #expect(cuStatus) let m = 2 let n = 2 let k = 4 @@ -250,7 +262,7 @@ final class SwiftCUBLASGenericTests: XCTestCase { 2.0, 1.0, ] - var C: [Float32] = [Float32](repeating: 0.0, count: m * n) + var C: [Float16] = [Float16](repeating: 0.0, count: m * n) var aPointer: UnsafeMutableRawPointer? var bPointer: UnsafeMutableRawPointer? @@ -261,54 +273,56 @@ final class SwiftCUBLASGenericTests: XCTestCase { _ = cPointer.cudaAndHostDeallocate() } let f16Size = MemoryLayout.stride - let f32Size = MemoryLayout.stride _ = aPointer.cudaMemoryAllocate(m * k * f16Size) _ = bPointer.cudaMemoryAllocate(k * n * f16Size) - _ = cPointer.cudaMemoryAllocate(m * n * f32Size) + _ = cPointer.cudaMemoryAllocate(m * n * f16Size) _ = aPointer.cudaMemoryCopy(fromRawPointer: &A, numberOfBytes: A.count * f16Size, copyKind: .cudaMemcpyHostToDevice) _ = bPointer.cudaMemoryCopy(fromRawPointer: &B, numberOfBytes: B.count * f16Size, copyKind: .cudaMemcpyHostToDevice) let handle = CUBLASHandle() - // Input types => __half, Output type => F32, compute type => F32 - var params = CUBLASParamsMixed<__half, Float32, Float32>( - fromRowMajor: aPointer!.assumingMemoryBound(to: __half.self), B: bPointer!.assumingMemoryBound(to: __half.self), - C: cPointer!.assumingMemoryBound(to: Float32.self), m: Int32(m), n: Int32(n), k: Int32(k), alpha: 1.0, beta: 0.0 + + var params = CUBLASParamsMixed( + fromRowMajor: aPointer!.assumingMemoryBound(to: Float16.self), B: bPointer!.assumingMemoryBound(to: Float16.self), + C: cPointer!.assumingMemoryBound(to: Float16.self), m: Int32(m), n: Int32(n), k: Int32(k), alpha: 1.0, beta: 0.0 ) - let status = handle.gemmEx(params: ¶ms, computeType: .cublas_compute_32f_fast_16bf) - XCTAssert(status.isSuccessful) + let status = handle.gemmEx(params: ¶ms, computeType: .cublas_compute_16f) + + #expect(status.isSuccessful) C.withUnsafeMutableBytes { rawBufferPointer in var pointerAddress = rawBufferPointer.baseAddress let outStatus = pointerAddress.cudaMemoryCopy( - fromMutableRawPointer: cPointer, numberOfBytes: m * n * f32Size, copyKind: .cudaMemcpyDeviceToHost) - XCTAssert(outStatus.isSuccessful) + fromMutableRawPointer: cPointer, numberOfBytes: m * n * f16Size, copyKind: .cudaMemcpyDeviceToHost) + #expect(outStatus.isSuccessful) } cudaDeviceSynchronize() - let npyMatmul = npy.matmul(npy.array(A.map { Float32($0) }).reshape([2, 4]), npy.array(B.map { Float32($0) }).reshape([4, 2])) - let cNpyArray = npy.array(C).reshape([2, 2]) - XCTAssert(Bool(npy.allclose(npyMatmul, cNpyArray))!) + + let cExpected = matrixMultiply(m, n, k, A, B, isRowMajor: true) + #expect(cExpected.map{Float16($0)} ~= C) } - func testSimpleMatmulRowMajorI8F32() throws { + @Test func testSimpleMatmulColumnMajorHalf() async throws { + let cuStatus = CUDevice(index: 0).setDevice() + #expect(cuStatus) let m = 2 let n = 2 let k = 4 - var A: [Int8] = [ - 1, 2, 3, 4, - 5, 6, 7, 8, + var A: [Float16] = [ + 1.0, 5.0, + 2.0, 6.0, + 3.0, 7.0, + 4.0, 8.0, ] - var B: [Int8] = [ - 8, 7, - 6, 5, - 4, 3, - 2, 1, + var B: [Float16] = [ + 8.0, 6.0, 4.0, 2.0, + 7.0, 5.0, 3.0, 1.0, ] - var C: [Float32] = [Float32](repeating: 0.0, count: m * n) + var C: [Float16] = [Float16](repeating: 0.0, count: m * n) var aPointer: UnsafeMutableRawPointer? var bPointer: UnsafeMutableRawPointer? @@ -318,34 +332,34 @@ final class SwiftCUBLASGenericTests: XCTestCase { _ = bPointer.cudaAndHostDeallocate() _ = cPointer.cudaAndHostDeallocate() } - let i8Size = MemoryLayout.stride - let f32Size = MemoryLayout.stride + let f16Size = MemoryLayout.stride - _ = aPointer.cudaMemoryAllocate(m * k * i8Size) - _ = bPointer.cudaMemoryAllocate(k * n * i8Size) - _ = cPointer.cudaMemoryAllocate(m * n * f32Size) + _ = aPointer.cudaMemoryAllocate(m * k * f16Size) + _ = bPointer.cudaMemoryAllocate(k * n * f16Size) + _ = cPointer.cudaMemoryAllocate(m * n * f16Size) - _ = aPointer.cudaMemoryCopy(fromRawPointer: &A, numberOfBytes: A.count * i8Size, copyKind: .cudaMemcpyHostToDevice) - _ = bPointer.cudaMemoryCopy(fromRawPointer: &B, numberOfBytes: B.count * i8Size, copyKind: .cudaMemcpyHostToDevice) + _ = aPointer.cudaMemoryCopy(fromRawPointer: &A, numberOfBytes: A.count * f16Size, copyKind: .cudaMemcpyHostToDevice) + _ = bPointer.cudaMemoryCopy(fromRawPointer: &B, numberOfBytes: B.count * f16Size, copyKind: .cudaMemcpyHostToDevice) let handle = CUBLASHandle() - // Input types => Int8, Output type => F32, compute type => F32 - var params = CUBLASParamsMixed( - fromRowMajor: aPointer!.assumingMemoryBound(to: Int8.self), B: bPointer!.assumingMemoryBound(to: Int8.self), - C: cPointer!.assumingMemoryBound(to: Float32.self), m: Int32(m), n: Int32(n), k: Int32(k), alpha: 1.0, beta: 0.0 + + var params = CUBLASParamsMixed( + fromColumnMajor: aPointer!.assumingMemoryBound(to: Float16.self), B: bPointer!.assumingMemoryBound(to: Float16.self), + C: cPointer!.assumingMemoryBound(to: Float16.self), m: Int32(m), n: Int32(n), k: Int32(k), alpha: 1.0, beta: 0.0 ) - let status = handle.gemmEx(params: ¶ms, computeType: .cublas_compute_32f) - XCTAssert(status.isSuccessful) + let status = handle.gemmEx(params: ¶ms, computeType: .cublas_compute_16f) + + #expect(status.isSuccessful) C.withUnsafeMutableBytes { rawBufferPointer in var pointerAddress = rawBufferPointer.baseAddress let outStatus = pointerAddress.cudaMemoryCopy( - fromMutableRawPointer: cPointer, numberOfBytes: m * n * f32Size, copyKind: .cudaMemcpyDeviceToHost) - XCTAssert(outStatus.isSuccessful) + fromMutableRawPointer: cPointer, numberOfBytes: m * n * f16Size, copyKind: .cudaMemcpyDeviceToHost) + #expect(outStatus.isSuccessful) } cudaDeviceSynchronize() - let npyMatmul = npy.matmul(npy.array(A).reshape([2, 4]), npy.array(B).reshape([4, 2])) - let cNpyArray = npy.array(C).reshape([2, 2]) - XCTAssert(Bool(npy.allclose(npyMatmul, cNpyArray))!) + + let cExpected = matrixMultiply(m, n, k, A, B, isRowMajor: false) + #expect(cExpected.map{Float16($0)} ~= C) } } diff --git a/Tests/SwiftCUBLASTests/TestHelpers.swift b/Tests/SwiftCUBLASTests/TestHelpers.swift new file mode 100644 index 0000000..5e8b05e --- /dev/null +++ b/Tests/SwiftCUBLASTests/TestHelpers.swift @@ -0,0 +1,29 @@ +import SwiftCUBLAS + +func getIndex(row: Int, col: Int, numRows: Int, numCols: Int, isRowMajor: Bool) -> Int { + return isRowMajor ? row * numCols + col : col * numRows + row +} + +func matrixMultiply( + _ m: Int, + _ n: Int, + _ k: Int, + _ A: [T], + _ B: [T], + isRowMajor: Bool +) -> [T] { + var C: [T] = [T](repeating: 0, count: m * n) + for i in 0..