Skip to content

Commit

Permalink
Remove CUDA dependence in favor of extension (#318)
Browse files Browse the repository at this point in the history
* cuda extension

* fix
  • Loading branch information
CarloLucibello authored Jul 16, 2023
1 parent f59ce44 commit 92d3163
Show file tree
Hide file tree
Showing 14 changed files with 84 additions and 94 deletions.
14 changes: 11 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,14 @@ uuid = "cffab07f-9bc2-4db1-8861-388f63bf7694"
authors = ["Carlo Lucibello and contributors"]
version = "0.6.8"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

[extensions]
GraphNeuralNetworksCUDAExt = "CUDA"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
Expand All @@ -22,7 +27,6 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"

[compat]
Adapt = "3"
Expand All @@ -46,12 +50,16 @@ julia = "1.9"
[extras]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"

[targets]
test = ["Test", "Adapt", "DataFrames", "InlineStrings", "Zygote", "FiniteDifferences", "ChainRulesTestUtils", "MLDatasets"]
test = ["Test", "Adapt", "DataFrames", "InlineStrings", "Zygote",
"FiniteDifferences", "ChainRulesTestUtils", "MLDatasets",
"CUDA", "cuDNN"]
2 changes: 2 additions & 0 deletions ext/GraphNeuralNetworksCUDAExt/GNNGraphs/query.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

GNNGraphs._rand_dense_vector(A::CUMAT_T) = CUDA.randn(size(A, 1))
2 changes: 2 additions & 0 deletions ext/GraphNeuralNetworksCUDAExt/GNNGraphs/transform.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

GNNGraphs.dense_zeros_like(a::CUMAT_T, T::Type, sz = size(a)) = CUDA.zeros(T, sz)
8 changes: 8 additions & 0 deletions ext/GraphNeuralNetworksCUDAExt/GNNGraphs/utils.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

GNNGraphs.iscuarray(x::AnyCuArray) = true


function sort_edge_index(u::AnyCuArray, v::AnyCuArray)
#TODO proper cuda friendly implementation
sort_edge_index(u |> Flux.cpu, v |> Flux.cpu) |> Flux.gpu
end
17 changes: 17 additions & 0 deletions ext/GraphNeuralNetworksCUDAExt/GraphNeuralNetworksCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
module GraphNeuralNetworksCUDAExt

using CUDA
using Random, Statistics, LinearAlgebra
using GraphNeuralNetworks
using GraphNeuralNetworks.GNNGraphs
using GraphNeuralNetworks.GNNGraphs: COO_T, ADJMAT_T, SPARSE_T
import GraphNeuralNetworks: propagate

const CUMAT_T = Union{CUDA.AnyCuMatrix, CUDA.CUSPARSE.CuSparseMatrix}

include("GNNGraphs/query.jl")
include("GNNGraphs/transform.jl")
include("GNNGraphs/utils.jl")
include("msgpass.jl")

end #module
37 changes: 37 additions & 0 deletions ext/GraphNeuralNetworksCUDAExt/msgpass.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@

###### PROPAGATE SPECIALIZATIONS ####################

## COPY_XJ

## avoid the fast path on gpu until we have better cuda support
function propagate(::typeof(copy_xj), g::GNNGraph{<:Union{COO_T, SPARSE_T}}, ::typeof(+),
xi, xj::AnyCuMatrix, e)
propagate((xi, xj, e) -> copy_xj(xi, xj, e), g, +, xi, xj, e)
end

## E_MUL_XJ

## avoid the fast path on gpu until we have better cuda support
function propagate(::typeof(e_mul_xj), g::GNNGraph{<:Union{COO_T, SPARSE_T}}, ::typeof(+),
xi, xj::AnyCuMatrix, e::AbstractVector)
propagate((xi, xj, e) -> e_mul_xj(xi, xj, e), g, +, xi, xj, e)
end

## W_MUL_XJ

## avoid the fast path on gpu until we have better cuda support
function propagate(::typeof(w_mul_xj), g::GNNGraph{<:Union{COO_T, SPARSE_T}}, ::typeof(+),
xi, xj::AnyCuMatrix, e::Nothing)
propagate((xi, xj, e) -> w_mul_xj(xi, xj, e), g, +, xi, xj, e)
end

# function propagate(::typeof(copy_xj), g::GNNGraph, ::typeof(mean), xi, xj::AbstractMatrix, e)
# A = adjacency_matrix(g, weighted=false)
# D = compute_degree(A)
# return xj * A * D
# end

# # Zygote bug. Error with sparse matrix without nograd
# compute_degree(A) = Diagonal(1f0 ./ vec(sum(A; dims=2)))

# Flux.Zygote.@nograd compute_degree
3 changes: 1 addition & 2 deletions src/GNNGraphs/GNNGraphs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ module GNNGraphs

using SparseArrays
using Functors: @functor
using CUDA
import Graphs
using Graphs: AbstractGraph, outneighbors, inneighbors, adjacency_matrix, degree,
has_self_loops, is_directed
Expand All @@ -15,7 +14,7 @@ import KrylovKit
using ChainRulesCore
using LinearAlgebra, Random, Statistics
import MLUtils
using MLUtils: getobs, numobs
using MLUtils: getobs, numobs, ones_like, zeros_like
import Functors

include("chainrules.jl") # hacks for differentiability
Expand Down
1 change: 0 additions & 1 deletion src/GNNGraphs/abstracttypes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ const COO_T = Tuple{T, T, V} where {T <: AbstractVector{<:Integer}, V}
const ADJLIST_T = AbstractVector{T} where {T <: AbstractVector{<:Integer}}
const ADJMAT_T = AbstractMatrix
const SPARSE_T = AbstractSparseMatrix # subset of ADJMAT_T
const CUMAT_T = Union{CUDA.AnyCuMatrix, CUDA.CUSPARSE.CuSparseMatrix}

const AVecI = AbstractVector{<:Integer}

Expand Down
57 changes: 0 additions & 57 deletions src/GNNGraphs/gatherscatter.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,60 +16,3 @@ function _scatter(aggr,
dstsize = (size(src)[1:(end - 1)]..., n)
return NNlib.scatter(aggr, src, idx; dstsize)
end

## TO MOVE TO NNlib ######################################################

### Considers the src a zero dimensional object.
### Useful for implementing `StatsBase.counts`, `degree`, etc...
### function NNlib.scatter!(op, dst::AbstractArray, src::Number, idx::AbstractArray)
### for k in CartesianIndices(idx)
### # dst_v = NNlib._view(dst, idx[k])
### # dst_v .= (op).(dst_v, src)
### dst[idx[k]] .= (op).(dst[idx[k]], src)
### end
### dst
### end

# 10 times faster than the generic version above.
# All the speedup comes from not broadcasting `op`, i dunno why.
# function NNlib.scatter!(op, dst::AbstractVector, src::Number, idx::AbstractVector{<:Integer})
# for i in idx
# dst[i] = op(dst[i], src)
# end
# end

## NNlib._view(X, k) = view(X, k...)
## NNlib._view(X, k::Union{Integer, CartesianIndex}) = view(X, k)
#
## Considers src as a zero dimensional object to be scattered
## function NNlib.scatter(op,
## src::Tsrc,
## idx::AbstractArray{Tidx,Nidx};
## init = nothing, dstsize = nothing) where {Tsrc<:Number,Tidx,Nidx}
## dstsz = isnothing(dstsize) ? maximum_dims(idx) : dstsize
## dst = similar(src, Tsrc, dstsz)
## xinit = isnothing(init) ? scatter_empty(op, Tsrc) : init
## fill!(dst, xinit)
## scatter!(op, dst, src, idx)
## end

# function scatter_scalar_kernel!(op, dst, src, idx)
# index = threadIdx().x + (blockIdx().x - 1) * blockDim().x

# @inbounds if index <= length(idx)
# CUDA.@atomic dst[idx[index]...] = op(dst[idx[index]...], src)
# end
# return nothing
# end

# function NNlib.scatter!(op, dst::AnyCuArray, src::Number, idx::AnyCuArray)
# max_idx = length(idx)
# args = op, dst, src, idx

# kernel = @cuda launch=false scatter_scalar_kernel!(args...)
# config = launch_configuration(kernel.fun; max_threads=256)
# threads = min(max_idx, config.threads)
# blocks = cld(max_idx, threads)
# kernel(args...; threads=threads, blocks=blocks)
# return dst
# end
3 changes: 1 addition & 2 deletions src/GNNGraphs/query.jl
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ If `weighted=true`, the `A` will contain the edge weights if any, otherwise the
"""
function Graphs.adjacency_matrix(g::GNNGraph{<:COO_T}, T::DataType = eltype(g); dir = :out,
weighted = true)
if g.graph[1] isa CuVector
if iscuarray(g.graph[1])
# Revisit after
# https://github.com/JuliaGPU/CUDA.jl/issues/1113
A, n, m = to_dense(g.graph, T; num_nodes = g.num_nodes, weighted)
Expand Down Expand Up @@ -448,7 +448,6 @@ function _eigmax(A)
end

_rand_dense_vector(A::AbstractMatrix{T}) where {T} = randn(float(T), size(A, 1))
_rand_dense_vector(A::CUMAT_T) = CUDA.randn(size(A, 1))

# Eigenvalues for cuarray don't seem to be well supported.
# https://github.com/JuliaGPU/CUDA.jl/issues/154
Expand Down
3 changes: 1 addition & 2 deletions src/GNNGraphs/transform.jl
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,7 @@ function negative_sample(g::GNNGraph;

s, t = edge_index(g)
n = g.num_nodes
if s isa CuArray
if iscuarray(s)
# Convert to gpu since set operations and sampling are not supported by CUDA.jl
device = Flux.gpu
s, t = Flux.cpu(s), Flux.cpu(t)
Expand Down Expand Up @@ -852,7 +852,6 @@ end

dense_zeros_like(a::SparseMatrixCSC, T::Type, sz = size(a)) = zeros(T, sz)
dense_zeros_like(a::AbstractArray, T::Type, sz = size(a)) = fill!(similar(a, T, sz), 0)
dense_zeros_like(a::CUMAT_T, T::Type, sz = size(a)) = CUDA.zeros(T, sz)
dense_zeros_like(x, sz = size(x)) = dense_zeros_like(x, eltype(x), sz)

# """
Expand Down
13 changes: 3 additions & 10 deletions src/GNNGraphs/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,6 @@ function sort_edge_index(u, v)
return u[p], v[p]
end

function sort_edge_index(u::AnyCuArray, v::AnyCuArray)
#TODO proper cuda friendly implementation
sort_edge_index(u |> Flux.cpu, v |> Flux.cpu) |> Flux.gpu
end

cat_features(x1::Nothing, x2::Nothing) = nothing
cat_features(x1::AbstractArray, x2::AbstractArray) = cat(x1, x2, dims = ndims(x1))
function cat_features(x1::Union{Number, AbstractVector}, x2::Union{Number, AbstractVector})
Expand Down Expand Up @@ -193,11 +188,6 @@ function normalize_heterographdata(data::Dict; default_name::Symbol, ns::Dict, k
for (k, n) in ns]...)
end

ones_like(x::AbstractArray, T::Type, sz = size(x)) = fill!(similar(x, T, sz), 1)
ones_like(x::SparseMatrixCSC, T::Type, sz = size(x)) = ones(T, sz)
ones_like(x::CUMAT_T, T::Type, sz = size(x)) = CUDA.ones(T, sz)
ones_like(x, sz = size(x)) = ones_like(x, eltype(x), sz)

numnonzeros(a::AbstractSparseMatrix) = nnz(a)
numnonzeros(a::AbstractMatrix) = count(!=(0), a)

Expand Down Expand Up @@ -303,3 +293,6 @@ end

@non_differentiable normalize_graphdata(::NamedTuple{(), Tuple{}})
@non_differentiable normalize_graphdata(::Nothing)

iscuarray(x::AbstractArray) = false
@non_differentiable iscuarray(::Any)
1 change: 0 additions & 1 deletion src/GraphNeuralNetworks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ module GraphNeuralNetworks
using Statistics: mean
using LinearAlgebra, Random
using Base: tail
using CUDA
using Flux
using Flux: glorot_uniform, leakyrelu, GRUCell, @functor, batch
using MacroTools: @forward
Expand Down
17 changes: 1 addition & 16 deletions src/msgpass.jl
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ function w_mul_xj(xi, xj::AbstractArray{Tj, Nj}, w::AbstractVector) where {Tj, N
end

###### PROPAGATE SPECIALIZATIONS ####################
## See also the methods defined in the package extensions.

## COPY_XJ

Expand All @@ -250,12 +251,6 @@ function propagate(::typeof(copy_xj), g::GNNGraph, ::typeof(+), xi, xj::Abstract
return xj * A
end

## avoid the fast path on gpu until we have better cuda support
function propagate(::typeof(copy_xj), g::GNNGraph{<:Union{COO_T, SPARSE_T}}, ::typeof(+),
xi, xj::AnyCuMatrix, e)
propagate((xi, xj, e) -> copy_xj(xi, xj, e), g, +, xi, xj, e)
end

## E_MUL_XJ

# for weighted convolution
Expand All @@ -266,11 +261,6 @@ function propagate(::typeof(e_mul_xj), g::GNNGraph, ::typeof(+), xi, xj::Abstrac
return xj * A
end

## avoid the fast path on gpu until we have better cuda support
function propagate(::typeof(e_mul_xj), g::GNNGraph{<:Union{COO_T, SPARSE_T}}, ::typeof(+),
xi, xj::AnyCuMatrix, e::AbstractVector)
propagate((xi, xj, e) -> e_mul_xj(xi, xj, e), g, +, xi, xj, e)
end

## W_MUL_XJ

Expand All @@ -281,11 +271,6 @@ function propagate(::typeof(w_mul_xj), g::GNNGraph, ::typeof(+), xi, xj::Abstrac
return xj * A
end

## avoid the fast path on gpu until we have better cuda support
function propagate(::typeof(w_mul_xj), g::GNNGraph{<:Union{COO_T, SPARSE_T}}, ::typeof(+),
xi, xj::AnyCuMatrix, e::Nothing)
propagate((xi, xj, e) -> w_mul_xj(xi, xj, e), g, +, xi, xj, e)
end

# function propagate(::typeof(copy_xj), g::GNNGraph, ::typeof(mean), xi, xj::AbstractMatrix, e)
# A = adjacency_matrix(g, weighted=false)
Expand Down

0 comments on commit 92d3163

Please sign in to comment.