Skip to content

test with sparse cuda arrays #66

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion perf/Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
GraphNeuralNetworks = "cffab07f-9bc2-4db1-8861-388f63bf7694"
Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
Graphs = "093fc24a-ae57-5d10-9952-331d41423f4d"
Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
Binary file added perf/master_2021_11_01_arrakis.jld2
Binary file not shown.
53 changes: 35 additions & 18 deletions perf/perf.jl
Original file line number Diff line number Diff line change
@@ -1,17 +1,30 @@
using Flux, GraphNeuralNetworks, Graphs, BenchmarkTools, CUDA
using DataFrames, Statistics, JLD2, SparseArrays
CUDA.device!(2)
using Unitful
# CUDA.device!(2)
CUDA.allowscalar(false)

BenchmarkTools.ratio(::Missing, x) = Inf
BenchmarkTools.ratio(x, ::Missing) = 0.0
BenchmarkTools.ratio(::Missing, ::Missing) = missing
function getres(res, str)
ismissing(res[str]) && return missing
t = median(res[str]).time
if t < 1e3
t * u"ns"
elseif t < 1e6
t / 1e3 * u"μs"
elseif t < 1e9
t / 1e6 * u"ms"
else
t / 1e9 * u"s"
end
end

function run_single_benchmark(N, c, D, CONV; gtype=:lg)
data = erdos_renyi(N, c / (N-1), seed=17)
X = randn(Float32, D, N)


data = erdos_renyi(N, c / (N-1), seed=17)
g = GNNGraph(data; ndata=X, graph_type=gtype)

# g = rand_graph(N, c*N; ndata=X, graph_type=gtype)
g_gpu = g |> gpu

m = CONV(D => D)
Expand Down Expand Up @@ -58,11 +71,12 @@ function run_benchmarks(;
c = 6,
D = 100,
layers = [GCNConv, GATConv],
gtypes = [:coo, :sparse, :dense],
gtypes = [:coo],
)

df = DataFrame(N=Int[], c=Float64[], layer=String[], gtype=Symbol[],
time_cpu=Any[], time_gpu=Any[]) |> allowmissing
df = DataFrame(N=Int[], c=Int[], layer=String[], gtype=Symbol[],
time_fwd_cpu=Any[], time_fwd_gpu=Any[],
time_grad_cpu=Any[], time_grad_gpu=Any[])

for gtype in gtypes
for N in Ns
Expand All @@ -73,31 +87,34 @@ function run_benchmarks(;
N = N,
c = c,
gtype = gtype,
time_cpu = ismissing(res["CPU"]) ? missing : median(res["CPU"]),
time_gpu = ismissing(res["GPU"]) ? missing : median(res["GPU"]),
time_fwd_cpu = getres(res, "CPU_FWD"),
time_fwd_gpu = getres(res, "GPU_FWD"),
time_grad_cpu = getres(res, "CPU_GRAD"),
time_grad_gpu = getres(res, "GPU_GRAD"),
)
push!(df, row)
println(row)
end
end
end

df.gpu_to_cpu = ratio.(df.time_gpu, df.time_cpu)
df.grad_gpu_to_cpu = NoUnits.(df.time_grad_gpu ./ df.time_grad_cpu)
sort!(df, [:layer, :N, :c, :gtype])
return df
end

# df = run_benchmarks()
# for g in groupby(df, :layer); println(g, "\n"); end
df = run_benchmarks()
for g in groupby(df, :layer); println(g, "\n"); end

# @save "perf/perf_master_20210803_carlo.jld2" dfmaster=df
# @save "master_2021_11_01_arrakis.jld2" dfmaster=df
## or
# @save "perf/perf_pr.jld2" dfpr=df
# @save "pr.jld2" dfpr=df


function compare(dfpr, dfmaster; on=[:N, :c, :gtype, :layer])
df = outerjoin(dfpr, dfmaster; on=on, makeunique=true, renamecols = :_pr => :_master)
df.pr_to_master_cpu = ratio.(df.time_cpu_pr, df.time_cpu_master)
df.pr_to_master_gpu = ratio.(df.time_gpu_pr, df.time_gpu_master)
df.pr_to_master_cpu = df.time_cpu_pr ./ df.time_cpu_master
df.pr_to_master_gpu = df.time_gpu_pr ./ df.time_gpu_master
return df[:,[:N, :c, :gtype, :layer, :pr_to_master_cpu, :pr_to_master_gpu]]
end

Expand Down
Binary file added perf/pr_2021_11_01_arrakis.jld2
Binary file not shown.
1 change: 1 addition & 0 deletions src/GNNGraphs/GNNGraphs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module GNNGraphs
using SparseArrays
using Functors: @functor
using CUDA
using CUDA.CUSPARSE
import Graphs
using Graphs: AbstractGraph, outneighbors, inneighbors, adjacency_matrix, degree
import Flux
Expand Down
30 changes: 29 additions & 1 deletion src/GNNGraphs/convert.jl
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,39 @@ function to_sparse(coo::COO_T, T::DataType=Int; dir=:out, num_nodes=nothing)
s, t, eweight = coo
eweight = isnothing(eweight) ? fill!(similar(s, T), 1) : eweight
num_nodes = isnothing(num_nodes) ? max(maximum(s), maximum(t)) : num_nodes
A = sparse(s, t, eweight, num_nodes, num_nodes)
A = _sparse(s, t, eweight, num_nodes, num_nodes)
num_edges = length(s)
return A, num_nodes, num_edges
end

_sparse(s, t, eweight, n, m) = sparse(s, t, eweight, n, m)

function _sparse(I::CuVector, J::CuVector, V::CuVector, m, n)
spcoo = CuSparseMatrixCOO{Float32, Int32}(Int32.(I), Int32.(J), Float32.(V), (m, n))
return CuSparseMatrixCSR(spcoo)
end

# function _sparse(I::CuVector, J::CuVector, V::CuVector, m, n; fmt=:csr)
# # Tv = Int32
# spcoo = CuSparseMatrixCOO{Float32, Int32}(Int32.(I), Int32.(J), Float32.(V), (m, n))
# if fmt == :csc
# return CuSparseMatrixCSC(spcoo)
# elseif fmt == :csr
# return CuSparseMatrixCSR(spcoo)
# elseif fmt == :coo
# return spcoo
# else
# error("Format :$fmt not available, use :csc, :csr, or :coo.")
# end
# end


# Workaround for https://github.com/JuliaGPU/CUDA.jl/issues/1113#issuecomment-955759875
function Base.:*(A::CuMatrix, B::CuSparseMatrixCSR)
@assert size(A, 2) == size(B, 1)
return CuMatrix((B' * A')')
end


@non_differentiable to_coo(x...)
@non_differentiable to_dense(x...)
Expand Down
6 changes: 3 additions & 3 deletions src/GNNGraphs/generate.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
rand_graph(n, m; bidirected=true, kws...)
rand_graph(n, m; bidirected=true, seed=-1, kws...)

Generate a random (Erdós-Renyi) `GNNGraph` with `n` nodes
and `m` edges.
Expand Down Expand Up @@ -43,10 +43,10 @@ julia> edge_index(g)

```
"""
function rand_graph(n::Integer, m::Integer; bidirected=true, kws...)
function rand_graph(n::Integer, m::Integer; bidirected=true, seed=-1, kws...)
if bidirected
@assert iseven(m) "Need even number of edges for bidirected graphs, given m=$m."
end
m2 = bidirected ? m÷2 : m
return GNNGraph(Graphs.erdos_renyi(n, m2, is_directed=!bidirected); kws...)
return GNNGraph(Graphs.erdos_renyi(n, m2; is_directed=!bidirected, seed); kws...)
end
4 changes: 2 additions & 2 deletions src/GNNGraphs/gnngraph.jl
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,8 @@ function GNNGraph(g::GNNGraph; ndata=g.ndata, edata=g.edata, gdata=g.gdata, grap
ndata, edata, gdata)
end

function Base.show(io::IO, g::GNNGraph)
println(io, "GNNGraph:
function Base.show(io::IO, g::GNNGraph{T}) where T
println(io, "GNNGraph{$T}:
num_nodes = $(g.num_nodes)
num_edges = $(g.num_edges)
num_graphs = $(g.num_graphs)")
Expand Down
7 changes: 1 addition & 6 deletions src/GNNGraphs/query.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,7 @@ function adjacency_list(g::GNNGraph; dir=:out)
end

function Graphs.adjacency_matrix(g::GNNGraph{<:COO_T}, T::DataType=Int; dir=:out)
if g.graph[1] isa CuVector
# TODO revisit after https://github.com/JuliaGPU/CUDA.jl/pull/1152
A, n, m = to_dense(g.graph, T, num_nodes=g.num_nodes)
else
A, n, m = to_sparse(g.graph, T, num_nodes=g.num_nodes)
end
A, n, m = to_sparse(g.graph, T, num_nodes=g.num_nodes)
@assert size(A) == (n, n)
return dir == :out ? A : A'
end
Expand Down
27 changes: 24 additions & 3 deletions src/msgpass.jl
Original file line number Diff line number Diff line change
Expand Up @@ -152,11 +152,32 @@ function propagate(::typeof(copyxj), g::GNNGraph, ::typeof(+), xi, xj::AbstractM
return xj * A
end

## avoid the fast path on gpu until we have better cuda support
function propagate(::typeof(copyxj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e)
propagate((xi,xj,e)->copyxj(xi,xj,e), g, +, xi, xj, e)
# Have to define custom rule since CUDA.jl has troubles with some sparse-dense multiplications
function ChainRulesCore.rrule(::typeof(propagate), ::typeof(copyxj), g::GNNGraph,
::typeof(+), xi, xj::AbstractMatrix, e)
A = adjacency_matrix(g)
y = xj * A
function propagate_pullback(ȳ)
Ȳ = unthunk(ȳ)
dxj = Ȳ * A'
return NoTangent(), NoTangent(), NoTangent(), NoTangent(), NoTangent(), dxj, NoTangent()
end

function propagate_pullback(ȳ::CuMatrix)
Ȳ = unthunk(ȳ)
dxj = CuArray((A * Ȳ')')
return NoTangent(), NoTangent(), NoTangent(), NoTangent(), NoTangent(), dxj, NoTangent()
end

y, propagate_pullback
end


# ## avoid the fast path on gpu until we have better cuda support
# function propagate(::typeof(copyxj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e)
# propagate((xi,xj,e) -> copyxj(xi,xj,e), g, +, xi, xj, e)
# end

# function propagate(::typeof(copyxj), g::GNNGraph, ::typeof(mean), xi, xj::AbstractMatrix, e)
# A = adjacency_matrix(g)
# D = compute_degree(A)
Expand Down