Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Initial AMDGPU support #26

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 22 additions & 7 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,45 @@ stages:
- test
- documentation
variables:
SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:15:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
JULIA_DEPOT_PATH: "/scratch/pc2-mitarbeiter/bauerc/.julia-ci"
JULIA_NUM_THREADS: "10"
JULIA_EXCLUSIVE: "1"
JULIA_1_9: "lang/JuliaHPC/1.9.2-foss-2022a-CUDA-11.7.0"
MKL_DYNAMIC: "false"
MKL_NUM_THREADS: "1"
JULIAHPC_1_9: "lang/JuliaHPC/1.9.2-foss-2022a-CUDA-11.7.0"
JULIA_1_9: "lang/Julia/1.9.2-linux-x86_64"
default:
tags:
- bauerc-noctua2

# Generates code coverage
julia/1.9:
julia/1.9/NVIDIA:
stage: test
rules:
- changes:
- "README.md"
- when: on_success
variables:
SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:20:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
script:
- /bin/bash -l
- module load $JULIA_1_9
- module load $JULIAHPC_1_9
- julia --color=yes --project=. -e 'using Pkg; Pkg.build(verbose=true); Pkg.test(; coverage = true);'
- julia --color=yes --project=test/coverage -e 'import Pkg; Pkg.instantiate()'
- julia --color=yes --project=test/coverage test/coverage/coverage.jl
allow_failure: false

julia/1.9/AMD:
stage: test
rules:
- changes:
- "README.md"
- when: on_success
variables:
SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 128 -t 00:20:00 -A pc2-mitarbeiter -p hacc --exclusive"
script:
- /bin/bash -l
- module load $JULIA_1_9
- julia --color=yes --project=. -e 'using Pkg; Pkg.build(verbose=true); Pkg.test(; coverage = false);'
allow_failure: true

# Documentation
build-and-deploy-docs:
Expand All @@ -37,9 +50,11 @@ build-and-deploy-docs:
- pushes
- tags
- external_pull_requests
variables:
SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 32 -t 00:20:00 -A pc2-mitarbeiter -p dgx --qos=devel --gres=gpu:a100:2"
script:
- /bin/bash -l
- module load $JULIA_1_9
- module load $JULIAHPC_1_9
- cd docs
- julia --color=yes build_docs.jl
allow_failure: false
10 changes: 5 additions & 5 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,39 +14,39 @@ Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042"
UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"

[extensions]
CUDAExt = "CUDA"
AMDGPUExt = "AMDGPU"
CairoMakieExt = "CairoMakie"

[compat]
AMDGPU = "0.5.5"
CUDA = "3.8.4, 3.12, 4.4"
CairoMakie = "0.7, 0.10.7"
CpuId = "0.3"
DocStringExtensions = "0.9"
Glob = "1.3"
HDF5 = "0.16"
NVTX = "0.3"
Reexport = "1.2"
TestItemRunner = "0.2"
ThreadPinning = "0.3, 0.4, 0.5, 0.6, 0.7"
UnicodePlots = "2.8, 3"
julia = "1.9"

[extras]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"

[targets]
test = ["Test", "InteractiveUtils", "CairoMakie", "CUDA", "TestItemRunner"]
test = ["Test", "InteractiveUtils", "CairoMakie", "CUDA", "AMDGPU"]
76 changes: 76 additions & 0 deletions ext/AMDGPUExt/AMDGPUExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
module AMDGPUExt

using GPUInspector
using AMDGPU
using AMDGPU: device, device!, devices

# stdlibs etc.
using Base: UUID
using Statistics
using Logging
using LinearAlgebra

# pkgs
using UnicodePlots
using ThreadPinning

# for usage in AMDGPUExt
using GPUInspector:
logspace,
ismonitoring,
_monitoring!,
_set_monitoring_task,
_get_monitoring_task,
MonitoringResults,
_defaultylims,
@unroll,
AMDBackend,
getstdout

include("utility.jl")
# include("stresstests.jl")
# include("peakflops_gpu_fmas.jl")
# include("peakflops_gpu_wmmas.jl")
# include("peakflops_gpu_matmul.jl")
include("implementations/general.jl")
include("implementations/gpuinfo.jl")
# include("implementations/p2p_bandwidth.jl")
include("implementations/host2device_bandwidth.jl")
include("implementations/membw.jl")
# include("implementations/stresstest.jl")
# include("implementations/monitoring.jl")
# include("implementations/peakflops_gpu.jl")

function __init__()
GPUInspector.AMDGPUJL_LOADED[] = true
GPUInspector.backend!(AMDBackend())
GPUInspector.AMDGPUExt = Base.get_extension(GPUInspector, :AMDGPUExt)
return nothing
end

function backendinfo(::AMDBackend)
# somewhat crude way to figure out which API functions are implemented :)
funcs = String[]
impl_dir = joinpath(@__DIR__, "implementations/")
for f in readdir(impl_dir)
lines = readlines(joinpath(impl_dir, f))
func_lines = filter(startswith("function"), lines)
for fl in func_lines
fname = strip(split(split(fl, "function")[2], "(")[1])
if startswith(fname, "_") || startswith(fname, "Base")
continue
end
if fname in funcs # avoid duplicates
continue
end
push!(funcs, fname)
end
end
println("Implementend API functions for AMDBackend:")
for f in funcs
println("\t", f)
end
return nothing
end

end # module
21 changes: 21 additions & 0 deletions ext/AMDGPUExt/implementations/general.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
function GPUInspector.functional(::AMDBackend; verbose=true)
if AMDGPU.functional()
verbose && @info("AMDGPU.jl is functional.")
working = true
else
verbose && @info("AMDGPU.jl not functional.")
working = false
end
return working
end

function GPUInspector.clear_gpu_memory(::AMDBackend; device=AMDGPU.device(), gc=true)
device!(device) do
gc && GC.gc()
AMDGPU.HIP.reclaim()
end
return nothing
end

GPUInspector.device(::AMDBackend) = AMDGPU.device()
GPUInspector.devices(::AMDBackend) = AMDGPU.devices()
70 changes: 70 additions & 0 deletions ext/AMDGPUExt/implementations/gpuinfo.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
function GPUInspector.ngpus(::AMDBackend)
return length(AMDGPU.devices())
end

function GPUInspector.gpus(::AMDBackend; io=getstdout())
# Based on https://github.com/JuliaGPU/CUDA.jl/blob/ca77d1828f3bc0df34501de848c7a13f1df0b1fe/src/utilities.jl#L69
devs = AMDGPU.devices()
if isempty(devs)
println(io, "No AMD devices found.")
elseif length(devs) == 1
println(io, "1 device:")
else
println(io, length(devs), " devices:")
end
for (i, dev) in enumerate(devs)
mem_free, mem_tot = AMDGPU.device!(dev) do
AMDGPU.Runtime.Mem.info()
end
println(
io,
" $(_gpuid(dev)): ",
repr(dev),
" ($(Base.format_bytes(mem_free)) / $(Base.format_bytes(mem_tot)) available)",
)
end
end

"""
gpuinfo(deviceid::Integer)

Print out detailed information about the AMD GPU with the given `deviceid`.

(This method is from the AMD backend.)
"""
function GPUInspector.gpuinfo(::AMDBackend, deviceid::Integer; io=getstdout())
0 <= deviceid <= ngpus(AMDBackend()) - 1 || throw(ArgumentError("Invalid device id."))
return gpuinfo(HIPDevice(deviceid); io)
end
function GPUInspector.gpuinfo(::AMDBackend, dev::HIPDevice=AMDGPU.device(); io=getstdout())
# printing
println(io, "Device: $dev \n")
show(io, AMDGPU.HIP.properties(dev))
return nothing
end

function GPUInspector.gpuinfo_p2p_access(::AMDBackend; io=getstdout())
# check p2p access
ndevs = ngpus(AMDBackend())
if ndevs <= 1
error("Only a single GPU available.")
else
devs = AMDGPU.devices()
mat_p2p_can_access = Matrix{Bool}(undef, ndevs, ndevs)
for i in 1:ndevs
for j in 1:ndevs
if i != j
mat_p2p_can_access[i, j] = Bool(AMDGPU.HIP.can_access_peer(devs[i], devs[j]))
else
mat_p2p_can_access[i, j] = false
end
end
end

printstyled(io, "P2P Can Access:\n"; bold=true)
show(io, "text/plain", mat_p2p_can_access)
println(io)
println(io)
end
return nothing
end
85 changes: 85 additions & 0 deletions ext/AMDGPUExt/implementations/host2device_bandwidth.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
function GPUInspector.host2device_bandwidth(
::AMDBackend;
memsize::UnitPrefixedBytes=GiB(0.5),
dtype=Cchar,
DtoDfactor=true,
verbose=true,
io=getstdout(),
kwargs...,
)
N = Int(bytes(memsize) ÷ sizeof(dtype))
mem_host = rand(dtype, N)
# mem_host_pinned = Mem.pin(rand(dtype, N)) # TODO
mem_gpu = AMDGPU.rand(dtype, N)

_perform_memcpy(mem_host, mem_gpu; title="Host <-> Device", verbose, io=io, kwargs...)
verbose && println(io)
# _perform_memcpy(
# mem_host_pinned,
# mem_gpu;
# title="Host (pinned) <-> Device",
# verbose,
# io=io,
# kwargs...,
# )
# verbose && println()
# _perform_memcpy(mem_gpu, mem_gpu2; title="Device <-> Device (same device)", DtoDfactor, verbose, kwargs...)
return nothing
end

function _perform_memcpy(
mem1,
mem2;
title="",
nbench=10,
times=false,
stats=false,
DtoDfactor=false,
verbose=true,
io=getstdout(),
)
sizeof(mem1) == sizeof(mem2) || error("sizeof(mem1) != sizeof(mem2)")
ts = zeros(nbench)

@inbounds for i in 1:nbench
if i % 2 == 0
ts[i] = AMDGPU.@elapsed copyto!(mem1, mem2)
else
ts[i] = AMDGPU.@elapsed copyto!(mem2, mem1)
end
end

t_min = minimum(ts)
t_max = maximum(ts)
t_avg = mean(ts)

actual_memsize_GiB = sizeof(mem1) * 2^(-30)
if DtoDfactor
actual_memsize_GiB *= 2 # must count both the read and the write here (taken from p2pBandwidthLatencyTest cuda sample....)
end
bws = actual_memsize_GiB ./ ts
bw_min = minimum(bws)
bw_max = maximum(bws)
bw_avg = mean(bws)

if verbose
if times
println(io, "t_min: $t_min")
println(io, "t_max: $t_max")
println(io, "t_avg: $t_avg")
end
printstyled(io, "$(title) Bandwidth (GiB/s):\n"; bold=true)
if stats
print(io, " ├ max: ")
printstyled(io, round(bw_max; digits=2), "\n"; color=:green, bold=true)
println(io, " ├ min: ", round(bw_min; digits=2))
println(io, " ├ avg: ", round(bw_avg; digits=2))
print(io, " └ std_dev: ")
printstyled(io, round(std(bws); digits=2), "\n"; color=:yellow, bold=true)
else
print(io, " └ max: ")
printstyled(io, round(bw_max; digits=2), "\n"; color=:green, bold=true)
end
end
return bw_max
end
Loading