-
Notifications
You must be signed in to change notification settings - Fork 263
Make Ahead-of-Time compilation feasible for packages using CUDA.jl #2998
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Make Ahead-of-Time compilation feasible for packages using CUDA.jl #2998
Conversation
Codecov Report✅ All modified and coverable lines are covered by tests. Additional details and impacted files@@ Coverage Diff @@
## master #2998 +/- ##
===========================================
- Coverage 89.15% 75.78% -13.38%
===========================================
Files 148 148
Lines 12885 12844 -41
===========================================
- Hits 11488 9734 -1754
- Misses 1397 3110 +1713 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 0874adf77..1a6d64784 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -151,7 +151,7 @@ for A in (AS.Generic, AS.Global, AS.Shared), T in (:Int16, :UInt16)
end
intr = "atom$scope.cas.b16 \$0, [\$1], \$2, \$3;"
- @eval @device_function @inline atomic_cas!(ptr::LLVMPtr{$T,$A}, cmp::$T, val::$T) =
+ @eval @device_function @inline atomic_cas!(ptr::LLVMPtr{$T, $A}, cmp::$T, val::$T) =
@asmcall($intr, "=h,l,h,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T,$T}, ptr, cmp, val)
end
@@ -172,7 +172,7 @@ for A in (AS.Generic, AS.Global, AS.Shared)
nb = sizeof(T)*8
fn = Symbol("atomic_$(op)!")
intr = "llvm.nvvm.atomic.load.$op.$nb.p$(convert(Int, A))i$nb"
- @eval @device_function @inline $fn(ptr::LLVMPtr{$T,$A}, val::$T) =
+ @eval @device_function @inline $fn(ptr::LLVMPtr{$T, $A}, val::$T) =
@typed_ccall($intr, llvmcall, $T, (LLVMPtr{$T,$A}, $T), ptr, val)
end
end
@@ -192,7 +192,7 @@ for A in (AS.Generic, AS.Global, AS.Shared), T in (:Float16,)
end
intr = "atom$scope.add.noftz.f16 \$0, [\$1], \$2;"
- @eval @device_function @inline atomic_add!(ptr::LLVMPtr{$T,$A}, val::$T) =
+ @eval @device_function @inline atomic_add!(ptr::LLVMPtr{$T, $A}, val::$T) =
@asmcall($intr, "=h,l,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T}, ptr, val)
end
diff --git a/src/device/intrinsics/cooperative_groups.jl b/src/device/intrinsics/cooperative_groups.jl
index 0c07e3117..4bf553ce9 100644
--- a/src/device/intrinsics/cooperative_groups.jl
+++ b/src/device/intrinsics/cooperative_groups.jl
@@ -24,7 +24,7 @@ Noteworthy missing functionality:
module CG
using ..CUDA
-using ..CUDA: i32, Aligned, alignment, @device_function
+ using ..CUDA: i32, Aligned, alignment, @device_function
import ..LLVM
using ..LLVM.Interop
@@ -73,7 +73,7 @@ const grid_workspace = Ptr{grid_workspace_st}
end
end
-@device_function function get_grid_workspace()
+ @device_function function get_grid_workspace()
# interpret the address from envreg 1 and 2 as the driver's grid workspace
hi = ccall("llvm.nvvm.read.ptx.sreg.envreg1", llvmcall, UInt32, ())
lo = ccall("llvm.nvvm.read.ptx.sreg.envreg2", llvmcall, UInt32, ())
@@ -561,13 +561,13 @@ end
## pipeline operations
-@device_function pipeline_commit() =
- ccall("llvm.nvvm.cp.async.commit.group", llvmcall, Cvoid, ())
+ @device_function pipeline_commit() =
+ ccall("llvm.nvvm.cp.async.commit.group", llvmcall, Cvoid, ())
-@device_function pipeline_wait_prior(n) =
+ @device_function pipeline_wait_prior(n) =
ccall("llvm.nvvm.cp.async.wait.group", llvmcall, Cvoid, (Int32,), n)
-@device_function @generated function pipeline_memcpy_async(dst::LLVMPtr{T}, src::LLVMPtr{T}) where T
+ @device_function @generated function pipeline_memcpy_async(dst::LLVMPtr{T}, src::LLVMPtr{T}) where {T}
size_and_align = sizeof(T)
size_and_align in (4, 8, 16) || :(return error($"Unsupported size $size_and_align"))
intr = "llvm.nvvm.cp.async.ca.shared.global.$(sizeof(T))"
diff --git a/src/device/intrinsics/indexing.jl b/src/device/intrinsics/indexing.jl
index 6661591ec..b77da46cd 100644
--- a/src/device/intrinsics/indexing.jl
+++ b/src/device/intrinsics/indexing.jl
@@ -43,7 +43,7 @@ for dim in (:x, :y, :z)
# Thread index
fn = Symbol("threadIdx_$dim")
intr = Symbol("tid.$dim")
- @eval @device_function @inline $fn() = _index($(Val(intr)), $(Val(0:max_block_size[dim]-1))) + 1i32
+ @eval @device_function @inline $fn() = _index($(Val(intr)), $(Val(0:(max_block_size[dim] - 1)))) + 1i32
# Block size (#threads per block)
fn = Symbol("blockDim_$dim")
@@ -53,7 +53,7 @@ for dim in (:x, :y, :z)
# Block index
fn = Symbol("blockIdx_$dim")
intr = Symbol("ctaid.$dim")
- @eval @device_function @inline $fn() = _index($(Val(intr)), $(Val(0:max_grid_size[dim]-1))) + 1i32
+ @eval @device_function @inline $fn() = _index($(Val(intr)), $(Val(0:(max_grid_size[dim] - 1)))) + 1i32
# Grid size (#blocks per grid)
fn = Symbol("gridDim_$dim")
diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl
index dcee17b10..241a26ba5 100644
--- a/src/device/intrinsics/synchronization.jl
+++ b/src/device/intrinsics/synchronization.jl
@@ -67,7 +67,7 @@ end # @device_functions
export barrier_sync
-@device_function barrier_sync(id=0) =
+@device_function barrier_sync(id = 0) =
ccall("llvm.nvvm.barrier.sync", llvmcall, Cvoid, (Int32,), id)
diff --git a/src/device/intrinsics/warp.jl b/src/device/intrinsics/warp.jl
index 63913c5aa..ed60d495f 100644
--- a/src/device/intrinsics/warp.jl
+++ b/src/device/intrinsics/warp.jl
@@ -26,7 +26,7 @@ for (name, mode, mask, offset) in (("_up", :up, UInt32(0x00), src->src),
for (T,typ) in ((Int32, "i32"), (UInt32, "i32"), (Float32, "f32"))
intrinsic = "llvm.nvvm.shfl.sync.$mode.$typ"
@eval begin
- @device_function @inline $fname(mask, val::$T, src, width=$ws) =
+ @device_function @inline $fname(mask, val::$T, src, width = $ws) =
ccall($intrinsic, llvmcall, $T,
(UInt32, $T, UInt32, UInt32),
mask, val, $(offset(:src)), pack(width, $mask))
diff --git a/src/device/intrinsics/wmma.jl b/src/device/intrinsics/wmma.jl
index 2998a5566..6faecfd55 100644
--- a/src/device/intrinsics/wmma.jl
+++ b/src/device/intrinsics/wmma.jl
@@ -2,7 +2,7 @@ export WMMA
module WMMA
import ..LLVM
-using ..CUDA: AS, @device_function
+ using ..CUDA: AS, @device_function
using Core: LLVMPtr
################################################################################
@@ -200,10 +200,10 @@ for ops in all_ldst_ops,
ptr_ty = :(LLVMPtr{$arr_ty, $addr_space_int})
if sz == 1
- @eval @device_function $func_name(src_addr, stride) = tuple(ccall($ccall_name, llvmcall, $frag_ty, ($ptr_ty, Int32), src_addr, stride))
+ @eval @device_function $func_name(src_addr, stride) = tuple(ccall($ccall_name, llvmcall, $frag_ty, ($ptr_ty, Int32), src_addr, stride))
else
struct_ty = Symbol("LLVMStruct$sz")
- @eval @device_function $func_name(src_addr, stride) = convert(NTuple{$sz, $frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$frag_ty}, ($ptr_ty, Int32), src_addr, stride))
+ @eval @device_function $func_name(src_addr, stride) = convert(NTuple{$sz, $frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$frag_ty}, ($ptr_ty, Int32), src_addr, stride))
end
@eval export $func_name
@eval @doc (@doc llvm_wmma_load) $func_name
@@ -270,7 +270,7 @@ export llvm_wmma_store
ptr_ty = :(LLVMPtr{$arr_ty, $addr_space_int})
- @eval @device_function $func_name(dst_addr, data, stride) = ccall($ccall_name, llvmcall, Nothing, ($ptr_ty, $(frag_types...), Int32), dst_addr, $(frag_vars...), stride)
+ @eval @device_function $func_name(dst_addr, data, stride) = ccall($ccall_name, llvmcall, Nothing, ($ptr_ty, $(frag_types...), Int32), dst_addr, $(frag_vars...), stride)
@eval export $func_name
@eval @doc (@doc llvm_wmma_store) $func_name
end
@@ -347,10 +347,10 @@ for ops in all_wmma_ops,
c_vars = ntuple(i -> :(c[$i]), c_sz)
if d_sz == 1
- @eval @device_function $func_name(a, b, c) = tuple(ccall($ccall_name, llvmcall, $d_frag_ty, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...)))
+ @eval @device_function $func_name(a, b, c) = tuple(ccall($ccall_name, llvmcall, $d_frag_ty, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...)))
else
struct_ty = Symbol("LLVMStruct$d_sz")
- @eval @device_function $func_name(a, b, c) = convert(NTuple{$d_sz, $d_frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$d_frag_ty}, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...)))
+ @eval @device_function $func_name(a, b, c) = convert(NTuple{$d_sz, $d_frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$d_frag_ty}, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...)))
end
@eval export $func_name
@eval @doc (@doc llvm_wmma_mma) $func_name
diff --git a/src/device/pointer.jl b/src/device/pointer.jl
index e38adf52e..df4852bb2 100644
--- a/src/device/pointer.jl
+++ b/src/device/pointer.jl
@@ -33,7 +33,8 @@ for T in LDGTypes
typ = Symbol(class, width)
intr = "llvm.nvvm.ldg.global.$class.$typ.p1$typ"
- @eval @device_function @inline function pointerref_ldg(base_ptr::LLVMPtr{$T,AS.Global}, i::Integer,
+ @eval @device_function @inline function pointerref_ldg(
+ base_ptr::LLVMPtr{$T, AS.Global}, i::Integer,
::Val{align}) where align
offset = i-one(i) # in elements
ptr = base_ptr + offset*sizeof($T)
@@ -52,7 +53,8 @@ for (N, T) in ((4, Float32), (2, Float64), (4, Int8), (4, Int16), (4, Int32), (2
typ = Symbol(class, width)
intr = "llvm.nvvm.ldg.global.$class.v$N$typ.p1v$N$typ"
- @eval @device_function @inline function pointerref_ldg(base_ptr::LLVMPtr{NTuple{$N, Base.VecElement{$T}},AS.Global}, i::Integer,
+ @eval @device_function @inline function pointerref_ldg(
+ base_ptr::LLVMPtr{NTuple{$N, Base.VecElement{$T}}, AS.Global}, i::Integer,
::Val{align}) where align
offset = i-one(i) # in elements
ptr = base_ptr + offset*$N*sizeof($T)
diff --git a/src/device/texture.jl b/src/device/texture.jl
index f675ff583..e28b60810 100644
--- a/src/device/texture.jl
+++ b/src/device/texture.jl
@@ -67,7 +67,7 @@ for (dispatch_rettyp, julia_rettyp, llvm_rettyp) in
NTuple{4,$dispatch_rettyp}})
# tex1D only supports array memory
- @eval @device_function tex(texObject::CuDeviceTexture{<:$eltyp,1,ArrayMemorySource}, x::Number) =
+ @eval @device_function tex(texObject::CuDeviceTexture{<:$eltyp, 1, ArrayMemorySource}, x::Number) =
Tuple(ccall($("llvm.nvvm.tex.unified.1d.$llvm_rettyp.f32"), llvmcall,
$julia_rettyp, (CUtexObject, Float32), texObject, x))
@@ -78,7 +78,7 @@ for (dispatch_rettyp, julia_rettyp, llvm_rettyp) in
julia_sig = ntuple(_->Float32, dims)
julia_params = ntuple(i->:($(julia_args[i])::Number), dims)
- @eval @device_function tex(texObject::CuDeviceTexture{<:$eltyp,$dims}, $(julia_params...)) =
+ @eval @device_function tex(texObject::CuDeviceTexture{<:$eltyp, $dims}, $(julia_params...)) =
Tuple(ccall($("llvm.nvvm.tex.unified.$llvm_dim.$llvm_rettyp.f32"), llvmcall,
$julia_rettyp, (CUtexObject, $(julia_sig...)), texObject, $(julia_args...)))
end
diff --git a/src/device/utils.jl b/src/device/utils.jl
index b899c6708..5245ae1c4 100644
--- a/src/device/utils.jl
+++ b/src/device/utils.jl
@@ -19,7 +19,7 @@ macro device_override(ex)
end)
else
esc(quote
- Base.Experimental.@overlay($(CUDA).method_table, $ex)
+ Base.Experimental.@overlay($(CUDA).method_table, $ex)
end)
end
end
@@ -37,7 +37,7 @@ macro device_function(ex)
$(combinedef(def))
# NOTE: no use of `@consistent_overlay` here because the regular function errors
- Base.Experimental.@overlay($(CUDA).method_table, $ex)
+ Base.Experimental.@overlay($(CUDA).method_table, $ex)
end)
end
|
|
Can confirm. Error Remaining issue is as you state at point 4. Previously it was observed in comment when trying to AOT compile GPUCompiler. I assume |
I started some digging into this yesterday and I think this is a bit of a red herring. If this was the core issue I would expect the methods that leak through to be all of the ones that need to be specified by the target or only the ones that are provided defaults. The two that currently are causing issues are |
|
I've started to mark all the methods in CUDA.jl as device methods. I still have to clean it up. Using @apozharski 's branch https://github.com/apozharski/GPUCompiler.jl/tree/ap/aot-compilation I can now get past the compilation. However, the next hurdle is that Core.InitError(mod=:CUDA_Driver_jll, error=Base.TaskFailedException(task=Core.Task(next=nothing, queue=nothing, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Core.Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=Base.IOError(msg="could not spawn setenv(`/home/michel/git/ExaPF.jl/app/build/bin/julia -C native -g1 -O0 --compile=min -t1 --startup-file=no -e '...' |
@michel2323 the workaround I found for this while trying to compile libMad is to bundle the julia executable (see the latest commit on my fork of JuliaC). Ideally |
3b831b0 to
784cad3
Compare
|
Nice! Thank you! @apozharski |
Also `@device_function` decoration
Currently, Ahead-of-Time (AOT) compilation for any package
using CUDA.jlis impossible (with or without--trim). This is due to several things as far as I can tell:CUDA.jlwhich are visible in the global method table,llvmcallLLVM ir withllvm.nvvmintrinsics which therefore leak through to the AOT step which generates assembly, leading to compilation failures.MethodInstances exist for the methods, the methods are concrete (unsure if this is the correct Julia terminology). This is because they either are defined to take concrete types (for example theclockmethods) or no arguments at all.libcudadevrt.jl. The "concrete" methods in that file containccalls to external functions which do not exist until they are dynamically loaded.gpu_malloc,gpu_signal_exception, andgpu_report_oomin the produced llvm ir. These functions don't exist as they would come from the runtime compilation of the runtime. This is likely a problem more fixable inGPUCompiler.jl, but maybe @vchuravy or @maleadt you have a better understanding of that.The relevant issues and other PRs:
@device_functionto provide erroring copies of the methods for the global method table