From 9b8872c5c8baf1f59fae8fa0ede5cee93ec02d93 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 14 Jan 2024 07:51:56 +0000 Subject: [PATCH 01/47] Initialize CUDA extension --- Project.toml | 6 ++- ext/DynamicExpressionsCUDAExt.jl | 70 ++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 ext/DynamicExpressionsCUDAExt.jl diff --git a/Project.toml b/Project.toml index 1c3b9b98..3158f7b1 100644 --- a/Project.toml +++ b/Project.toml @@ -16,12 +16,14 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" [weakdeps] Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [extensions] DynamicExpressionsBumperExt = "Bumper" +DynamicExpressionsCUDAExt = "CUDA" DynamicExpressionsLoopVectorizationExt = "LoopVectorization" DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils" DynamicExpressionsZygoteExt = "Zygote" @@ -30,6 +32,7 @@ DynamicExpressionsZygoteExt = "Zygote" Aqua = "0.7" Bumper = "0.6" Compat = "3.37, 4" +CUDA = "5" Enzyme = "^0.11.12" LoopVectorization = "0.12" MacroTools = "0.4, 0.5" @@ -43,6 +46,7 @@ julia = "1.6" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" @@ -54,4 +58,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [targets] -test = ["Test", "SafeTestsets", "Aqua", "Bumper", "Enzyme", "ForwardDiff", "LoopVectorization", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"] +test = ["Test", "SafeTestsets", "Aqua", "Bumper", "CUDA", "Enzyme", "ForwardDiff", "LoopVectorization", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"] diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl new file mode 100644 index 00000000..80b0203d --- /dev/null +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -0,0 +1,70 @@ +module DynamicExpressionsCUDAExt + +using CUDA: CuArray +using DynamicExpressions: OperatorEnum, AbstractExpressionNode, tree_mapreduce +using DynamicExpressions.UtilsModule: counttuple + +import DynamicExpressions.EvaluateEquationModule: eval_tree_array + +function eval_tree_array( + tree::AbstractExpressionNode{T}, cX::CuArray{T,2}, operators::OperatorEnum; _... +) where {T<:Number} + result = tree_mapreduce( + # Leaf nodes, we create an allocation and fill + # it with the value of the leaf: + leaf -> begin + if leaf.constant + v = leaf.val::T + ar = similar(cX, axes(cX, 2)) + ar .= v + return ar + else + return cX[leaf.feature, :] + end + end, + # Branch nodes, we simply pass them to the evaluation kernel: + branch -> branch, + # In the evaluation kernel, we combine the branch nodes + # with the arrays created by the leaf nodes: + ((branch, cumulators::Vararg{Any,M}) where {M}) -> begin + if M == 1 + return dispatch_kern1!(operators.unaops, branch.op, cumulators[1]) + else + return dispatch_kern2!( + operators.binops, branch.op, cumulators[1], cumulators[2] + ) + end + end, + tree; + break_sharing=Val(true), + ) + return (result.x, isfinite(sum(x .* zero(T)))) +end +@generated function dispatch_kern1!(unaops, op_idx, cumulator) + nuna = counttuple(unaops) + quote + Base.@nif($nuna, i -> i == op_idx, i -> let op = unaops[i] + return kern1!(op, cumulator) + end,) + end +end +@generated function dispatch_kern2!(binops, op_idx, cumulator1, cumulator2) + nbin = counttuple(binops) + quote + Base.@nif( + $nbin, i -> i == op_idx, i -> let op = binops[i] + return kern2!(op, cumulator1, cumulator2) + end, + ) + end +end +function kern1!(op::F, cumulator) where {F} + @. cumulator = op(cumulator) + return cumulator +end +function kern2!(op::F, cumulator1, cumulator2) where {F} + @. cumulator1 = op(cumulator1, cumulator2) + return cumulator1 +end + +end From 666708e8245be4ee329426a3ebe4fc1ba5d89751 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 14 Jan 2024 07:57:25 +0000 Subject: [PATCH 02/47] Fix output --- ext/DynamicExpressionsCUDAExt.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 80b0203d..0989f891 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -38,7 +38,7 @@ function eval_tree_array( tree; break_sharing=Val(true), ) - return (result.x, isfinite(sum(x .* zero(T)))) + return (result, isfinite(sum(result .* zero(T)))) end @generated function dispatch_kern1!(unaops, op_idx, cumulator) nuna = counttuple(unaops) From 4484d180f08a5324e8f91d8c7f7db5ab0e9d653a Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 14 Jan 2024 20:57:17 +0000 Subject: [PATCH 03/47] Switch to GPUArrays rather than CUDA --- Project.toml | 10 +++++----- ext/DynamicExpressionsCUDAExt.jl | 9 ++++++--- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/Project.toml b/Project.toml index 3158f7b1..5b9c9e24 100644 --- a/Project.toml +++ b/Project.toml @@ -16,14 +16,14 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" [weakdeps] Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [extensions] DynamicExpressionsBumperExt = "Bumper" -DynamicExpressionsCUDAExt = "CUDA" +DynamicExpressionsGPUArraysExt = "GPUArrays" DynamicExpressionsLoopVectorizationExt = "LoopVectorization" DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils" DynamicExpressionsZygoteExt = "Zygote" @@ -32,8 +32,8 @@ DynamicExpressionsZygoteExt = "Zygote" Aqua = "0.7" Bumper = "0.6" Compat = "3.37, 4" -CUDA = "5" Enzyme = "^0.11.12" +GPUArrays = "8, 9, 10" LoopVectorization = "0.12" MacroTools = "0.4, 0.5" PackageExtensionCompat = "1" @@ -46,7 +46,7 @@ julia = "1.6" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" @@ -58,4 +58,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [targets] -test = ["Test", "SafeTestsets", "Aqua", "Bumper", "CUDA", "Enzyme", "ForwardDiff", "LoopVectorization", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"] +test = ["Test", "SafeTestsets", "Aqua", "Bumper", "Enzyme", "ForwardDiff", "GPUArrays", "LoopVectorization", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"] diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 0989f891..52d3fb93 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -1,13 +1,16 @@ -module DynamicExpressionsCUDAExt +module DynamicExpressionsGPUArraysExt -using CUDA: CuArray +using GPUArrays: AbstractDeviceArray using DynamicExpressions: OperatorEnum, AbstractExpressionNode, tree_mapreduce using DynamicExpressions.UtilsModule: counttuple import DynamicExpressions.EvaluateEquationModule: eval_tree_array function eval_tree_array( - tree::AbstractExpressionNode{T}, cX::CuArray{T,2}, operators::OperatorEnum; _... + tree::AbstractExpressionNode{T}, + cX::AbstractDeviceArray{T,2}, + operators::OperatorEnum; + _..., ) where {T<:Number} result = tree_mapreduce( # Leaf nodes, we create an allocation and fill From 644c3b886aba2f01c2bd5644efb11ce9dbf85c7d Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 14 Jan 2024 21:31:00 +0000 Subject: [PATCH 04/47] Switch to GPUArrays --- Project.toml | 4 ++-- ...xpressionsCUDAExt.jl => DynamicExpressionsGPUArraysExt.jl} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename ext/{DynamicExpressionsCUDAExt.jl => DynamicExpressionsGPUArraysExt.jl} (100%) diff --git a/Project.toml b/Project.toml index 5b9c9e24..a6fac437 100644 --- a/Project.toml +++ b/Project.toml @@ -46,7 +46,7 @@ julia = "1.6" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" -GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" @@ -58,4 +58,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [targets] -test = ["Test", "SafeTestsets", "Aqua", "Bumper", "Enzyme", "ForwardDiff", "GPUArrays", "LoopVectorization", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"] +test = ["Test", "SafeTestsets", "Aqua", "Bumper", "CUDA", "Enzyme", "ForwardDiff", "LoopVectorization", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"] diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsGPUArraysExt.jl similarity index 100% rename from ext/DynamicExpressionsCUDAExt.jl rename to ext/DynamicExpressionsGPUArraysExt.jl From 0bfd8ee2025a0bb7eb7ab1d336eea02de4edf2f6 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 15 Jan 2024 02:23:41 +0000 Subject: [PATCH 05/47] Add file for converting to arrays --- src/AsArray.jl | 64 +++++++++++++++++++++++++++++++++++++++ src/DynamicExpressions.jl | 2 ++ 2 files changed, 66 insertions(+) create mode 100644 src/AsArray.jl diff --git a/src/AsArray.jl b/src/AsArray.jl new file mode 100644 index 00000000..f5f4a808 --- /dev/null +++ b/src/AsArray.jl @@ -0,0 +1,64 @@ +module AsArrayModule + +using .DynamicExpressions.EquationModule: AbstractExpressionNode, tree_mapreduce + +function as_array( + tree::N, index_type::Type{I}=Int64 +) where {T,N<:AbstractExpressionNode{T},I} + num_nodes = length(tree) + degree = Array{fieldtype(N, :degree)}(undef, num_nodes) + constant = Array{fieldtype(N, :constant)}(undef, num_nodes) + val = Array{T}(undef, num_nodes) + feature = Array{fieldtype(N, :feature)}(undef, num_nodes) + op = Array{fieldtype(N, :op)}(undef, num_nodes) + execution_order = Array{I}(undef, num_nodes) + idx_self = Array{I}(undef, num_nodes) + idx_l = Array{I}(undef, num_nodes) + idx_r = Array{I}(undef, num_nodes) + + cursor = Ref(zero(I)) + tree_mapreduce( + leaf -> begin + self = (cursor[] += one(I)) + idx_self[self] = self + degree[self] = 0 + execution_order[self] = one(I) + constant[self] = leaf.constant + if leaf.constant + val[self] = leaf.val::T + else + feature[self] = leaf.feature + end + + (id=self, order=one(I)) + end, + branch -> begin + self = (cursor[] += one(I)) + idx_self[self] = self + op[self] = branch.op + degree[self] = branch.degree + + (id=self, order=one(I)) # this order is unused + end, + ((parent, children::Vararg{Any,M}) where {M}) -> begin + idx_l[parent.id] = children[1].id + if M == 2 + idx_r[parent.id] = children[2].id + end + parent_execution_order = if M == 1 + children[1].order + one(I) + else + max(children[1].order, children[2].order) + one(I) + end + execution_order[parent.id] = parent_execution_order + + (id=parent.id, order=parent_execution_order) + end, + tree; + break_sharing=Val(true), + ) + + return (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r) +end + +end diff --git a/src/DynamicExpressions.jl b/src/DynamicExpressions.jl index 09fee767..b664819f 100644 --- a/src/DynamicExpressions.jl +++ b/src/DynamicExpressions.jl @@ -11,6 +11,7 @@ include("EvaluationHelpers.jl") include("SimplifyEquation.jl") include("OperatorEnumConstruction.jl") include("Random.jl") +include("AsArray.jl") import PackageExtensionCompat: @require_extensions import Reexport: @reexport @@ -47,6 +48,7 @@ import .EquationModule: constructorof, preserve_sharing @reexport import .EvaluationHelpersModule @reexport import .ExtensionInterfaceModule: node_to_symbolic, symbolic_to_node @reexport import .RandomModule: NodeSampler +@reexport import .AsArrayModule: as_array function __init__() @require_extensions From 5c908964abddad666f9af830907038dcae86d911 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 15 Jan 2024 03:05:52 +0000 Subject: [PATCH 06/47] Add GPU kernel for evaluation --- Project.toml | 3 + ext/DynamicExpressionsCUDAExt.jl | 128 +++++++++++++++++++++++++++++++ src/AsArray.jl | 2 +- 3 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 ext/DynamicExpressionsCUDAExt.jl diff --git a/Project.toml b/Project.toml index a6fac437..983e5ee0 100644 --- a/Project.toml +++ b/Project.toml @@ -16,6 +16,7 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" [weakdeps] Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b" @@ -23,6 +24,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [extensions] DynamicExpressionsBumperExt = "Bumper" +DynamicExpressionsCUDAExt = "CUDA" DynamicExpressionsGPUArraysExt = "GPUArrays" DynamicExpressionsLoopVectorizationExt = "LoopVectorization" DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils" @@ -32,6 +34,7 @@ DynamicExpressionsZygoteExt = "Zygote" Aqua = "0.7" Bumper = "0.6" Compat = "3.37, 4" +CUDA = "5" Enzyme = "^0.11.12" GPUArrays = "8, 9, 10" LoopVectorization = "0.12" diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl new file mode 100644 index 00000000..284259cb --- /dev/null +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -0,0 +1,128 @@ +module DynamicExpressionsCUDAExt +#! format: off + +using CUDA: threadIdx, blockIdx, @cuda, CuArray, cu +using DynamicExpressions: OperatorEnum, AbstractExpressionNode +using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna +using DynamicExpressions.AsArrayModule: as_array + + +function eval_tree_gpu( + tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum +) + (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r) = as_array(tree) + num_launches = maximum(execution_order) + num_elem = size(gcX, 2) + num_nodes = length(degree) + + # Convert everything to GPU: + gbuffer = CuArray{T}(undef, num_elem, num_nodes) + gexecution_order = cu(execution_order) + gdegree = cu(degree) + gconstant = cu(constant) + gval = cu(val) + gfeature = cu(feature) + gop = cu(op) + gidx_self = cu(idx_self) + gidx_l = cu(idx_l) + gidx_r = cu(idx_r) + + _launch_gpu_kernel!( + Val(nextpow(2, num_elem * num_nodes)), num_launches, gbuffer, + # Thread info: + num_elem, num_nodes, gexecution_order, + # Input data and tree + operators, gcX, gidx_self, gidx_l, gidx_r, + gdegree, gconstant, gval, gfeature, gop, + ) + + out = gbuffer[:, 1] + return (out, isfinite(sum(out .* zero(T)))) +end + +function _launch_gpu_kernel!( + ::Val{num_threads}, num_launches::Integer, buffer::CuArray{T,2}, + # Thread info: + num_elem::Integer, num_nodes::Integer, execution_order::CuArray{I}, + # Input data and tree + operators::OperatorEnum, cX::CuArray{T,2}, idx_self::CuArray, idx_l::CuArray, idx_r::CuArray, + degree::CuArray, constant::CuArray, val::CuArray{T,1}, feature::CuArray, op::CuArray, +) where {num_threads,I,T} + for launch in I(1):I(num_launches) + execute = execution_order .== launch + @cuda( + threads = num_threads, + _gpu_kernel!( + buffer, + num_elem, num_nodes, execute, + operators, cX, idx_self, idx_l, idx_r, + degree, constant, val, feature, op + ) + ) + end +end + +@generated function _gpu_kernel!( + # Storage: + buffer::CuArray{T,2}, + # Thread info: + num_elem::Integer, num_nodes::Integer, execute::CuArray{Bool}, + # Input data and tree + operators::OperatorEnum, cX::CuArray{T,2}, idx_self::CuArray, idx_l::CuArray, idx_r::CuArray, + degree::CuArray, constant::CuArray, val::CuArray{T,1}, feature::CuArray, op::CuArray, +) where {T} + nbin = get_nbin(operators) + nuna = get_nuna(operators) + quote + i = threadIdx().x + if i > num_elem * num_nodes + return nothing + end + + node = (i - 1) % num_nodes + 1 + elem = (i - node) ÷ num_nodes + 1 + + if !execute[node] + return nothing + end + + cur_degree = degree[node] + cur_idx = idx_self[node] + if cur_degree == 0 + if constant[node] + cur_val = val[node] + buffer[elem, cur_idx] = cur_val + else + cur_feature = feature[node] + buffer[elem, cur_idx] = cX[cur_feature, elem] + end + else + if cur_degree == 1 + cur_op = op[node] + l_idx = idx_l[node] + Base.Cartesian.@nif( + $nuna, + i -> i == cur_op, + i -> let op = operators.unaops[i] + buffer[elem, cur_idx] = op(buffer[elem, l_idx]) + end + ) + else + cur_op = op[node] + l_idx = idx_l[node] + r_idx = idx_r[node] + Base.Cartesian.@nif( + $nbin, + i -> i == cur_op, + i -> let op = operators.binops[i] + buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx]) + end + ) + end + end + return nothing + end +end + +#! format: on +end diff --git a/src/AsArray.jl b/src/AsArray.jl index f5f4a808..aecc4ac5 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -1,6 +1,6 @@ module AsArrayModule -using .DynamicExpressions.EquationModule: AbstractExpressionNode, tree_mapreduce +using ..EquationModule: AbstractExpressionNode, tree_mapreduce function as_array( tree::N, index_type::Type{I}=Int64 From 57fff36a166acc5c31b184bf90bc5ae9baa1376d Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 15 Jan 2024 00:04:47 -0500 Subject: [PATCH 07/47] Allow submitting multiple trees at once --- src/AsArray.jl | 89 ++++++++++++++++++++++------------------- src/EvaluateEquation.jl | 15 +++++-- 2 files changed, 60 insertions(+), 44 deletions(-) diff --git a/src/AsArray.jl b/src/AsArray.jl index aecc4ac5..b84a5aad 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -3,9 +3,11 @@ module AsArrayModule using ..EquationModule: AbstractExpressionNode, tree_mapreduce function as_array( - tree::N, index_type::Type{I}=Int64 -) where {T,N<:AbstractExpressionNode{T},I} - num_nodes = length(tree) + trees::Vararg{N,M}; index_type::Type{I}=Int64 +) where {T,N<:AbstractExpressionNode{T},I,M} + each_num_nodes = length.(trees) + num_nodes = sum(each_num_nodes) + degree = Array{fieldtype(N, :degree)}(undef, num_nodes) constant = Array{fieldtype(N, :constant)}(undef, num_nodes) val = Array{T}(undef, num_nodes) @@ -16,49 +18,54 @@ function as_array( idx_l = Array{I}(undef, num_nodes) idx_r = Array{I}(undef, num_nodes) + roots = Array{I}(undef, length(trees)) + cursor = Ref(zero(I)) - tree_mapreduce( - leaf -> begin - self = (cursor[] += one(I)) - idx_self[self] = self - degree[self] = 0 - execution_order[self] = one(I) - constant[self] = leaf.constant - if leaf.constant - val[self] = leaf.val::T - else - feature[self] = leaf.feature - end + for (i_tree, tree) in enumerate(trees) + roots[i_tree] = cursor[] + one(I) + tree_mapreduce( + leaf -> begin + self = (cursor[] += one(I)) + idx_self[self] = self + degree[self] = 0 + execution_order[self] = one(I) + constant[self] = leaf.constant + if leaf.constant + val[self] = leaf.val::T + else + feature[self] = leaf.feature + end - (id=self, order=one(I)) - end, - branch -> begin - self = (cursor[] += one(I)) - idx_self[self] = self - op[self] = branch.op - degree[self] = branch.degree + (id=self, order=one(I)) + end, + branch -> begin + self = (cursor[] += one(I)) + idx_self[self] = self + op[self] = branch.op + degree[self] = branch.degree - (id=self, order=one(I)) # this order is unused - end, - ((parent, children::Vararg{Any,M}) where {M}) -> begin - idx_l[parent.id] = children[1].id - if M == 2 - idx_r[parent.id] = children[2].id - end - parent_execution_order = if M == 1 - children[1].order + one(I) - else - max(children[1].order, children[2].order) + one(I) - end - execution_order[parent.id] = parent_execution_order + (id=self, order=one(I)) # this order is unused + end, + ((parent, children::Vararg{Any,C}) where {C}) -> begin + idx_l[parent.id] = children[1].id + if C == 2 + idx_r[parent.id] = children[2].id + end + parent_execution_order = if C == 1 + children[1].order + one(I) + else + max(children[1].order, children[2].order) + one(I) + end + execution_order[parent.id] = parent_execution_order - (id=parent.id, order=parent_execution_order) - end, - tree; - break_sharing=Val(true), - ) + (id=parent.id, order=parent_execution_order) + end, + tree; + break_sharing=Val(true), + ) + end - return (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r) + return (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots) end end diff --git a/src/EvaluateEquation.jl b/src/EvaluateEquation.jl index 1848248a..e5a4cff8 100644 --- a/src/EvaluateEquation.jl +++ b/src/EvaluateEquation.jl @@ -1,6 +1,6 @@ module EvaluateEquationModule -import ..EquationModule: AbstractExpressionNode, constructorof, string_tree +import ..EquationModule: AbstractExpressionNode, with_type_parameters, string_tree import ..OperatorEnumModule: OperatorEnum, GenericOperatorEnum import ..UtilsModule: is_bad_array, fill_similar, counttuple, ResultOk import ..EquationUtilsModule: is_constant @@ -91,10 +91,19 @@ function eval_tree_array( ) where {T1<:Number,T2<:Number} T = promote_type(T1, T2) @warn "Warning: eval_tree_array received mixed types: tree=$(T1) and data=$(T2)." - tree = convert(constructorof(typeof(tree)){T}, tree) - cX = Base.Fix1(convert, T).(cX) + tree = convert(with_type_parameters(typeof(tree), T), tree) + cX = T.(cX) return eval_tree_array(tree, cX, operators; turbo, bumper) end +function eval_tree_array( + trees::NTuple{M,N}, + cX::AbstractMatrix{T}, + operators::OperatorEnum; + kws... +) where {T<:Number,N<:AbstractExpressionNode{T},M} + outs = ntuple(i -> eval_tree_array(trees[i], cX, operators; kws...)[1], Val(M)) + return ntuple(i -> first(outs[i]), Val(M)), ntuple(i -> last(outs[i]), Val(M)) +end get_nuna(::Type{<:OperatorEnum{B,U}}) where {B,U} = counttuple(U) get_nbin(::Type{<:OperatorEnum{B}}) where {B} = counttuple(B) From 81529af3c11f8368f9bdedbb8cd63b14d3edf4d5 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 15 Jan 2024 00:05:31 -0500 Subject: [PATCH 08/47] Working GPU kernel for expression evaluation! --- ext/DynamicExpressionsCUDAExt.jl | 166 ++++++++++++++++++------------- 1 file changed, 95 insertions(+), 71 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 284259cb..b798b5fc 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -1,16 +1,24 @@ module DynamicExpressionsCUDAExt #! format: off -using CUDA: threadIdx, blockIdx, @cuda, CuArray, cu +using CUDA using DynamicExpressions: OperatorEnum, AbstractExpressionNode using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna using DynamicExpressions.AsArrayModule: as_array +import DynamicExpressions.EvaluateEquationModule: eval_tree_array -function eval_tree_gpu( - tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum -) - (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r) = as_array(tree) +function eval_tree_array( + tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum; _... +) where {T<:Number} + (outs, is_good) = eval_tree_array((tree,), gcX, operators) + return only(outs), only(is_good) +end + +function eval_tree_array( + trees::NTuple{M,N}, gcX::CuArray{T,2}, operators::OperatorEnum; _... +) where {T<:Number,N<:AbstractExpressionNode{T},M} + (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots) = as_array(trees...; index_type=Int32) num_launches = maximum(execution_order) num_elem = size(gcX, 2) num_nodes = length(degree) @@ -20,15 +28,20 @@ function eval_tree_gpu( gexecution_order = cu(execution_order) gdegree = cu(degree) gconstant = cu(constant) - gval = cu(val) + gval = CuArray(val) gfeature = cu(feature) gop = cu(op) gidx_self = cu(idx_self) gidx_l = cu(idx_l) gidx_r = cu(idx_r) + groots = cu(roots) # Just for indexing output + + num_threads = 256 + num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads)) + _launch_gpu_kernel!( - Val(nextpow(2, num_elem * num_nodes)), num_launches, gbuffer, + num_blocks, num_launches, gbuffer, # Thread info: num_elem, num_nodes, gexecution_order, # Input data and tree @@ -36,91 +49,102 @@ function eval_tree_gpu( gdegree, gconstant, gval, gfeature, gop, ) - out = gbuffer[:, 1] - return (out, isfinite(sum(out .* zero(T)))) + out = ntuple(i -> gbuffer[:, groots[i]], Val(M)) + is_good = ntuple(i -> isfinite(sum(out[i] .* zero(T))), Val(M)) + return out, is_good end function _launch_gpu_kernel!( - ::Val{num_threads}, num_launches::Integer, buffer::CuArray{T,2}, + num_blocks, num_launches::Integer, buffer::CuArray{T,2}, # Thread info: num_elem::Integer, num_nodes::Integer, execution_order::CuArray{I}, # Input data and tree operators::OperatorEnum, cX::CuArray{T,2}, idx_self::CuArray, idx_l::CuArray, idx_r::CuArray, degree::CuArray, constant::CuArray, val::CuArray{T,1}, feature::CuArray, op::CuArray, -) where {num_threads,I,T} - for launch in I(1):I(num_launches) - execute = execution_order .== launch - @cuda( - threads = num_threads, - _gpu_kernel!( +) where {I,T} + nuna = get_nuna(typeof(operators)) + nbin = get_nbin(typeof(operators)) + (nuna > 10 || nbin > 10) && error("Too many operators. Kernels are only compiled up to 10.") + gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin)) + for launch in I.(1:num_launches) + CUDA.@sync begin + execute = execution_order .== launch + @cuda threads=256 blocks=num_blocks gpu_kernel!( buffer, num_elem, num_nodes, execute, - operators, cX, idx_self, idx_l, idx_r, + cX, idx_self, idx_l, idx_r, degree, constant, val, feature, op ) - ) + end end + return nothing end -@generated function _gpu_kernel!( - # Storage: - buffer::CuArray{T,2}, - # Thread info: - num_elem::Integer, num_nodes::Integer, execute::CuArray{Bool}, - # Input data and tree - operators::OperatorEnum, cX::CuArray{T,2}, idx_self::CuArray, idx_l::CuArray, idx_r::CuArray, - degree::CuArray, constant::CuArray, val::CuArray{T,1}, feature::CuArray, op::CuArray, -) where {T} - nbin = get_nbin(operators) - nuna = get_nuna(operators) - quote - i = threadIdx().x - if i > num_elem * num_nodes - return nothing - end - - node = (i - 1) % num_nodes + 1 - elem = (i - node) ÷ num_nodes + 1 +# Need to pre-compute the GPU kernels with an `@eval` for each number of operators +# 1. We need to use an `@nif` over operators, as GPU kernels +# can't index into arrays of operators. +# 2. `@nif` is evaluated at parse time and needs to know the number of +# ifs to generate at that time, so we can't simply use specialization. +# 3. We can't use `@generated` because we can't create closures in those. +for nuna in 0:10, nbin in 0:10 + @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin}) + function ( + # Storage: + buffer, + # Thread info: + num_elem::Integer, num_nodes::Integer, execute::AbstractArray, + # Input data and tree + cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray, + degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray, + ) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i > num_elem * num_nodes + return nothing + end - if !execute[node] - return nothing - end + node = (i - 1) % num_nodes + 1 + elem = (i - node) ÷ num_nodes + 1 - cur_degree = degree[node] - cur_idx = idx_self[node] - if cur_degree == 0 - if constant[node] - cur_val = val[node] - buffer[elem, cur_idx] = cur_val - else - cur_feature = feature[node] - buffer[elem, cur_idx] = cX[cur_feature, elem] + if !execute[node] + return nothing end - else - if cur_degree == 1 - cur_op = op[node] - l_idx = idx_l[node] - Base.Cartesian.@nif( - $nuna, - i -> i == cur_op, - i -> let op = operators.unaops[i] - buffer[elem, cur_idx] = op(buffer[elem, l_idx]) - end - ) + + cur_degree = degree[node] + cur_idx = idx_self[node] + if cur_degree == 0 + if constant[node] + cur_val = val[node] + buffer[elem, cur_idx] = cur_val + else + cur_feature = feature[node] + buffer[elem, cur_idx] = cX[cur_feature, elem] + end else - cur_op = op[node] - l_idx = idx_l[node] - r_idx = idx_r[node] - Base.Cartesian.@nif( - $nbin, - i -> i == cur_op, - i -> let op = operators.binops[i] - buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx]) - end - ) + if cur_degree == 1 + cur_op = op[node] + l_idx = idx_l[node] + Base.Cartesian.@nif( + $nuna, + i -> i == cur_op, + i -> let op = operators.unaops[i] + buffer[elem, cur_idx] = op(buffer[elem, l_idx]) + end + ) + else + cur_op = op[node] + l_idx = idx_l[node] + r_idx = idx_r[node] + Base.Cartesian.@nif( + $nbin, + i -> i == cur_op, + i -> let op = operators.binops[i] + buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx]) + end + ) + end end + return nothing end - return nothing end end From 07dec6bce9e1e9dea0e484b393938714a6f5e775 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 15 Jan 2024 00:07:47 -0500 Subject: [PATCH 09/47] Formatting --- src/AsArray.jl | 4 +++- src/EvaluateEquation.jl | 5 +---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/AsArray.jl b/src/AsArray.jl index b84a5aad..7c49ea71 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -65,7 +65,9 @@ function as_array( ) end - return (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots) + return (; + degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots + ) end end diff --git a/src/EvaluateEquation.jl b/src/EvaluateEquation.jl index e5a4cff8..4be61aaf 100644 --- a/src/EvaluateEquation.jl +++ b/src/EvaluateEquation.jl @@ -96,10 +96,7 @@ function eval_tree_array( return eval_tree_array(tree, cX, operators; turbo, bumper) end function eval_tree_array( - trees::NTuple{M,N}, - cX::AbstractMatrix{T}, - operators::OperatorEnum; - kws... + trees::NTuple{M,N}, cX::AbstractMatrix{T}, operators::OperatorEnum; kws... ) where {T<:Number,N<:AbstractExpressionNode{T},M} outs = ntuple(i -> eval_tree_array(trees[i], cX, operators; kws...)[1], Val(M)) return ntuple(i -> first(outs[i]), Val(M)), ntuple(i -> last(outs[i]), Val(M)) From c19f37f7bd170a97685afdffda2cafa685cd2196 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 15 Jan 2024 00:08:37 -0500 Subject: [PATCH 10/47] Remove GPUArrays extension as redundant --- Project.toml | 3 -- ext/DynamicExpressionsGPUArraysExt.jl | 73 --------------------------- 2 files changed, 76 deletions(-) delete mode 100644 ext/DynamicExpressionsGPUArraysExt.jl diff --git a/Project.toml b/Project.toml index 983e5ee0..3158f7b1 100644 --- a/Project.toml +++ b/Project.toml @@ -17,7 +17,6 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" [weakdeps] Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" @@ -25,7 +24,6 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [extensions] DynamicExpressionsBumperExt = "Bumper" DynamicExpressionsCUDAExt = "CUDA" -DynamicExpressionsGPUArraysExt = "GPUArrays" DynamicExpressionsLoopVectorizationExt = "LoopVectorization" DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils" DynamicExpressionsZygoteExt = "Zygote" @@ -36,7 +34,6 @@ Bumper = "0.6" Compat = "3.37, 4" CUDA = "5" Enzyme = "^0.11.12" -GPUArrays = "8, 9, 10" LoopVectorization = "0.12" MacroTools = "0.4, 0.5" PackageExtensionCompat = "1" diff --git a/ext/DynamicExpressionsGPUArraysExt.jl b/ext/DynamicExpressionsGPUArraysExt.jl deleted file mode 100644 index 52d3fb93..00000000 --- a/ext/DynamicExpressionsGPUArraysExt.jl +++ /dev/null @@ -1,73 +0,0 @@ -module DynamicExpressionsGPUArraysExt - -using GPUArrays: AbstractDeviceArray -using DynamicExpressions: OperatorEnum, AbstractExpressionNode, tree_mapreduce -using DynamicExpressions.UtilsModule: counttuple - -import DynamicExpressions.EvaluateEquationModule: eval_tree_array - -function eval_tree_array( - tree::AbstractExpressionNode{T}, - cX::AbstractDeviceArray{T,2}, - operators::OperatorEnum; - _..., -) where {T<:Number} - result = tree_mapreduce( - # Leaf nodes, we create an allocation and fill - # it with the value of the leaf: - leaf -> begin - if leaf.constant - v = leaf.val::T - ar = similar(cX, axes(cX, 2)) - ar .= v - return ar - else - return cX[leaf.feature, :] - end - end, - # Branch nodes, we simply pass them to the evaluation kernel: - branch -> branch, - # In the evaluation kernel, we combine the branch nodes - # with the arrays created by the leaf nodes: - ((branch, cumulators::Vararg{Any,M}) where {M}) -> begin - if M == 1 - return dispatch_kern1!(operators.unaops, branch.op, cumulators[1]) - else - return dispatch_kern2!( - operators.binops, branch.op, cumulators[1], cumulators[2] - ) - end - end, - tree; - break_sharing=Val(true), - ) - return (result, isfinite(sum(result .* zero(T)))) -end -@generated function dispatch_kern1!(unaops, op_idx, cumulator) - nuna = counttuple(unaops) - quote - Base.@nif($nuna, i -> i == op_idx, i -> let op = unaops[i] - return kern1!(op, cumulator) - end,) - end -end -@generated function dispatch_kern2!(binops, op_idx, cumulator1, cumulator2) - nbin = counttuple(binops) - quote - Base.@nif( - $nbin, i -> i == op_idx, i -> let op = binops[i] - return kern2!(op, cumulator1, cumulator2) - end, - ) - end -end -function kern1!(op::F, cumulator) where {F} - @. cumulator = op(cumulator) - return cumulator -end -function kern2!(op::F, cumulator1, cumulator2) where {F} - @. cumulator1 = op(cumulator1, cumulator2) - return cumulator1 -end - -end From ade1613b1694105048ef7cbfd88b7e8e31f115f4 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 15 Jan 2024 00:59:51 -0500 Subject: [PATCH 11/47] Remove unneccessary sync call --- ext/DynamicExpressionsCUDAExt.jl | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index b798b5fc..d50883a6 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -35,7 +35,6 @@ function eval_tree_array( gidx_l = cu(idx_l) gidx_r = cu(idx_r) - groots = cu(roots) # Just for indexing output num_threads = 256 num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads)) @@ -49,7 +48,7 @@ function eval_tree_array( gdegree, gconstant, gval, gfeature, gop, ) - out = ntuple(i -> gbuffer[:, groots[i]], Val(M)) + out = ntuple(i -> gbuffer[:, roots[i]], Val(M)) is_good = ntuple(i -> isfinite(sum(out[i] .* zero(T))), Val(M)) return out, is_good end @@ -67,15 +66,13 @@ function _launch_gpu_kernel!( (nuna > 10 || nbin > 10) && error("Too many operators. Kernels are only compiled up to 10.") gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin)) for launch in I.(1:num_launches) - CUDA.@sync begin - execute = execution_order .== launch - @cuda threads=256 blocks=num_blocks gpu_kernel!( - buffer, - num_elem, num_nodes, execute, - cX, idx_self, idx_l, idx_r, - degree, constant, val, feature, op - ) - end + execute = execution_order .== launch + @cuda threads=256 blocks=num_blocks gpu_kernel!( + buffer, + num_elem, num_nodes, execute, + cX, idx_self, idx_l, idx_r, + degree, constant, val, feature, op + ) end return nothing end From 53b42ceda3bc385ba784f6de8f1152327e3eb231 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 15 Jan 2024 01:40:29 -0500 Subject: [PATCH 12/47] Reduce number of allocations --- ext/DynamicExpressionsCUDAExt.jl | 35 ++++++++++++++++---------------- src/AsArray.jl | 4 +--- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index d50883a6..9f206e08 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -12,29 +12,30 @@ function eval_tree_array( tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum; _... ) where {T<:Number} (outs, is_good) = eval_tree_array((tree,), gcX, operators) - return only(outs), only(is_good) + return (only(outs), only(is_good)) end function eval_tree_array( trees::NTuple{M,N}, gcX::CuArray{T,2}, operators::OperatorEnum; _... ) where {T<:Number,N<:AbstractExpressionNode{T},M} - (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots) = as_array(trees...; index_type=Int32) + (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots) = as_array(Int32, trees...) num_launches = maximum(execution_order) num_elem = size(gcX, 2) num_nodes = length(degree) # Convert everything to GPU: gbuffer = CuArray{T}(undef, num_elem, num_nodes) - gexecution_order = cu(execution_order) - gdegree = cu(degree) - gconstant = cu(constant) + gexecution_order = CuArray(execution_order) + gdegree = CuArray(degree) + gconstant = CuArray(constant) gval = CuArray(val) - gfeature = cu(feature) - gop = cu(op) - gidx_self = cu(idx_self) - gidx_l = cu(idx_l) - gidx_r = cu(idx_r) + gfeature = CuArray(feature) + gop = CuArray(op) + gidx_self = CuArray(idx_self) + gidx_l = CuArray(idx_l) + gidx_r = CuArray(idx_r) + gexecute = CUDA.zeros(Bool, num_nodes) num_threads = 256 num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads)) @@ -42,21 +43,21 @@ function eval_tree_array( _launch_gpu_kernel!( num_blocks, num_launches, gbuffer, # Thread info: - num_elem, num_nodes, gexecution_order, + num_elem, num_nodes, gexecute, gexecution_order, # Input data and tree operators, gcX, gidx_self, gidx_l, gidx_r, gdegree, gconstant, gval, gfeature, gop, ) - out = ntuple(i -> gbuffer[:, roots[i]], Val(M)) - is_good = ntuple(i -> isfinite(sum(out[i] .* zero(T))), Val(M)) - return out, is_good + out = ntuple(i -> (@view gbuffer[:, roots[i]]), Val(M)) + is_good = ntuple(i -> true, Val(M)) # Up to user to find NaNs + return (out, is_good) end function _launch_gpu_kernel!( num_blocks, num_launches::Integer, buffer::CuArray{T,2}, # Thread info: - num_elem::Integer, num_nodes::Integer, execution_order::CuArray{I}, + num_elem::Integer, num_nodes::Integer, execute::CuArray{Bool}, execution_order::CuArray{I}, # Input data and tree operators::OperatorEnum, cX::CuArray{T,2}, idx_self::CuArray, idx_l::CuArray, idx_r::CuArray, degree::CuArray, constant::CuArray, val::CuArray{T,1}, feature::CuArray, op::CuArray, @@ -65,8 +66,8 @@ function _launch_gpu_kernel!( nbin = get_nbin(typeof(operators)) (nuna > 10 || nbin > 10) && error("Too many operators. Kernels are only compiled up to 10.") gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin)) - for launch in I.(1:num_launches) - execute = execution_order .== launch + for launch in one(I):I(num_launches) + @. execute = execution_order .== launch @cuda threads=256 blocks=num_blocks gpu_kernel!( buffer, num_elem, num_nodes, execute, diff --git a/src/AsArray.jl b/src/AsArray.jl index 7c49ea71..0c65897b 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -2,9 +2,7 @@ module AsArrayModule using ..EquationModule: AbstractExpressionNode, tree_mapreduce -function as_array( - trees::Vararg{N,M}; index_type::Type{I}=Int64 -) where {T,N<:AbstractExpressionNode{T},I,M} +function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionNode{T},I,M} each_num_nodes = length.(trees) num_nodes = sum(each_num_nodes) From cdfcec2f4e2912ffb2a3a538e0832866761c527b Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 15 Jan 2024 02:02:55 -0500 Subject: [PATCH 13/47] Greatly improve allocations --- ext/DynamicExpressionsCUDAExt.jl | 60 ++++++++++++++++++-------------- src/AsArray.jl | 41 ++++++++++++++++------ 2 files changed, 65 insertions(+), 36 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 9f206e08..509375e3 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -18,59 +18,67 @@ end function eval_tree_array( trees::NTuple{M,N}, gcX::CuArray{T,2}, operators::OperatorEnum; _... ) where {T<:Number,N<:AbstractExpressionNode{T},M} - (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots) = as_array(Int32, trees...) + (; constant, val, execution_order, roots, buffer) = as_array(UInt16, trees...) num_launches = maximum(execution_order) num_elem = size(gcX, 2) - num_nodes = length(degree) + num_nodes = size(buffer, 2) - # Convert everything to GPU: - gbuffer = CuArray{T}(undef, num_elem, num_nodes) - gexecution_order = CuArray(execution_order) - gdegree = CuArray(degree) - gconstant = CuArray(constant) + ## Floating point arrays: + workspace = CuArray{T}(undef, num_elem, num_nodes) gval = CuArray(val) - gfeature = CuArray(feature) - gop = CuArray(op) - gidx_self = CuArray(idx_self) - gidx_l = CuArray(idx_l) - gidx_r = CuArray(idx_r) - gexecute = CUDA.zeros(Bool, num_nodes) + ## Bool arrays + gconstant = CuArray(constant) + + ## Index arrays (much faster to have `@view` here) + _gbuffer = CuArray(buffer) + gdegree = @view _gbuffer[1, :] + gfeature = @view _gbuffer[2, :] + gop = @view _gbuffer[3, :] + gexecution_order = @view _gbuffer[4, :] + gidx_self = @view _gbuffer[5, :] + gidx_l = @view _gbuffer[6, :] + gidx_r = @view _gbuffer[7, :] num_threads = 256 num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads)) _launch_gpu_kernel!( - num_blocks, num_launches, gbuffer, + num_threads, num_blocks, num_launches, workspace, # Thread info: - num_elem, num_nodes, gexecute, gexecution_order, + num_elem, num_nodes, gexecution_order, # Input data and tree operators, gcX, gidx_self, gidx_l, gidx_r, gdegree, gconstant, gval, gfeature, gop, ) - out = ntuple(i -> (@view gbuffer[:, roots[i]]), Val(M)) - is_good = ntuple(i -> true, Val(M)) # Up to user to find NaNs + out = ntuple( + i -> (@view workspace[:, roots[i]]), + Val(M) + ) + is_good = ntuple( + i -> true, # Up to user to find NaNs + Val(M) + ) return (out, is_good) end function _launch_gpu_kernel!( - num_blocks, num_launches::Integer, buffer::CuArray{T,2}, + num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2}, # Thread info: - num_elem::Integer, num_nodes::Integer, execute::CuArray{Bool}, execution_order::CuArray{I}, + num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray{I}, # Input data and tree - operators::OperatorEnum, cX::CuArray{T,2}, idx_self::CuArray, idx_l::CuArray, idx_r::CuArray, - degree::CuArray, constant::CuArray, val::CuArray{T,1}, feature::CuArray, op::CuArray, + operators::OperatorEnum, cX::AbstractArray{T,2}, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray, + degree::AbstractArray, constant::AbstractArray, val::AbstractArray{T,1}, feature::AbstractArray, op::AbstractArray, ) where {I,T} nuna = get_nuna(typeof(operators)) nbin = get_nbin(typeof(operators)) (nuna > 10 || nbin > 10) && error("Too many operators. Kernels are only compiled up to 10.") gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin)) for launch in one(I):I(num_launches) - @. execute = execution_order .== launch - @cuda threads=256 blocks=num_blocks gpu_kernel!( + @cuda threads=num_threads blocks=num_blocks gpu_kernel!( buffer, - num_elem, num_nodes, execute, + launch, num_elem, num_nodes, execution_order, cX, idx_self, idx_l, idx_r, degree, constant, val, feature, op ) @@ -90,7 +98,7 @@ for nuna in 0:10, nbin in 0:10 # Storage: buffer, # Thread info: - num_elem::Integer, num_nodes::Integer, execute::AbstractArray, + launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray, # Input data and tree cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray, degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray, @@ -103,7 +111,7 @@ for nuna in 0:10, nbin in 0:10 node = (i - 1) % num_nodes + 1 elem = (i - node) ÷ num_nodes + 1 - if !execute[node] + if execution_order[node] != launch return nothing end diff --git a/src/AsArray.jl b/src/AsArray.jl index 0c65897b..e687d572 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -6,17 +6,28 @@ function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionN each_num_nodes = length.(trees) num_nodes = sum(each_num_nodes) - degree = Array{fieldtype(N, :degree)}(undef, num_nodes) - constant = Array{fieldtype(N, :constant)}(undef, num_nodes) + roots = Array{I}(undef, M) + + constant = Array{Bool}(undef, num_nodes) + val = Array{T}(undef, num_nodes) - feature = Array{fieldtype(N, :feature)}(undef, num_nodes) - op = Array{fieldtype(N, :op)}(undef, num_nodes) - execution_order = Array{I}(undef, num_nodes) - idx_self = Array{I}(undef, num_nodes) - idx_l = Array{I}(undef, num_nodes) - idx_r = Array{I}(undef, num_nodes) - roots = Array{I}(undef, length(trees)) + # degree = Array{I}(undef, num_nodes) + # feature = Array{I}(undef, num_nodes) + # op = Array{I}(undef, num_nodes) + # execution_order = Array{I}(undef, num_nodes) + # idx_self = Array{I}(undef, num_nodes) + # idx_l = Array{I}(undef, num_nodes) + # idx_r = Array{I}(undef, num_nodes) + ## Views of the same matrix: + buffer = Array{I}(undef, 7, num_nodes) + degree = @view buffer[1, :] + feature = @view buffer[2, :] + op = @view buffer[3, :] + execution_order = @view buffer[4, :] + idx_self = @view buffer[5, :] + idx_l = @view buffer[6, :] + idx_r = @view buffer[7, :] cursor = Ref(zero(I)) for (i_tree, tree) in enumerate(trees) @@ -64,7 +75,17 @@ function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionN end return (; - degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots + degree, + constant, + val, + feature, + op, + execution_order, + idx_self, + idx_l, + idx_r, + roots, + buffer, ) end From 20a8f389cdff8628dafb13222fd66711ea45292d Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 15 Jan 2024 02:30:48 -0500 Subject: [PATCH 14/47] More memory allocation improvements --- ext/DynamicExpressionsCUDAExt.jl | 31 +++++++++++++++---------------- src/AsArray.jl | 12 ++---------- 2 files changed, 17 insertions(+), 26 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 509375e3..8ee215c7 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -18,33 +18,31 @@ end function eval_tree_array( trees::NTuple{M,N}, gcX::CuArray{T,2}, operators::OperatorEnum; _... ) where {T<:Number,N<:AbstractExpressionNode{T},M} - (; constant, val, execution_order, roots, buffer) = as_array(UInt16, trees...) + (; val, execution_order, roots, buffer) = as_array(Int32, trees...) num_launches = maximum(execution_order) num_elem = size(gcX, 2) num_nodes = size(buffer, 2) ## Floating point arrays: - workspace = CuArray{T}(undef, num_elem, num_nodes) + gworkspace = CuArray{T}(undef, num_elem, num_nodes) gval = CuArray(val) - ## Bool arrays - gconstant = CuArray(constant) - ## Index arrays (much faster to have `@view` here) - _gbuffer = CuArray(buffer) - gdegree = @view _gbuffer[1, :] - gfeature = @view _gbuffer[2, :] - gop = @view _gbuffer[3, :] - gexecution_order = @view _gbuffer[4, :] - gidx_self = @view _gbuffer[5, :] - gidx_l = @view _gbuffer[6, :] - gidx_r = @view _gbuffer[7, :] + gbuffer = CuArray(buffer) + gdegree = @view gbuffer[1, :] + gfeature = @view gbuffer[2, :] + gop = @view gbuffer[3, :] + gexecution_order = @view gbuffer[4, :] + gidx_self = @view gbuffer[5, :] + gidx_l = @view gbuffer[6, :] + gidx_r = @view gbuffer[7, :] + gconstant = @view gbuffer[8, :] num_threads = 256 num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads)) _launch_gpu_kernel!( - num_threads, num_blocks, num_launches, workspace, + num_threads, num_blocks, num_launches, gworkspace, # Thread info: num_elem, num_nodes, gexecution_order, # Input data and tree @@ -53,13 +51,14 @@ function eval_tree_array( ) out = ntuple( - i -> (@view workspace[:, roots[i]]), + i -> @view(gworkspace[:, roots[i]]), Val(M) ) is_good = ntuple( i -> true, # Up to user to find NaNs Val(M) ) + return (out, is_good) end @@ -118,7 +117,7 @@ for nuna in 0:10, nbin in 0:10 cur_degree = degree[node] cur_idx = idx_self[node] if cur_degree == 0 - if constant[node] + if constant[node] == 1 cur_val = val[node] buffer[elem, cur_idx] = cur_val else diff --git a/src/AsArray.jl b/src/AsArray.jl index e687d572..836f180a 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -8,19 +8,10 @@ function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionN roots = Array{I}(undef, M) - constant = Array{Bool}(undef, num_nodes) - val = Array{T}(undef, num_nodes) - # degree = Array{I}(undef, num_nodes) - # feature = Array{I}(undef, num_nodes) - # op = Array{I}(undef, num_nodes) - # execution_order = Array{I}(undef, num_nodes) - # idx_self = Array{I}(undef, num_nodes) - # idx_l = Array{I}(undef, num_nodes) - # idx_r = Array{I}(undef, num_nodes) ## Views of the same matrix: - buffer = Array{I}(undef, 7, num_nodes) + buffer = Array{I}(undef, 8, num_nodes) degree = @view buffer[1, :] feature = @view buffer[2, :] op = @view buffer[3, :] @@ -28,6 +19,7 @@ function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionN idx_self = @view buffer[5, :] idx_l = @view buffer[6, :] idx_r = @view buffer[7, :] + constant = @view buffer[8, :] cursor = Ref(zero(I)) for (i_tree, tree) in enumerate(trees) From 6bbff73059c9334c926f8b3576b0c2bac146eb93 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 15 Jan 2024 11:37:17 -0500 Subject: [PATCH 15/47] Allow user to pass preallocated buffer --- Project.toml | 2 +- ext/DynamicExpressionsCUDAExt.jl | 25 +++++++++++++++++-------- src/AsArray.jl | 7 +++++-- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/Project.toml b/Project.toml index 3158f7b1..fb3f8746 100644 --- a/Project.toml +++ b/Project.toml @@ -31,8 +31,8 @@ DynamicExpressionsZygoteExt = "Zygote" [compat] Aqua = "0.7" Bumper = "0.6" -Compat = "3.37, 4" CUDA = "5" +Compat = "3.37, 4" Enzyme = "^0.11.12" LoopVectorization = "0.12" MacroTools = "0.4, 0.5" diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 8ee215c7..e873c8af 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -9,26 +9,35 @@ using DynamicExpressions.AsArrayModule: as_array import DynamicExpressions.EvaluateEquationModule: eval_tree_array function eval_tree_array( - tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum; _... + tree::AbstractExpressionNode{T}, + gcX::CuArray{T,2}, + operators::OperatorEnum; + kws... ) where {T<:Number} - (outs, is_good) = eval_tree_array((tree,), gcX, operators) + (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...) return (only(outs), only(is_good)) end function eval_tree_array( - trees::NTuple{M,N}, gcX::CuArray{T,2}, operators::OperatorEnum; _... + trees::NTuple{M,N}, + gcX::CuArray{T,2}, + operators::OperatorEnum; + buffer=nothing, + gpu_workspace=nothing, + gpu_buffer=nothing, ) where {T<:Number,N<:AbstractExpressionNode{T},M} - (; val, execution_order, roots, buffer) = as_array(Int32, trees...) + (; val, execution_order, roots, buffer, num_nodes) = as_array(Int32, trees...; buffer) num_launches = maximum(execution_order) num_elem = size(gcX, 2) - num_nodes = size(buffer, 2) ## Floating point arrays: - gworkspace = CuArray{T}(undef, num_elem, num_nodes) - gval = CuArray(val) + gworkspace = gpu_workspace === nothing ? CuArray{T}(undef, num_elem, num_nodes + 1) : gpu_workspace + # gval = CuArray(val) + gval = @view gworkspace[:, end] + copyto!(gval, val) ## Index arrays (much faster to have `@view` here) - gbuffer = CuArray(buffer) + gbuffer = gpu_buffer === nothing ? CuArray(buffer) : copyto!(gpu_buffer, buffer) gdegree = @view gbuffer[1, :] gfeature = @view gbuffer[2, :] gop = @view gbuffer[3, :] diff --git a/src/AsArray.jl b/src/AsArray.jl index 836f180a..df450424 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -2,7 +2,9 @@ module AsArrayModule using ..EquationModule: AbstractExpressionNode, tree_mapreduce -function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionNode{T},I,M} +function as_array( + ::Type{I}, trees::Vararg{N,M}; buffer::Union{AbstractArray{I},Nothing}=nothing +) where {T,N<:AbstractExpressionNode{T},I,M} each_num_nodes = length.(trees) num_nodes = sum(each_num_nodes) @@ -11,7 +13,7 @@ function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionN val = Array{T}(undef, num_nodes) ## Views of the same matrix: - buffer = Array{I}(undef, 8, num_nodes) + buffer = buffer === nothing ? Array{I}(undef, 8, num_nodes) : buffer degree = @view buffer[1, :] feature = @view buffer[2, :] op = @view buffer[3, :] @@ -78,6 +80,7 @@ function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionN idx_r, roots, buffer, + num_nodes, ) end From 03d2d24d2aab52b946fc5e1eafe55196b39052e0 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 15 Jan 2024 14:44:23 -0500 Subject: [PATCH 16/47] Try to fix unbound type parameter --- src/AsArray.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AsArray.jl b/src/AsArray.jl index df450424..cce00f88 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -3,7 +3,7 @@ module AsArrayModule using ..EquationModule: AbstractExpressionNode, tree_mapreduce function as_array( - ::Type{I}, trees::Vararg{N,M}; buffer::Union{AbstractArray{I},Nothing}=nothing + ::Type{I}, trees::Vararg{N,M}; buffer::Union{AbstractArray,Nothing}=nothing ) where {T,N<:AbstractExpressionNode{T},I,M} each_num_nodes = length.(trees) num_nodes = sum(each_num_nodes) From 2e128bb4bdc6ca9645d8f7388a7e44ed338ca9d3 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 15 Jan 2024 14:48:35 -0500 Subject: [PATCH 17/47] Expand CUDA compat to 4 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index fb3f8746..d6724f76 100644 --- a/Project.toml +++ b/Project.toml @@ -31,7 +31,7 @@ DynamicExpressionsZygoteExt = "Zygote" [compat] Aqua = "0.7" Bumper = "0.6" -CUDA = "5" +CUDA = "4, 5" Compat = "3.37, 4" Enzyme = "^0.11.12" LoopVectorization = "0.12" From da31b8757ecd4bcd4dbc7d4981494c65d7050f0f Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Tue, 16 Jan 2024 04:14:48 -0500 Subject: [PATCH 18/47] Clean up formatting --- ext/DynamicExpressionsCUDAExt.jl | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index e873c8af..b053b351 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -1,5 +1,4 @@ module DynamicExpressionsCUDAExt -#! format: off using CUDA using DynamicExpressions: OperatorEnum, AbstractExpressionNode @@ -9,10 +8,7 @@ using DynamicExpressions.AsArrayModule: as_array import DynamicExpressions.EvaluateEquationModule: eval_tree_array function eval_tree_array( - tree::AbstractExpressionNode{T}, - gcX::CuArray{T,2}, - operators::OperatorEnum; - kws... + tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum; kws... ) where {T<:Number} (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...) return (only(outs), only(is_good)) @@ -31,8 +27,11 @@ function eval_tree_array( num_elem = size(gcX, 2) ## Floating point arrays: - gworkspace = gpu_workspace === nothing ? CuArray{T}(undef, num_elem, num_nodes + 1) : gpu_workspace - # gval = CuArray(val) + gworkspace = if gpu_workspace === nothing + CuArray{T}(undef, num_elem, num_nodes + 1) + else + gpu_workspace + end gval = @view gworkspace[:, end] copyto!(gval, val) @@ -50,6 +49,7 @@ function eval_tree_array( num_threads = 256 num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads)) + #! format: off _launch_gpu_kernel!( num_threads, num_blocks, num_launches, gworkspace, # Thread info: @@ -58,19 +58,18 @@ function eval_tree_array( operators, gcX, gidx_self, gidx_l, gidx_r, gdegree, gconstant, gval, gfeature, gop, ) + #! format: on - out = ntuple( - i -> @view(gworkspace[:, roots[i]]), - Val(M) - ) + out = ntuple(i -> @view(gworkspace[:, roots[i]]), Val(M)) is_good = ntuple( i -> true, # Up to user to find NaNs - Val(M) + Val(M), ) return (out, is_good) end +#! format: off function _launch_gpu_kernel!( num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2}, # Thread info: @@ -79,17 +78,21 @@ function _launch_gpu_kernel!( operators::OperatorEnum, cX::AbstractArray{T,2}, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray, degree::AbstractArray, constant::AbstractArray, val::AbstractArray{T,1}, feature::AbstractArray, op::AbstractArray, ) where {I,T} + #! format: on nuna = get_nuna(typeof(operators)) nbin = get_nbin(typeof(operators)) - (nuna > 10 || nbin > 10) && error("Too many operators. Kernels are only compiled up to 10.") + (nuna > 10 || nbin > 10) && + error("Too many operators. Kernels are only compiled up to 10.") gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin)) for launch in one(I):I(num_launches) + #! format: off @cuda threads=num_threads blocks=num_blocks gpu_kernel!( buffer, launch, num_elem, num_nodes, execution_order, cX, idx_self, idx_l, idx_r, degree, constant, val, feature, op ) + #! format: on end return nothing end @@ -102,6 +105,7 @@ end # 3. We can't use `@generated` because we can't create closures in those. for nuna in 0:10, nbin in 0:10 @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin}) + #! format: off function ( # Storage: buffer, @@ -111,6 +115,7 @@ for nuna in 0:10, nbin in 0:10 cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray, degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray, ) + #! format: on i = (blockIdx().x - 1) * blockDim().x + threadIdx().x if i > num_elem * num_nodes return nothing @@ -162,5 +167,4 @@ for nuna in 0:10, nbin in 0:10 end end -#! format: on end From 4fdc025cbca493cfbd1c861b4130e1b57090317e Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Tue, 16 Jan 2024 04:25:20 -0500 Subject: [PATCH 19/47] Reduce allocations in `as_array` --- src/AsArray.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/AsArray.jl b/src/AsArray.jl index cce00f88..aaad05b0 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -8,7 +8,7 @@ function as_array( each_num_nodes = length.(trees) num_nodes = sum(each_num_nodes) - roots = Array{I}(undef, M) + roots = tuple(one(I), cumsum(each_num_nodes[1:end-1])...) val = Array{T}(undef, num_nodes) @@ -24,8 +24,7 @@ function as_array( constant = @view buffer[8, :] cursor = Ref(zero(I)) - for (i_tree, tree) in enumerate(trees) - roots[i_tree] = cursor[] + one(I) + for tree in trees tree_mapreduce( leaf -> begin self = (cursor[] += one(I)) From c64a6cbb01664dac528b3146f202c24407b11e8f Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Tue, 16 Jan 2024 04:27:55 -0500 Subject: [PATCH 20/47] Fix indexing --- src/AsArray.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AsArray.jl b/src/AsArray.jl index aaad05b0..d5a1884a 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -8,7 +8,7 @@ function as_array( each_num_nodes = length.(trees) num_nodes = sum(each_num_nodes) - roots = tuple(one(I), cumsum(each_num_nodes[1:end-1])...) + roots = cumsum(tuple(one(I), each_num_nodes[1:end-1]...)) val = Array{T}(undef, num_nodes) From 88ee6cf8b6b00c41335d1c79c5aa8627071e3fdb Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Tue, 16 Jan 2024 18:54:47 -0500 Subject: [PATCH 21/47] Compatible CUDA kernel with empty operators --- ext/DynamicExpressionsCUDAExt.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index b053b351..1d6c7b19 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -21,6 +21,7 @@ function eval_tree_array( buffer=nothing, gpu_workspace=nothing, gpu_buffer=nothing, + kws... ) where {T<:Number,N<:AbstractExpressionNode{T},M} (; val, execution_order, roots, buffer, num_nodes) = as_array(Int32, trees...; buffer) num_launches = maximum(execution_order) @@ -139,7 +140,7 @@ for nuna in 0:10, nbin in 0:10 buffer[elem, cur_idx] = cX[cur_feature, elem] end else - if cur_degree == 1 + if cur_degree == 1 && $nuna > 0 cur_op = op[node] l_idx = idx_l[node] Base.Cartesian.@nif( @@ -149,7 +150,7 @@ for nuna in 0:10, nbin in 0:10 buffer[elem, cur_idx] = op(buffer[elem, l_idx]) end ) - else + elseif $nbin > 0 # Note this check is to avoid type inference issues when binops is empty cur_op = op[node] l_idx = idx_l[node] r_idx = idx_r[node] From 243bdda7c785d22d297f3c4b9de96fe37f39f0ca Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 28 Jan 2024 20:38:12 +0000 Subject: [PATCH 22/47] Formatting --- ext/DynamicExpressionsCUDAExt.jl | 2 +- src/AsArray.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 1d6c7b19..ddb97bf8 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -21,7 +21,7 @@ function eval_tree_array( buffer=nothing, gpu_workspace=nothing, gpu_buffer=nothing, - kws... + kws..., ) where {T<:Number,N<:AbstractExpressionNode{T},M} (; val, execution_order, roots, buffer, num_nodes) = as_array(Int32, trees...; buffer) num_launches = maximum(execution_order) diff --git a/src/AsArray.jl b/src/AsArray.jl index d5a1884a..dc5d9b2b 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -8,7 +8,7 @@ function as_array( each_num_nodes = length.(trees) num_nodes = sum(each_num_nodes) - roots = cumsum(tuple(one(I), each_num_nodes[1:end-1]...)) + roots = cumsum(tuple(one(I), each_num_nodes[1:(end - 1)]...)) val = Array{T}(undef, num_nodes) From b1aca681879f6bb07730184e3f3b68d9a259c5bb Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 25 Feb 2024 20:23:54 +0000 Subject: [PATCH 23/47] Ensure we always check root during array construction --- src/AsArray.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/AsArray.jl b/src/AsArray.jl index dc5d9b2b..13b59507 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -1,11 +1,11 @@ module AsArrayModule -using ..EquationModule: AbstractExpressionNode, tree_mapreduce +using ..EquationModule: AbstractExpressionNode, tree_mapreduce, count_nodes function as_array( ::Type{I}, trees::Vararg{N,M}; buffer::Union{AbstractArray,Nothing}=nothing ) where {T,N<:AbstractExpressionNode{T},I,M} - each_num_nodes = length.(trees) + each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees) num_nodes = sum(each_num_nodes) roots = cumsum(tuple(one(I), each_num_nodes[1:(end - 1)]...)) @@ -24,7 +24,8 @@ function as_array( constant = @view buffer[8, :] cursor = Ref(zero(I)) - for tree in trees + for (root, tree) in zip(roots, trees) + @assert root == cursor[] + 1 tree_mapreduce( leaf -> begin self = (cursor[] += one(I)) From 9372150d8b857f629ede4879a69f50029b6158bd Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 25 Feb 2024 20:43:38 +0000 Subject: [PATCH 24/47] Mechanism to unittest cuda kernel --- ext/DynamicExpressionsCUDAExt.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index ddb97bf8..4a05ad48 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -115,9 +115,11 @@ for nuna in 0:10, nbin in 0:10 # Input data and tree cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray, degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray, + # Override for unittesting: + i=nothing, ) #! format: on - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + i = i === nothing ? (blockIdx().x - 1) * blockDim().x + threadIdx().x : i if i > num_elem * num_nodes return nothing end From 5befede43045565050b6aacddd5e6485e040fb34 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 25 Feb 2024 21:13:33 +0000 Subject: [PATCH 25/47] Fix unbound type issues --- src/AsArray.jl | 6 +++++- src/EvaluateEquation.jl | 6 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/AsArray.jl b/src/AsArray.jl index 13b59507..60239cda 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -3,8 +3,12 @@ module AsArrayModule using ..EquationModule: AbstractExpressionNode, tree_mapreduce, count_nodes function as_array( - ::Type{I}, trees::Vararg{N,M}; buffer::Union{AbstractArray,Nothing}=nothing + ::Type{I}, + tree::N, + additional_trees::Vararg{N,M}; + buffer::Union{AbstractArray,Nothing}=nothing, ) where {T,N<:AbstractExpressionNode{T},I,M} + trees = (tree, additional_trees...) each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees) num_nodes = sum(each_num_nodes) diff --git a/src/EvaluateEquation.jl b/src/EvaluateEquation.jl index 0ced0631..764c307a 100644 --- a/src/EvaluateEquation.jl +++ b/src/EvaluateEquation.jl @@ -98,10 +98,10 @@ function eval_tree_array( return eval_tree_array(tree, cX, operators; turbo, bumper) end function eval_tree_array( - trees::NTuple{M,N}, cX::AbstractMatrix{T}, operators::OperatorEnum; kws... + trees::Tuple{N,Vararg{N,M}}, cX::AbstractMatrix{T}, operators::OperatorEnum; kws... ) where {T<:Number,N<:AbstractExpressionNode{T},M} - outs = ntuple(i -> eval_tree_array(trees[i], cX, operators; kws...)[1], Val(M)) - return ntuple(i -> first(outs[i]), Val(M)), ntuple(i -> last(outs[i]), Val(M)) + outs = ntuple(i -> eval_tree_array(trees[i], cX, operators; kws...)[1], Val(M + 1)) + return ntuple(i -> first(outs[i]), Val(M + 1)), ntuple(i -> last(outs[i]), Val(M + 1)) end get_nuna(::Type{<:OperatorEnum{B,U}}) where {B,U} = counttuple(U) From 74094e36e61c0e3b1a3c6823c898ec7b4889b51e Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 25 Feb 2024 22:15:47 +0000 Subject: [PATCH 26/47] Start working on CPU-based unittests --- ext/DynamicExpressionsCUDAExt.jl | 58 ++++++++++++++++++++++++-------- test/test_cuda.jl | 29 ++++++++++++++++ test/unittest.jl | 6 +++- 3 files changed, 78 insertions(+), 15 deletions(-) create mode 100644 test/test_cuda.jl diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 4a05ad48..457dfa80 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -1,22 +1,36 @@ module DynamicExpressionsCUDAExt -using CUDA +using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx using DynamicExpressions: OperatorEnum, AbstractExpressionNode using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna using DynamicExpressions.AsArrayModule: as_array import DynamicExpressions.EvaluateEquationModule: eval_tree_array +# array type for exclusively testing purposes +struct FakeCuArray{T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N} + a::A +end +Base.similar(x::FakeCuArray, dims::Integer...) = FakeCuArray(similar(x.a, dims...)) +Base.getindex(x::FakeCuArray, i::Int...) = getindex(x.a, i...) +Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...) +Base.size(x::FakeCuArray) = size(x.a) + +const MaybeCuArray{T,N} = Union{CuArray{T,2},FakeCuArray{T,N}} + +to_device(a, ::CuArray) = CuArray(a) +to_device(a, ::FakeCuArray) = FakeCuArray(a) + function eval_tree_array( - tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum; kws... + tree::AbstractExpressionNode{T}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; kws... ) where {T<:Number} (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...) return (only(outs), only(is_good)) end function eval_tree_array( - trees::NTuple{M,N}, - gcX::CuArray{T,2}, + trees::Tuple{N,Vararg{N,M}}, + gcX::MaybeCuArray{T,2}, operators::OperatorEnum; buffer=nothing, gpu_workspace=nothing, @@ -29,7 +43,7 @@ function eval_tree_array( ## Floating point arrays: gworkspace = if gpu_workspace === nothing - CuArray{T}(undef, num_elem, num_nodes + 1) + similar(gcX, num_elem, num_nodes + 1) else gpu_workspace end @@ -37,7 +51,11 @@ function eval_tree_array( copyto!(gval, val) ## Index arrays (much faster to have `@view` here) - gbuffer = gpu_buffer === nothing ? CuArray(buffer) : copyto!(gpu_buffer, buffer) + gbuffer = if gpu_buffer === nothing + to_device(buffer, gcX) + else + copyto!(gpu_buffer, buffer) + end gdegree = @view gbuffer[1, :] gfeature = @view gbuffer[2, :] gop = @view gbuffer[3, :] @@ -61,10 +79,10 @@ function eval_tree_array( ) #! format: on - out = ntuple(i -> @view(gworkspace[:, roots[i]]), Val(M)) + out = ntuple(i -> @view(gworkspace[:, roots[i]]), Val(M + 1)) is_good = ntuple( i -> true, # Up to user to find NaNs - Val(M), + Val(M + 1), ) return (out, is_good) @@ -87,12 +105,24 @@ function _launch_gpu_kernel!( gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin)) for launch in one(I):I(num_launches) #! format: off - @cuda threads=num_threads blocks=num_blocks gpu_kernel!( - buffer, - launch, num_elem, num_nodes, execution_order, - cX, idx_self, idx_l, idx_r, - degree, constant, val, feature, op - ) + if buffer isa CuArray + @cuda threads=num_threads blocks=num_blocks gpu_kernel!( + buffer, + launch, num_elem, num_nodes, execution_order, + cX, idx_self, idx_l, idx_r, + degree, constant, val, feature, op + ) + else + Threads.@threads for i in 1:(num_threads * num_blocks) + gpu_kernel!( + buffer, + launch, num_elem, num_nodes, execution_order, + cX, idx_self, idx_l, idx_r, + degree, constant, val, feature, op, + i + ) + end + end #! format: on end return nothing diff --git a/test/test_cuda.jl b/test/test_cuda.jl new file mode 100644 index 00000000..264db2a4 --- /dev/null +++ b/test/test_cuda.jl @@ -0,0 +1,29 @@ +using DynamicExpressions +using CUDA +using Random + +ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt) +const FakeCuArray = ext.FakeCuArray + +include("tree_gen_utils.jl") + +let + operators = OperatorEnum(; binary_operators=[+, -, *, /], unary_operators=[cos, sin]); + x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3) + + for T in (Float32, Float64, ComplexF64), num_trees in (1, 2, 3), seed in 0:10 + Random.seed!(seed) + num_rows = rand(10:30) + nodes_per = rand(10:25, num_trees) + trees = ntuple(i -> gen_random_tree_fixed_size(nodes_per[i], operators, 3, T), num_trees) + @show trees + X = randn(T, 3, num_rows) + y, completed = eval_tree_array(trees, X, operators) + gpu_y, gpu_completed = eval_tree_array(trees, FakeCuArray(X), operators) + gpu_y = Array.(gpu_y) + + for i in eachindex(completed, gpu_completed) + @test ((completed[i] && gpu_completed[i]) && (y[i] ≈ gpu_y[i])) || (!completed[i] && !gpu_completed[i]) + end + end +end diff --git a/test/unittest.jl b/test/unittest.jl index 1f3528c0..8200c538 100644 --- a/test/unittest.jl +++ b/test/unittest.jl @@ -9,7 +9,7 @@ end end # Trigger extensions: -using Zygote, SymbolicUtils, LoopVectorization, Bumper, Optim +using Zygote, SymbolicUtils, LoopVectorization, Bumper, Optim, CUDA @safetestset "Test deprecations" begin include("test_deprecations.jl") @@ -110,3 +110,7 @@ end @safetestset "Test random sampling" begin include("test_random.jl") end + +@safetestset "Test CUDA" begin + include("test_cuda.jl") +end From 9d7500c67b51a832595a7c56fd0b28d63b2c0a2f Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 25 Feb 2024 22:16:06 +0000 Subject: [PATCH 27/47] Fix indexing bugs in GPU kernels --- ext/DynamicExpressionsCUDAExt.jl | 12 ++++++++---- src/EvaluateEquation.jl | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 457dfa80..613ab8b2 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -15,6 +15,7 @@ Base.similar(x::FakeCuArray, dims::Integer...) = FakeCuArray(similar(x.a, dims.. Base.getindex(x::FakeCuArray, i::Int...) = getindex(x.a, i...) Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...) Base.size(x::FakeCuArray) = size(x.a) +Base.Array(x::FakeCuArray) = Array(x.a) const MaybeCuArray{T,N} = Union{CuArray{T,2},FakeCuArray{T,N}} @@ -41,13 +42,16 @@ function eval_tree_array( num_launches = maximum(execution_order) num_elem = size(gcX, 2) - ## Floating point arrays: + ## The following array is our "workspace" for + ## the GPU kernel, with size equal to the number of rows + ## in the input data by the number of nodes in the tree. + ## It has one extra row to store the constant values. gworkspace = if gpu_workspace === nothing - similar(gcX, num_elem, num_nodes + 1) + similar(gcX, num_elem + 1, num_nodes) else gpu_workspace end - gval = @view gworkspace[:, end] + gval = @view gworkspace[end, :] copyto!(gval, val) ## Index arrays (much faster to have `@view` here) @@ -79,7 +83,7 @@ function eval_tree_array( ) #! format: on - out = ntuple(i -> @view(gworkspace[:, roots[i]]), Val(M + 1)) + out = ntuple(i -> @view(gworkspace[begin:end-1, roots[i]]), Val(M + 1)) is_good = ntuple( i -> true, # Up to user to find NaNs Val(M + 1), diff --git a/src/EvaluateEquation.jl b/src/EvaluateEquation.jl index 764c307a..0683405c 100644 --- a/src/EvaluateEquation.jl +++ b/src/EvaluateEquation.jl @@ -100,7 +100,7 @@ end function eval_tree_array( trees::Tuple{N,Vararg{N,M}}, cX::AbstractMatrix{T}, operators::OperatorEnum; kws... ) where {T<:Number,N<:AbstractExpressionNode{T},M} - outs = ntuple(i -> eval_tree_array(trees[i], cX, operators; kws...)[1], Val(M + 1)) + outs = ntuple(i -> eval_tree_array(trees[i], cX, operators; kws...), Val(M + 1)) return ntuple(i -> first(outs[i]), Val(M + 1)), ntuple(i -> last(outs[i]), Val(M + 1)) end From f6155badaf9d9a38c1ada861538d8912eecf4e5b Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 25 Feb 2024 22:21:14 +0000 Subject: [PATCH 28/47] Fix unsafe `sin` in CUDA kernel tests --- ext/DynamicExpressionsCUDAExt.jl | 2 +- test/test_cuda.jl | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 613ab8b2..f351dd82 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -83,7 +83,7 @@ function eval_tree_array( ) #! format: on - out = ntuple(i -> @view(gworkspace[begin:end-1, roots[i]]), Val(M + 1)) + out = ntuple(i -> @view(gworkspace[begin:(end - 1), roots[i]]), Val(M + 1)) is_good = ntuple( i -> true, # Up to user to find NaNs Val(M + 1), diff --git a/test/test_cuda.jl b/test/test_cuda.jl index 264db2a4..11f7567e 100644 --- a/test/test_cuda.jl +++ b/test/test_cuda.jl @@ -7,23 +7,31 @@ const FakeCuArray = ext.FakeCuArray include("tree_gen_utils.jl") +safe_sin(x) = isfinite(x) ? sin(x) : convert(eltype(x), NaN) +safe_cos(x) = isfinite(x) ? cos(x) : convert(eltype(x), NaN) + let - operators = OperatorEnum(; binary_operators=[+, -, *, /], unary_operators=[cos, sin]); + operators = OperatorEnum(; + binary_operators=[+, -, *, /], unary_operators=[safe_sin, safe_cos] + ) x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3) for T in (Float32, Float64, ComplexF64), num_trees in (1, 2, 3), seed in 0:10 Random.seed!(seed) num_rows = rand(10:30) nodes_per = rand(10:25, num_trees) - trees = ntuple(i -> gen_random_tree_fixed_size(nodes_per[i], operators, 3, T), num_trees) - @show trees + trees = ntuple( + i -> gen_random_tree_fixed_size(nodes_per[i], operators, 3, T), num_trees + ) X = randn(T, 3, num_rows) y, completed = eval_tree_array(trees, X, operators) gpu_y, gpu_completed = eval_tree_array(trees, FakeCuArray(X), operators) gpu_y = Array.(gpu_y) for i in eachindex(completed, gpu_completed) - @test ((completed[i] && gpu_completed[i]) && (y[i] ≈ gpu_y[i])) || (!completed[i] && !gpu_completed[i]) + if completed[i] + @test y[i] ≈ gpu_y[i] + end end end end From a7d7e795925fd0ca25f5b9bd2d4651a4f1b95184 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 25 Feb 2024 22:30:04 +0000 Subject: [PATCH 29/47] Fix `MaybeCuArray` definition --- ext/DynamicExpressionsCUDAExt.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index f351dd82..5dd08479 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -17,7 +17,7 @@ Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...) Base.size(x::FakeCuArray) = size(x.a) Base.Array(x::FakeCuArray) = Array(x.a) -const MaybeCuArray{T,N} = Union{CuArray{T,2},FakeCuArray{T,N}} +const MaybeCuArray{T,N} = Union{CuArray{T,N},FakeCuArray{T,N}} to_device(a, ::CuArray) = CuArray(a) to_device(a, ::FakeCuArray) = FakeCuArray(a) From 46a6b48ae3a7e12a123ad06749b08f52387d0db9 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 25 Feb 2024 22:46:29 +0000 Subject: [PATCH 30/47] Test buffered-evaluation of CUDA kernels --- ext/DynamicExpressionsCUDAExt.jl | 1 - test/test_cuda.jl | 41 +++++++++++++++++++++----------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 5dd08479..07b72de7 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -15,7 +15,6 @@ Base.similar(x::FakeCuArray, dims::Integer...) = FakeCuArray(similar(x.a, dims.. Base.getindex(x::FakeCuArray, i::Int...) = getindex(x.a, i...) Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...) Base.size(x::FakeCuArray) = size(x.a) -Base.Array(x::FakeCuArray) = Array(x.a) const MaybeCuArray{T,N} = Union{CuArray{T,N},FakeCuArray{T,N}} diff --git a/test/test_cuda.jl b/test/test_cuda.jl index 11f7567e..8f9f33d0 100644 --- a/test/test_cuda.jl +++ b/test/test_cuda.jl @@ -16,21 +16,34 @@ let ) x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3) - for T in (Float32, Float64, ComplexF64), num_trees in (1, 2, 3), seed in 0:10 + for T in (Float32, Float64, ComplexF64), ntrees in (1, 2, 3), seed in 0:10 Random.seed!(seed) - num_rows = rand(10:30) - nodes_per = rand(10:25, num_trees) - trees = ntuple( - i -> gen_random_tree_fixed_size(nodes_per[i], operators, 3, T), num_trees - ) - X = randn(T, 3, num_rows) - y, completed = eval_tree_array(trees, X, operators) - gpu_y, gpu_completed = eval_tree_array(trees, FakeCuArray(X), operators) - gpu_y = Array.(gpu_y) - - for i in eachindex(completed, gpu_completed) - if completed[i] - @test y[i] ≈ gpu_y[i] + + nrow = rand(10:30) + nnodes = rand(10:25, ntrees) + + buffer = rand(Bool) ? ones(Int32, 8, sum(nnodes)) : nothing + gpu_buffer = rand(Bool) ? FakeCuArray(ones(Int32, 8, sum(nnodes))) : nothing + gpu_workspace = rand(Bool) ? FakeCuArray(ones(T, nrow + 1, sum(nnodes))) : nothing + + trees = ntuple(i -> gen_random_tree_fixed_size(nnodes[i], operators, 3, T), ntrees) + X = randn(T, 3, nrow) + if ntrees > 1 + y, completed = eval_tree_array(trees, X, operators) + gpu_y, gpu_completed = eval_tree_array( + trees, FakeCuArray(X), operators; buffer, gpu_workspace, gpu_buffer + ) + + for i in eachindex(completed, gpu_completed) + if completed[i] + @test y[i] ≈ gpu_y[i] + end + end + else + y, completed = eval_tree_array(only(trees), X, operators) + gpu_y, gpu_completed = eval_tree_array(only(trees), FakeCuArray(X), operators) + if completed + @test y ≈ gpu_y end end end From 7ceacc077eba0513c29f25d32d39d24aa1fc2509 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 25 Feb 2024 22:47:47 +0000 Subject: [PATCH 31/47] Only test CUDA on 1.9+ --- test/unittest.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/unittest.jl b/test/unittest.jl index 8200c538..48462ad2 100644 --- a/test/unittest.jl +++ b/test/unittest.jl @@ -9,7 +9,7 @@ end end # Trigger extensions: -using Zygote, SymbolicUtils, LoopVectorization, Bumper, Optim, CUDA +using Zygote, SymbolicUtils, LoopVectorization, Bumper, Optim @safetestset "Test deprecations" begin include("test_deprecations.jl") @@ -111,6 +111,8 @@ end include("test_random.jl") end -@safetestset "Test CUDA" begin - include("test_cuda.jl") +if VERSION >= v"1.9" + @eval @safetestset "Test CUDA" begin + include("test_cuda.jl") + end end From 08df9a31ce53540277d7c80ce1fffc1a69e95650 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 25 Feb 2024 23:13:10 +0000 Subject: [PATCH 32/47] Use vectors for vector input; tuple for tuple --- ext/DynamicExpressionsCUDAExt.jl | 13 +++++-------- src/AsArray.jl | 15 ++++++++++----- src/EvaluateEquation.jl | 11 +++++++---- test/test_cuda.jl | 22 ++++++++++++++++++---- 4 files changed, 40 insertions(+), 21 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 07b72de7..23c797b8 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -29,15 +29,15 @@ function eval_tree_array( end function eval_tree_array( - trees::Tuple{N,Vararg{N,M}}, + trees::Union{NTuple{M,N} where M,AbstractVector{N}}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; buffer=nothing, gpu_workspace=nothing, gpu_buffer=nothing, kws..., -) where {T<:Number,N<:AbstractExpressionNode{T},M} - (; val, execution_order, roots, buffer, num_nodes) = as_array(Int32, trees...; buffer) +) where {T<:Number,N<:AbstractExpressionNode{T}} + (; val, execution_order, roots, buffer, num_nodes) = as_array(Int32, trees; buffer) num_launches = maximum(execution_order) num_elem = size(gcX, 2) @@ -82,11 +82,8 @@ function eval_tree_array( ) #! format: on - out = ntuple(i -> @view(gworkspace[begin:(end - 1), roots[i]]), Val(M + 1)) - is_good = ntuple( - i -> true, # Up to user to find NaNs - Val(M + 1), - ) + out = (r -> @view(gworkspace[begin:(end - 1), r])).(roots) + is_good = (_ -> true).(trees) return (out, is_good) end diff --git a/src/AsArray.jl b/src/AsArray.jl index 60239cda..3d7bdde9 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -4,15 +4,20 @@ using ..EquationModule: AbstractExpressionNode, tree_mapreduce, count_nodes function as_array( ::Type{I}, - tree::N, - additional_trees::Vararg{N,M}; + trees::Union{NTuple{M,N} where M,AbstractVector{N}}; buffer::Union{AbstractArray,Nothing}=nothing, -) where {T,N<:AbstractExpressionNode{T},I,M} - trees = (tree, additional_trees...) +) where {T,N<:AbstractExpressionNode{T},I} each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees) num_nodes = sum(each_num_nodes) - roots = cumsum(tuple(one(I), each_num_nodes[1:(end - 1)]...)) + # Want `roots` to be tuple if `trees` is tuple and similar for vector + roots = cumsum( + if each_num_nodes isa Tuple + tuple(one(I), each_num_nodes[1:(end - 1)]...) + else + vcat(one(I), each_num_nodes[1:(end - 1)]) + end, + ) val = Array{T}(undef, num_nodes) diff --git a/src/EvaluateEquation.jl b/src/EvaluateEquation.jl index 0683405c..df52a0cb 100644 --- a/src/EvaluateEquation.jl +++ b/src/EvaluateEquation.jl @@ -98,10 +98,13 @@ function eval_tree_array( return eval_tree_array(tree, cX, operators; turbo, bumper) end function eval_tree_array( - trees::Tuple{N,Vararg{N,M}}, cX::AbstractMatrix{T}, operators::OperatorEnum; kws... -) where {T<:Number,N<:AbstractExpressionNode{T},M} - outs = ntuple(i -> eval_tree_array(trees[i], cX, operators; kws...), Val(M + 1)) - return ntuple(i -> first(outs[i]), Val(M + 1)), ntuple(i -> last(outs[i]), Val(M + 1)) + trees::Union{NTuple{M,N} where M,AbstractArray{N}}, + cX::AbstractMatrix{T}, + operators::OperatorEnum; + kws..., +) where {T<:Number,N<:AbstractExpressionNode{T}} + outs = (t -> eval_tree_array(t, cX, operators; kws...)).(trees) + return first.(outs), last.(outs) end get_nuna(::Type{<:OperatorEnum{B,U}}) where {B,U} = counttuple(U) diff --git a/test/test_cuda.jl b/test/test_cuda.jl index 8f9f33d0..443df746 100644 --- a/test/test_cuda.jl +++ b/test/test_cuda.jl @@ -21,27 +21,41 @@ let nrow = rand(10:30) nnodes = rand(10:25, ntrees) + use_tuple = rand(Bool) buffer = rand(Bool) ? ones(Int32, 8, sum(nnodes)) : nothing gpu_buffer = rand(Bool) ? FakeCuArray(ones(Int32, 8, sum(nnodes))) : nothing gpu_workspace = rand(Bool) ? FakeCuArray(ones(T, nrow + 1, sum(nnodes))) : nothing trees = ntuple(i -> gen_random_tree_fixed_size(nnodes[i], operators, 3, T), ntrees) + trees = use_tuple ? trees : collect(trees) X = randn(T, 3, nrow) if ntrees > 1 - y, completed = eval_tree_array(trees, X, operators) - gpu_y, gpu_completed = eval_tree_array( + y, completed = @inferred eval_tree_array(trees, X, operators) + gpu_y, gpu_completed = @inferred eval_tree_array( trees, FakeCuArray(X), operators; buffer, gpu_workspace, gpu_buffer ) + # Should give same result either way for i in eachindex(completed, gpu_completed) if completed[i] @test y[i] ≈ gpu_y[i] end end + + # Should return same type as input + if use_tuple + @test y isa Tuple + @test gpu_y isa Tuple + else + @test y isa Vector + @test gpu_y isa Vector + end else - y, completed = eval_tree_array(only(trees), X, operators) - gpu_y, gpu_completed = eval_tree_array(only(trees), FakeCuArray(X), operators) + y, completed = @inferred eval_tree_array(only(trees), X, operators) + gpu_y, gpu_completed = @inferred eval_tree_array( + only(trees), FakeCuArray(X), operators + ) if completed @test y ≈ gpu_y end From 8b4cbf379c173983bd4f8ed47280d19902cd1619 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 25 Feb 2024 23:20:48 +0000 Subject: [PATCH 33/47] Fix unbound args in multi-tree eval --- ext/DynamicExpressionsCUDAExt.jl | 2 +- src/AsArray.jl | 2 +- src/EvaluateEquation.jl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 23c797b8..929ba31f 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -29,7 +29,7 @@ function eval_tree_array( end function eval_tree_array( - trees::Union{NTuple{M,N} where M,AbstractVector{N}}, + trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; buffer=nothing, diff --git a/src/AsArray.jl b/src/AsArray.jl index 3d7bdde9..0939a11c 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -4,7 +4,7 @@ using ..EquationModule: AbstractExpressionNode, tree_mapreduce, count_nodes function as_array( ::Type{I}, - trees::Union{NTuple{M,N} where M,AbstractVector{N}}; + trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}}; buffer::Union{AbstractArray,Nothing}=nothing, ) where {T,N<:AbstractExpressionNode{T},I} each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees) diff --git a/src/EvaluateEquation.jl b/src/EvaluateEquation.jl index df52a0cb..2749bbfc 100644 --- a/src/EvaluateEquation.jl +++ b/src/EvaluateEquation.jl @@ -98,7 +98,7 @@ function eval_tree_array( return eval_tree_array(tree, cX, operators; turbo, bumper) end function eval_tree_array( - trees::Union{NTuple{M,N} where M,AbstractArray{N}}, + trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}}, cX::AbstractMatrix{T}, operators::OperatorEnum; kws..., From 676ad86e98795712e78c9667883903bed8c0b9c3 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 26 Feb 2024 00:35:43 +0000 Subject: [PATCH 34/47] Add test for no buffer recreation --- ext/DynamicExpressionsCUDAExt.jl | 19 ++-- src/AsArray.jl | 12 ++- test/test_cuda.jl | 148 ++++++++++++++++++++++--------- 3 files changed, 130 insertions(+), 49 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 929ba31f..bb965214 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -35,10 +35,15 @@ function eval_tree_array( buffer=nothing, gpu_workspace=nothing, gpu_buffer=nothing, + roots=nothing, + num_nodes=nothing, + num_launches=nothing, + update_buffers::Val{_update_buffers}=Val(true), kws..., -) where {T<:Number,N<:AbstractExpressionNode{T}} - (; val, execution_order, roots, buffer, num_nodes) = as_array(Int32, trees; buffer) - num_launches = maximum(execution_order) +) where {T<:Number,N<:AbstractExpressionNode{T},_update_buffers} + if _update_buffers + (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, trees; buffer) + end num_elem = size(gcX, 2) ## The following array is our "workspace" for @@ -51,10 +56,14 @@ function eval_tree_array( gpu_workspace end gval = @view gworkspace[end, :] - copyto!(gval, val) + if _update_buffers + copyto!(gval, val) + end ## Index arrays (much faster to have `@view` here) - gbuffer = if gpu_buffer === nothing + gbuffer = if !_update_buffers + gpu_buffer + elseif gpu_buffer === nothing to_device(buffer, gcX) else copyto!(gpu_buffer, buffer) diff --git a/src/AsArray.jl b/src/AsArray.jl index 0939a11c..75ff1ece 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -4,9 +4,12 @@ using ..EquationModule: AbstractExpressionNode, tree_mapreduce, count_nodes function as_array( ::Type{I}, - trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}}; + trees::Union{N,Tuple{N,Vararg{N}},AbstractVector{N}}; buffer::Union{AbstractArray,Nothing}=nothing, ) where {T,N<:AbstractExpressionNode{T},I} + if trees isa N + return as_array(I, (trees,); buffer=buffer) + end each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees) num_nodes = sum(each_num_nodes) @@ -33,6 +36,7 @@ function as_array( constant = @view buffer[8, :] cursor = Ref(zero(I)) + num_launches = zero(I) for (root, tree) in zip(roots, trees) @assert root == cursor[] + 1 tree_mapreduce( @@ -70,6 +74,11 @@ function as_array( end execution_order[parent.id] = parent_execution_order + # Global number of launches equal to maximum execution order + if parent_execution_order > num_launches + num_launches = parent_execution_order + end + (id=parent.id, order=parent_execution_order) end, tree; @@ -84,6 +93,7 @@ function as_array( feature, op, execution_order, + num_launches, idx_self, idx_l, idx_r, diff --git a/test/test_cuda.jl b/test/test_cuda.jl index 443df746..2d46bce0 100644 --- a/test/test_cuda.jl +++ b/test/test_cuda.jl @@ -1,4 +1,5 @@ using DynamicExpressions +using DynamicExpressions.AsArrayModule: as_array using CUDA using Random @@ -10,55 +11,116 @@ include("tree_gen_utils.jl") safe_sin(x) = isfinite(x) ? sin(x) : convert(eltype(x), NaN) safe_cos(x) = isfinite(x) ? cos(x) : convert(eltype(x), NaN) -let - operators = OperatorEnum(; - binary_operators=[+, -, *, /], unary_operators=[safe_sin, safe_cos] - ) - x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3) - - for T in (Float32, Float64, ComplexF64), ntrees in (1, 2, 3), seed in 0:10 - Random.seed!(seed) - - nrow = rand(10:30) - nnodes = rand(10:25, ntrees) - use_tuple = rand(Bool) - - buffer = rand(Bool) ? ones(Int32, 8, sum(nnodes)) : nothing - gpu_buffer = rand(Bool) ? FakeCuArray(ones(Int32, 8, sum(nnodes))) : nothing - gpu_workspace = rand(Bool) ? FakeCuArray(ones(T, nrow + 1, sum(nnodes))) : nothing - - trees = ntuple(i -> gen_random_tree_fixed_size(nnodes[i], operators, 3, T), ntrees) - trees = use_tuple ? trees : collect(trees) - X = randn(T, 3, nrow) - if ntrees > 1 - y, completed = @inferred eval_tree_array(trees, X, operators) - gpu_y, gpu_completed = @inferred eval_tree_array( - trees, FakeCuArray(X), operators; buffer, gpu_workspace, gpu_buffer +@testset "Random evals" begin + let + operators = OperatorEnum(; + binary_operators=[+, -, *, /], unary_operators=[safe_sin, safe_cos] + ) + x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3) + + for T in (Float32, Float64, ComplexF64), ntrees in (1, 2, 3), seed in 0:10 + Random.seed!(seed) + + nrow = rand(10:30) + nnodes = rand(10:25, ntrees) + use_tuple = rand(Bool) + + buffer = rand(Bool) ? ones(Int32, 8, sum(nnodes)) : nothing + gpu_buffer = rand(Bool) ? FakeCuArray(ones(Int32, 8, sum(nnodes))) : nothing + gpu_workspace = + rand(Bool) ? FakeCuArray(ones(T, nrow + 1, sum(nnodes))) : nothing + + trees = ntuple( + i -> gen_random_tree_fixed_size(nnodes[i], operators, 3, T), ntrees ) + trees = use_tuple ? trees : collect(trees) + X = randn(T, 3, nrow) + if ntrees > 1 + y, completed = @inferred eval_tree_array(trees, X, operators) + gpu_y, gpu_completed = @inferred eval_tree_array( + trees, FakeCuArray(X), operators; buffer, gpu_workspace, gpu_buffer + ) - # Should give same result either way - for i in eachindex(completed, gpu_completed) - if completed[i] - @test y[i] ≈ gpu_y[i] + # Should give same result either way + for i in eachindex(completed, gpu_completed) + if completed[i] + @test y[i] ≈ gpu_y[i] + end end - end - # Should return same type as input - if use_tuple - @test y isa Tuple - @test gpu_y isa Tuple + # Should return same type as input + if use_tuple + @test y isa Tuple + @test gpu_y isa Tuple + else + @test y isa Vector + @test gpu_y isa Vector + end else - @test y isa Vector - @test gpu_y isa Vector - end - else - y, completed = @inferred eval_tree_array(only(trees), X, operators) - gpu_y, gpu_completed = @inferred eval_tree_array( - only(trees), FakeCuArray(X), operators - ) - if completed - @test y ≈ gpu_y + y, completed = @inferred eval_tree_array(only(trees), X, operators) + gpu_y, gpu_completed = @inferred eval_tree_array( + only(trees), FakeCuArray(X), operators + ) + if completed + @test y ≈ gpu_y + end end end end end + +@testset "Evaluation on pre-computed buffers" begin + let + operators = OperatorEnum(; + binary_operators=[+, -, *, /], unary_operators=[sin, cos] + ) + x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3) + Random.seed!(0) + tree = sin(x1 * 3.1 - x3 * 0.9 + 0.2) * x2 - x3 * x3 * 1.5 + X = randn(Float64, 3, 100) + + y1, _ = eval_tree_array(tree, X, operators) + y2, _ = eval_tree_array(tree, FakeCuArray(X), operators) + + @test y1 ≈ y2 + + (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, tree) + gpu_buffer = FakeCuArray(buffer) + gpu_workspace = FakeCuArray(zeros(Float64, 101, 50)) + copyto!((@view gpu_workspace[end, :]), val) + + # Now, with all buffers: + y3, _ = eval_tree_array( + tree, + FakeCuArray(X), + operators; + gpu_workspace, + gpu_buffer, + roots, + num_nodes, + num_launches, + update_buffers=Val(false), + ) + @test y1 ≈ y3 + + # Should be able to shift some of the values in this buffer: + i = findfirst(gpu_workspace[end, :] .== 0.9) + gpu_workspace[end, i] = 0.8 + + # And get the updated results: + tree_prime = sin(x1 * 3.1 - x3 * 0.8 + 0.2) * x2 - x3 * x3 * 1.5 + y1_prime, _ = eval_tree_array(tree_prime, X, operators) + y3_prime, _ = eval_tree_array( + x1, # Doesn't matter what we put here + FakeCuArray(X), + operators; + gpu_workspace, + gpu_buffer, + roots, + num_nodes, + num_launches, + update_buffers=Val(false), + ) + @test y1_prime ≈ y3_prime + end +end From c1e579a95c5facefb06d5e11a3bf27ed94f703e4 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 26 Feb 2024 01:15:34 +0000 Subject: [PATCH 35/47] Start switching to KernelAbstractions.jl --- Project.toml | 10 +-- ...ynamicExpressionsKernelAbstractionsExt.jl} | 78 +++++++------------ src/DynamicExpressions.jl | 3 +- src/ExtensionInterface.jl | 3 + 4 files changed, 36 insertions(+), 58 deletions(-) rename ext/{DynamicExpressionsCUDAExt.jl => DynamicExpressionsKernelAbstractionsExt.jl} (70%) diff --git a/Project.toml b/Project.toml index 097e0fd7..69d24f56 100644 --- a/Project.toml +++ b/Project.toml @@ -14,7 +14,7 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" [weakdeps] Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" Optim = "429524aa-4258-5aef-a3af-852621145aeb" SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b" @@ -22,7 +22,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [extensions] DynamicExpressionsBumperExt = "Bumper" -DynamicExpressionsCUDAExt = "CUDA" +DynamicExpressionsKernelAbstractionsExt = "KernelAbstractions" DynamicExpressionsLoopVectorizationExt = "LoopVectorization" DynamicExpressionsOptimExt = "Optim" DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils" @@ -31,7 +31,7 @@ DynamicExpressionsZygoteExt = "Zygote" [compat] Aqua = "0.7" Bumper = "0.6" -CUDA = "4, 5" +KernelAbstractions = "0.9" Compat = "3.37, 4" Enzyme = "^0.11.12" LoopVectorization = "0.12" @@ -47,9 +47,9 @@ julia = "1.6" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" Optim = "429524aa-4258-5aef-a3af-852621145aeb" @@ -61,4 +61,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [targets] -test = ["Test", "SafeTestsets", "Aqua", "Bumper", "CUDA", "Enzyme", "ForwardDiff", "LinearAlgebra", "LoopVectorization", "Optim", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"] +test = ["Test", "SafeTestsets", "Aqua", "Bumper", "KernelAbstractions", "Enzyme", "ForwardDiff", "LinearAlgebra", "LoopVectorization", "Optim", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"] diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsKernelAbstractionsExt.jl similarity index 70% rename from ext/DynamicExpressionsCUDAExt.jl rename to ext/DynamicExpressionsKernelAbstractionsExt.jl index bb965214..454fdbdb 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsKernelAbstractionsExt.jl @@ -1,37 +1,24 @@ -module DynamicExpressionsCUDAExt +module DynamicExpressionsKernelAbstractionsExt -using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx +using KernelAbstractions: @index, @kernel, @Const, get_backend using DynamicExpressions: OperatorEnum, AbstractExpressionNode using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna using DynamicExpressions.AsArrayModule: as_array -import DynamicExpressions.EvaluateEquationModule: eval_tree_array +import DynamicExpressions.ExtensionInterfaceModule: gpu_eval_tree_array -# array type for exclusively testing purposes -struct FakeCuArray{T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N} - a::A -end -Base.similar(x::FakeCuArray, dims::Integer...) = FakeCuArray(similar(x.a, dims...)) -Base.getindex(x::FakeCuArray, i::Int...) = getindex(x.a, i...) -Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...) -Base.size(x::FakeCuArray) = size(x.a) - -const MaybeCuArray{T,N} = Union{CuArray{T,N},FakeCuArray{T,N}} - -to_device(a, ::CuArray) = CuArray(a) -to_device(a, ::FakeCuArray) = FakeCuArray(a) - -function eval_tree_array( - tree::AbstractExpressionNode{T}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; kws... +function gpu_eval_tree_array( + tree::AbstractExpressionNode{T}, gcX, operators::OperatorEnum; kws... ) where {T<:Number} - (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...) + (outs, is_good) = gpu_eval_tree_array((tree,), gcX, operators; kws...) return (only(outs), only(is_good)) end -function eval_tree_array( +function gpu_eval_tree_array( trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}}, - gcX::MaybeCuArray{T,2}, + gcX, operators::OperatorEnum; + backend=get_backend(gcX), buffer=nothing, gpu_workspace=nothing, gpu_buffer=nothing, @@ -39,7 +26,6 @@ function eval_tree_array( num_nodes=nothing, num_launches=nothing, update_buffers::Val{_update_buffers}=Val(true), - kws..., ) where {T<:Number,N<:AbstractExpressionNode{T},_update_buffers} if _update_buffers (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, trees; buffer) @@ -82,6 +68,7 @@ function eval_tree_array( #! format: off _launch_gpu_kernel!( + backend, num_threads, num_blocks, num_launches, gworkspace, # Thread info: num_elem, num_nodes, gexecution_order, @@ -99,6 +86,7 @@ end #! format: off function _launch_gpu_kernel!( + backend, num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2}, # Thread info: num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray{I}, @@ -114,24 +102,12 @@ function _launch_gpu_kernel!( gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin)) for launch in one(I):I(num_launches) #! format: off - if buffer isa CuArray - @cuda threads=num_threads blocks=num_blocks gpu_kernel!( - buffer, - launch, num_elem, num_nodes, execution_order, - cX, idx_self, idx_l, idx_r, - degree, constant, val, feature, op - ) - else - Threads.@threads for i in 1:(num_threads * num_blocks) - gpu_kernel!( - buffer, - launch, num_elem, num_nodes, execution_order, - cX, idx_self, idx_l, idx_r, - degree, constant, val, feature, op, - i - ) - end - end + gpu_kernel!(backend, num_threads * num_blocks)( + buffer, + launch, num_elem, num_nodes, execution_order, + cX, idx_self, idx_l, idx_r, + degree, constant, val, feature, op + ) #! format: on end return nothing @@ -146,19 +122,17 @@ end for nuna in 0:10, nbin in 0:10 @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin}) #! format: off - function ( + @kernel function k( # Storage: buffer, # Thread info: - launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray, + @Const(launch)::Integer, @Const(num_elem)::Integer, @Const(num_nodes)::Integer, @Const(execution_order)::AbstractArray{I}, # Input data and tree - cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray, - degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray, - # Override for unittesting: - i=nothing, + @Const(cX)::AbstractArray, @Const(idx_self)::AbstractArray, @Const(idx_l)::AbstractArray, @Const(idx_r)::AbstractArray, + @Const(degree)::AbstractArray, @Const(constant)::AbstractArray, @Const(val)::AbstractArray, @Const(feature)::AbstractArray, @Const(op)::AbstractArray, ) #! format: on - i = i === nothing ? (blockIdx().x - 1) * blockDim().x + threadIdx().x : i + i = @index(Global, Linear) if i > num_elem * num_nodes return nothing end @@ -186,8 +160,8 @@ for nuna in 0:10, nbin in 0:10 l_idx = idx_l[node] Base.Cartesian.@nif( $nuna, - i -> i == cur_op, - i -> let op = operators.unaops[i] + j -> j == cur_op, + j -> let op = operators.unaops[j] buffer[elem, cur_idx] = op(buffer[elem, l_idx]) end ) @@ -197,8 +171,8 @@ for nuna in 0:10, nbin in 0:10 r_idx = idx_r[node] Base.Cartesian.@nif( $nbin, - i -> i == cur_op, - i -> let op = operators.binops[i] + j -> j == cur_op, + j -> let op = operators.binops[j] buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx]) end ) diff --git a/src/DynamicExpressions.jl b/src/DynamicExpressions.jl index 7ab08835..d03bec08 100644 --- a/src/DynamicExpressions.jl +++ b/src/DynamicExpressions.jl @@ -48,7 +48,8 @@ import .EquationModule: constructorof, preserve_sharing eval_diff_tree_array, eval_grad_tree_array @reexport import .SimplifyEquationModule: combine_operators, simplify_tree! @reexport import .EvaluationHelpersModule -@reexport import .ExtensionInterfaceModule: node_to_symbolic, symbolic_to_node +@reexport import .ExtensionInterfaceModule: + node_to_symbolic, symbolic_to_node, gpu_eval_tree_array @reexport import .RandomModule: NodeSampler @reexport import .AsArrayModule: as_array diff --git a/src/ExtensionInterface.jl b/src/ExtensionInterface.jl index 521b0c88..48476819 100644 --- a/src/ExtensionInterface.jl +++ b/src/ExtensionInterface.jl @@ -14,6 +14,9 @@ end function bumper_eval_tree_array(args...) return error("Please load the Bumper.jl package to use this feature.") end +function gpu_eval_tree_array(args...) + return error("Please load a GPU backend such as CUDA.jl to use this feature.") +end function bumper_kern1! end function bumper_kern2! end From cb2d055423b415af9d2118bfcf2fc33ef67730c2 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 26 Feb 2024 01:15:37 +0000 Subject: [PATCH 36/47] Revert "Start switching to KernelAbstractions.jl" This reverts commit c1e579a95c5facefb06d5e11a3bf27ed94f703e4. --- Project.toml | 10 +-- ...onsExt.jl => DynamicExpressionsCUDAExt.jl} | 79 +++++++++++++------ src/DynamicExpressions.jl | 3 +- src/ExtensionInterface.jl | 3 - 4 files changed, 59 insertions(+), 36 deletions(-) rename ext/{DynamicExpressionsKernelAbstractionsExt.jl => DynamicExpressionsCUDAExt.jl} (69%) diff --git a/Project.toml b/Project.toml index 69d24f56..097e0fd7 100644 --- a/Project.toml +++ b/Project.toml @@ -14,7 +14,7 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" [weakdeps] Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" -KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" Optim = "429524aa-4258-5aef-a3af-852621145aeb" SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b" @@ -22,7 +22,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [extensions] DynamicExpressionsBumperExt = "Bumper" -DynamicExpressionsKernelAbstractionsExt = "KernelAbstractions" +DynamicExpressionsCUDAExt = "CUDA" DynamicExpressionsLoopVectorizationExt = "LoopVectorization" DynamicExpressionsOptimExt = "Optim" DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils" @@ -31,7 +31,7 @@ DynamicExpressionsZygoteExt = "Zygote" [compat] Aqua = "0.7" Bumper = "0.6" -KernelAbstractions = "0.9" +CUDA = "4, 5" Compat = "3.37, 4" Enzyme = "^0.11.12" LoopVectorization = "0.12" @@ -47,9 +47,9 @@ julia = "1.6" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" -KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" Optim = "429524aa-4258-5aef-a3af-852621145aeb" @@ -61,4 +61,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [targets] -test = ["Test", "SafeTestsets", "Aqua", "Bumper", "KernelAbstractions", "Enzyme", "ForwardDiff", "LinearAlgebra", "LoopVectorization", "Optim", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"] +test = ["Test", "SafeTestsets", "Aqua", "Bumper", "CUDA", "Enzyme", "ForwardDiff", "LinearAlgebra", "LoopVectorization", "Optim", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"] diff --git a/ext/DynamicExpressionsKernelAbstractionsExt.jl b/ext/DynamicExpressionsCUDAExt.jl similarity index 69% rename from ext/DynamicExpressionsKernelAbstractionsExt.jl rename to ext/DynamicExpressionsCUDAExt.jl index 454fdbdb..4cc381ca 100644 --- a/ext/DynamicExpressionsKernelAbstractionsExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -1,24 +1,38 @@ -module DynamicExpressionsKernelAbstractionsExt +module DynamicExpressionsCUDAExt -using KernelAbstractions: @index, @kernel, @Const, get_backend +# TODO: Switch to KernelAbstractions.jl (once they hit v1.0) +using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx using DynamicExpressions: OperatorEnum, AbstractExpressionNode using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna using DynamicExpressions.AsArrayModule: as_array -import DynamicExpressions.ExtensionInterfaceModule: gpu_eval_tree_array +import DynamicExpressions.EvaluateEquationModule: eval_tree_array -function gpu_eval_tree_array( - tree::AbstractExpressionNode{T}, gcX, operators::OperatorEnum; kws... +# array type for exclusively testing purposes +struct FakeCuArray{T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N} + a::A +end +Base.similar(x::FakeCuArray, dims::Integer...) = FakeCuArray(similar(x.a, dims...)) +Base.getindex(x::FakeCuArray, i::Int...) = getindex(x.a, i...) +Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...) +Base.size(x::FakeCuArray) = size(x.a) + +const MaybeCuArray{T,N} = Union{CuArray{T,N},FakeCuArray{T,N}} + +to_device(a, ::CuArray) = CuArray(a) +to_device(a, ::FakeCuArray) = FakeCuArray(a) + +function eval_tree_array( + tree::AbstractExpressionNode{T}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; kws... ) where {T<:Number} - (outs, is_good) = gpu_eval_tree_array((tree,), gcX, operators; kws...) + (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...) return (only(outs), only(is_good)) end -function gpu_eval_tree_array( +function eval_tree_array( trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}}, - gcX, + gcX::MaybeCuArray{T,2}, operators::OperatorEnum; - backend=get_backend(gcX), buffer=nothing, gpu_workspace=nothing, gpu_buffer=nothing, @@ -26,6 +40,7 @@ function gpu_eval_tree_array( num_nodes=nothing, num_launches=nothing, update_buffers::Val{_update_buffers}=Val(true), + kws..., ) where {T<:Number,N<:AbstractExpressionNode{T},_update_buffers} if _update_buffers (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, trees; buffer) @@ -68,7 +83,6 @@ function gpu_eval_tree_array( #! format: off _launch_gpu_kernel!( - backend, num_threads, num_blocks, num_launches, gworkspace, # Thread info: num_elem, num_nodes, gexecution_order, @@ -86,7 +100,6 @@ end #! format: off function _launch_gpu_kernel!( - backend, num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2}, # Thread info: num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray{I}, @@ -102,12 +115,24 @@ function _launch_gpu_kernel!( gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin)) for launch in one(I):I(num_launches) #! format: off - gpu_kernel!(backend, num_threads * num_blocks)( - buffer, - launch, num_elem, num_nodes, execution_order, - cX, idx_self, idx_l, idx_r, - degree, constant, val, feature, op - ) + if buffer isa CuArray + @cuda threads=num_threads blocks=num_blocks gpu_kernel!( + buffer, + launch, num_elem, num_nodes, execution_order, + cX, idx_self, idx_l, idx_r, + degree, constant, val, feature, op + ) + else + Threads.@threads for i in 1:(num_threads * num_blocks) + gpu_kernel!( + buffer, + launch, num_elem, num_nodes, execution_order, + cX, idx_self, idx_l, idx_r, + degree, constant, val, feature, op, + i + ) + end + end #! format: on end return nothing @@ -122,17 +147,19 @@ end for nuna in 0:10, nbin in 0:10 @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin}) #! format: off - @kernel function k( + function ( # Storage: buffer, # Thread info: - @Const(launch)::Integer, @Const(num_elem)::Integer, @Const(num_nodes)::Integer, @Const(execution_order)::AbstractArray{I}, + launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray, # Input data and tree - @Const(cX)::AbstractArray, @Const(idx_self)::AbstractArray, @Const(idx_l)::AbstractArray, @Const(idx_r)::AbstractArray, - @Const(degree)::AbstractArray, @Const(constant)::AbstractArray, @Const(val)::AbstractArray, @Const(feature)::AbstractArray, @Const(op)::AbstractArray, + cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray, + degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray, + # Override for unittesting: + i=nothing, ) #! format: on - i = @index(Global, Linear) + i = i === nothing ? (blockIdx().x - 1) * blockDim().x + threadIdx().x : i if i > num_elem * num_nodes return nothing end @@ -160,8 +187,8 @@ for nuna in 0:10, nbin in 0:10 l_idx = idx_l[node] Base.Cartesian.@nif( $nuna, - j -> j == cur_op, - j -> let op = operators.unaops[j] + i -> i == cur_op, + i -> let op = operators.unaops[i] buffer[elem, cur_idx] = op(buffer[elem, l_idx]) end ) @@ -171,8 +198,8 @@ for nuna in 0:10, nbin in 0:10 r_idx = idx_r[node] Base.Cartesian.@nif( $nbin, - j -> j == cur_op, - j -> let op = operators.binops[j] + i -> i == cur_op, + i -> let op = operators.binops[i] buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx]) end ) diff --git a/src/DynamicExpressions.jl b/src/DynamicExpressions.jl index d03bec08..7ab08835 100644 --- a/src/DynamicExpressions.jl +++ b/src/DynamicExpressions.jl @@ -48,8 +48,7 @@ import .EquationModule: constructorof, preserve_sharing eval_diff_tree_array, eval_grad_tree_array @reexport import .SimplifyEquationModule: combine_operators, simplify_tree! @reexport import .EvaluationHelpersModule -@reexport import .ExtensionInterfaceModule: - node_to_symbolic, symbolic_to_node, gpu_eval_tree_array +@reexport import .ExtensionInterfaceModule: node_to_symbolic, symbolic_to_node @reexport import .RandomModule: NodeSampler @reexport import .AsArrayModule: as_array diff --git a/src/ExtensionInterface.jl b/src/ExtensionInterface.jl index 48476819..521b0c88 100644 --- a/src/ExtensionInterface.jl +++ b/src/ExtensionInterface.jl @@ -14,9 +14,6 @@ end function bumper_eval_tree_array(args...) return error("Please load the Bumper.jl package to use this feature.") end -function gpu_eval_tree_array(args...) - return error("Please load a GPU backend such as CUDA.jl to use this feature.") -end function bumper_kern1! end function bumper_kern2! end From bfa9148f74e8866d1d8178d5857ee1cdf8af56cc Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 17 Jul 2024 18:49:54 -0400 Subject: [PATCH 37/47] fix: as array imports --- src/AsArray.jl | 2 +- test/Project.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/AsArray.jl b/src/AsArray.jl index 75ff1ece..809638ca 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -1,6 +1,6 @@ module AsArrayModule -using ..EquationModule: AbstractExpressionNode, tree_mapreduce, count_nodes +using ..NodeModule: AbstractExpressionNode, tree_mapreduce, count_nodes function as_array( ::Type{I}, diff --git a/test/Project.toml b/test/Project.toml index 952e0370..3e47197d 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,6 +1,7 @@ [deps] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63" From 9f49619e6580536244c8b74bcadbdb1b2d237b1c Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Wed, 17 Jul 2024 19:48:41 -0400 Subject: [PATCH 38/47] hack: try implementing cooperative group --- ext/DynamicExpressionsCUDAExt.jl | 116 +++++++++++++++++-------------- 1 file changed, 62 insertions(+), 54 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 4cc381ca..2e1cb61e 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -1,7 +1,7 @@ module DynamicExpressionsCUDAExt # TODO: Switch to KernelAbstractions.jl (once they hit v1.0) -using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx +using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx, CG using DynamicExpressions: OperatorEnum, AbstractExpressionNode using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna using DynamicExpressions.AsArrayModule: as_array @@ -78,7 +78,7 @@ function eval_tree_array( gidx_r = @view gbuffer[7, :] gconstant = @view gbuffer[8, :] - num_threads = 256 + num_threads = 1024 num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads)) #! format: off @@ -113,25 +113,23 @@ function _launch_gpu_kernel!( (nuna > 10 || nbin > 10) && error("Too many operators. Kernels are only compiled up to 10.") gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin)) - for launch in one(I):I(num_launches) - #! format: off - if buffer isa CuArray - @cuda threads=num_threads blocks=num_blocks gpu_kernel!( + #! format: off + if buffer isa CuArray + @cuda cooperative=true threads=num_threads blocks=num_blocks gpu_kernel!( + buffer, + num_launches, num_elem, num_nodes, execution_order, + cX, idx_self, idx_l, idx_r, + degree, constant, val, feature, op + ) + else + Threads.@threads for i in 1:(num_threads * num_blocks * num_launches) + gpu_kernel!( buffer, - launch, num_elem, num_nodes, execution_order, + num_launches, num_elem, num_nodes, execution_order, cX, idx_self, idx_l, idx_r, - degree, constant, val, feature, op + degree, constant, val, feature, op, + i ) - else - Threads.@threads for i in 1:(num_threads * num_blocks) - gpu_kernel!( - buffer, - launch, num_elem, num_nodes, execution_order, - cX, idx_self, idx_l, idx_r, - degree, constant, val, feature, op, - i - ) - end end #! format: on end @@ -151,7 +149,7 @@ for nuna in 0:10, nbin in 0:10 # Storage: buffer, # Thread info: - launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray, + num_launches::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray, # Input data and tree cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray, degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray, @@ -163,46 +161,56 @@ for nuna in 0:10, nbin in 0:10 if i > num_elem * num_nodes return nothing end - + # node = (i - 1) % num_nodes + 1 elem = (i - node) ÷ num_nodes + 1 + grid_group = CG.this_grid() - if execution_order[node] != launch - return nothing - end + for launch in 1:num_launches + if launch > 1 + # TODO: Investigate whether synchronizing within + # group instead of whole grid is better. + CG.sync(grid_group) + end + if execution_order[node] != launch + continue + end - cur_degree = degree[node] - cur_idx = idx_self[node] - if cur_degree == 0 - if constant[node] == 1 - cur_val = val[node] - buffer[elem, cur_idx] = cur_val + cur_degree = degree[node] + cur_idx = idx_self[node] + if cur_degree == 0 + if constant[node] == 1 + cur_val = val[node] + buffer[elem, cur_idx] = cur_val + else + cur_feature = feature[node] + buffer[elem, cur_idx] = cX[cur_feature, elem] + end else - cur_feature = feature[node] - buffer[elem, cur_idx] = cX[cur_feature, elem] - end - else - if cur_degree == 1 && $nuna > 0 - cur_op = op[node] - l_idx = idx_l[node] - Base.Cartesian.@nif( - $nuna, - i -> i == cur_op, - i -> let op = operators.unaops[i] - buffer[elem, cur_idx] = op(buffer[elem, l_idx]) - end - ) - elseif $nbin > 0 # Note this check is to avoid type inference issues when binops is empty - cur_op = op[node] - l_idx = idx_l[node] - r_idx = idx_r[node] - Base.Cartesian.@nif( - $nbin, - i -> i == cur_op, - i -> let op = operators.binops[i] - buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx]) - end - ) + if cur_degree == 1 && $nuna > 0 + cur_op = op[node] + l_idx = idx_l[node] + Base.Cartesian.@nif( + $nuna, + i -> i == cur_op, + i -> let op = operators.unaops[i] + buffer[elem, cur_idx] = op(buffer[elem, l_idx]) + end + ) + elseif $nbin > 0 # Note this check is to avoid type inference issues when binops is empty + cur_op = op[node] + l_idx = idx_l[node] + r_idx = idx_r[node] + Base.Cartesian.@nif( + $nbin, + i -> i == cur_op, + i -> let op = operators.binops[i] + buffer[elem, cur_idx] = op( + buffer[elem, l_idx], buffer[elem, r_idx] + ) + end + ) + end end end return nothing From 1422e363e97f0085ed14cab259b0dc755dbadb24 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 29 Nov 2024 14:07:21 +0000 Subject: [PATCH 39/47] refactor: clean up extensions --- ext/DynamicExpressionsCUDAExt.jl | 6 +----- src/AsArray.jl | 13 ++++++++----- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 2e1cb61e..dbdc0b39 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -51,11 +51,7 @@ function eval_tree_array( ## the GPU kernel, with size equal to the number of rows ## in the input data by the number of nodes in the tree. ## It has one extra row to store the constant values. - gworkspace = if gpu_workspace === nothing - similar(gcX, num_elem + 1, num_nodes) - else - gpu_workspace - end + gworkspace = @something(gpu_workspace, similar(gcX, num_elem + 1, num_nodes)) gval = @view gworkspace[end, :] if _update_buffers copyto!(gval, val) diff --git a/src/AsArray.jl b/src/AsArray.jl index 809638ca..83d32377 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -2,14 +2,17 @@ module AsArrayModule using ..NodeModule: AbstractExpressionNode, tree_mapreduce, count_nodes +function as_array( + ::Type{I}, trees::N; buffer=nothing +) where {T,N<:AbstractExpressionNode{T},I} + return as_array(I, (trees,); buffer=buffer) +end + function as_array( ::Type{I}, - trees::Union{N,Tuple{N,Vararg{N}},AbstractVector{N}}; + trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}}; buffer::Union{AbstractArray,Nothing}=nothing, ) where {T,N<:AbstractExpressionNode{T},I} - if trees isa N - return as_array(I, (trees,); buffer=buffer) - end each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees) num_nodes = sum(each_num_nodes) @@ -25,7 +28,7 @@ function as_array( val = Array{T}(undef, num_nodes) ## Views of the same matrix: - buffer = buffer === nothing ? Array{I}(undef, 8, num_nodes) : buffer + buffer = @something(buffer, Array{I}(undef, 8, num_nodes)) degree = @view buffer[1, :] feature = @view buffer[2, :] op = @view buffer[3, :] From 74afd727d7df168904bc8e4b0e7736455633343f Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 15 Dec 2024 15:07:41 -0800 Subject: [PATCH 40/47] Revert "hack: try implementing cooperative group" This reverts commit 9f49619e6580536244c8b74bcadbdb1b2d237b1c. --- ext/DynamicExpressionsCUDAExt.jl | 116 ++++++++++++++----------------- 1 file changed, 54 insertions(+), 62 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index dbdc0b39..6c89e333 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -1,7 +1,7 @@ module DynamicExpressionsCUDAExt # TODO: Switch to KernelAbstractions.jl (once they hit v1.0) -using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx, CG +using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx using DynamicExpressions: OperatorEnum, AbstractExpressionNode using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna using DynamicExpressions.AsArrayModule: as_array @@ -74,7 +74,7 @@ function eval_tree_array( gidx_r = @view gbuffer[7, :] gconstant = @view gbuffer[8, :] - num_threads = 1024 + num_threads = 256 num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads)) #! format: off @@ -109,23 +109,25 @@ function _launch_gpu_kernel!( (nuna > 10 || nbin > 10) && error("Too many operators. Kernels are only compiled up to 10.") gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin)) - #! format: off - if buffer isa CuArray - @cuda cooperative=true threads=num_threads blocks=num_blocks gpu_kernel!( - buffer, - num_launches, num_elem, num_nodes, execution_order, - cX, idx_self, idx_l, idx_r, - degree, constant, val, feature, op - ) - else - Threads.@threads for i in 1:(num_threads * num_blocks * num_launches) - gpu_kernel!( + for launch in one(I):I(num_launches) + #! format: off + if buffer isa CuArray + @cuda threads=num_threads blocks=num_blocks gpu_kernel!( buffer, - num_launches, num_elem, num_nodes, execution_order, + launch, num_elem, num_nodes, execution_order, cX, idx_self, idx_l, idx_r, - degree, constant, val, feature, op, - i + degree, constant, val, feature, op ) + else + Threads.@threads for i in 1:(num_threads * num_blocks) + gpu_kernel!( + buffer, + launch, num_elem, num_nodes, execution_order, + cX, idx_self, idx_l, idx_r, + degree, constant, val, feature, op, + i + ) + end end #! format: on end @@ -145,7 +147,7 @@ for nuna in 0:10, nbin in 0:10 # Storage: buffer, # Thread info: - num_launches::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray, + launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray, # Input data and tree cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray, degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray, @@ -157,56 +159,46 @@ for nuna in 0:10, nbin in 0:10 if i > num_elem * num_nodes return nothing end - # + node = (i - 1) % num_nodes + 1 elem = (i - node) ÷ num_nodes + 1 - grid_group = CG.this_grid() - for launch in 1:num_launches - if launch > 1 - # TODO: Investigate whether synchronizing within - # group instead of whole grid is better. - CG.sync(grid_group) - end - if execution_order[node] != launch - continue - end + if execution_order[node] != launch + return nothing + end - cur_degree = degree[node] - cur_idx = idx_self[node] - if cur_degree == 0 - if constant[node] == 1 - cur_val = val[node] - buffer[elem, cur_idx] = cur_val - else - cur_feature = feature[node] - buffer[elem, cur_idx] = cX[cur_feature, elem] - end + cur_degree = degree[node] + cur_idx = idx_self[node] + if cur_degree == 0 + if constant[node] == 1 + cur_val = val[node] + buffer[elem, cur_idx] = cur_val else - if cur_degree == 1 && $nuna > 0 - cur_op = op[node] - l_idx = idx_l[node] - Base.Cartesian.@nif( - $nuna, - i -> i == cur_op, - i -> let op = operators.unaops[i] - buffer[elem, cur_idx] = op(buffer[elem, l_idx]) - end - ) - elseif $nbin > 0 # Note this check is to avoid type inference issues when binops is empty - cur_op = op[node] - l_idx = idx_l[node] - r_idx = idx_r[node] - Base.Cartesian.@nif( - $nbin, - i -> i == cur_op, - i -> let op = operators.binops[i] - buffer[elem, cur_idx] = op( - buffer[elem, l_idx], buffer[elem, r_idx] - ) - end - ) - end + cur_feature = feature[node] + buffer[elem, cur_idx] = cX[cur_feature, elem] + end + else + if cur_degree == 1 && $nuna > 0 + cur_op = op[node] + l_idx = idx_l[node] + Base.Cartesian.@nif( + $nuna, + i -> i == cur_op, + i -> let op = operators.unaops[i] + buffer[elem, cur_idx] = op(buffer[elem, l_idx]) + end + ) + elseif $nbin > 0 # Note this check is to avoid type inference issues when binops is empty + cur_op = op[node] + l_idx = idx_l[node] + r_idx = idx_r[node] + Base.Cartesian.@nif( + $nbin, + i -> i == cur_op, + i -> let op = operators.binops[i] + buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx]) + end + ) end end return nothing From 38c589b3e29de39c6df66fca025c4e29ed3f94bc Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 15 Dec 2024 15:15:04 -0800 Subject: [PATCH 41/47] refactor: clean up `map` operations --- ext/DynamicExpressionsCUDAExt.jl | 4 ++-- src/AsArray.jl | 2 +- src/Evaluate.jl | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 6c89e333..ddb1c3d9 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -88,8 +88,8 @@ function eval_tree_array( ) #! format: on - out = (r -> @view(gworkspace[begin:(end - 1), r])).(roots) - is_good = (_ -> true).(trees) + out = map(r -> @view(gworkspace[begin:(end - 1), r]), roots) + is_good = map(Returns(true), trees) return (out, is_good) end diff --git a/src/AsArray.jl b/src/AsArray.jl index 83d32377..410147e6 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -13,7 +13,7 @@ function as_array( trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}}; buffer::Union{AbstractArray,Nothing}=nothing, ) where {T,N<:AbstractExpressionNode{T},I} - each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees) + each_num_nodes = map(t -> count_nodes(t; break_sharing=Val(true)), trees) num_nodes = sum(each_num_nodes) # Want `roots` to be tuple if `trees` is tuple and similar for vector diff --git a/src/Evaluate.jl b/src/Evaluate.jl index 569b1767..0c36947a 100644 --- a/src/Evaluate.jl +++ b/src/Evaluate.jl @@ -239,8 +239,8 @@ function eval_tree_array( operators::OperatorEnum; kws..., ) where {T<:Number,N<:AbstractExpressionNode{T}} - outs = (t -> eval_tree_array(t, cX, operators; kws...)).(trees) - return first.(outs), last.(outs) + outs = map(t -> eval_tree_array(t, cX, operators; kws...), trees) + return map(first, outs), map(last, outs) end # These are marked unstable due to issues discussed on From 4cce551a57507b2dc6909e3f69b0813e3341a6b7 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 15 Dec 2024 15:35:05 -0800 Subject: [PATCH 42/47] fix: stabilize CUDA ext --- ext/DynamicExpressionsCUDAExt.jl | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index ddb1c3d9..9c10ebbf 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -5,6 +5,7 @@ using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx using DynamicExpressions: OperatorEnum, AbstractExpressionNode using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna using DynamicExpressions.AsArrayModule: as_array +using DispatchDoctor: @stable import DynamicExpressions.EvaluateEquationModule: eval_tree_array @@ -19,17 +20,19 @@ Base.size(x::FakeCuArray) = size(x.a) const MaybeCuArray{T,N} = Union{CuArray{T,N},FakeCuArray{T,N}} -to_device(a, ::CuArray) = CuArray(a) -to_device(a, ::FakeCuArray) = FakeCuArray(a) +@stable default_mode = "disable" begin + to_device(a, ::CuArray) = CuArray(a) + to_device(a, ::FakeCuArray) = FakeCuArray(a) +end -function eval_tree_array( +@stable default_mode = "disable" function eval_tree_array( tree::AbstractExpressionNode{T}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; kws... ) where {T<:Number} (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...) return (only(outs), only(is_good)) end -function eval_tree_array( +@stable default_mode = "disable" function eval_tree_array( trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; @@ -95,7 +98,7 @@ function eval_tree_array( end #! format: off -function _launch_gpu_kernel!( +@stable default_mode = "disable" function _launch_gpu_kernel!( num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2}, # Thread info: num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray{I}, @@ -141,7 +144,9 @@ end # ifs to generate at that time, so we can't simply use specialization. # 3. We can't use `@generated` because we can't create closures in those. for nuna in 0:10, nbin in 0:10 - @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin}) + @eval @stable default_mode = "disable" function create_gpu_kernel( + operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin} + ) #! format: off function ( # Storage: From b7ecdd3b7e24f1166a68bdfe0b966fbfe23610c3 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 15 Dec 2024 15:37:04 -0800 Subject: [PATCH 43/47] test: modularize CUDA test --- test/test_cuda.jl | 283 ++++++++++++++++++++++++++++------------------ test/unittest.jl | 7 +- 2 files changed, 174 insertions(+), 116 deletions(-) diff --git a/test/test_cuda.jl b/test/test_cuda.jl index 2d46bce0..b281c024 100644 --- a/test/test_cuda.jl +++ b/test/test_cuda.jl @@ -1,126 +1,189 @@ -using DynamicExpressions -using DynamicExpressions.AsArrayModule: as_array -using CUDA -using Random +@testitem "Random Evals: Single Tree" begin + using DynamicExpressions, CUDA, Random + using DynamicExpressions.AsArrayModule: as_array -ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt) -const FakeCuArray = ext.FakeCuArray + include("tree_gen_utils.jl") -include("tree_gen_utils.jl") + safe_sin(x) = isfinite(x) ? sin(x) : convert(eltype(x), NaN) + safe_cos(x) = isfinite(x) ? cos(x) : convert(eltype(x), NaN) -safe_sin(x) = isfinite(x) ? sin(x) : convert(eltype(x), NaN) -safe_cos(x) = isfinite(x) ? cos(x) : convert(eltype(x), NaN) + ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt) + const FakeCuArray = ext.FakeCuArray -@testset "Random evals" begin - let - operators = OperatorEnum(; - binary_operators=[+, -, *, /], unary_operators=[safe_sin, safe_cos] - ) - x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3) + operators = OperatorEnum(; + binary_operators=[+, -, *, /], unary_operators=[safe_sin, safe_cos] + ) - for T in (Float32, Float64, ComplexF64), ntrees in (1, 2, 3), seed in 0:10 + for T in (Float32, Float64, ComplexF64) + for seed in 0:10 Random.seed!(seed) - nrow = rand(10:30) - nnodes = rand(10:25, ntrees) - use_tuple = rand(Bool) - - buffer = rand(Bool) ? ones(Int32, 8, sum(nnodes)) : nothing - gpu_buffer = rand(Bool) ? FakeCuArray(ones(Int32, 8, sum(nnodes))) : nothing - gpu_workspace = - rand(Bool) ? FakeCuArray(ones(T, nrow + 1, sum(nnodes))) : nothing - - trees = ntuple( - i -> gen_random_tree_fixed_size(nnodes[i], operators, 3, T), ntrees - ) - trees = use_tuple ? trees : collect(trees) + nnodes = rand(10:25) + tree = gen_random_tree_fixed_size(nnodes, operators, 3, T) X = randn(T, 3, nrow) - if ntrees > 1 - y, completed = @inferred eval_tree_array(trees, X, operators) - gpu_y, gpu_completed = @inferred eval_tree_array( - trees, FakeCuArray(X), operators; buffer, gpu_workspace, gpu_buffer - ) - - # Should give same result either way - for i in eachindex(completed, gpu_completed) - if completed[i] - @test y[i] ≈ gpu_y[i] - end - end - - # Should return same type as input - if use_tuple - @test y isa Tuple - @test gpu_y isa Tuple - else - @test y isa Vector - @test gpu_y isa Vector - end - else - y, completed = @inferred eval_tree_array(only(trees), X, operators) - gpu_y, gpu_completed = @inferred eval_tree_array( - only(trees), FakeCuArray(X), operators - ) - if completed - @test y ≈ gpu_y - end + + y, completed = eval_tree_array(tree, X, operators) + y_gpu, completed_gpu = eval_tree_array(tree, FakeCuArray(X), operators) + + # TODO: Fix this + # @test completed == completed_gpu + if completed + @test y ≈ y_gpu end end end end -@testset "Evaluation on pre-computed buffers" begin - let - operators = OperatorEnum(; - binary_operators=[+, -, *, /], unary_operators=[sin, cos] - ) - x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3) - Random.seed!(0) - tree = sin(x1 * 3.1 - x3 * 0.9 + 0.2) * x2 - x3 * x3 * 1.5 - X = randn(Float64, 3, 100) - - y1, _ = eval_tree_array(tree, X, operators) - y2, _ = eval_tree_array(tree, FakeCuArray(X), operators) - - @test y1 ≈ y2 - - (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, tree) - gpu_buffer = FakeCuArray(buffer) - gpu_workspace = FakeCuArray(zeros(Float64, 101, 50)) - copyto!((@view gpu_workspace[end, :]), val) - - # Now, with all buffers: - y3, _ = eval_tree_array( - tree, - FakeCuArray(X), - operators; - gpu_workspace, - gpu_buffer, - roots, - num_nodes, - num_launches, - update_buffers=Val(false), - ) - @test y1 ≈ y3 - - # Should be able to shift some of the values in this buffer: - i = findfirst(gpu_workspace[end, :] .== 0.9) - gpu_workspace[end, i] = 0.8 - - # And get the updated results: - tree_prime = sin(x1 * 3.1 - x3 * 0.8 + 0.2) * x2 - x3 * x3 * 1.5 - y1_prime, _ = eval_tree_array(tree_prime, X, operators) - y3_prime, _ = eval_tree_array( - x1, # Doesn't matter what we put here - FakeCuArray(X), - operators; - gpu_workspace, - gpu_buffer, - roots, - num_nodes, - num_launches, - update_buffers=Val(false), +@testitem "Random Evals: Multiple Trees" begin + using DynamicExpressions, CUDA, Random + using DynamicExpressions.AsArrayModule: as_array + + include("tree_gen_utils.jl") + + safe_sin(x) = isfinite(x) ? sin(x) : convert(eltype(x), NaN) + safe_cos(x) = isfinite(x) ? cos(x) : convert(eltype(x), NaN) + + operators = OperatorEnum(; + binary_operators=[+, -, *, /], unary_operators=[safe_sin, safe_cos] + ) + + ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt) + const FakeCuArray = ext.FakeCuArray + + for T in (Float32, Float64, ComplexF64), ntrees in (2, 3), seed in 0:10 + Random.seed!(seed) + + nrow = rand(10:30) + nnodes = rand(10:25, ntrees) + use_tuple = rand(Bool) + + buffer = rand(Bool) ? ones(Int32, 8, sum(nnodes)) : nothing + gpu_buffer = rand(Bool) ? FakeCuArray(ones(Int32, 8, sum(nnodes))) : nothing + gpu_workspace = rand(Bool) ? FakeCuArray(ones(T, nrow + 1, sum(nnodes))) : nothing + + trees = ntuple(i -> gen_random_tree_fixed_size(nnodes[i], operators, 3, T), ntrees) + trees = use_tuple ? trees : collect(trees) + + X = randn(T, 3, nrow) + + y, completed = eval_tree_array(trees, X, operators) + gpu_y, gpu_completed = eval_tree_array( + trees, FakeCuArray(X), operators; buffer, gpu_workspace, gpu_buffer ) - @test y1_prime ≈ y3_prime + + # TODO: Fix this + # @test completed == gpu_completed + + for i in eachindex(completed, gpu_completed) + if completed[i] + @test y[i] ≈ gpu_y[i] + end + end + + # Check return type matches input type (tuple or vector) + if use_tuple + @test y isa Tuple + @test gpu_y isa Tuple + else + @test y isa Vector + @test gpu_y isa Vector + end end end + +@testitem "Pre-Computed Buffers: Basic Equivalence" begin + using DynamicExpressions, CUDA, Random + using DynamicExpressions.AsArrayModule: as_array + + ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt) + const FakeCuArray = ext.FakeCuArray + + # No random trees here, we define a fixed tree + x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3) + operators = OperatorEnum(; binary_operators=[+, -, *, /], unary_operators=[sin, cos]) + + Random.seed!(0) + tree = sin(x1 * 3.1 - x3 * 0.9 + 0.2) * x2 - x3 * x3 * 1.5 + X = randn(Float64, 3, 100) + + y1, _ = eval_tree_array(tree, X, operators) + y2, _ = eval_tree_array(tree, FakeCuArray(X), operators) + @test y1 ≈ y2 +end + +@testitem "Pre-Computed Buffers: Using Provided Buffers" begin + using DynamicExpressions, CUDA, Random + using DynamicExpressions.AsArrayModule: as_array + + ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt) + const FakeCuArray = ext.FakeCuArray + + x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3) + operators = OperatorEnum(; binary_operators=[+, -, *, /], unary_operators=[sin, cos]) + + Random.seed!(0) + tree = sin(x1 * 3.1 - x3 * 0.9 + 0.2) * x2 - x3 * x3 * 1.5 + X = randn(Float64, 3, 100) + + y1, _ = eval_tree_array(tree, X, operators) + + # Extract arrays + (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, tree) + gpu_buffer = FakeCuArray(buffer) + gpu_workspace = FakeCuArray(zeros(Float64, size(X, 2) + 1, num_nodes)) + copyto!((@view gpu_workspace[end, :]), val) + + y3, _ = eval_tree_array( + tree, + FakeCuArray(X), + operators; + gpu_workspace, + gpu_buffer, + roots, + num_nodes, + num_launches, + update_buffers=Val(false), + ) + @test y1 ≈ y3 +end + +@testitem "Pre-Computed Buffers: Modified Values" begin + using DynamicExpressions, CUDA, Random + using DynamicExpressions.AsArrayModule: as_array + + ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt) + const FakeCuArray = ext.FakeCuArray + + x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3) + operators = OperatorEnum(; binary_operators=[+, -, *, /], unary_operators=[sin, cos]) + + Random.seed!(0) + tree = sin(x1 * 3.1 - x3 * 0.9 + 0.2) * x2 - x3 * x3 * 1.5 + X = randn(Float64, 3, 100) + + y1, _ = eval_tree_array(tree, X, operators) + + (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, tree) + gpu_buffer = FakeCuArray(buffer) + gpu_workspace = FakeCuArray(zeros(Float64, size(X, 2) + 1, num_nodes)) + gpu_workspace[end, :] .= val + + # Change a constant (0.9 to 0.8) + i = findfirst(gpu_workspace[end, :] .== 0.9) + gpu_workspace[end, i] = 0.8 + + tree_prime = sin(x1 * 3.1 - x3 * 0.8 + 0.2) * x2 - x3 * x3 * 1.5 + y1_prime, _ = eval_tree_array(tree_prime, X, operators) + y3_prime, _ = eval_tree_array( + x1, # dummy tree + FakeCuArray(X), + operators; + gpu_workspace, + gpu_buffer, + roots, + num_nodes, + num_launches, + update_buffers=Val(false), + ) + @test y1_prime ≈ y3_prime +end diff --git a/test/unittest.jl b/test/unittest.jl index 57015a89..4685c119 100644 --- a/test/unittest.jl +++ b/test/unittest.jl @@ -123,12 +123,6 @@ end include("test_random.jl") end -@testitem "Test CUDA" begin - if VERSION >= v"1.9" - include("test_cuda.jl") - end -end - include("test_expressions.jl") include("test_multi_expression.jl") include("test_parse.jl") @@ -139,3 +133,4 @@ include("test_expression_math.jl") include("test_structured_expression.jl") include("test_readonlynode.jl") include("test_zygote_gradient_wrapper.jl") +include("test_cuda.jl") From 8fe652cd6a9f8127a3b238c17c1b7904252ae75c Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 15 Dec 2024 15:45:15 -0800 Subject: [PATCH 44/47] fix: potential type instability in as_array --- src/AsArray.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/AsArray.jl b/src/AsArray.jl index 410147e6..040766f4 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -39,7 +39,7 @@ function as_array( constant = @view buffer[8, :] cursor = Ref(zero(I)) - num_launches = zero(I) + num_launches = Ref(zero(I)) for (root, tree) in zip(roots, trees) @assert root == cursor[] + 1 tree_mapreduce( @@ -78,8 +78,8 @@ function as_array( execution_order[parent.id] = parent_execution_order # Global number of launches equal to maximum execution order - if parent_execution_order > num_launches - num_launches = parent_execution_order + if parent_execution_order > num_launches[] + num_launches[] = parent_execution_order end (id=parent.id, order=parent_execution_order) @@ -96,7 +96,7 @@ function as_array( feature, op, execution_order, - num_launches, + num_launches=num_launches[], idx_self, idx_l, idx_r, From 1d06bd0ae178bc411c3ef548cc557071fb68f68b Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 15 Dec 2024 17:43:32 -0800 Subject: [PATCH 45/47] refactor: as array into modular functions --- Project.toml | 2 + ext/DynamicExpressionsCUDAExt.jl | 30 +++-- src/AsArray.jl | 217 ++++++++++++++++++++----------- 3 files changed, 159 insertions(+), 90 deletions(-) diff --git a/Project.toml b/Project.toml index 892b8c71..63b5bbee 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "1.8.0" [deps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" DispatchDoctor = "8d63f2c5-f18a-4cf2-ba9d-b3f60fc568c8" Interfaces = "85a1e053-f937-4924-92a5-1367d23b7b87" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" @@ -33,6 +34,7 @@ DynamicExpressionsZygoteExt = "Zygote" Bumper = "0.6" CUDA = "4, 5" ChainRulesCore = "1" +Compat = "4.16" DispatchDoctor = "0.4" Interfaces = "0.3" LoopVectorization = "0.12" diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 9c10ebbf..3127e260 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -3,11 +3,11 @@ module DynamicExpressionsCUDAExt # TODO: Switch to KernelAbstractions.jl (once they hit v1.0) using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx using DynamicExpressions: OperatorEnum, AbstractExpressionNode -using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna +using DynamicExpressions.EvaluateModule: get_nbin, get_nuna using DynamicExpressions.AsArrayModule: as_array using DispatchDoctor: @stable -import DynamicExpressions.EvaluateEquationModule: eval_tree_array +import DynamicExpressions.EvaluateModule: eval_tree_array # array type for exclusively testing purposes struct FakeCuArray{T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N} @@ -45,11 +45,15 @@ end update_buffers::Val{_update_buffers}=Val(true), kws..., ) where {T<:Number,N<:AbstractExpressionNode{T},_update_buffers} + local val if _update_buffers (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, trees; buffer) end + # TODO: Fix this type instability? num_elem = size(gcX, 2) + num_launches = num_launches isa Integer ? num_launches : num_launches[] + ## The following array is our "workspace" for ## the GPU kernel, with size equal to the number of rows ## in the input data by the number of nodes in the tree. @@ -68,14 +72,18 @@ end else copyto!(gpu_buffer, buffer) end - gdegree = @view gbuffer[1, :] - gfeature = @view gbuffer[2, :] - gop = @view gbuffer[3, :] + + #! format: off + gdegree = @view gbuffer[1, :] + gfeature = @view gbuffer[2, :] + gop = @view gbuffer[3, :] gexecution_order = @view gbuffer[4, :] - gidx_self = @view gbuffer[5, :] - gidx_l = @view gbuffer[6, :] - gidx_r = @view gbuffer[7, :] - gconstant = @view gbuffer[8, :] + gidx_self = @view gbuffer[5, :] + gidx_l = @view gbuffer[6, :] + gidx_r = @view gbuffer[7, :] + gconstant = @view gbuffer[8, :] + #! format: on + # TODO: This is a bit dangerous as we're assuming exact indices num_threads = 256 num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads)) @@ -144,9 +152,7 @@ end # ifs to generate at that time, so we can't simply use specialization. # 3. We can't use `@generated` because we can't create closures in those. for nuna in 0:10, nbin in 0:10 - @eval @stable default_mode = "disable" function create_gpu_kernel( - operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin} - ) + @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin}) #! format: off function ( # Storage: diff --git a/src/AsArray.jl b/src/AsArray.jl index 040766f4..6d9f19a1 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -1,109 +1,170 @@ module AsArrayModule +using Compat: Fix + using ..NodeModule: AbstractExpressionNode, tree_mapreduce, count_nodes +using ..EvaluateModule: ArrayBuffer, get_array, get_filled_array function as_array( - ::Type{I}, trees::N; buffer=nothing + ::Type{I}, trees::N; buffer::Union{ArrayBuffer,Nothing}=nothing ) where {T,N<:AbstractExpressionNode{T},I} return as_array(I, (trees,); buffer=buffer) end +Base.@kwdef struct TreeBuffer{ + T,I,A<:AbstractArray{I},B<:AbstractArray{T},C,D<:AbstractArray{I} +} + # Corresponds to the `Node` fields + degree::A + constant::A + val::B + feature::A + op::A + idx_l::A + idx_r::A + + # Indexing information + execution_order::A + idx_self::A + num_launches::Base.RefValue{I} + cursor::Base.RefValue{I} + + # Segment information + roots::C + num_nodes::I + + # Original buffer + buffer::D +end + function as_array( ::Type{I}, trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}}; - buffer::Union{AbstractArray,Nothing}=nothing, + buffer::Union{AbstractArray{I},Nothing}=nothing, ) where {T,N<:AbstractExpressionNode{T},I} each_num_nodes = map(t -> count_nodes(t; break_sharing=Val(true)), trees) num_nodes = sum(each_num_nodes) - # Want `roots` to be tuple if `trees` is tuple and similar for vector + # Compute the roots array for indexing. roots = cumsum( if each_num_nodes isa Tuple - tuple(one(I), each_num_nodes[1:(end - 1)]...) + tuple(one(I), each_num_nodes[begin:(end - 1)]...) else - vcat(one(I), each_num_nodes[1:(end - 1)]) + vcat(one(I), @view(each_num_nodes[begin:(end - 1)])) end, ) val = Array{T}(undef, num_nodes) - ## Views of the same matrix: + # If no buffer is provided, create a new ArrayBuffer from scratch buffer = @something(buffer, Array{I}(undef, 8, num_nodes)) - degree = @view buffer[1, :] - feature = @view buffer[2, :] - op = @view buffer[3, :] + + # Obtain arrays from the buffer. Each call to get_array consumes one "slot". + #! format: off + degree = @view buffer[1, :] + feature = @view buffer[2, :] + op = @view buffer[3, :] execution_order = @view buffer[4, :] - idx_self = @view buffer[5, :] - idx_l = @view buffer[6, :] - idx_r = @view buffer[7, :] - constant = @view buffer[8, :] - - cursor = Ref(zero(I)) - num_launches = Ref(zero(I)) - for (root, tree) in zip(roots, trees) - @assert root == cursor[] + 1 - tree_mapreduce( - leaf -> begin - self = (cursor[] += one(I)) - idx_self[self] = self - degree[self] = 0 - execution_order[self] = one(I) - constant[self] = leaf.constant - if leaf.constant - val[self] = leaf.val::T - else - feature[self] = leaf.feature - end - - (id=self, order=one(I)) - end, - branch -> begin - self = (cursor[] += one(I)) - idx_self[self] = self - op[self] = branch.op - degree[self] = branch.degree - - (id=self, order=one(I)) # this order is unused - end, - ((parent, children::Vararg{Any,C}) where {C}) -> begin - idx_l[parent.id] = children[1].id - if C == 2 - idx_r[parent.id] = children[2].id - end - parent_execution_order = if C == 1 - children[1].order + one(I) - else - max(children[1].order, children[2].order) + one(I) - end - execution_order[parent.id] = parent_execution_order - - # Global number of launches equal to maximum execution order - if parent_execution_order > num_launches[] - num_launches[] = parent_execution_order - end - - (id=parent.id, order=parent_execution_order) - end, - tree; - break_sharing=Val(true), - ) - end + idx_self = @view buffer[5, :] + idx_l = @view buffer[6, :] + idx_r = @view buffer[7, :] + constant = @view buffer[8, :] + #! format: on + + tree_buffers = TreeBuffer(; + degree=degree, + constant=constant, + val=val, + feature=feature, + op=op, + idx_l=idx_l, + idx_r=idx_r, - return (; - degree, - constant, - val, - feature, - op, - execution_order, - num_launches=num_launches[], - idx_self, - idx_l, - idx_r, - roots, - buffer, - num_nodes, + # Indexing information + execution_order=execution_order, + idx_self=idx_self, + num_launches=Ref(zero(I)), + cursor=Ref(zero(I)), + + # Segment information + roots=roots, + num_nodes=I(num_nodes), + + # Original buffer + buffer=buffer, ) + + fill_tree_buffer!(tree_buffers, trees) + + return tree_buffers +end + +function fill_tree_buffer!( + tree_buffers::TreeBuffer{T,I}, trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}} +) where {T,I,N<:AbstractExpressionNode{T}} + return foreach(Fix{1}(fill_single_tree!, tree_buffers), trees) +end + +function fill_single_tree!( + tree_buffers::TreeBuffer{T,I}, tree::N +) where {T,I,N<:AbstractExpressionNode{T}} + return tree_mapreduce( + Fix{1}(fill_single_leaf!, tree_buffers), + Fix{1}(fill_single_branch!, tree_buffers), + Fix{1}(link_parent_and_children!, tree_buffers), + tree; + break_sharing=Val(true), + ) +end + +function fill_single_leaf!( + tree_buffers::TreeBuffer{T,I}, leaf::N +) where {T,I,N<:AbstractExpressionNode{T}} + self = (tree_buffers.cursor[] += one(I)) + tree_buffers.idx_self[self] = self + tree_buffers.degree[self] = 0 + tree_buffers.execution_order[self] = one(I) + tree_buffers.constant[self] = leaf.constant + if leaf.constant + tree_buffers.val[self] = leaf.val::T + else + tree_buffers.feature[self] = leaf.feature + end + + return (id=self, order=one(I)) +end + +function fill_single_branch!( + tree_buffers::TreeBuffer{T,I}, branch::N +) where {T,I,N<:AbstractExpressionNode{T}} + self = (tree_buffers.cursor[] += one(I)) + tree_buffers.idx_self[self] = self + tree_buffers.op[self] = branch.op + tree_buffers.degree[self] = branch.degree + + return (id=self, order=one(I)) +end + +function link_parent_and_children!( + tree_buffers::TreeBuffer{T,I}, parent, children::Vararg{Any,C} +) where {T,I,C} + tree_buffers.idx_l[parent.id] = children[1].id + if C == 2 + tree_buffers.idx_r[parent.id] = children[2].id + end + parent_execution_order = if C == 1 + children[1].order + one(I) + else + max(children[1].order, children[2].order) + one(I) + end + + tree_buffers.execution_order[parent.id] = parent_execution_order + + if parent_execution_order > tree_buffers.num_launches[] + tree_buffers.num_launches[] = parent_execution_order + end + + return (id=parent.id, order=parent_execution_order) end end From 74f56d46bca33f095d099988d6136f5c1eee0c78 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 15 Dec 2024 21:12:40 -0800 Subject: [PATCH 46/47] refactor: ensure `@inbounds` for gpu kernel --- ext/DynamicExpressionsCUDAExt.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 3127e260..6e0fbc1d 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -165,7 +165,6 @@ for nuna in 0:10, nbin in 0:10 # Override for unittesting: i=nothing, ) - #! format: on i = i === nothing ? (blockIdx().x - 1) * blockDim().x + threadIdx().x : i if i > num_elem * num_nodes return nothing @@ -174,6 +173,8 @@ for nuna in 0:10, nbin in 0:10 node = (i - 1) % num_nodes + 1 elem = (i - node) ÷ num_nodes + 1 + #! format: off + @inbounds begin if execution_order[node] != launch return nothing end @@ -212,6 +213,8 @@ for nuna in 0:10, nbin in 0:10 ) end end + end + #! format: on return nothing end end From 3da1d38b1da79b9bfbfa76d675063fe868ad1b30 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 16 Dec 2024 03:41:20 -0500 Subject: [PATCH 47/47] refactor: avoid SubArray in CUDA kernel --- ext/DynamicExpressionsCUDAExt.jl | 98 +++++++++++++++----------------- src/AsArray.jl | 25 +++++--- 2 files changed, 62 insertions(+), 61 deletions(-) diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl index 6e0fbc1d..83cea775 100644 --- a/ext/DynamicExpressionsCUDAExt.jl +++ b/ext/DynamicExpressionsCUDAExt.jl @@ -4,7 +4,16 @@ module DynamicExpressionsCUDAExt using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx using DynamicExpressions: OperatorEnum, AbstractExpressionNode using DynamicExpressions.EvaluateModule: get_nbin, get_nuna -using DynamicExpressions.AsArrayModule: as_array +using DynamicExpressions.AsArrayModule: + as_array, + IDX_DEGREE, + IDX_FEATURE, + IDX_OP, + IDX_EXECUTION_ORDER, + IDX_SELF, + IDX_L, + IDX_R, + IDX_CONSTANT using DispatchDoctor: @stable import DynamicExpressions.EvaluateModule: eval_tree_array @@ -59,12 +68,11 @@ end ## in the input data by the number of nodes in the tree. ## It has one extra row to store the constant values. gworkspace = @something(gpu_workspace, similar(gcX, num_elem + 1, num_nodes)) - gval = @view gworkspace[end, :] if _update_buffers - copyto!(gval, val) + copyto!(@view(gworkspace[end, :]), val) end + val_idx = size(gworkspace, 1) - ## Index arrays (much faster to have `@view` here) gbuffer = if !_update_buffers gpu_buffer elseif gpu_buffer === nothing @@ -73,17 +81,8 @@ end copyto!(gpu_buffer, buffer) end - #! format: off - gdegree = @view gbuffer[1, :] - gfeature = @view gbuffer[2, :] - gop = @view gbuffer[3, :] - gexecution_order = @view gbuffer[4, :] - gidx_self = @view gbuffer[5, :] - gidx_l = @view gbuffer[6, :] - gidx_r = @view gbuffer[7, :] - gconstant = @view gbuffer[8, :] - #! format: on - # TODO: This is a bit dangerous as we're assuming exact indices + # Removed @view definitions of gdegree, gfeature, etc. + # We'll index directly into gbuffer using the constants above. num_threads = 256 num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads)) @@ -92,10 +91,9 @@ end _launch_gpu_kernel!( num_threads, num_blocks, num_launches, gworkspace, # Thread info: - num_elem, num_nodes, gexecution_order, - # Input data and tree - operators, gcX, gidx_self, gidx_l, gidx_r, - gdegree, gconstant, gval, gfeature, gop, + num_elem, num_nodes, + # We'll pass gbuffer directly to the kernel now: + operators, gcX, gbuffer, val_idx, ) #! format: on @@ -109,34 +107,30 @@ end @stable default_mode = "disable" function _launch_gpu_kernel!( num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2}, # Thread info: - num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray{I}, - # Input data and tree - operators::OperatorEnum, cX::AbstractArray{T,2}, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray, - degree::AbstractArray, constant::AbstractArray, val::AbstractArray{T,1}, feature::AbstractArray, op::AbstractArray, -) where {I,T} + num_elem::Integer, num_nodes::Integer, + operators::OperatorEnum, cX::AbstractArray{T,2}, gbuffer::AbstractArray{Int32,2}, + val_idx::Integer +) where {T} #! format: on nuna = get_nuna(typeof(operators)) nbin = get_nbin(typeof(operators)) (nuna > 10 || nbin > 10) && error("Too many operators. Kernels are only compiled up to 10.") gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin)) - for launch in one(I):I(num_launches) + for launch in one(Int32):Int32(num_launches) #! format: off if buffer isa CuArray @cuda threads=num_threads blocks=num_blocks gpu_kernel!( buffer, - launch, num_elem, num_nodes, execution_order, - cX, idx_self, idx_l, idx_r, - degree, constant, val, feature, op + launch, num_elem, num_nodes, + cX, gbuffer, val_idx ) else Threads.@threads for i in 1:(num_threads * num_blocks) gpu_kernel!( buffer, - launch, num_elem, num_nodes, execution_order, - cX, idx_self, idx_l, idx_r, - degree, constant, val, feature, op, - i + launch, num_elem, num_nodes, + cX, gbuffer, val_idx, i ) end end @@ -155,17 +149,13 @@ for nuna in 0:10, nbin in 0:10 @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin}) #! format: off function ( - # Storage: buffer, - # Thread info: - launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray, - # Input data and tree - cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray, - degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray, - # Override for unittesting: + launch::Integer, num_elem::Integer, num_nodes::Integer, + cX::AbstractArray, gbuffer::AbstractArray{Int32,2}, + val_idx::Integer, i=nothing, ) - i = i === nothing ? (blockIdx().x - 1) * blockDim().x + threadIdx().x : i + i = @something(i, (blockIdx().x - 1) * blockDim().x + threadIdx().x) if i > num_elem * num_nodes return nothing end @@ -173,26 +163,28 @@ for nuna in 0:10, nbin in 0:10 node = (i - 1) % num_nodes + 1 elem = (i - node) ÷ num_nodes + 1 - #! format: off + @inbounds begin - if execution_order[node] != launch + if gbuffer[IDX_EXECUTION_ORDER, node] != launch return nothing end - cur_degree = degree[node] - cur_idx = idx_self[node] + # Use constants to index gbuffer: + cur_degree = gbuffer[IDX_DEGREE, node] + cur_idx = gbuffer[IDX_SELF, node] + if cur_degree == 0 - if constant[node] == 1 - cur_val = val[node] + if gbuffer[IDX_CONSTANT, node] == 1 + cur_val = buffer[val_idx, node] buffer[elem, cur_idx] = cur_val else - cur_feature = feature[node] + cur_feature = gbuffer[IDX_FEATURE, node] buffer[elem, cur_idx] = cX[cur_feature, elem] end else if cur_degree == 1 && $nuna > 0 - cur_op = op[node] - l_idx = idx_l[node] + cur_op = gbuffer[IDX_OP, node] + l_idx = gbuffer[IDX_L, node] Base.Cartesian.@nif( $nuna, i -> i == cur_op, @@ -200,10 +192,10 @@ for nuna in 0:10, nbin in 0:10 buffer[elem, cur_idx] = op(buffer[elem, l_idx]) end ) - elseif $nbin > 0 # Note this check is to avoid type inference issues when binops is empty - cur_op = op[node] - l_idx = idx_l[node] - r_idx = idx_r[node] + elseif $nbin > 0 + cur_op = gbuffer[IDX_OP, node] + l_idx = gbuffer[IDX_L, node] + r_idx = gbuffer[IDX_R, node] Base.Cartesian.@nif( $nbin, i -> i == cur_op, diff --git a/src/AsArray.jl b/src/AsArray.jl index 6d9f19a1..c63f79da 100644 --- a/src/AsArray.jl +++ b/src/AsArray.jl @@ -37,6 +37,15 @@ Base.@kwdef struct TreeBuffer{ buffer::D end +const IDX_DEGREE = 1 +const IDX_FEATURE = 2 +const IDX_OP = 3 +const IDX_EXECUTION_ORDER = 4 +const IDX_SELF = 5 +const IDX_L = 6 +const IDX_R = 7 +const IDX_CONSTANT = 8 + function as_array( ::Type{I}, trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}}; @@ -61,14 +70,14 @@ function as_array( # Obtain arrays from the buffer. Each call to get_array consumes one "slot". #! format: off - degree = @view buffer[1, :] - feature = @view buffer[2, :] - op = @view buffer[3, :] - execution_order = @view buffer[4, :] - idx_self = @view buffer[5, :] - idx_l = @view buffer[6, :] - idx_r = @view buffer[7, :] - constant = @view buffer[8, :] + degree = @view buffer[IDX_DEGREE, :] + feature = @view buffer[IDX_FEATURE, :] + op = @view buffer[IDX_OP, :] + execution_order = @view buffer[IDX_EXECUTION_ORDER, :] + idx_self = @view buffer[IDX_SELF, :] + idx_l = @view buffer[IDX_L, :] + idx_r = @view buffer[IDX_R, :] + constant = @view buffer[IDX_CONSTANT, :] #! format: on tree_buffers = TreeBuffer(;