From 9b8872c5c8baf1f59fae8fa0ede5cee93ec02d93 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 14 Jan 2024 07:51:56 +0000
Subject: [PATCH 01/47] Initialize CUDA extension

---
 Project.toml                     |  6 ++-
 ext/DynamicExpressionsCUDAExt.jl | 70 ++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 ext/DynamicExpressionsCUDAExt.jl

diff --git a/Project.toml b/Project.toml
index 1c3b9b98..3158f7b1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,12 +16,14 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 
 [weakdeps]
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [extensions]
 DynamicExpressionsBumperExt = "Bumper"
+DynamicExpressionsCUDAExt = "CUDA"
 DynamicExpressionsLoopVectorizationExt = "LoopVectorization"
 DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils"
 DynamicExpressionsZygoteExt = "Zygote"
@@ -30,6 +32,7 @@ DynamicExpressionsZygoteExt = "Zygote"
 Aqua = "0.7"
 Bumper = "0.6"
 Compat = "3.37, 4"
+CUDA = "5"
 Enzyme = "^0.11.12"
 LoopVectorization = "0.12"
 MacroTools = "0.4, 0.5"
@@ -43,6 +46,7 @@ julia = "1.6"
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
@@ -54,4 +58,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "SafeTestsets", "Aqua", "Bumper", "Enzyme", "ForwardDiff", "LoopVectorization", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
+test = ["Test", "SafeTestsets", "Aqua", "Bumper", "CUDA", "Enzyme", "ForwardDiff", "LoopVectorization", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
new file mode 100644
index 00000000..80b0203d
--- /dev/null
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -0,0 +1,70 @@
+module DynamicExpressionsCUDAExt
+
+using CUDA: CuArray
+using DynamicExpressions: OperatorEnum, AbstractExpressionNode, tree_mapreduce
+using DynamicExpressions.UtilsModule: counttuple
+
+import DynamicExpressions.EvaluateEquationModule: eval_tree_array
+
+function eval_tree_array(
+    tree::AbstractExpressionNode{T}, cX::CuArray{T,2}, operators::OperatorEnum; _...
+) where {T<:Number}
+    result = tree_mapreduce(
+        # Leaf nodes, we create an allocation and fill
+        # it with the value of the leaf:
+        leaf -> begin
+            if leaf.constant
+                v = leaf.val::T
+                ar = similar(cX, axes(cX, 2))
+                ar .= v
+                return ar
+            else
+                return cX[leaf.feature, :]
+            end
+        end,
+        # Branch nodes, we simply pass them to the evaluation kernel:
+        branch -> branch,
+        # In the evaluation kernel, we combine the branch nodes
+        # with the arrays created by the leaf nodes:
+        ((branch, cumulators::Vararg{Any,M}) where {M}) -> begin
+            if M == 1
+                return dispatch_kern1!(operators.unaops, branch.op, cumulators[1])
+            else
+                return dispatch_kern2!(
+                    operators.binops, branch.op, cumulators[1], cumulators[2]
+                )
+            end
+        end,
+        tree;
+        break_sharing=Val(true),
+    )
+    return (result.x, isfinite(sum(x .* zero(T))))
+end
+@generated function dispatch_kern1!(unaops, op_idx, cumulator)
+    nuna = counttuple(unaops)
+    quote
+        Base.@nif($nuna, i -> i == op_idx, i -> let op = unaops[i]
+            return kern1!(op, cumulator)
+        end,)
+    end
+end
+@generated function dispatch_kern2!(binops, op_idx, cumulator1, cumulator2)
+    nbin = counttuple(binops)
+    quote
+        Base.@nif(
+            $nbin, i -> i == op_idx, i -> let op = binops[i]
+                return kern2!(op, cumulator1, cumulator2)
+            end,
+        )
+    end
+end
+function kern1!(op::F, cumulator) where {F}
+    @. cumulator = op(cumulator)
+    return cumulator
+end
+function kern2!(op::F, cumulator1, cumulator2) where {F}
+    @. cumulator1 = op(cumulator1, cumulator2)
+    return cumulator1
+end
+
+end

From 666708e8245be4ee329426a3ebe4fc1ba5d89751 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 14 Jan 2024 07:57:25 +0000
Subject: [PATCH 02/47] Fix output

---
 ext/DynamicExpressionsCUDAExt.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 80b0203d..0989f891 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -38,7 +38,7 @@ function eval_tree_array(
         tree;
         break_sharing=Val(true),
     )
-    return (result.x, isfinite(sum(x .* zero(T))))
+    return (result, isfinite(sum(result .* zero(T))))
 end
 @generated function dispatch_kern1!(unaops, op_idx, cumulator)
     nuna = counttuple(unaops)

From 4484d180f08a5324e8f91d8c7f7db5ab0e9d653a Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 14 Jan 2024 20:57:17 +0000
Subject: [PATCH 03/47] Switch to GPUArrays rather than CUDA

---
 Project.toml                     | 10 +++++-----
 ext/DynamicExpressionsCUDAExt.jl |  9 ++++++---
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/Project.toml b/Project.toml
index 3158f7b1..5b9c9e24 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,14 +16,14 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 
 [weakdeps]
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [extensions]
 DynamicExpressionsBumperExt = "Bumper"
-DynamicExpressionsCUDAExt = "CUDA"
+DynamicExpressionsGPUArraysExt = "GPUArrays"
 DynamicExpressionsLoopVectorizationExt = "LoopVectorization"
 DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils"
 DynamicExpressionsZygoteExt = "Zygote"
@@ -32,8 +32,8 @@ DynamicExpressionsZygoteExt = "Zygote"
 Aqua = "0.7"
 Bumper = "0.6"
 Compat = "3.37, 4"
-CUDA = "5"
 Enzyme = "^0.11.12"
+GPUArrays = "8, 9, 10"
 LoopVectorization = "0.12"
 MacroTools = "0.4, 0.5"
 PackageExtensionCompat = "1"
@@ -46,7 +46,7 @@ julia = "1.6"
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
@@ -58,4 +58,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "SafeTestsets", "Aqua", "Bumper", "CUDA", "Enzyme", "ForwardDiff", "LoopVectorization", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
+test = ["Test", "SafeTestsets", "Aqua", "Bumper", "Enzyme", "ForwardDiff", "GPUArrays", "LoopVectorization", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 0989f891..52d3fb93 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -1,13 +1,16 @@
-module DynamicExpressionsCUDAExt
+module DynamicExpressionsGPUArraysExt
 
-using CUDA: CuArray
+using GPUArrays: AbstractDeviceArray
 using DynamicExpressions: OperatorEnum, AbstractExpressionNode, tree_mapreduce
 using DynamicExpressions.UtilsModule: counttuple
 
 import DynamicExpressions.EvaluateEquationModule: eval_tree_array
 
 function eval_tree_array(
-    tree::AbstractExpressionNode{T}, cX::CuArray{T,2}, operators::OperatorEnum; _...
+    tree::AbstractExpressionNode{T},
+    cX::AbstractDeviceArray{T,2},
+    operators::OperatorEnum;
+    _...,
 ) where {T<:Number}
     result = tree_mapreduce(
         # Leaf nodes, we create an allocation and fill

From 644c3b886aba2f01c2bd5644efb11ce9dbf85c7d Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 14 Jan 2024 21:31:00 +0000
Subject: [PATCH 04/47] Switch to GPUArrays

---
 Project.toml                                                  | 4 ++--
 ...xpressionsCUDAExt.jl => DynamicExpressionsGPUArraysExt.jl} | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename ext/{DynamicExpressionsCUDAExt.jl => DynamicExpressionsGPUArraysExt.jl} (100%)

diff --git a/Project.toml b/Project.toml
index 5b9c9e24..a6fac437 100644
--- a/Project.toml
+++ b/Project.toml
@@ -46,7 +46,7 @@ julia = "1.6"
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
-GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
@@ -58,4 +58,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "SafeTestsets", "Aqua", "Bumper", "Enzyme", "ForwardDiff", "GPUArrays", "LoopVectorization", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
+test = ["Test", "SafeTestsets", "Aqua", "Bumper", "CUDA", "Enzyme", "ForwardDiff", "LoopVectorization", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsGPUArraysExt.jl
similarity index 100%
rename from ext/DynamicExpressionsCUDAExt.jl
rename to ext/DynamicExpressionsGPUArraysExt.jl

From 0bfd8ee2025a0bb7eb7ab1d336eea02de4edf2f6 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 15 Jan 2024 02:23:41 +0000
Subject: [PATCH 05/47] Add file for converting to arrays

---
 src/AsArray.jl            | 64 +++++++++++++++++++++++++++++++++++++++
 src/DynamicExpressions.jl |  2 ++
 2 files changed, 66 insertions(+)
 create mode 100644 src/AsArray.jl

diff --git a/src/AsArray.jl b/src/AsArray.jl
new file mode 100644
index 00000000..f5f4a808
--- /dev/null
+++ b/src/AsArray.jl
@@ -0,0 +1,64 @@
+module AsArrayModule
+
+using .DynamicExpressions.EquationModule: AbstractExpressionNode, tree_mapreduce
+
+function as_array(
+    tree::N, index_type::Type{I}=Int64
+) where {T,N<:AbstractExpressionNode{T},I}
+    num_nodes = length(tree)
+    degree = Array{fieldtype(N, :degree)}(undef, num_nodes)
+    constant = Array{fieldtype(N, :constant)}(undef, num_nodes)
+    val = Array{T}(undef, num_nodes)
+    feature = Array{fieldtype(N, :feature)}(undef, num_nodes)
+    op = Array{fieldtype(N, :op)}(undef, num_nodes)
+    execution_order = Array{I}(undef, num_nodes)
+    idx_self = Array{I}(undef, num_nodes)
+    idx_l = Array{I}(undef, num_nodes)
+    idx_r = Array{I}(undef, num_nodes)
+
+    cursor = Ref(zero(I))
+    tree_mapreduce(
+        leaf -> begin
+            self = (cursor[] += one(I))
+            idx_self[self] = self
+            degree[self] = 0
+            execution_order[self] = one(I)
+            constant[self] = leaf.constant
+            if leaf.constant
+                val[self] = leaf.val::T
+            else
+                feature[self] = leaf.feature
+            end
+
+            (id=self, order=one(I))
+        end,
+        branch -> begin
+            self = (cursor[] += one(I))
+            idx_self[self] = self
+            op[self] = branch.op
+            degree[self] = branch.degree
+
+            (id=self, order=one(I))  # this order is unused
+        end,
+        ((parent, children::Vararg{Any,M}) where {M}) -> begin
+            idx_l[parent.id] = children[1].id
+            if M == 2
+                idx_r[parent.id] = children[2].id
+            end
+            parent_execution_order = if M == 1
+                children[1].order + one(I)
+            else
+                max(children[1].order, children[2].order) + one(I)
+            end
+            execution_order[parent.id] = parent_execution_order
+
+            (id=parent.id, order=parent_execution_order)
+        end,
+        tree;
+        break_sharing=Val(true),
+    )
+
+    return (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r)
+end
+
+end
diff --git a/src/DynamicExpressions.jl b/src/DynamicExpressions.jl
index 09fee767..b664819f 100644
--- a/src/DynamicExpressions.jl
+++ b/src/DynamicExpressions.jl
@@ -11,6 +11,7 @@ include("EvaluationHelpers.jl")
 include("SimplifyEquation.jl")
 include("OperatorEnumConstruction.jl")
 include("Random.jl")
+include("AsArray.jl")
 
 import PackageExtensionCompat: @require_extensions
 import Reexport: @reexport
@@ -47,6 +48,7 @@ import .EquationModule: constructorof, preserve_sharing
 @reexport import .EvaluationHelpersModule
 @reexport import .ExtensionInterfaceModule: node_to_symbolic, symbolic_to_node
 @reexport import .RandomModule: NodeSampler
+@reexport import .AsArrayModule: as_array
 
 function __init__()
     @require_extensions

From 5c908964abddad666f9af830907038dcae86d911 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 15 Jan 2024 03:05:52 +0000
Subject: [PATCH 06/47] Add GPU kernel for evaluation

---
 Project.toml                     |   3 +
 ext/DynamicExpressionsCUDAExt.jl | 128 +++++++++++++++++++++++++++++++
 src/AsArray.jl                   |   2 +-
 3 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 ext/DynamicExpressionsCUDAExt.jl

diff --git a/Project.toml b/Project.toml
index a6fac437..983e5ee0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,6 +16,7 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 
 [weakdeps]
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b"
@@ -23,6 +24,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [extensions]
 DynamicExpressionsBumperExt = "Bumper"
+DynamicExpressionsCUDAExt = "CUDA"
 DynamicExpressionsGPUArraysExt = "GPUArrays"
 DynamicExpressionsLoopVectorizationExt = "LoopVectorization"
 DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils"
@@ -32,6 +34,7 @@ DynamicExpressionsZygoteExt = "Zygote"
 Aqua = "0.7"
 Bumper = "0.6"
 Compat = "3.37, 4"
+CUDA = "5"
 Enzyme = "^0.11.12"
 GPUArrays = "8, 9, 10"
 LoopVectorization = "0.12"
diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
new file mode 100644
index 00000000..284259cb
--- /dev/null
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -0,0 +1,128 @@
+module DynamicExpressionsCUDAExt
+#! format: off
+
+using CUDA: threadIdx, blockIdx, @cuda, CuArray, cu
+using DynamicExpressions: OperatorEnum, AbstractExpressionNode
+using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna
+using DynamicExpressions.AsArrayModule: as_array
+
+
+function eval_tree_gpu(
+    tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum
+)
+    (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r) = as_array(tree)
+    num_launches = maximum(execution_order)
+    num_elem = size(gcX, 2)
+    num_nodes = length(degree)
+
+    # Convert everything to GPU:
+    gbuffer = CuArray{T}(undef, num_elem, num_nodes)
+    gexecution_order = cu(execution_order)
+    gdegree = cu(degree)
+    gconstant = cu(constant)
+    gval = cu(val)
+    gfeature = cu(feature)
+    gop = cu(op)
+    gidx_self = cu(idx_self)
+    gidx_l = cu(idx_l)
+    gidx_r = cu(idx_r)
+
+    _launch_gpu_kernel!(
+        Val(nextpow(2, num_elem * num_nodes)), num_launches, gbuffer,
+        # Thread info:
+        num_elem, num_nodes, gexecution_order,
+        # Input data and tree
+        operators, gcX, gidx_self, gidx_l, gidx_r,
+        gdegree, gconstant, gval, gfeature, gop,
+    )
+
+    out = gbuffer[:, 1]
+    return (out, isfinite(sum(out .* zero(T))))
+end
+
+function _launch_gpu_kernel!(
+    ::Val{num_threads}, num_launches::Integer, buffer::CuArray{T,2},
+    # Thread info:
+    num_elem::Integer, num_nodes::Integer, execution_order::CuArray{I},
+    # Input data and tree
+    operators::OperatorEnum, cX::CuArray{T,2}, idx_self::CuArray, idx_l::CuArray, idx_r::CuArray,
+    degree::CuArray, constant::CuArray, val::CuArray{T,1}, feature::CuArray, op::CuArray,
+) where {num_threads,I,T}
+    for launch in I(1):I(num_launches)
+        execute = execution_order .== launch
+        @cuda(
+            threads = num_threads,
+            _gpu_kernel!(
+                buffer,
+                num_elem, num_nodes, execute,
+                operators, cX, idx_self, idx_l, idx_r,
+                degree, constant, val, feature, op
+            )
+        )
+    end
+end
+
+@generated function _gpu_kernel!(
+    # Storage:
+    buffer::CuArray{T,2},
+    # Thread info:
+    num_elem::Integer, num_nodes::Integer, execute::CuArray{Bool},
+    # Input data and tree
+    operators::OperatorEnum, cX::CuArray{T,2}, idx_self::CuArray, idx_l::CuArray, idx_r::CuArray,
+    degree::CuArray, constant::CuArray, val::CuArray{T,1}, feature::CuArray, op::CuArray,
+) where {T}
+    nbin = get_nbin(operators)
+    nuna = get_nuna(operators)
+    quote
+        i = threadIdx().x
+        if i > num_elem * num_nodes
+            return nothing
+        end
+
+        node = (i - 1) % num_nodes + 1
+        elem = (i - node) ÷ num_nodes + 1
+
+        if !execute[node]
+            return nothing
+        end
+
+        cur_degree = degree[node]
+        cur_idx = idx_self[node]
+        if cur_degree == 0
+            if constant[node]
+                cur_val = val[node]
+                buffer[elem, cur_idx] = cur_val
+            else
+                cur_feature = feature[node]
+                buffer[elem, cur_idx] = cX[cur_feature, elem]
+            end
+        else
+            if cur_degree == 1
+                cur_op = op[node]
+                l_idx = idx_l[node]
+                Base.Cartesian.@nif(
+                    $nuna,
+                    i -> i == cur_op,
+                    i -> let op = operators.unaops[i]
+                        buffer[elem, cur_idx] = op(buffer[elem, l_idx])
+                    end
+                )
+            else
+                cur_op = op[node]
+                l_idx = idx_l[node]
+                r_idx = idx_r[node]
+                Base.Cartesian.@nif(
+                    $nbin,
+                    i -> i == cur_op,
+                    i -> let op = operators.binops[i]
+                        buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx])
+                    end
+                )
+            end
+        end
+        return nothing
+    end
+end
+
+#! format: on
+end
diff --git a/src/AsArray.jl b/src/AsArray.jl
index f5f4a808..aecc4ac5 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -1,6 +1,6 @@
 module AsArrayModule
 
-using .DynamicExpressions.EquationModule: AbstractExpressionNode, tree_mapreduce
+using ..EquationModule: AbstractExpressionNode, tree_mapreduce
 
 function as_array(
     tree::N, index_type::Type{I}=Int64

From 57fff36a166acc5c31b184bf90bc5ae9baa1376d Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 15 Jan 2024 00:04:47 -0500
Subject: [PATCH 07/47] Allow submitting multiple trees at once

---
 src/AsArray.jl          | 89 ++++++++++++++++++++++-------------------
 src/EvaluateEquation.jl | 15 +++++--
 2 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/src/AsArray.jl b/src/AsArray.jl
index aecc4ac5..b84a5aad 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -3,9 +3,11 @@ module AsArrayModule
 using ..EquationModule: AbstractExpressionNode, tree_mapreduce
 
 function as_array(
-    tree::N, index_type::Type{I}=Int64
-) where {T,N<:AbstractExpressionNode{T},I}
-    num_nodes = length(tree)
+    trees::Vararg{N,M}; index_type::Type{I}=Int64
+) where {T,N<:AbstractExpressionNode{T},I,M}
+    each_num_nodes = length.(trees)
+    num_nodes = sum(each_num_nodes)
+
     degree = Array{fieldtype(N, :degree)}(undef, num_nodes)
     constant = Array{fieldtype(N, :constant)}(undef, num_nodes)
     val = Array{T}(undef, num_nodes)
@@ -16,49 +18,54 @@ function as_array(
     idx_l = Array{I}(undef, num_nodes)
     idx_r = Array{I}(undef, num_nodes)
 
+    roots = Array{I}(undef, length(trees))
+
     cursor = Ref(zero(I))
-    tree_mapreduce(
-        leaf -> begin
-            self = (cursor[] += one(I))
-            idx_self[self] = self
-            degree[self] = 0
-            execution_order[self] = one(I)
-            constant[self] = leaf.constant
-            if leaf.constant
-                val[self] = leaf.val::T
-            else
-                feature[self] = leaf.feature
-            end
+    for (i_tree, tree) in enumerate(trees)
+        roots[i_tree] = cursor[] + one(I)
+        tree_mapreduce(
+            leaf -> begin
+                self = (cursor[] += one(I))
+                idx_self[self] = self
+                degree[self] = 0
+                execution_order[self] = one(I)
+                constant[self] = leaf.constant
+                if leaf.constant
+                    val[self] = leaf.val::T
+                else
+                    feature[self] = leaf.feature
+                end
 
-            (id=self, order=one(I))
-        end,
-        branch -> begin
-            self = (cursor[] += one(I))
-            idx_self[self] = self
-            op[self] = branch.op
-            degree[self] = branch.degree
+                (id=self, order=one(I))
+            end,
+            branch -> begin
+                self = (cursor[] += one(I))
+                idx_self[self] = self
+                op[self] = branch.op
+                degree[self] = branch.degree
 
-            (id=self, order=one(I))  # this order is unused
-        end,
-        ((parent, children::Vararg{Any,M}) where {M}) -> begin
-            idx_l[parent.id] = children[1].id
-            if M == 2
-                idx_r[parent.id] = children[2].id
-            end
-            parent_execution_order = if M == 1
-                children[1].order + one(I)
-            else
-                max(children[1].order, children[2].order) + one(I)
-            end
-            execution_order[parent.id] = parent_execution_order
+                (id=self, order=one(I))  # this order is unused
+            end,
+            ((parent, children::Vararg{Any,C}) where {C}) -> begin
+                idx_l[parent.id] = children[1].id
+                if C == 2
+                    idx_r[parent.id] = children[2].id
+                end
+                parent_execution_order = if C == 1
+                    children[1].order + one(I)
+                else
+                    max(children[1].order, children[2].order) + one(I)
+                end
+                execution_order[parent.id] = parent_execution_order
 
-            (id=parent.id, order=parent_execution_order)
-        end,
-        tree;
-        break_sharing=Val(true),
-    )
+                (id=parent.id, order=parent_execution_order)
+            end,
+            tree;
+            break_sharing=Val(true),
+        )
+    end
 
-    return (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r)
+    return (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots)
 end
 
 end
diff --git a/src/EvaluateEquation.jl b/src/EvaluateEquation.jl
index 1848248a..e5a4cff8 100644
--- a/src/EvaluateEquation.jl
+++ b/src/EvaluateEquation.jl
@@ -1,6 +1,6 @@
 module EvaluateEquationModule
 
-import ..EquationModule: AbstractExpressionNode, constructorof, string_tree
+import ..EquationModule: AbstractExpressionNode, with_type_parameters, string_tree
 import ..OperatorEnumModule: OperatorEnum, GenericOperatorEnum
 import ..UtilsModule: is_bad_array, fill_similar, counttuple, ResultOk
 import ..EquationUtilsModule: is_constant
@@ -91,10 +91,19 @@ function eval_tree_array(
 ) where {T1<:Number,T2<:Number}
     T = promote_type(T1, T2)
     @warn "Warning: eval_tree_array received mixed types: tree=$(T1) and data=$(T2)."
-    tree = convert(constructorof(typeof(tree)){T}, tree)
-    cX = Base.Fix1(convert, T).(cX)
+    tree = convert(with_type_parameters(typeof(tree), T), tree)
+    cX = T.(cX)
     return eval_tree_array(tree, cX, operators; turbo, bumper)
 end
+function eval_tree_array(
+    trees::NTuple{M,N},
+    cX::AbstractMatrix{T},
+    operators::OperatorEnum;
+    kws...
+) where {T<:Number,N<:AbstractExpressionNode{T},M}
+    outs = ntuple(i -> eval_tree_array(trees[i], cX, operators; kws...)[1], Val(M))
+    return ntuple(i -> first(outs[i]), Val(M)), ntuple(i -> last(outs[i]), Val(M))
+end
 
 get_nuna(::Type{<:OperatorEnum{B,U}}) where {B,U} = counttuple(U)
 get_nbin(::Type{<:OperatorEnum{B}}) where {B} = counttuple(B)

From 81529af3c11f8368f9bdedbb8cd63b14d3edf4d5 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 15 Jan 2024 00:05:31 -0500
Subject: [PATCH 08/47] Working GPU kernel for expression evaluation!

---
 ext/DynamicExpressionsCUDAExt.jl | 166 ++++++++++++++++++-------------
 1 file changed, 95 insertions(+), 71 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 284259cb..b798b5fc 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -1,16 +1,24 @@
 module DynamicExpressionsCUDAExt
 #! format: off
 
-using CUDA: threadIdx, blockIdx, @cuda, CuArray, cu
+using CUDA
 using DynamicExpressions: OperatorEnum, AbstractExpressionNode
 using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna
 using DynamicExpressions.AsArrayModule: as_array
 
+import DynamicExpressions.EvaluateEquationModule: eval_tree_array
 
-function eval_tree_gpu(
-    tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum
-)
-    (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r) = as_array(tree)
+function eval_tree_array(
+    tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum; _...
+) where {T<:Number}
+    (outs, is_good) = eval_tree_array((tree,), gcX, operators)
+    return only(outs), only(is_good)
+end
+
+function eval_tree_array(
+    trees::NTuple{M,N}, gcX::CuArray{T,2}, operators::OperatorEnum; _...
+) where {T<:Number,N<:AbstractExpressionNode{T},M}
+    (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots) = as_array(trees...; index_type=Int32)
     num_launches = maximum(execution_order)
     num_elem = size(gcX, 2)
     num_nodes = length(degree)
@@ -20,15 +28,20 @@ function eval_tree_gpu(
     gexecution_order = cu(execution_order)
     gdegree = cu(degree)
     gconstant = cu(constant)
-    gval = cu(val)
+    gval = CuArray(val)
     gfeature = cu(feature)
     gop = cu(op)
     gidx_self = cu(idx_self)
     gidx_l = cu(idx_l)
     gidx_r = cu(idx_r)
 
+    groots = cu(roots)  # Just for indexing output
+
+    num_threads = 256
+    num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads))
+
     _launch_gpu_kernel!(
-        Val(nextpow(2, num_elem * num_nodes)), num_launches, gbuffer,
+        num_blocks, num_launches, gbuffer,
         # Thread info:
         num_elem, num_nodes, gexecution_order,
         # Input data and tree
@@ -36,91 +49,102 @@ function eval_tree_gpu(
         gdegree, gconstant, gval, gfeature, gop,
     )
 
-    out = gbuffer[:, 1]
-    return (out, isfinite(sum(out .* zero(T))))
+    out = ntuple(i -> gbuffer[:, groots[i]], Val(M))
+    is_good = ntuple(i -> isfinite(sum(out[i] .* zero(T))), Val(M))
+    return out, is_good
 end
 
 function _launch_gpu_kernel!(
-    ::Val{num_threads}, num_launches::Integer, buffer::CuArray{T,2},
+    num_blocks, num_launches::Integer, buffer::CuArray{T,2},
     # Thread info:
     num_elem::Integer, num_nodes::Integer, execution_order::CuArray{I},
     # Input data and tree
     operators::OperatorEnum, cX::CuArray{T,2}, idx_self::CuArray, idx_l::CuArray, idx_r::CuArray,
     degree::CuArray, constant::CuArray, val::CuArray{T,1}, feature::CuArray, op::CuArray,
-) where {num_threads,I,T}
-    for launch in I(1):I(num_launches)
-        execute = execution_order .== launch
-        @cuda(
-            threads = num_threads,
-            _gpu_kernel!(
+) where {I,T}
+    nuna = get_nuna(typeof(operators))
+    nbin = get_nbin(typeof(operators))
+    (nuna > 10 || nbin > 10) && error("Too many operators. Kernels are only compiled up to 10.")
+    gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
+    for launch in I.(1:num_launches)
+        CUDA.@sync begin
+            execute = execution_order .== launch
+            @cuda threads=256 blocks=num_blocks gpu_kernel!(
                 buffer,
                 num_elem, num_nodes, execute,
-                operators, cX, idx_self, idx_l, idx_r,
+                cX, idx_self, idx_l, idx_r,
                 degree, constant, val, feature, op
             )
-        )
+        end
     end
+    return nothing
 end
 
-@generated function _gpu_kernel!(
-    # Storage:
-    buffer::CuArray{T,2},
-    # Thread info:
-    num_elem::Integer, num_nodes::Integer, execute::CuArray{Bool},
-    # Input data and tree
-    operators::OperatorEnum, cX::CuArray{T,2}, idx_self::CuArray, idx_l::CuArray, idx_r::CuArray,
-    degree::CuArray, constant::CuArray, val::CuArray{T,1}, feature::CuArray, op::CuArray,
-) where {T}
-    nbin = get_nbin(operators)
-    nuna = get_nuna(operators)
-    quote
-        i = threadIdx().x
-        if i > num_elem * num_nodes
-            return nothing
-        end
-
-        node = (i - 1) % num_nodes + 1
-        elem = (i - node) ÷ num_nodes + 1
+# Need to pre-compute the GPU kernels with an `@eval` for each number of operators
+#   1. We need to use an `@nif` over operators, as GPU kernels
+#      can't index into arrays of operators.
+#   2. `@nif` is evaluated at parse time and needs to know the number of
+#      ifs to generate at that time, so we can't simply use specialization.
+#   3. We can't use `@generated` because we can't create closures in those.
+for nuna in 0:10, nbin in 0:10
+    @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin})
+        function (
+            # Storage:
+            buffer,
+            # Thread info:
+            num_elem::Integer, num_nodes::Integer, execute::AbstractArray,
+            # Input data and tree
+            cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
+            degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray,
+        )
+            i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+            if i > num_elem * num_nodes
+                return nothing
+            end
 
-        if !execute[node]
-            return nothing
-        end
+            node = (i - 1) % num_nodes + 1
+            elem = (i - node) ÷ num_nodes + 1
 
-        cur_degree = degree[node]
-        cur_idx = idx_self[node]
-        if cur_degree == 0
-            if constant[node]
-                cur_val = val[node]
-                buffer[elem, cur_idx] = cur_val
-            else
-                cur_feature = feature[node]
-                buffer[elem, cur_idx] = cX[cur_feature, elem]
+            if !execute[node]
+                return nothing
             end
-        else
-            if cur_degree == 1
-                cur_op = op[node]
-                l_idx = idx_l[node]
-                Base.Cartesian.@nif(
-                    $nuna,
-                    i -> i == cur_op,
-                    i -> let op = operators.unaops[i]
-                        buffer[elem, cur_idx] = op(buffer[elem, l_idx])
-                    end
-                )
+
+            cur_degree = degree[node]
+            cur_idx = idx_self[node]
+            if cur_degree == 0
+                if constant[node]
+                    cur_val = val[node]
+                    buffer[elem, cur_idx] = cur_val
+                else
+                    cur_feature = feature[node]
+                    buffer[elem, cur_idx] = cX[cur_feature, elem]
+                end
             else
-                cur_op = op[node]
-                l_idx = idx_l[node]
-                r_idx = idx_r[node]
-                Base.Cartesian.@nif(
-                    $nbin,
-                    i -> i == cur_op,
-                    i -> let op = operators.binops[i]
-                        buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx])
-                    end
-                )
+                if cur_degree == 1
+                    cur_op = op[node]
+                    l_idx = idx_l[node]
+                    Base.Cartesian.@nif(
+                        $nuna,
+                        i -> i == cur_op,
+                        i -> let op = operators.unaops[i]
+                            buffer[elem, cur_idx] = op(buffer[elem, l_idx])
+                        end
+                    )
+                else
+                    cur_op = op[node]
+                    l_idx = idx_l[node]
+                    r_idx = idx_r[node]
+                    Base.Cartesian.@nif(
+                        $nbin,
+                        i -> i == cur_op,
+                        i -> let op = operators.binops[i]
+                            buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx])
+                        end
+                    )
+                end
             end
+            return nothing
         end
-        return nothing
     end
 end
 

From 07dec6bce9e1e9dea0e484b393938714a6f5e775 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 15 Jan 2024 00:07:47 -0500
Subject: [PATCH 09/47] Formatting

---
 src/AsArray.jl          | 4 +++-
 src/EvaluateEquation.jl | 5 +----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/AsArray.jl b/src/AsArray.jl
index b84a5aad..7c49ea71 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -65,7 +65,9 @@ function as_array(
         )
     end
 
-    return (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots)
+    return (;
+        degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots
+    )
 end
 
 end
diff --git a/src/EvaluateEquation.jl b/src/EvaluateEquation.jl
index e5a4cff8..4be61aaf 100644
--- a/src/EvaluateEquation.jl
+++ b/src/EvaluateEquation.jl
@@ -96,10 +96,7 @@ function eval_tree_array(
     return eval_tree_array(tree, cX, operators; turbo, bumper)
 end
 function eval_tree_array(
-    trees::NTuple{M,N},
-    cX::AbstractMatrix{T},
-    operators::OperatorEnum;
-    kws...
+    trees::NTuple{M,N}, cX::AbstractMatrix{T}, operators::OperatorEnum; kws...
 ) where {T<:Number,N<:AbstractExpressionNode{T},M}
     outs = ntuple(i -> eval_tree_array(trees[i], cX, operators; kws...)[1], Val(M))
     return ntuple(i -> first(outs[i]), Val(M)), ntuple(i -> last(outs[i]), Val(M))

From c19f37f7bd170a97685afdffda2cafa685cd2196 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 15 Jan 2024 00:08:37 -0500
Subject: [PATCH 10/47] Remove GPUArrays extension as redundant

---
 Project.toml                          |  3 --
 ext/DynamicExpressionsGPUArraysExt.jl | 73 ---------------------------
 2 files changed, 76 deletions(-)
 delete mode 100644 ext/DynamicExpressionsGPUArraysExt.jl

diff --git a/Project.toml b/Project.toml
index 983e5ee0..3158f7b1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -17,7 +17,6 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 [weakdeps]
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
@@ -25,7 +24,6 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 [extensions]
 DynamicExpressionsBumperExt = "Bumper"
 DynamicExpressionsCUDAExt = "CUDA"
-DynamicExpressionsGPUArraysExt = "GPUArrays"
 DynamicExpressionsLoopVectorizationExt = "LoopVectorization"
 DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils"
 DynamicExpressionsZygoteExt = "Zygote"
@@ -36,7 +34,6 @@ Bumper = "0.6"
 Compat = "3.37, 4"
 CUDA = "5"
 Enzyme = "^0.11.12"
-GPUArrays = "8, 9, 10"
 LoopVectorization = "0.12"
 MacroTools = "0.4, 0.5"
 PackageExtensionCompat = "1"
diff --git a/ext/DynamicExpressionsGPUArraysExt.jl b/ext/DynamicExpressionsGPUArraysExt.jl
deleted file mode 100644
index 52d3fb93..00000000
--- a/ext/DynamicExpressionsGPUArraysExt.jl
+++ /dev/null
@@ -1,73 +0,0 @@
-module DynamicExpressionsGPUArraysExt
-
-using GPUArrays: AbstractDeviceArray
-using DynamicExpressions: OperatorEnum, AbstractExpressionNode, tree_mapreduce
-using DynamicExpressions.UtilsModule: counttuple
-
-import DynamicExpressions.EvaluateEquationModule: eval_tree_array
-
-function eval_tree_array(
-    tree::AbstractExpressionNode{T},
-    cX::AbstractDeviceArray{T,2},
-    operators::OperatorEnum;
-    _...,
-) where {T<:Number}
-    result = tree_mapreduce(
-        # Leaf nodes, we create an allocation and fill
-        # it with the value of the leaf:
-        leaf -> begin
-            if leaf.constant
-                v = leaf.val::T
-                ar = similar(cX, axes(cX, 2))
-                ar .= v
-                return ar
-            else
-                return cX[leaf.feature, :]
-            end
-        end,
-        # Branch nodes, we simply pass them to the evaluation kernel:
-        branch -> branch,
-        # In the evaluation kernel, we combine the branch nodes
-        # with the arrays created by the leaf nodes:
-        ((branch, cumulators::Vararg{Any,M}) where {M}) -> begin
-            if M == 1
-                return dispatch_kern1!(operators.unaops, branch.op, cumulators[1])
-            else
-                return dispatch_kern2!(
-                    operators.binops, branch.op, cumulators[1], cumulators[2]
-                )
-            end
-        end,
-        tree;
-        break_sharing=Val(true),
-    )
-    return (result, isfinite(sum(result .* zero(T))))
-end
-@generated function dispatch_kern1!(unaops, op_idx, cumulator)
-    nuna = counttuple(unaops)
-    quote
-        Base.@nif($nuna, i -> i == op_idx, i -> let op = unaops[i]
-            return kern1!(op, cumulator)
-        end,)
-    end
-end
-@generated function dispatch_kern2!(binops, op_idx, cumulator1, cumulator2)
-    nbin = counttuple(binops)
-    quote
-        Base.@nif(
-            $nbin, i -> i == op_idx, i -> let op = binops[i]
-                return kern2!(op, cumulator1, cumulator2)
-            end,
-        )
-    end
-end
-function kern1!(op::F, cumulator) where {F}
-    @. cumulator = op(cumulator)
-    return cumulator
-end
-function kern2!(op::F, cumulator1, cumulator2) where {F}
-    @. cumulator1 = op(cumulator1, cumulator2)
-    return cumulator1
-end
-
-end

From ade1613b1694105048ef7cbfd88b7e8e31f115f4 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 15 Jan 2024 00:59:51 -0500
Subject: [PATCH 11/47] Remove unneccessary sync call

---
 ext/DynamicExpressionsCUDAExt.jl | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index b798b5fc..d50883a6 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -35,7 +35,6 @@ function eval_tree_array(
     gidx_l = cu(idx_l)
     gidx_r = cu(idx_r)
 
-    groots = cu(roots)  # Just for indexing output
 
     num_threads = 256
     num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads))
@@ -49,7 +48,7 @@ function eval_tree_array(
         gdegree, gconstant, gval, gfeature, gop,
     )
 
-    out = ntuple(i -> gbuffer[:, groots[i]], Val(M))
+    out = ntuple(i -> gbuffer[:, roots[i]], Val(M))
     is_good = ntuple(i -> isfinite(sum(out[i] .* zero(T))), Val(M))
     return out, is_good
 end
@@ -67,15 +66,13 @@ function _launch_gpu_kernel!(
     (nuna > 10 || nbin > 10) && error("Too many operators. Kernels are only compiled up to 10.")
     gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
     for launch in I.(1:num_launches)
-        CUDA.@sync begin
-            execute = execution_order .== launch
-            @cuda threads=256 blocks=num_blocks gpu_kernel!(
-                buffer,
-                num_elem, num_nodes, execute,
-                cX, idx_self, idx_l, idx_r,
-                degree, constant, val, feature, op
-            )
-        end
+        execute = execution_order .== launch
+        @cuda threads=256 blocks=num_blocks gpu_kernel!(
+            buffer,
+            num_elem, num_nodes, execute,
+            cX, idx_self, idx_l, idx_r,
+            degree, constant, val, feature, op
+        )
     end
     return nothing
 end

From 53b42ceda3bc385ba784f6de8f1152327e3eb231 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 15 Jan 2024 01:40:29 -0500
Subject: [PATCH 12/47] Reduce number of allocations

---
 ext/DynamicExpressionsCUDAExt.jl | 35 ++++++++++++++++----------------
 src/AsArray.jl                   |  4 +---
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index d50883a6..9f206e08 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -12,29 +12,30 @@ function eval_tree_array(
     tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum; _...
 ) where {T<:Number}
     (outs, is_good) = eval_tree_array((tree,), gcX, operators)
-    return only(outs), only(is_good)
+    return (only(outs), only(is_good))
 end
 
 function eval_tree_array(
     trees::NTuple{M,N}, gcX::CuArray{T,2}, operators::OperatorEnum; _...
 ) where {T<:Number,N<:AbstractExpressionNode{T},M}
-    (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots) = as_array(trees...; index_type=Int32)
+    (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots) = as_array(Int32, trees...)
     num_launches = maximum(execution_order)
     num_elem = size(gcX, 2)
     num_nodes = length(degree)
 
     # Convert everything to GPU:
     gbuffer = CuArray{T}(undef, num_elem, num_nodes)
-    gexecution_order = cu(execution_order)
-    gdegree = cu(degree)
-    gconstant = cu(constant)
+    gexecution_order = CuArray(execution_order)
+    gdegree = CuArray(degree)
+    gconstant = CuArray(constant)
     gval = CuArray(val)
-    gfeature = cu(feature)
-    gop = cu(op)
-    gidx_self = cu(idx_self)
-    gidx_l = cu(idx_l)
-    gidx_r = cu(idx_r)
+    gfeature = CuArray(feature)
+    gop = CuArray(op)
+    gidx_self = CuArray(idx_self)
+    gidx_l = CuArray(idx_l)
+    gidx_r = CuArray(idx_r)
 
+    gexecute = CUDA.zeros(Bool, num_nodes)
 
     num_threads = 256
     num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads))
@@ -42,21 +43,21 @@ function eval_tree_array(
     _launch_gpu_kernel!(
         num_blocks, num_launches, gbuffer,
         # Thread info:
-        num_elem, num_nodes, gexecution_order,
+        num_elem, num_nodes, gexecute, gexecution_order,
         # Input data and tree
         operators, gcX, gidx_self, gidx_l, gidx_r,
         gdegree, gconstant, gval, gfeature, gop,
     )
 
-    out = ntuple(i -> gbuffer[:, roots[i]], Val(M))
-    is_good = ntuple(i -> isfinite(sum(out[i] .* zero(T))), Val(M))
-    return out, is_good
+    out = ntuple(i -> (@view gbuffer[:, roots[i]]), Val(M))
+    is_good = ntuple(i -> true, Val(M))  # Up to user to find NaNs
+    return (out, is_good)
 end
 
 function _launch_gpu_kernel!(
     num_blocks, num_launches::Integer, buffer::CuArray{T,2},
     # Thread info:
-    num_elem::Integer, num_nodes::Integer, execution_order::CuArray{I},
+    num_elem::Integer, num_nodes::Integer, execute::CuArray{Bool}, execution_order::CuArray{I},
     # Input data and tree
     operators::OperatorEnum, cX::CuArray{T,2}, idx_self::CuArray, idx_l::CuArray, idx_r::CuArray,
     degree::CuArray, constant::CuArray, val::CuArray{T,1}, feature::CuArray, op::CuArray,
@@ -65,8 +66,8 @@ function _launch_gpu_kernel!(
     nbin = get_nbin(typeof(operators))
     (nuna > 10 || nbin > 10) && error("Too many operators. Kernels are only compiled up to 10.")
     gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
-    for launch in I.(1:num_launches)
-        execute = execution_order .== launch
+    for launch in one(I):I(num_launches)
+        @. execute = execution_order .== launch
         @cuda threads=256 blocks=num_blocks gpu_kernel!(
             buffer,
             num_elem, num_nodes, execute,
diff --git a/src/AsArray.jl b/src/AsArray.jl
index 7c49ea71..0c65897b 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -2,9 +2,7 @@ module AsArrayModule
 
 using ..EquationModule: AbstractExpressionNode, tree_mapreduce
 
-function as_array(
-    trees::Vararg{N,M}; index_type::Type{I}=Int64
-) where {T,N<:AbstractExpressionNode{T},I,M}
+function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionNode{T},I,M}
     each_num_nodes = length.(trees)
     num_nodes = sum(each_num_nodes)
 

From cdfcec2f4e2912ffb2a3a538e0832866761c527b Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 15 Jan 2024 02:02:55 -0500
Subject: [PATCH 13/47] Greatly improve allocations

---
 ext/DynamicExpressionsCUDAExt.jl | 60 ++++++++++++++++++--------------
 src/AsArray.jl                   | 41 ++++++++++++++++------
 2 files changed, 65 insertions(+), 36 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 9f206e08..509375e3 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -18,59 +18,67 @@ end
 function eval_tree_array(
     trees::NTuple{M,N}, gcX::CuArray{T,2}, operators::OperatorEnum; _...
 ) where {T<:Number,N<:AbstractExpressionNode{T},M}
-    (; degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots) = as_array(Int32, trees...)
+    (; constant, val, execution_order, roots, buffer) = as_array(UInt16, trees...)
     num_launches = maximum(execution_order)
     num_elem = size(gcX, 2)
-    num_nodes = length(degree)
+    num_nodes = size(buffer, 2)
 
-    # Convert everything to GPU:
-    gbuffer = CuArray{T}(undef, num_elem, num_nodes)
-    gexecution_order = CuArray(execution_order)
-    gdegree = CuArray(degree)
-    gconstant = CuArray(constant)
+    ## Floating point arrays:
+    workspace = CuArray{T}(undef, num_elem, num_nodes)
     gval = CuArray(val)
-    gfeature = CuArray(feature)
-    gop = CuArray(op)
-    gidx_self = CuArray(idx_self)
-    gidx_l = CuArray(idx_l)
-    gidx_r = CuArray(idx_r)
 
-    gexecute = CUDA.zeros(Bool, num_nodes)
+    ## Bool arrays
+    gconstant = CuArray(constant)
+
+    ## Index arrays (much faster to have `@view` here)
+    _gbuffer = CuArray(buffer)
+    gdegree = @view _gbuffer[1, :]
+    gfeature = @view _gbuffer[2, :]
+    gop = @view _gbuffer[3, :]
+    gexecution_order = @view _gbuffer[4, :]
+    gidx_self = @view _gbuffer[5, :]
+    gidx_l = @view _gbuffer[6, :]
+    gidx_r = @view _gbuffer[7, :]
 
     num_threads = 256
     num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads))
 
     _launch_gpu_kernel!(
-        num_blocks, num_launches, gbuffer,
+        num_threads, num_blocks, num_launches, workspace,
         # Thread info:
-        num_elem, num_nodes, gexecute, gexecution_order,
+        num_elem, num_nodes, gexecution_order,
         # Input data and tree
         operators, gcX, gidx_self, gidx_l, gidx_r,
         gdegree, gconstant, gval, gfeature, gop,
     )
 
-    out = ntuple(i -> (@view gbuffer[:, roots[i]]), Val(M))
-    is_good = ntuple(i -> true, Val(M))  # Up to user to find NaNs
+    out = ntuple(
+        i -> (@view workspace[:, roots[i]]),
+        Val(M)
+    )
+    is_good = ntuple(
+        i -> true,  # Up to user to find NaNs
+        Val(M)
+    )
     return (out, is_good)
 end
 
 function _launch_gpu_kernel!(
-    num_blocks, num_launches::Integer, buffer::CuArray{T,2},
+    num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2},
     # Thread info:
-    num_elem::Integer, num_nodes::Integer, execute::CuArray{Bool}, execution_order::CuArray{I},
+    num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray{I},
     # Input data and tree
-    operators::OperatorEnum, cX::CuArray{T,2}, idx_self::CuArray, idx_l::CuArray, idx_r::CuArray,
-    degree::CuArray, constant::CuArray, val::CuArray{T,1}, feature::CuArray, op::CuArray,
+    operators::OperatorEnum, cX::AbstractArray{T,2}, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
+    degree::AbstractArray, constant::AbstractArray, val::AbstractArray{T,1}, feature::AbstractArray, op::AbstractArray,
 ) where {I,T}
     nuna = get_nuna(typeof(operators))
     nbin = get_nbin(typeof(operators))
     (nuna > 10 || nbin > 10) && error("Too many operators. Kernels are only compiled up to 10.")
     gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
     for launch in one(I):I(num_launches)
-        @. execute = execution_order .== launch
-        @cuda threads=256 blocks=num_blocks gpu_kernel!(
+        @cuda threads=num_threads blocks=num_blocks gpu_kernel!(
             buffer,
-            num_elem, num_nodes, execute,
+            launch, num_elem, num_nodes, execution_order,
             cX, idx_self, idx_l, idx_r,
             degree, constant, val, feature, op
         )
@@ -90,7 +98,7 @@ for nuna in 0:10, nbin in 0:10
             # Storage:
             buffer,
             # Thread info:
-            num_elem::Integer, num_nodes::Integer, execute::AbstractArray,
+            launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray,
             # Input data and tree
             cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
             degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray,
@@ -103,7 +111,7 @@ for nuna in 0:10, nbin in 0:10
             node = (i - 1) % num_nodes + 1
             elem = (i - node) ÷ num_nodes + 1
 
-            if !execute[node]
+            if execution_order[node] != launch
                 return nothing
             end
 
diff --git a/src/AsArray.jl b/src/AsArray.jl
index 0c65897b..e687d572 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -6,17 +6,28 @@ function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionN
     each_num_nodes = length.(trees)
     num_nodes = sum(each_num_nodes)
 
-    degree = Array{fieldtype(N, :degree)}(undef, num_nodes)
-    constant = Array{fieldtype(N, :constant)}(undef, num_nodes)
+    roots = Array{I}(undef, M)
+
+    constant = Array{Bool}(undef, num_nodes)
+
     val = Array{T}(undef, num_nodes)
-    feature = Array{fieldtype(N, :feature)}(undef, num_nodes)
-    op = Array{fieldtype(N, :op)}(undef, num_nodes)
-    execution_order = Array{I}(undef, num_nodes)
-    idx_self = Array{I}(undef, num_nodes)
-    idx_l = Array{I}(undef, num_nodes)
-    idx_r = Array{I}(undef, num_nodes)
 
-    roots = Array{I}(undef, length(trees))
+    # degree = Array{I}(undef, num_nodes)
+    # feature = Array{I}(undef, num_nodes)
+    # op = Array{I}(undef, num_nodes)
+    # execution_order = Array{I}(undef, num_nodes)
+    # idx_self = Array{I}(undef, num_nodes)
+    # idx_l = Array{I}(undef, num_nodes)
+    # idx_r = Array{I}(undef, num_nodes)
+    ## Views of the same matrix:
+    buffer = Array{I}(undef, 7, num_nodes)
+    degree = @view buffer[1, :]
+    feature = @view buffer[2, :]
+    op = @view buffer[3, :]
+    execution_order = @view buffer[4, :]
+    idx_self = @view buffer[5, :]
+    idx_l = @view buffer[6, :]
+    idx_r = @view buffer[7, :]
 
     cursor = Ref(zero(I))
     for (i_tree, tree) in enumerate(trees)
@@ -64,7 +75,17 @@ function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionN
     end
 
     return (;
-        degree, constant, val, feature, op, execution_order, idx_self, idx_l, idx_r, roots
+        degree,
+        constant,
+        val,
+        feature,
+        op,
+        execution_order,
+        idx_self,
+        idx_l,
+        idx_r,
+        roots,
+        buffer,
     )
 end
 

From 20a8f389cdff8628dafb13222fd66711ea45292d Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 15 Jan 2024 02:30:48 -0500
Subject: [PATCH 14/47] More memory allocation improvements

---
 ext/DynamicExpressionsCUDAExt.jl | 31 +++++++++++++++----------------
 src/AsArray.jl                   | 12 ++----------
 2 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 509375e3..8ee215c7 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -18,33 +18,31 @@ end
 function eval_tree_array(
     trees::NTuple{M,N}, gcX::CuArray{T,2}, operators::OperatorEnum; _...
 ) where {T<:Number,N<:AbstractExpressionNode{T},M}
-    (; constant, val, execution_order, roots, buffer) = as_array(UInt16, trees...)
+    (; val, execution_order, roots, buffer) = as_array(Int32, trees...)
     num_launches = maximum(execution_order)
     num_elem = size(gcX, 2)
     num_nodes = size(buffer, 2)
 
     ## Floating point arrays:
-    workspace = CuArray{T}(undef, num_elem, num_nodes)
+    gworkspace = CuArray{T}(undef, num_elem, num_nodes)
     gval = CuArray(val)
 
-    ## Bool arrays
-    gconstant = CuArray(constant)
-
     ## Index arrays (much faster to have `@view` here)
-    _gbuffer = CuArray(buffer)
-    gdegree = @view _gbuffer[1, :]
-    gfeature = @view _gbuffer[2, :]
-    gop = @view _gbuffer[3, :]
-    gexecution_order = @view _gbuffer[4, :]
-    gidx_self = @view _gbuffer[5, :]
-    gidx_l = @view _gbuffer[6, :]
-    gidx_r = @view _gbuffer[7, :]
+    gbuffer = CuArray(buffer)
+    gdegree = @view gbuffer[1, :]
+    gfeature = @view gbuffer[2, :]
+    gop = @view gbuffer[3, :]
+    gexecution_order = @view gbuffer[4, :]
+    gidx_self = @view gbuffer[5, :]
+    gidx_l = @view gbuffer[6, :]
+    gidx_r = @view gbuffer[7, :]
+    gconstant = @view gbuffer[8, :]
 
     num_threads = 256
     num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads))
 
     _launch_gpu_kernel!(
-        num_threads, num_blocks, num_launches, workspace,
+        num_threads, num_blocks, num_launches, gworkspace,
         # Thread info:
         num_elem, num_nodes, gexecution_order,
         # Input data and tree
@@ -53,13 +51,14 @@ function eval_tree_array(
     )
 
     out = ntuple(
-        i -> (@view workspace[:, roots[i]]),
+        i -> @view(gworkspace[:, roots[i]]),
         Val(M)
     )
     is_good = ntuple(
         i -> true,  # Up to user to find NaNs
         Val(M)
     )
+
     return (out, is_good)
 end
 
@@ -118,7 +117,7 @@ for nuna in 0:10, nbin in 0:10
             cur_degree = degree[node]
             cur_idx = idx_self[node]
             if cur_degree == 0
-                if constant[node]
+                if constant[node] == 1
                     cur_val = val[node]
                     buffer[elem, cur_idx] = cur_val
                 else
diff --git a/src/AsArray.jl b/src/AsArray.jl
index e687d572..836f180a 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -8,19 +8,10 @@ function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionN
 
     roots = Array{I}(undef, M)
 
-    constant = Array{Bool}(undef, num_nodes)
-
     val = Array{T}(undef, num_nodes)
 
-    # degree = Array{I}(undef, num_nodes)
-    # feature = Array{I}(undef, num_nodes)
-    # op = Array{I}(undef, num_nodes)
-    # execution_order = Array{I}(undef, num_nodes)
-    # idx_self = Array{I}(undef, num_nodes)
-    # idx_l = Array{I}(undef, num_nodes)
-    # idx_r = Array{I}(undef, num_nodes)
     ## Views of the same matrix:
-    buffer = Array{I}(undef, 7, num_nodes)
+    buffer = Array{I}(undef, 8, num_nodes)
     degree = @view buffer[1, :]
     feature = @view buffer[2, :]
     op = @view buffer[3, :]
@@ -28,6 +19,7 @@ function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionN
     idx_self = @view buffer[5, :]
     idx_l = @view buffer[6, :]
     idx_r = @view buffer[7, :]
+    constant = @view buffer[8, :]
 
     cursor = Ref(zero(I))
     for (i_tree, tree) in enumerate(trees)

From 6bbff73059c9334c926f8b3576b0c2bac146eb93 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 15 Jan 2024 11:37:17 -0500
Subject: [PATCH 15/47] Allow user to pass preallocated buffer

---
 Project.toml                     |  2 +-
 ext/DynamicExpressionsCUDAExt.jl | 25 +++++++++++++++++--------
 src/AsArray.jl                   |  7 +++++--
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/Project.toml b/Project.toml
index 3158f7b1..fb3f8746 100644
--- a/Project.toml
+++ b/Project.toml
@@ -31,8 +31,8 @@ DynamicExpressionsZygoteExt = "Zygote"
 [compat]
 Aqua = "0.7"
 Bumper = "0.6"
-Compat = "3.37, 4"
 CUDA = "5"
+Compat = "3.37, 4"
 Enzyme = "^0.11.12"
 LoopVectorization = "0.12"
 MacroTools = "0.4, 0.5"
diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 8ee215c7..e873c8af 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -9,26 +9,35 @@ using DynamicExpressions.AsArrayModule: as_array
 import DynamicExpressions.EvaluateEquationModule: eval_tree_array
 
 function eval_tree_array(
-    tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum; _...
+    tree::AbstractExpressionNode{T},
+    gcX::CuArray{T,2},
+    operators::OperatorEnum;
+    kws...
 ) where {T<:Number}
-    (outs, is_good) = eval_tree_array((tree,), gcX, operators)
+    (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...)
     return (only(outs), only(is_good))
 end
 
 function eval_tree_array(
-    trees::NTuple{M,N}, gcX::CuArray{T,2}, operators::OperatorEnum; _...
+    trees::NTuple{M,N},
+    gcX::CuArray{T,2},
+    operators::OperatorEnum;
+    buffer=nothing,
+    gpu_workspace=nothing,
+    gpu_buffer=nothing,
 ) where {T<:Number,N<:AbstractExpressionNode{T},M}
-    (; val, execution_order, roots, buffer) = as_array(Int32, trees...)
+    (; val, execution_order, roots, buffer, num_nodes) = as_array(Int32, trees...; buffer)
     num_launches = maximum(execution_order)
     num_elem = size(gcX, 2)
-    num_nodes = size(buffer, 2)
 
     ## Floating point arrays:
-    gworkspace = CuArray{T}(undef, num_elem, num_nodes)
-    gval = CuArray(val)
+    gworkspace = gpu_workspace === nothing ? CuArray{T}(undef, num_elem, num_nodes + 1) : gpu_workspace
+    # gval = CuArray(val)
+    gval = @view gworkspace[:, end]
+    copyto!(gval, val)
 
     ## Index arrays (much faster to have `@view` here)
-    gbuffer = CuArray(buffer)
+    gbuffer = gpu_buffer === nothing ? CuArray(buffer) : copyto!(gpu_buffer, buffer)
     gdegree = @view gbuffer[1, :]
     gfeature = @view gbuffer[2, :]
     gop = @view gbuffer[3, :]
diff --git a/src/AsArray.jl b/src/AsArray.jl
index 836f180a..df450424 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -2,7 +2,9 @@ module AsArrayModule
 
 using ..EquationModule: AbstractExpressionNode, tree_mapreduce
 
-function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionNode{T},I,M}
+function as_array(
+    ::Type{I}, trees::Vararg{N,M}; buffer::Union{AbstractArray{I},Nothing}=nothing
+) where {T,N<:AbstractExpressionNode{T},I,M}
     each_num_nodes = length.(trees)
     num_nodes = sum(each_num_nodes)
 
@@ -11,7 +13,7 @@ function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionN
     val = Array{T}(undef, num_nodes)
 
     ## Views of the same matrix:
-    buffer = Array{I}(undef, 8, num_nodes)
+    buffer = buffer === nothing ? Array{I}(undef, 8, num_nodes) : buffer
     degree = @view buffer[1, :]
     feature = @view buffer[2, :]
     op = @view buffer[3, :]
@@ -78,6 +80,7 @@ function as_array(::Type{I}, trees::Vararg{N,M}) where {T,N<:AbstractExpressionN
         idx_r,
         roots,
         buffer,
+        num_nodes,
     )
 end
 

From 03d2d24d2aab52b946fc5e1eafe55196b39052e0 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 15 Jan 2024 14:44:23 -0500
Subject: [PATCH 16/47] Try to fix unbound type parameter

---
 src/AsArray.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/AsArray.jl b/src/AsArray.jl
index df450424..cce00f88 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -3,7 +3,7 @@ module AsArrayModule
 using ..EquationModule: AbstractExpressionNode, tree_mapreduce
 
 function as_array(
-    ::Type{I}, trees::Vararg{N,M}; buffer::Union{AbstractArray{I},Nothing}=nothing
+    ::Type{I}, trees::Vararg{N,M}; buffer::Union{AbstractArray,Nothing}=nothing
 ) where {T,N<:AbstractExpressionNode{T},I,M}
     each_num_nodes = length.(trees)
     num_nodes = sum(each_num_nodes)

From 2e128bb4bdc6ca9645d8f7388a7e44ed338ca9d3 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 15 Jan 2024 14:48:35 -0500
Subject: [PATCH 17/47] Expand CUDA compat to 4

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index fb3f8746..d6724f76 100644
--- a/Project.toml
+++ b/Project.toml
@@ -31,7 +31,7 @@ DynamicExpressionsZygoteExt = "Zygote"
 [compat]
 Aqua = "0.7"
 Bumper = "0.6"
-CUDA = "5"
+CUDA = "4, 5"
 Compat = "3.37, 4"
 Enzyme = "^0.11.12"
 LoopVectorization = "0.12"

From da31b8757ecd4bcd4dbc7d4981494c65d7050f0f Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Tue, 16 Jan 2024 04:14:48 -0500
Subject: [PATCH 18/47] Clean up formatting

---
 ext/DynamicExpressionsCUDAExt.jl | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index e873c8af..b053b351 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -1,5 +1,4 @@
 module DynamicExpressionsCUDAExt
-#! format: off
 
 using CUDA
 using DynamicExpressions: OperatorEnum, AbstractExpressionNode
@@ -9,10 +8,7 @@ using DynamicExpressions.AsArrayModule: as_array
 import DynamicExpressions.EvaluateEquationModule: eval_tree_array
 
 function eval_tree_array(
-    tree::AbstractExpressionNode{T},
-    gcX::CuArray{T,2},
-    operators::OperatorEnum;
-    kws...
+    tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum; kws...
 ) where {T<:Number}
     (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...)
     return (only(outs), only(is_good))
@@ -31,8 +27,11 @@ function eval_tree_array(
     num_elem = size(gcX, 2)
 
     ## Floating point arrays:
-    gworkspace = gpu_workspace === nothing ? CuArray{T}(undef, num_elem, num_nodes + 1) : gpu_workspace
-    # gval = CuArray(val)
+    gworkspace = if gpu_workspace === nothing
+        CuArray{T}(undef, num_elem, num_nodes + 1)
+    else
+        gpu_workspace
+    end
     gval = @view gworkspace[:, end]
     copyto!(gval, val)
 
@@ -50,6 +49,7 @@ function eval_tree_array(
     num_threads = 256
     num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads))
 
+    #! format: off
     _launch_gpu_kernel!(
         num_threads, num_blocks, num_launches, gworkspace,
         # Thread info:
@@ -58,19 +58,18 @@ function eval_tree_array(
         operators, gcX, gidx_self, gidx_l, gidx_r,
         gdegree, gconstant, gval, gfeature, gop,
     )
+    #! format: on
 
-    out = ntuple(
-        i -> @view(gworkspace[:, roots[i]]),
-        Val(M)
-    )
+    out = ntuple(i -> @view(gworkspace[:, roots[i]]), Val(M))
     is_good = ntuple(
         i -> true,  # Up to user to find NaNs
-        Val(M)
+        Val(M),
     )
 
     return (out, is_good)
 end
 
+#! format: off
 function _launch_gpu_kernel!(
     num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2},
     # Thread info:
@@ -79,17 +78,21 @@ function _launch_gpu_kernel!(
     operators::OperatorEnum, cX::AbstractArray{T,2}, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
     degree::AbstractArray, constant::AbstractArray, val::AbstractArray{T,1}, feature::AbstractArray, op::AbstractArray,
 ) where {I,T}
+    #! format: on
     nuna = get_nuna(typeof(operators))
     nbin = get_nbin(typeof(operators))
-    (nuna > 10 || nbin > 10) && error("Too many operators. Kernels are only compiled up to 10.")
+    (nuna > 10 || nbin > 10) &&
+        error("Too many operators. Kernels are only compiled up to 10.")
     gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
     for launch in one(I):I(num_launches)
+        #! format: off
         @cuda threads=num_threads blocks=num_blocks gpu_kernel!(
             buffer,
             launch, num_elem, num_nodes, execution_order,
             cX, idx_self, idx_l, idx_r,
             degree, constant, val, feature, op
         )
+        #! format: on
     end
     return nothing
 end
@@ -102,6 +105,7 @@ end
 #   3. We can't use `@generated` because we can't create closures in those.
 for nuna in 0:10, nbin in 0:10
     @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin})
+        #! format: off
         function (
             # Storage:
             buffer,
@@ -111,6 +115,7 @@ for nuna in 0:10, nbin in 0:10
             cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
             degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray,
         )
+            #! format: on
             i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
             if i > num_elem * num_nodes
                 return nothing
@@ -162,5 +167,4 @@ for nuna in 0:10, nbin in 0:10
     end
 end
 
-#! format: on
 end

From 4fdc025cbca493cfbd1c861b4130e1b57090317e Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Tue, 16 Jan 2024 04:25:20 -0500
Subject: [PATCH 19/47] Reduce allocations in `as_array`

---
 src/AsArray.jl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/AsArray.jl b/src/AsArray.jl
index cce00f88..aaad05b0 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -8,7 +8,7 @@ function as_array(
     each_num_nodes = length.(trees)
     num_nodes = sum(each_num_nodes)
 
-    roots = Array{I}(undef, M)
+    roots = tuple(one(I), cumsum(each_num_nodes[1:end-1])...)
 
     val = Array{T}(undef, num_nodes)
 
@@ -24,8 +24,7 @@ function as_array(
     constant = @view buffer[8, :]
 
     cursor = Ref(zero(I))
-    for (i_tree, tree) in enumerate(trees)
-        roots[i_tree] = cursor[] + one(I)
+    for tree in trees
         tree_mapreduce(
             leaf -> begin
                 self = (cursor[] += one(I))

From c64a6cbb01664dac528b3146f202c24407b11e8f Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Tue, 16 Jan 2024 04:27:55 -0500
Subject: [PATCH 20/47] Fix indexing

---
 src/AsArray.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/AsArray.jl b/src/AsArray.jl
index aaad05b0..d5a1884a 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -8,7 +8,7 @@ function as_array(
     each_num_nodes = length.(trees)
     num_nodes = sum(each_num_nodes)
 
-    roots = tuple(one(I), cumsum(each_num_nodes[1:end-1])...)
+    roots = cumsum(tuple(one(I), each_num_nodes[1:end-1]...))
 
     val = Array{T}(undef, num_nodes)
 

From 88ee6cf8b6b00c41335d1c79c5aa8627071e3fdb Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Tue, 16 Jan 2024 18:54:47 -0500
Subject: [PATCH 21/47] Compatible CUDA kernel with empty operators

---
 ext/DynamicExpressionsCUDAExt.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index b053b351..1d6c7b19 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -21,6 +21,7 @@ function eval_tree_array(
     buffer=nothing,
     gpu_workspace=nothing,
     gpu_buffer=nothing,
+    kws...
 ) where {T<:Number,N<:AbstractExpressionNode{T},M}
     (; val, execution_order, roots, buffer, num_nodes) = as_array(Int32, trees...; buffer)
     num_launches = maximum(execution_order)
@@ -139,7 +140,7 @@ for nuna in 0:10, nbin in 0:10
                     buffer[elem, cur_idx] = cX[cur_feature, elem]
                 end
             else
-                if cur_degree == 1
+                if cur_degree == 1 && $nuna > 0
                     cur_op = op[node]
                     l_idx = idx_l[node]
                     Base.Cartesian.@nif(
@@ -149,7 +150,7 @@ for nuna in 0:10, nbin in 0:10
                             buffer[elem, cur_idx] = op(buffer[elem, l_idx])
                         end
                     )
-                else
+                elseif $nbin > 0  # Note this check is to avoid type inference issues when binops is empty
                     cur_op = op[node]
                     l_idx = idx_l[node]
                     r_idx = idx_r[node]

From 243bdda7c785d22d297f3c4b9de96fe37f39f0ca Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 28 Jan 2024 20:38:12 +0000
Subject: [PATCH 22/47] Formatting

---
 ext/DynamicExpressionsCUDAExt.jl | 2 +-
 src/AsArray.jl                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 1d6c7b19..ddb97bf8 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -21,7 +21,7 @@ function eval_tree_array(
     buffer=nothing,
     gpu_workspace=nothing,
     gpu_buffer=nothing,
-    kws...
+    kws...,
 ) where {T<:Number,N<:AbstractExpressionNode{T},M}
     (; val, execution_order, roots, buffer, num_nodes) = as_array(Int32, trees...; buffer)
     num_launches = maximum(execution_order)
diff --git a/src/AsArray.jl b/src/AsArray.jl
index d5a1884a..dc5d9b2b 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -8,7 +8,7 @@ function as_array(
     each_num_nodes = length.(trees)
     num_nodes = sum(each_num_nodes)
 
-    roots = cumsum(tuple(one(I), each_num_nodes[1:end-1]...))
+    roots = cumsum(tuple(one(I), each_num_nodes[1:(end - 1)]...))
 
     val = Array{T}(undef, num_nodes)
 

From b1aca681879f6bb07730184e3f3b68d9a259c5bb Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 25 Feb 2024 20:23:54 +0000
Subject: [PATCH 23/47] Ensure we always check root during array construction

---
 src/AsArray.jl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/AsArray.jl b/src/AsArray.jl
index dc5d9b2b..13b59507 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -1,11 +1,11 @@
 module AsArrayModule
 
-using ..EquationModule: AbstractExpressionNode, tree_mapreduce
+using ..EquationModule: AbstractExpressionNode, tree_mapreduce, count_nodes
 
 function as_array(
     ::Type{I}, trees::Vararg{N,M}; buffer::Union{AbstractArray,Nothing}=nothing
 ) where {T,N<:AbstractExpressionNode{T},I,M}
-    each_num_nodes = length.(trees)
+    each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees)
     num_nodes = sum(each_num_nodes)
 
     roots = cumsum(tuple(one(I), each_num_nodes[1:(end - 1)]...))
@@ -24,7 +24,8 @@ function as_array(
     constant = @view buffer[8, :]
 
     cursor = Ref(zero(I))
-    for tree in trees
+    for (root, tree) in zip(roots, trees)
+        @assert root == cursor[] + 1
         tree_mapreduce(
             leaf -> begin
                 self = (cursor[] += one(I))

From 9372150d8b857f629ede4879a69f50029b6158bd Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 25 Feb 2024 20:43:38 +0000
Subject: [PATCH 24/47] Mechanism to unittest cuda kernel

---
 ext/DynamicExpressionsCUDAExt.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index ddb97bf8..4a05ad48 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -115,9 +115,11 @@ for nuna in 0:10, nbin in 0:10
             # Input data and tree
             cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
             degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray,
+            # Override for unittesting:
+            i=nothing,
         )
             #! format: on
-            i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+            i = i === nothing ? (blockIdx().x - 1) * blockDim().x + threadIdx().x : i
             if i > num_elem * num_nodes
                 return nothing
             end

From 5befede43045565050b6aacddd5e6485e040fb34 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 25 Feb 2024 21:13:33 +0000
Subject: [PATCH 25/47] Fix unbound type issues

---
 src/AsArray.jl          | 6 +++++-
 src/EvaluateEquation.jl | 6 +++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/AsArray.jl b/src/AsArray.jl
index 13b59507..60239cda 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -3,8 +3,12 @@ module AsArrayModule
 using ..EquationModule: AbstractExpressionNode, tree_mapreduce, count_nodes
 
 function as_array(
-    ::Type{I}, trees::Vararg{N,M}; buffer::Union{AbstractArray,Nothing}=nothing
+    ::Type{I},
+    tree::N,
+    additional_trees::Vararg{N,M};
+    buffer::Union{AbstractArray,Nothing}=nothing,
 ) where {T,N<:AbstractExpressionNode{T},I,M}
+    trees = (tree, additional_trees...)
     each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees)
     num_nodes = sum(each_num_nodes)
 
diff --git a/src/EvaluateEquation.jl b/src/EvaluateEquation.jl
index 0ced0631..764c307a 100644
--- a/src/EvaluateEquation.jl
+++ b/src/EvaluateEquation.jl
@@ -98,10 +98,10 @@ function eval_tree_array(
     return eval_tree_array(tree, cX, operators; turbo, bumper)
 end
 function eval_tree_array(
-    trees::NTuple{M,N}, cX::AbstractMatrix{T}, operators::OperatorEnum; kws...
+    trees::Tuple{N,Vararg{N,M}}, cX::AbstractMatrix{T}, operators::OperatorEnum; kws...
 ) where {T<:Number,N<:AbstractExpressionNode{T},M}
-    outs = ntuple(i -> eval_tree_array(trees[i], cX, operators; kws...)[1], Val(M))
-    return ntuple(i -> first(outs[i]), Val(M)), ntuple(i -> last(outs[i]), Val(M))
+    outs = ntuple(i -> eval_tree_array(trees[i], cX, operators; kws...)[1], Val(M + 1))
+    return ntuple(i -> first(outs[i]), Val(M + 1)), ntuple(i -> last(outs[i]), Val(M + 1))
 end
 
 get_nuna(::Type{<:OperatorEnum{B,U}}) where {B,U} = counttuple(U)

From 74094e36e61c0e3b1a3c6823c898ec7b4889b51e Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 25 Feb 2024 22:15:47 +0000
Subject: [PATCH 26/47] Start working on CPU-based unittests

---
 ext/DynamicExpressionsCUDAExt.jl | 58 ++++++++++++++++++++++++--------
 test/test_cuda.jl                | 29 ++++++++++++++++
 test/unittest.jl                 |  6 +++-
 3 files changed, 78 insertions(+), 15 deletions(-)
 create mode 100644 test/test_cuda.jl

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 4a05ad48..457dfa80 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -1,22 +1,36 @@
 module DynamicExpressionsCUDAExt
 
-using CUDA
+using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx
 using DynamicExpressions: OperatorEnum, AbstractExpressionNode
 using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna
 using DynamicExpressions.AsArrayModule: as_array
 
 import DynamicExpressions.EvaluateEquationModule: eval_tree_array
 
+# array type for exclusively testing purposes
+struct FakeCuArray{T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N}
+    a::A
+end
+Base.similar(x::FakeCuArray, dims::Integer...) = FakeCuArray(similar(x.a, dims...))
+Base.getindex(x::FakeCuArray, i::Int...) = getindex(x.a, i...)
+Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...)
+Base.size(x::FakeCuArray) = size(x.a)
+
+const MaybeCuArray{T,N} = Union{CuArray{T,2},FakeCuArray{T,N}}
+
+to_device(a, ::CuArray) = CuArray(a)
+to_device(a, ::FakeCuArray) = FakeCuArray(a)
+
 function eval_tree_array(
-    tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum; kws...
+    tree::AbstractExpressionNode{T}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; kws...
 ) where {T<:Number}
     (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...)
     return (only(outs), only(is_good))
 end
 
 function eval_tree_array(
-    trees::NTuple{M,N},
-    gcX::CuArray{T,2},
+    trees::Tuple{N,Vararg{N,M}},
+    gcX::MaybeCuArray{T,2},
     operators::OperatorEnum;
     buffer=nothing,
     gpu_workspace=nothing,
@@ -29,7 +43,7 @@ function eval_tree_array(
 
     ## Floating point arrays:
     gworkspace = if gpu_workspace === nothing
-        CuArray{T}(undef, num_elem, num_nodes + 1)
+        similar(gcX, num_elem, num_nodes + 1)
     else
         gpu_workspace
     end
@@ -37,7 +51,11 @@ function eval_tree_array(
     copyto!(gval, val)
 
     ## Index arrays (much faster to have `@view` here)
-    gbuffer = gpu_buffer === nothing ? CuArray(buffer) : copyto!(gpu_buffer, buffer)
+    gbuffer = if gpu_buffer === nothing
+        to_device(buffer, gcX)
+    else
+        copyto!(gpu_buffer, buffer)
+    end
     gdegree = @view gbuffer[1, :]
     gfeature = @view gbuffer[2, :]
     gop = @view gbuffer[3, :]
@@ -61,10 +79,10 @@ function eval_tree_array(
     )
     #! format: on
 
-    out = ntuple(i -> @view(gworkspace[:, roots[i]]), Val(M))
+    out = ntuple(i -> @view(gworkspace[:, roots[i]]), Val(M + 1))
     is_good = ntuple(
         i -> true,  # Up to user to find NaNs
-        Val(M),
+        Val(M + 1),
     )
 
     return (out, is_good)
@@ -87,12 +105,24 @@ function _launch_gpu_kernel!(
     gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
     for launch in one(I):I(num_launches)
         #! format: off
-        @cuda threads=num_threads blocks=num_blocks gpu_kernel!(
-            buffer,
-            launch, num_elem, num_nodes, execution_order,
-            cX, idx_self, idx_l, idx_r,
-            degree, constant, val, feature, op
-        )
+        if buffer isa CuArray
+            @cuda threads=num_threads blocks=num_blocks gpu_kernel!(
+                buffer,
+                launch, num_elem, num_nodes, execution_order,
+                cX, idx_self, idx_l, idx_r,
+                degree, constant, val, feature, op
+            )
+        else
+            Threads.@threads for i in 1:(num_threads * num_blocks)
+                gpu_kernel!(
+                    buffer,
+                    launch, num_elem, num_nodes, execution_order,
+                    cX, idx_self, idx_l, idx_r,
+                    degree, constant, val, feature, op,
+                    i
+                )
+            end
+        end
         #! format: on
     end
     return nothing
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
new file mode 100644
index 00000000..264db2a4
--- /dev/null
+++ b/test/test_cuda.jl
@@ -0,0 +1,29 @@
+using DynamicExpressions
+using CUDA
+using Random
+
+ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt)
+const FakeCuArray = ext.FakeCuArray
+
+include("tree_gen_utils.jl")
+
+let
+    operators = OperatorEnum(; binary_operators=[+, -, *, /], unary_operators=[cos, sin]);
+    x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3)
+
+    for T in (Float32, Float64, ComplexF64), num_trees in (1, 2, 3), seed in 0:10
+        Random.seed!(seed)
+        num_rows = rand(10:30)
+        nodes_per = rand(10:25, num_trees)
+        trees = ntuple(i -> gen_random_tree_fixed_size(nodes_per[i], operators, 3, T), num_trees)
+        @show trees
+        X = randn(T, 3, num_rows)
+        y, completed = eval_tree_array(trees, X, operators)
+        gpu_y, gpu_completed = eval_tree_array(trees, FakeCuArray(X), operators)
+        gpu_y = Array.(gpu_y)
+
+        for i in eachindex(completed, gpu_completed)
+            @test ((completed[i] && gpu_completed[i]) && (y[i] ≈ gpu_y[i])) || (!completed[i] && !gpu_completed[i])
+        end
+    end
+end
diff --git a/test/unittest.jl b/test/unittest.jl
index 1f3528c0..8200c538 100644
--- a/test/unittest.jl
+++ b/test/unittest.jl
@@ -9,7 +9,7 @@ end
 end
 
 # Trigger extensions:
-using Zygote, SymbolicUtils, LoopVectorization, Bumper, Optim
+using Zygote, SymbolicUtils, LoopVectorization, Bumper, Optim, CUDA
 
 @safetestset "Test deprecations" begin
     include("test_deprecations.jl")
@@ -110,3 +110,7 @@ end
 @safetestset "Test random sampling" begin
     include("test_random.jl")
 end
+
+@safetestset "Test CUDA" begin
+    include("test_cuda.jl")
+end

From 9d7500c67b51a832595a7c56fd0b28d63b2c0a2f Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 25 Feb 2024 22:16:06 +0000
Subject: [PATCH 27/47] Fix indexing bugs in GPU kernels

---
 ext/DynamicExpressionsCUDAExt.jl | 12 ++++++++----
 src/EvaluateEquation.jl          |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 457dfa80..613ab8b2 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -15,6 +15,7 @@ Base.similar(x::FakeCuArray, dims::Integer...) = FakeCuArray(similar(x.a, dims..
 Base.getindex(x::FakeCuArray, i::Int...) = getindex(x.a, i...)
 Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...)
 Base.size(x::FakeCuArray) = size(x.a)
+Base.Array(x::FakeCuArray) = Array(x.a)
 
 const MaybeCuArray{T,N} = Union{CuArray{T,2},FakeCuArray{T,N}}
 
@@ -41,13 +42,16 @@ function eval_tree_array(
     num_launches = maximum(execution_order)
     num_elem = size(gcX, 2)
 
-    ## Floating point arrays:
+    ## The following array is our "workspace" for
+    ## the GPU kernel, with size equal to the number of rows
+    ## in the input data by the number of nodes in the tree.
+    ## It has one extra row to store the constant values.
     gworkspace = if gpu_workspace === nothing
-        similar(gcX, num_elem, num_nodes + 1)
+        similar(gcX, num_elem + 1, num_nodes)
     else
         gpu_workspace
     end
-    gval = @view gworkspace[:, end]
+    gval = @view gworkspace[end, :]
     copyto!(gval, val)
 
     ## Index arrays (much faster to have `@view` here)
@@ -79,7 +83,7 @@ function eval_tree_array(
     )
     #! format: on
 
-    out = ntuple(i -> @view(gworkspace[:, roots[i]]), Val(M + 1))
+    out = ntuple(i -> @view(gworkspace[begin:end-1, roots[i]]), Val(M + 1))
     is_good = ntuple(
         i -> true,  # Up to user to find NaNs
         Val(M + 1),
diff --git a/src/EvaluateEquation.jl b/src/EvaluateEquation.jl
index 764c307a..0683405c 100644
--- a/src/EvaluateEquation.jl
+++ b/src/EvaluateEquation.jl
@@ -100,7 +100,7 @@ end
 function eval_tree_array(
     trees::Tuple{N,Vararg{N,M}}, cX::AbstractMatrix{T}, operators::OperatorEnum; kws...
 ) where {T<:Number,N<:AbstractExpressionNode{T},M}
-    outs = ntuple(i -> eval_tree_array(trees[i], cX, operators; kws...)[1], Val(M + 1))
+    outs = ntuple(i -> eval_tree_array(trees[i], cX, operators; kws...), Val(M + 1))
     return ntuple(i -> first(outs[i]), Val(M + 1)), ntuple(i -> last(outs[i]), Val(M + 1))
 end
 

From f6155badaf9d9a38c1ada861538d8912eecf4e5b Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 25 Feb 2024 22:21:14 +0000
Subject: [PATCH 28/47] Fix unsafe `sin` in CUDA kernel tests

---
 ext/DynamicExpressionsCUDAExt.jl |  2 +-
 test/test_cuda.jl                | 16 ++++++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 613ab8b2..f351dd82 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -83,7 +83,7 @@ function eval_tree_array(
     )
     #! format: on
 
-    out = ntuple(i -> @view(gworkspace[begin:end-1, roots[i]]), Val(M + 1))
+    out = ntuple(i -> @view(gworkspace[begin:(end - 1), roots[i]]), Val(M + 1))
     is_good = ntuple(
         i -> true,  # Up to user to find NaNs
         Val(M + 1),
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
index 264db2a4..11f7567e 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda.jl
@@ -7,23 +7,31 @@ const FakeCuArray = ext.FakeCuArray
 
 include("tree_gen_utils.jl")
 
+safe_sin(x) = isfinite(x) ? sin(x) : convert(eltype(x), NaN)
+safe_cos(x) = isfinite(x) ? cos(x) : convert(eltype(x), NaN)
+
 let
-    operators = OperatorEnum(; binary_operators=[+, -, *, /], unary_operators=[cos, sin]);
+    operators = OperatorEnum(;
+        binary_operators=[+, -, *, /], unary_operators=[safe_sin, safe_cos]
+    )
     x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3)
 
     for T in (Float32, Float64, ComplexF64), num_trees in (1, 2, 3), seed in 0:10
         Random.seed!(seed)
         num_rows = rand(10:30)
         nodes_per = rand(10:25, num_trees)
-        trees = ntuple(i -> gen_random_tree_fixed_size(nodes_per[i], operators, 3, T), num_trees)
-        @show trees
+        trees = ntuple(
+            i -> gen_random_tree_fixed_size(nodes_per[i], operators, 3, T), num_trees
+        )
         X = randn(T, 3, num_rows)
         y, completed = eval_tree_array(trees, X, operators)
         gpu_y, gpu_completed = eval_tree_array(trees, FakeCuArray(X), operators)
         gpu_y = Array.(gpu_y)
 
         for i in eachindex(completed, gpu_completed)
-            @test ((completed[i] && gpu_completed[i]) && (y[i] ≈ gpu_y[i])) || (!completed[i] && !gpu_completed[i])
+            if completed[i]
+                @test y[i] ≈ gpu_y[i]
+            end
         end
     end
 end

From a7d7e795925fd0ca25f5b9bd2d4651a4f1b95184 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 25 Feb 2024 22:30:04 +0000
Subject: [PATCH 29/47] Fix `MaybeCuArray` definition

---
 ext/DynamicExpressionsCUDAExt.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index f351dd82..5dd08479 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -17,7 +17,7 @@ Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...)
 Base.size(x::FakeCuArray) = size(x.a)
 Base.Array(x::FakeCuArray) = Array(x.a)
 
-const MaybeCuArray{T,N} = Union{CuArray{T,2},FakeCuArray{T,N}}
+const MaybeCuArray{T,N} = Union{CuArray{T,N},FakeCuArray{T,N}}
 
 to_device(a, ::CuArray) = CuArray(a)
 to_device(a, ::FakeCuArray) = FakeCuArray(a)

From 46a6b48ae3a7e12a123ad06749b08f52387d0db9 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 25 Feb 2024 22:46:29 +0000
Subject: [PATCH 30/47] Test buffered-evaluation of CUDA kernels

---
 ext/DynamicExpressionsCUDAExt.jl |  1 -
 test/test_cuda.jl                | 41 +++++++++++++++++++++-----------
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 5dd08479..07b72de7 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -15,7 +15,6 @@ Base.similar(x::FakeCuArray, dims::Integer...) = FakeCuArray(similar(x.a, dims..
 Base.getindex(x::FakeCuArray, i::Int...) = getindex(x.a, i...)
 Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...)
 Base.size(x::FakeCuArray) = size(x.a)
-Base.Array(x::FakeCuArray) = Array(x.a)
 
 const MaybeCuArray{T,N} = Union{CuArray{T,N},FakeCuArray{T,N}}
 
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
index 11f7567e..8f9f33d0 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda.jl
@@ -16,21 +16,34 @@ let
     )
     x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3)
 
-    for T in (Float32, Float64, ComplexF64), num_trees in (1, 2, 3), seed in 0:10
+    for T in (Float32, Float64, ComplexF64), ntrees in (1, 2, 3), seed in 0:10
         Random.seed!(seed)
-        num_rows = rand(10:30)
-        nodes_per = rand(10:25, num_trees)
-        trees = ntuple(
-            i -> gen_random_tree_fixed_size(nodes_per[i], operators, 3, T), num_trees
-        )
-        X = randn(T, 3, num_rows)
-        y, completed = eval_tree_array(trees, X, operators)
-        gpu_y, gpu_completed = eval_tree_array(trees, FakeCuArray(X), operators)
-        gpu_y = Array.(gpu_y)
-
-        for i in eachindex(completed, gpu_completed)
-            if completed[i]
-                @test y[i] ≈ gpu_y[i]
+
+        nrow = rand(10:30)
+        nnodes = rand(10:25, ntrees)
+
+        buffer = rand(Bool) ? ones(Int32, 8, sum(nnodes)) : nothing
+        gpu_buffer = rand(Bool) ? FakeCuArray(ones(Int32, 8, sum(nnodes))) : nothing
+        gpu_workspace = rand(Bool) ? FakeCuArray(ones(T, nrow + 1, sum(nnodes))) : nothing
+
+        trees = ntuple(i -> gen_random_tree_fixed_size(nnodes[i], operators, 3, T), ntrees)
+        X = randn(T, 3, nrow)
+        if ntrees > 1
+            y, completed = eval_tree_array(trees, X, operators)
+            gpu_y, gpu_completed = eval_tree_array(
+                trees, FakeCuArray(X), operators; buffer, gpu_workspace, gpu_buffer
+            )
+
+            for i in eachindex(completed, gpu_completed)
+                if completed[i]
+                    @test y[i] ≈ gpu_y[i]
+                end
+            end
+        else
+            y, completed = eval_tree_array(only(trees), X, operators)
+            gpu_y, gpu_completed = eval_tree_array(only(trees), FakeCuArray(X), operators)
+            if completed
+                @test y ≈ gpu_y
             end
         end
     end

From 7ceacc077eba0513c29f25d32d39d24aa1fc2509 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 25 Feb 2024 22:47:47 +0000
Subject: [PATCH 31/47] Only test CUDA on 1.9+

---
 test/unittest.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/unittest.jl b/test/unittest.jl
index 8200c538..48462ad2 100644
--- a/test/unittest.jl
+++ b/test/unittest.jl
@@ -9,7 +9,7 @@ end
 end
 
 # Trigger extensions:
-using Zygote, SymbolicUtils, LoopVectorization, Bumper, Optim, CUDA
+using Zygote, SymbolicUtils, LoopVectorization, Bumper, Optim
 
 @safetestset "Test deprecations" begin
     include("test_deprecations.jl")
@@ -111,6 +111,8 @@ end
     include("test_random.jl")
 end
 
-@safetestset "Test CUDA" begin
-    include("test_cuda.jl")
+if VERSION >= v"1.9"
+    @eval @safetestset "Test CUDA" begin
+        include("test_cuda.jl")
+    end
 end

From 08df9a31ce53540277d7c80ce1fffc1a69e95650 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 25 Feb 2024 23:13:10 +0000
Subject: [PATCH 32/47] Use vectors for vector input; tuple for tuple

---
 ext/DynamicExpressionsCUDAExt.jl | 13 +++++--------
 src/AsArray.jl                   | 15 ++++++++++-----
 src/EvaluateEquation.jl          | 11 +++++++----
 test/test_cuda.jl                | 22 ++++++++++++++++++----
 4 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 07b72de7..23c797b8 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -29,15 +29,15 @@ function eval_tree_array(
 end
 
 function eval_tree_array(
-    trees::Tuple{N,Vararg{N,M}},
+    trees::Union{NTuple{M,N} where M,AbstractVector{N}},
     gcX::MaybeCuArray{T,2},
     operators::OperatorEnum;
     buffer=nothing,
     gpu_workspace=nothing,
     gpu_buffer=nothing,
     kws...,
-) where {T<:Number,N<:AbstractExpressionNode{T},M}
-    (; val, execution_order, roots, buffer, num_nodes) = as_array(Int32, trees...; buffer)
+) where {T<:Number,N<:AbstractExpressionNode{T}}
+    (; val, execution_order, roots, buffer, num_nodes) = as_array(Int32, trees; buffer)
     num_launches = maximum(execution_order)
     num_elem = size(gcX, 2)
 
@@ -82,11 +82,8 @@ function eval_tree_array(
     )
     #! format: on
 
-    out = ntuple(i -> @view(gworkspace[begin:(end - 1), roots[i]]), Val(M + 1))
-    is_good = ntuple(
-        i -> true,  # Up to user to find NaNs
-        Val(M + 1),
-    )
+    out = (r -> @view(gworkspace[begin:(end - 1), r])).(roots)
+    is_good = (_ -> true).(trees)
 
     return (out, is_good)
 end
diff --git a/src/AsArray.jl b/src/AsArray.jl
index 60239cda..3d7bdde9 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -4,15 +4,20 @@ using ..EquationModule: AbstractExpressionNode, tree_mapreduce, count_nodes
 
 function as_array(
     ::Type{I},
-    tree::N,
-    additional_trees::Vararg{N,M};
+    trees::Union{NTuple{M,N} where M,AbstractVector{N}};
     buffer::Union{AbstractArray,Nothing}=nothing,
-) where {T,N<:AbstractExpressionNode{T},I,M}
-    trees = (tree, additional_trees...)
+) where {T,N<:AbstractExpressionNode{T},I}
     each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees)
     num_nodes = sum(each_num_nodes)
 
-    roots = cumsum(tuple(one(I), each_num_nodes[1:(end - 1)]...))
+    # Want `roots` to be tuple if `trees` is tuple and similar for vector
+    roots = cumsum(
+        if each_num_nodes isa Tuple
+            tuple(one(I), each_num_nodes[1:(end - 1)]...)
+        else
+            vcat(one(I), each_num_nodes[1:(end - 1)])
+        end,
+    )
 
     val = Array{T}(undef, num_nodes)
 
diff --git a/src/EvaluateEquation.jl b/src/EvaluateEquation.jl
index 0683405c..df52a0cb 100644
--- a/src/EvaluateEquation.jl
+++ b/src/EvaluateEquation.jl
@@ -98,10 +98,13 @@ function eval_tree_array(
     return eval_tree_array(tree, cX, operators; turbo, bumper)
 end
 function eval_tree_array(
-    trees::Tuple{N,Vararg{N,M}}, cX::AbstractMatrix{T}, operators::OperatorEnum; kws...
-) where {T<:Number,N<:AbstractExpressionNode{T},M}
-    outs = ntuple(i -> eval_tree_array(trees[i], cX, operators; kws...), Val(M + 1))
-    return ntuple(i -> first(outs[i]), Val(M + 1)), ntuple(i -> last(outs[i]), Val(M + 1))
+    trees::Union{NTuple{M,N} where M,AbstractArray{N}},
+    cX::AbstractMatrix{T},
+    operators::OperatorEnum;
+    kws...,
+) where {T<:Number,N<:AbstractExpressionNode{T}}
+    outs = (t -> eval_tree_array(t, cX, operators; kws...)).(trees)
+    return first.(outs), last.(outs)
 end
 
 get_nuna(::Type{<:OperatorEnum{B,U}}) where {B,U} = counttuple(U)
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
index 8f9f33d0..443df746 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda.jl
@@ -21,27 +21,41 @@ let
 
         nrow = rand(10:30)
         nnodes = rand(10:25, ntrees)
+        use_tuple = rand(Bool)
 
         buffer = rand(Bool) ? ones(Int32, 8, sum(nnodes)) : nothing
         gpu_buffer = rand(Bool) ? FakeCuArray(ones(Int32, 8, sum(nnodes))) : nothing
         gpu_workspace = rand(Bool) ? FakeCuArray(ones(T, nrow + 1, sum(nnodes))) : nothing
 
         trees = ntuple(i -> gen_random_tree_fixed_size(nnodes[i], operators, 3, T), ntrees)
+        trees = use_tuple ? trees : collect(trees)
         X = randn(T, 3, nrow)
         if ntrees > 1
-            y, completed = eval_tree_array(trees, X, operators)
-            gpu_y, gpu_completed = eval_tree_array(
+            y, completed = @inferred eval_tree_array(trees, X, operators)
+            gpu_y, gpu_completed = @inferred eval_tree_array(
                 trees, FakeCuArray(X), operators; buffer, gpu_workspace, gpu_buffer
             )
 
+            # Should give same result either way
             for i in eachindex(completed, gpu_completed)
                 if completed[i]
                     @test y[i] ≈ gpu_y[i]
                 end
             end
+
+            # Should return same type as input
+            if use_tuple
+                @test y isa Tuple
+                @test gpu_y isa Tuple
+            else
+                @test y isa Vector
+                @test gpu_y isa Vector
+            end
         else
-            y, completed = eval_tree_array(only(trees), X, operators)
-            gpu_y, gpu_completed = eval_tree_array(only(trees), FakeCuArray(X), operators)
+            y, completed = @inferred eval_tree_array(only(trees), X, operators)
+            gpu_y, gpu_completed = @inferred eval_tree_array(
+                only(trees), FakeCuArray(X), operators
+            )
             if completed
                 @test y ≈ gpu_y
             end

From 8b4cbf379c173983bd4f8ed47280d19902cd1619 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 25 Feb 2024 23:20:48 +0000
Subject: [PATCH 33/47] Fix unbound args in multi-tree eval

---
 ext/DynamicExpressionsCUDAExt.jl | 2 +-
 src/AsArray.jl                   | 2 +-
 src/EvaluateEquation.jl          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 23c797b8..929ba31f 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -29,7 +29,7 @@ function eval_tree_array(
 end
 
 function eval_tree_array(
-    trees::Union{NTuple{M,N} where M,AbstractVector{N}},
+    trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}},
     gcX::MaybeCuArray{T,2},
     operators::OperatorEnum;
     buffer=nothing,
diff --git a/src/AsArray.jl b/src/AsArray.jl
index 3d7bdde9..0939a11c 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -4,7 +4,7 @@ using ..EquationModule: AbstractExpressionNode, tree_mapreduce, count_nodes
 
 function as_array(
     ::Type{I},
-    trees::Union{NTuple{M,N} where M,AbstractVector{N}};
+    trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}};
     buffer::Union{AbstractArray,Nothing}=nothing,
 ) where {T,N<:AbstractExpressionNode{T},I}
     each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees)
diff --git a/src/EvaluateEquation.jl b/src/EvaluateEquation.jl
index df52a0cb..2749bbfc 100644
--- a/src/EvaluateEquation.jl
+++ b/src/EvaluateEquation.jl
@@ -98,7 +98,7 @@ function eval_tree_array(
     return eval_tree_array(tree, cX, operators; turbo, bumper)
 end
 function eval_tree_array(
-    trees::Union{NTuple{M,N} where M,AbstractArray{N}},
+    trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}},
     cX::AbstractMatrix{T},
     operators::OperatorEnum;
     kws...,

From 676ad86e98795712e78c9667883903bed8c0b9c3 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 26 Feb 2024 00:35:43 +0000
Subject: [PATCH 34/47] Add test for no buffer recreation

---
 ext/DynamicExpressionsCUDAExt.jl |  19 ++--
 src/AsArray.jl                   |  12 ++-
 test/test_cuda.jl                | 148 ++++++++++++++++++++++---------
 3 files changed, 130 insertions(+), 49 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 929ba31f..bb965214 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -35,10 +35,15 @@ function eval_tree_array(
     buffer=nothing,
     gpu_workspace=nothing,
     gpu_buffer=nothing,
+    roots=nothing,
+    num_nodes=nothing,
+    num_launches=nothing,
+    update_buffers::Val{_update_buffers}=Val(true),
     kws...,
-) where {T<:Number,N<:AbstractExpressionNode{T}}
-    (; val, execution_order, roots, buffer, num_nodes) = as_array(Int32, trees; buffer)
-    num_launches = maximum(execution_order)
+) where {T<:Number,N<:AbstractExpressionNode{T},_update_buffers}
+    if _update_buffers
+        (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, trees; buffer)
+    end
     num_elem = size(gcX, 2)
 
     ## The following array is our "workspace" for
@@ -51,10 +56,14 @@ function eval_tree_array(
         gpu_workspace
     end
     gval = @view gworkspace[end, :]
-    copyto!(gval, val)
+    if _update_buffers
+        copyto!(gval, val)
+    end
 
     ## Index arrays (much faster to have `@view` here)
-    gbuffer = if gpu_buffer === nothing
+    gbuffer = if !_update_buffers
+        gpu_buffer
+    elseif gpu_buffer === nothing
         to_device(buffer, gcX)
     else
         copyto!(gpu_buffer, buffer)
diff --git a/src/AsArray.jl b/src/AsArray.jl
index 0939a11c..75ff1ece 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -4,9 +4,12 @@ using ..EquationModule: AbstractExpressionNode, tree_mapreduce, count_nodes
 
 function as_array(
     ::Type{I},
-    trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}};
+    trees::Union{N,Tuple{N,Vararg{N}},AbstractVector{N}};
     buffer::Union{AbstractArray,Nothing}=nothing,
 ) where {T,N<:AbstractExpressionNode{T},I}
+    if trees isa N
+        return as_array(I, (trees,); buffer=buffer)
+    end
     each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees)
     num_nodes = sum(each_num_nodes)
 
@@ -33,6 +36,7 @@ function as_array(
     constant = @view buffer[8, :]
 
     cursor = Ref(zero(I))
+    num_launches = zero(I)
     for (root, tree) in zip(roots, trees)
         @assert root == cursor[] + 1
         tree_mapreduce(
@@ -70,6 +74,11 @@ function as_array(
                 end
                 execution_order[parent.id] = parent_execution_order
 
+                # Global number of launches equal to maximum execution order
+                if parent_execution_order > num_launches
+                    num_launches = parent_execution_order
+                end
+
                 (id=parent.id, order=parent_execution_order)
             end,
             tree;
@@ -84,6 +93,7 @@ function as_array(
         feature,
         op,
         execution_order,
+        num_launches,
         idx_self,
         idx_l,
         idx_r,
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
index 443df746..2d46bce0 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda.jl
@@ -1,4 +1,5 @@
 using DynamicExpressions
+using DynamicExpressions.AsArrayModule: as_array
 using CUDA
 using Random
 
@@ -10,55 +11,116 @@ include("tree_gen_utils.jl")
 safe_sin(x) = isfinite(x) ? sin(x) : convert(eltype(x), NaN)
 safe_cos(x) = isfinite(x) ? cos(x) : convert(eltype(x), NaN)
 
-let
-    operators = OperatorEnum(;
-        binary_operators=[+, -, *, /], unary_operators=[safe_sin, safe_cos]
-    )
-    x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3)
-
-    for T in (Float32, Float64, ComplexF64), ntrees in (1, 2, 3), seed in 0:10
-        Random.seed!(seed)
-
-        nrow = rand(10:30)
-        nnodes = rand(10:25, ntrees)
-        use_tuple = rand(Bool)
-
-        buffer = rand(Bool) ? ones(Int32, 8, sum(nnodes)) : nothing
-        gpu_buffer = rand(Bool) ? FakeCuArray(ones(Int32, 8, sum(nnodes))) : nothing
-        gpu_workspace = rand(Bool) ? FakeCuArray(ones(T, nrow + 1, sum(nnodes))) : nothing
-
-        trees = ntuple(i -> gen_random_tree_fixed_size(nnodes[i], operators, 3, T), ntrees)
-        trees = use_tuple ? trees : collect(trees)
-        X = randn(T, 3, nrow)
-        if ntrees > 1
-            y, completed = @inferred eval_tree_array(trees, X, operators)
-            gpu_y, gpu_completed = @inferred eval_tree_array(
-                trees, FakeCuArray(X), operators; buffer, gpu_workspace, gpu_buffer
+@testset "Random evals" begin
+    let
+        operators = OperatorEnum(;
+            binary_operators=[+, -, *, /], unary_operators=[safe_sin, safe_cos]
+        )
+        x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3)
+
+        for T in (Float32, Float64, ComplexF64), ntrees in (1, 2, 3), seed in 0:10
+            Random.seed!(seed)
+
+            nrow = rand(10:30)
+            nnodes = rand(10:25, ntrees)
+            use_tuple = rand(Bool)
+
+            buffer = rand(Bool) ? ones(Int32, 8, sum(nnodes)) : nothing
+            gpu_buffer = rand(Bool) ? FakeCuArray(ones(Int32, 8, sum(nnodes))) : nothing
+            gpu_workspace =
+                rand(Bool) ? FakeCuArray(ones(T, nrow + 1, sum(nnodes))) : nothing
+
+            trees = ntuple(
+                i -> gen_random_tree_fixed_size(nnodes[i], operators, 3, T), ntrees
             )
+            trees = use_tuple ? trees : collect(trees)
+            X = randn(T, 3, nrow)
+            if ntrees > 1
+                y, completed = @inferred eval_tree_array(trees, X, operators)
+                gpu_y, gpu_completed = @inferred eval_tree_array(
+                    trees, FakeCuArray(X), operators; buffer, gpu_workspace, gpu_buffer
+                )
 
-            # Should give same result either way
-            for i in eachindex(completed, gpu_completed)
-                if completed[i]
-                    @test y[i] ≈ gpu_y[i]
+                # Should give same result either way
+                for i in eachindex(completed, gpu_completed)
+                    if completed[i]
+                        @test y[i] ≈ gpu_y[i]
+                    end
                 end
-            end
 
-            # Should return same type as input
-            if use_tuple
-                @test y isa Tuple
-                @test gpu_y isa Tuple
+                # Should return same type as input
+                if use_tuple
+                    @test y isa Tuple
+                    @test gpu_y isa Tuple
+                else
+                    @test y isa Vector
+                    @test gpu_y isa Vector
+                end
             else
-                @test y isa Vector
-                @test gpu_y isa Vector
-            end
-        else
-            y, completed = @inferred eval_tree_array(only(trees), X, operators)
-            gpu_y, gpu_completed = @inferred eval_tree_array(
-                only(trees), FakeCuArray(X), operators
-            )
-            if completed
-                @test y ≈ gpu_y
+                y, completed = @inferred eval_tree_array(only(trees), X, operators)
+                gpu_y, gpu_completed = @inferred eval_tree_array(
+                    only(trees), FakeCuArray(X), operators
+                )
+                if completed
+                    @test y ≈ gpu_y
+                end
             end
         end
     end
 end
+
+@testset "Evaluation on pre-computed buffers" begin
+    let
+        operators = OperatorEnum(;
+            binary_operators=[+, -, *, /], unary_operators=[sin, cos]
+        )
+        x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3)
+        Random.seed!(0)
+        tree = sin(x1 * 3.1 - x3 * 0.9 + 0.2) * x2 - x3 * x3 * 1.5
+        X = randn(Float64, 3, 100)
+
+        y1, _ = eval_tree_array(tree, X, operators)
+        y2, _ = eval_tree_array(tree, FakeCuArray(X), operators)
+
+        @test y1 ≈ y2
+
+        (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, tree)
+        gpu_buffer = FakeCuArray(buffer)
+        gpu_workspace = FakeCuArray(zeros(Float64, 101, 50))
+        copyto!((@view gpu_workspace[end, :]), val)
+
+        # Now, with all buffers:
+        y3, _ = eval_tree_array(
+            tree,
+            FakeCuArray(X),
+            operators;
+            gpu_workspace,
+            gpu_buffer,
+            roots,
+            num_nodes,
+            num_launches,
+            update_buffers=Val(false),
+        )
+        @test y1 ≈ y3
+
+        # Should be able to shift some of the values in this buffer:
+        i = findfirst(gpu_workspace[end, :] .== 0.9)
+        gpu_workspace[end, i] = 0.8
+
+        # And get the updated results:
+        tree_prime = sin(x1 * 3.1 - x3 * 0.8 + 0.2) * x2 - x3 * x3 * 1.5
+        y1_prime, _ = eval_tree_array(tree_prime, X, operators)
+        y3_prime, _ = eval_tree_array(
+            x1, # Doesn't matter what we put here
+            FakeCuArray(X),
+            operators;
+            gpu_workspace,
+            gpu_buffer,
+            roots,
+            num_nodes,
+            num_launches,
+            update_buffers=Val(false),
+        )
+        @test y1_prime ≈ y3_prime
+    end
+end

From c1e579a95c5facefb06d5e11a3bf27ed94f703e4 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 26 Feb 2024 01:15:34 +0000
Subject: [PATCH 35/47] Start switching to KernelAbstractions.jl

---
 Project.toml                                  | 10 +--
 ...ynamicExpressionsKernelAbstractionsExt.jl} | 78 +++++++------------
 src/DynamicExpressions.jl                     |  3 +-
 src/ExtensionInterface.jl                     |  3 +
 4 files changed, 36 insertions(+), 58 deletions(-)
 rename ext/{DynamicExpressionsCUDAExt.jl => DynamicExpressionsKernelAbstractionsExt.jl} (70%)

diff --git a/Project.toml b/Project.toml
index 097e0fd7..69d24f56 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,7 +14,7 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 
 [weakdeps]
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b"
@@ -22,7 +22,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [extensions]
 DynamicExpressionsBumperExt = "Bumper"
-DynamicExpressionsCUDAExt = "CUDA"
+DynamicExpressionsKernelAbstractionsExt = "KernelAbstractions"
 DynamicExpressionsLoopVectorizationExt = "LoopVectorization"
 DynamicExpressionsOptimExt = "Optim"
 DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils"
@@ -31,7 +31,7 @@ DynamicExpressionsZygoteExt = "Zygote"
 [compat]
 Aqua = "0.7"
 Bumper = "0.6"
-CUDA = "4, 5"
+KernelAbstractions = "0.9"
 Compat = "3.37, 4"
 Enzyme = "^0.11.12"
 LoopVectorization = "0.12"
@@ -47,9 +47,9 @@ julia = "1.6"
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
@@ -61,4 +61,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "SafeTestsets", "Aqua", "Bumper", "CUDA", "Enzyme", "ForwardDiff", "LinearAlgebra", "LoopVectorization", "Optim", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
+test = ["Test", "SafeTestsets", "Aqua", "Bumper", "KernelAbstractions", "Enzyme", "ForwardDiff", "LinearAlgebra", "LoopVectorization", "Optim", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsKernelAbstractionsExt.jl
similarity index 70%
rename from ext/DynamicExpressionsCUDAExt.jl
rename to ext/DynamicExpressionsKernelAbstractionsExt.jl
index bb965214..454fdbdb 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsKernelAbstractionsExt.jl
@@ -1,37 +1,24 @@
-module DynamicExpressionsCUDAExt
+module DynamicExpressionsKernelAbstractionsExt
 
-using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx
+using KernelAbstractions: @index, @kernel, @Const, get_backend
 using DynamicExpressions: OperatorEnum, AbstractExpressionNode
 using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna
 using DynamicExpressions.AsArrayModule: as_array
 
-import DynamicExpressions.EvaluateEquationModule: eval_tree_array
+import DynamicExpressions.ExtensionInterfaceModule: gpu_eval_tree_array
 
-# array type for exclusively testing purposes
-struct FakeCuArray{T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N}
-    a::A
-end
-Base.similar(x::FakeCuArray, dims::Integer...) = FakeCuArray(similar(x.a, dims...))
-Base.getindex(x::FakeCuArray, i::Int...) = getindex(x.a, i...)
-Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...)
-Base.size(x::FakeCuArray) = size(x.a)
-
-const MaybeCuArray{T,N} = Union{CuArray{T,N},FakeCuArray{T,N}}
-
-to_device(a, ::CuArray) = CuArray(a)
-to_device(a, ::FakeCuArray) = FakeCuArray(a)
-
-function eval_tree_array(
-    tree::AbstractExpressionNode{T}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; kws...
+function gpu_eval_tree_array(
+    tree::AbstractExpressionNode{T}, gcX, operators::OperatorEnum; kws...
 ) where {T<:Number}
-    (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...)
+    (outs, is_good) = gpu_eval_tree_array((tree,), gcX, operators; kws...)
     return (only(outs), only(is_good))
 end
 
-function eval_tree_array(
+function gpu_eval_tree_array(
     trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}},
-    gcX::MaybeCuArray{T,2},
+    gcX,
     operators::OperatorEnum;
+    backend=get_backend(gcX),
     buffer=nothing,
     gpu_workspace=nothing,
     gpu_buffer=nothing,
@@ -39,7 +26,6 @@ function eval_tree_array(
     num_nodes=nothing,
     num_launches=nothing,
     update_buffers::Val{_update_buffers}=Val(true),
-    kws...,
 ) where {T<:Number,N<:AbstractExpressionNode{T},_update_buffers}
     if _update_buffers
         (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, trees; buffer)
@@ -82,6 +68,7 @@ function eval_tree_array(
 
     #! format: off
     _launch_gpu_kernel!(
+        backend,
         num_threads, num_blocks, num_launches, gworkspace,
         # Thread info:
         num_elem, num_nodes, gexecution_order,
@@ -99,6 +86,7 @@ end
 
 #! format: off
 function _launch_gpu_kernel!(
+    backend,
     num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2},
     # Thread info:
     num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray{I},
@@ -114,24 +102,12 @@ function _launch_gpu_kernel!(
     gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
     for launch in one(I):I(num_launches)
         #! format: off
-        if buffer isa CuArray
-            @cuda threads=num_threads blocks=num_blocks gpu_kernel!(
-                buffer,
-                launch, num_elem, num_nodes, execution_order,
-                cX, idx_self, idx_l, idx_r,
-                degree, constant, val, feature, op
-            )
-        else
-            Threads.@threads for i in 1:(num_threads * num_blocks)
-                gpu_kernel!(
-                    buffer,
-                    launch, num_elem, num_nodes, execution_order,
-                    cX, idx_self, idx_l, idx_r,
-                    degree, constant, val, feature, op,
-                    i
-                )
-            end
-        end
+        gpu_kernel!(backend, num_threads * num_blocks)(
+            buffer,
+            launch, num_elem, num_nodes, execution_order,
+            cX, idx_self, idx_l, idx_r,
+            degree, constant, val, feature, op
+        )
         #! format: on
     end
     return nothing
@@ -146,19 +122,17 @@ end
 for nuna in 0:10, nbin in 0:10
     @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin})
         #! format: off
-        function (
+        @kernel function k(
             # Storage:
             buffer,
             # Thread info:
-            launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray,
+            @Const(launch)::Integer, @Const(num_elem)::Integer, @Const(num_nodes)::Integer, @Const(execution_order)::AbstractArray{I},
             # Input data and tree
-            cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
-            degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray,
-            # Override for unittesting:
-            i=nothing,
+            @Const(cX)::AbstractArray, @Const(idx_self)::AbstractArray, @Const(idx_l)::AbstractArray, @Const(idx_r)::AbstractArray,
+            @Const(degree)::AbstractArray, @Const(constant)::AbstractArray, @Const(val)::AbstractArray, @Const(feature)::AbstractArray, @Const(op)::AbstractArray,
         )
             #! format: on
-            i = i === nothing ? (blockIdx().x - 1) * blockDim().x + threadIdx().x : i
+            i = @index(Global, Linear)
             if i > num_elem * num_nodes
                 return nothing
             end
@@ -186,8 +160,8 @@ for nuna in 0:10, nbin in 0:10
                     l_idx = idx_l[node]
                     Base.Cartesian.@nif(
                         $nuna,
-                        i -> i == cur_op,
-                        i -> let op = operators.unaops[i]
+                        j -> j == cur_op,
+                        j -> let op = operators.unaops[j]
                             buffer[elem, cur_idx] = op(buffer[elem, l_idx])
                         end
                     )
@@ -197,8 +171,8 @@ for nuna in 0:10, nbin in 0:10
                     r_idx = idx_r[node]
                     Base.Cartesian.@nif(
                         $nbin,
-                        i -> i == cur_op,
-                        i -> let op = operators.binops[i]
+                        j -> j == cur_op,
+                        j -> let op = operators.binops[j]
                             buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx])
                         end
                     )
diff --git a/src/DynamicExpressions.jl b/src/DynamicExpressions.jl
index 7ab08835..d03bec08 100644
--- a/src/DynamicExpressions.jl
+++ b/src/DynamicExpressions.jl
@@ -48,7 +48,8 @@ import .EquationModule: constructorof, preserve_sharing
     eval_diff_tree_array, eval_grad_tree_array
 @reexport import .SimplifyEquationModule: combine_operators, simplify_tree!
 @reexport import .EvaluationHelpersModule
-@reexport import .ExtensionInterfaceModule: node_to_symbolic, symbolic_to_node
+@reexport import .ExtensionInterfaceModule:
+    node_to_symbolic, symbolic_to_node, gpu_eval_tree_array
 @reexport import .RandomModule: NodeSampler
 @reexport import .AsArrayModule: as_array
 
diff --git a/src/ExtensionInterface.jl b/src/ExtensionInterface.jl
index 521b0c88..48476819 100644
--- a/src/ExtensionInterface.jl
+++ b/src/ExtensionInterface.jl
@@ -14,6 +14,9 @@ end
 function bumper_eval_tree_array(args...)
     return error("Please load the Bumper.jl package to use this feature.")
 end
+function gpu_eval_tree_array(args...)
+    return error("Please load a GPU backend such as CUDA.jl to use this feature.")
+end
 function bumper_kern1! end
 function bumper_kern2! end
 

From cb2d055423b415af9d2118bfcf2fc33ef67730c2 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 26 Feb 2024 01:15:37 +0000
Subject: [PATCH 36/47] Revert "Start switching to KernelAbstractions.jl"

This reverts commit c1e579a95c5facefb06d5e11a3bf27ed94f703e4.
---
 Project.toml                                  | 10 +--
 ...onsExt.jl => DynamicExpressionsCUDAExt.jl} | 79 +++++++++++++------
 src/DynamicExpressions.jl                     |  3 +-
 src/ExtensionInterface.jl                     |  3 -
 4 files changed, 59 insertions(+), 36 deletions(-)
 rename ext/{DynamicExpressionsKernelAbstractionsExt.jl => DynamicExpressionsCUDAExt.jl} (69%)

diff --git a/Project.toml b/Project.toml
index 69d24f56..097e0fd7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,7 +14,7 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 
 [weakdeps]
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
-KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b"
@@ -22,7 +22,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [extensions]
 DynamicExpressionsBumperExt = "Bumper"
-DynamicExpressionsKernelAbstractionsExt = "KernelAbstractions"
+DynamicExpressionsCUDAExt = "CUDA"
 DynamicExpressionsLoopVectorizationExt = "LoopVectorization"
 DynamicExpressionsOptimExt = "Optim"
 DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils"
@@ -31,7 +31,7 @@ DynamicExpressionsZygoteExt = "Zygote"
 [compat]
 Aqua = "0.7"
 Bumper = "0.6"
-KernelAbstractions = "0.9"
+CUDA = "4, 5"
 Compat = "3.37, 4"
 Enzyme = "^0.11.12"
 LoopVectorization = "0.12"
@@ -47,9 +47,9 @@ julia = "1.6"
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
@@ -61,4 +61,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "SafeTestsets", "Aqua", "Bumper", "KernelAbstractions", "Enzyme", "ForwardDiff", "LinearAlgebra", "LoopVectorization", "Optim", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
+test = ["Test", "SafeTestsets", "Aqua", "Bumper", "CUDA", "Enzyme", "ForwardDiff", "LinearAlgebra", "LoopVectorization", "Optim", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
diff --git a/ext/DynamicExpressionsKernelAbstractionsExt.jl b/ext/DynamicExpressionsCUDAExt.jl
similarity index 69%
rename from ext/DynamicExpressionsKernelAbstractionsExt.jl
rename to ext/DynamicExpressionsCUDAExt.jl
index 454fdbdb..4cc381ca 100644
--- a/ext/DynamicExpressionsKernelAbstractionsExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -1,24 +1,38 @@
-module DynamicExpressionsKernelAbstractionsExt
+module DynamicExpressionsCUDAExt
 
-using KernelAbstractions: @index, @kernel, @Const, get_backend
+# TODO: Switch to KernelAbstractions.jl (once they hit v1.0)
+using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx
 using DynamicExpressions: OperatorEnum, AbstractExpressionNode
 using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna
 using DynamicExpressions.AsArrayModule: as_array
 
-import DynamicExpressions.ExtensionInterfaceModule: gpu_eval_tree_array
+import DynamicExpressions.EvaluateEquationModule: eval_tree_array
 
-function gpu_eval_tree_array(
-    tree::AbstractExpressionNode{T}, gcX, operators::OperatorEnum; kws...
+# array type for exclusively testing purposes
+struct FakeCuArray{T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N}
+    a::A
+end
+Base.similar(x::FakeCuArray, dims::Integer...) = FakeCuArray(similar(x.a, dims...))
+Base.getindex(x::FakeCuArray, i::Int...) = getindex(x.a, i...)
+Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...)
+Base.size(x::FakeCuArray) = size(x.a)
+
+const MaybeCuArray{T,N} = Union{CuArray{T,N},FakeCuArray{T,N}}
+
+to_device(a, ::CuArray) = CuArray(a)
+to_device(a, ::FakeCuArray) = FakeCuArray(a)
+
+function eval_tree_array(
+    tree::AbstractExpressionNode{T}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; kws...
 ) where {T<:Number}
-    (outs, is_good) = gpu_eval_tree_array((tree,), gcX, operators; kws...)
+    (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...)
     return (only(outs), only(is_good))
 end
 
-function gpu_eval_tree_array(
+function eval_tree_array(
     trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}},
-    gcX,
+    gcX::MaybeCuArray{T,2},
     operators::OperatorEnum;
-    backend=get_backend(gcX),
     buffer=nothing,
     gpu_workspace=nothing,
     gpu_buffer=nothing,
@@ -26,6 +40,7 @@ function gpu_eval_tree_array(
     num_nodes=nothing,
     num_launches=nothing,
     update_buffers::Val{_update_buffers}=Val(true),
+    kws...,
 ) where {T<:Number,N<:AbstractExpressionNode{T},_update_buffers}
     if _update_buffers
         (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, trees; buffer)
@@ -68,7 +83,6 @@ function gpu_eval_tree_array(
 
     #! format: off
     _launch_gpu_kernel!(
-        backend,
         num_threads, num_blocks, num_launches, gworkspace,
         # Thread info:
         num_elem, num_nodes, gexecution_order,
@@ -86,7 +100,6 @@ end
 
 #! format: off
 function _launch_gpu_kernel!(
-    backend,
     num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2},
     # Thread info:
     num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray{I},
@@ -102,12 +115,24 @@ function _launch_gpu_kernel!(
     gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
     for launch in one(I):I(num_launches)
         #! format: off
-        gpu_kernel!(backend, num_threads * num_blocks)(
-            buffer,
-            launch, num_elem, num_nodes, execution_order,
-            cX, idx_self, idx_l, idx_r,
-            degree, constant, val, feature, op
-        )
+        if buffer isa CuArray
+            @cuda threads=num_threads blocks=num_blocks gpu_kernel!(
+                buffer,
+                launch, num_elem, num_nodes, execution_order,
+                cX, idx_self, idx_l, idx_r,
+                degree, constant, val, feature, op
+            )
+        else
+            Threads.@threads for i in 1:(num_threads * num_blocks)
+                gpu_kernel!(
+                    buffer,
+                    launch, num_elem, num_nodes, execution_order,
+                    cX, idx_self, idx_l, idx_r,
+                    degree, constant, val, feature, op,
+                    i
+                )
+            end
+        end
         #! format: on
     end
     return nothing
@@ -122,17 +147,19 @@ end
 for nuna in 0:10, nbin in 0:10
     @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin})
         #! format: off
-        @kernel function k(
+        function (
             # Storage:
             buffer,
             # Thread info:
-            @Const(launch)::Integer, @Const(num_elem)::Integer, @Const(num_nodes)::Integer, @Const(execution_order)::AbstractArray{I},
+            launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray,
             # Input data and tree
-            @Const(cX)::AbstractArray, @Const(idx_self)::AbstractArray, @Const(idx_l)::AbstractArray, @Const(idx_r)::AbstractArray,
-            @Const(degree)::AbstractArray, @Const(constant)::AbstractArray, @Const(val)::AbstractArray, @Const(feature)::AbstractArray, @Const(op)::AbstractArray,
+            cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
+            degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray,
+            # Override for unittesting:
+            i=nothing,
         )
             #! format: on
-            i = @index(Global, Linear)
+            i = i === nothing ? (blockIdx().x - 1) * blockDim().x + threadIdx().x : i
             if i > num_elem * num_nodes
                 return nothing
             end
@@ -160,8 +187,8 @@ for nuna in 0:10, nbin in 0:10
                     l_idx = idx_l[node]
                     Base.Cartesian.@nif(
                         $nuna,
-                        j -> j == cur_op,
-                        j -> let op = operators.unaops[j]
+                        i -> i == cur_op,
+                        i -> let op = operators.unaops[i]
                             buffer[elem, cur_idx] = op(buffer[elem, l_idx])
                         end
                     )
@@ -171,8 +198,8 @@ for nuna in 0:10, nbin in 0:10
                     r_idx = idx_r[node]
                     Base.Cartesian.@nif(
                         $nbin,
-                        j -> j == cur_op,
-                        j -> let op = operators.binops[j]
+                        i -> i == cur_op,
+                        i -> let op = operators.binops[i]
                             buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx])
                         end
                     )
diff --git a/src/DynamicExpressions.jl b/src/DynamicExpressions.jl
index d03bec08..7ab08835 100644
--- a/src/DynamicExpressions.jl
+++ b/src/DynamicExpressions.jl
@@ -48,8 +48,7 @@ import .EquationModule: constructorof, preserve_sharing
     eval_diff_tree_array, eval_grad_tree_array
 @reexport import .SimplifyEquationModule: combine_operators, simplify_tree!
 @reexport import .EvaluationHelpersModule
-@reexport import .ExtensionInterfaceModule:
-    node_to_symbolic, symbolic_to_node, gpu_eval_tree_array
+@reexport import .ExtensionInterfaceModule: node_to_symbolic, symbolic_to_node
 @reexport import .RandomModule: NodeSampler
 @reexport import .AsArrayModule: as_array
 
diff --git a/src/ExtensionInterface.jl b/src/ExtensionInterface.jl
index 48476819..521b0c88 100644
--- a/src/ExtensionInterface.jl
+++ b/src/ExtensionInterface.jl
@@ -14,9 +14,6 @@ end
 function bumper_eval_tree_array(args...)
     return error("Please load the Bumper.jl package to use this feature.")
 end
-function gpu_eval_tree_array(args...)
-    return error("Please load a GPU backend such as CUDA.jl to use this feature.")
-end
 function bumper_kern1! end
 function bumper_kern2! end
 

From bfa9148f74e8866d1d8178d5857ee1cdf8af56cc Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Wed, 17 Jul 2024 18:49:54 -0400
Subject: [PATCH 37/47] fix: as array imports

---
 src/AsArray.jl    | 2 +-
 test/Project.toml | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/AsArray.jl b/src/AsArray.jl
index 75ff1ece..809638ca 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -1,6 +1,6 @@
 module AsArrayModule
 
-using ..EquationModule: AbstractExpressionNode, tree_mapreduce, count_nodes
+using ..NodeModule: AbstractExpressionNode, tree_mapreduce, count_nodes
 
 function as_array(
     ::Type{I},
diff --git a/test/Project.toml b/test/Project.toml
index 952e0370..3e47197d 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,6 +1,7 @@
 [deps]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"

From 9f49619e6580536244c8b74bcadbdb1b2d237b1c Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Wed, 17 Jul 2024 19:48:41 -0400
Subject: [PATCH 38/47] hack: try implementing cooperative group

---
 ext/DynamicExpressionsCUDAExt.jl | 116 +++++++++++++++++--------------
 1 file changed, 62 insertions(+), 54 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 4cc381ca..2e1cb61e 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -1,7 +1,7 @@
 module DynamicExpressionsCUDAExt
 
 # TODO: Switch to KernelAbstractions.jl (once they hit v1.0)
-using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx
+using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx, CG
 using DynamicExpressions: OperatorEnum, AbstractExpressionNode
 using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna
 using DynamicExpressions.AsArrayModule: as_array
@@ -78,7 +78,7 @@ function eval_tree_array(
     gidx_r = @view gbuffer[7, :]
     gconstant = @view gbuffer[8, :]
 
-    num_threads = 256
+    num_threads = 1024
     num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads))
 
     #! format: off
@@ -113,25 +113,23 @@ function _launch_gpu_kernel!(
     (nuna > 10 || nbin > 10) &&
         error("Too many operators. Kernels are only compiled up to 10.")
     gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
-    for launch in one(I):I(num_launches)
-        #! format: off
-        if buffer isa CuArray
-            @cuda threads=num_threads blocks=num_blocks gpu_kernel!(
+    #! format: off
+    if buffer isa CuArray
+        @cuda cooperative=true threads=num_threads blocks=num_blocks gpu_kernel!(
+            buffer,
+            num_launches, num_elem, num_nodes, execution_order,
+            cX, idx_self, idx_l, idx_r,
+            degree, constant, val, feature, op
+        )
+    else
+        Threads.@threads for i in 1:(num_threads * num_blocks * num_launches)
+            gpu_kernel!(
                 buffer,
-                launch, num_elem, num_nodes, execution_order,
+                num_launches, num_elem, num_nodes, execution_order,
                 cX, idx_self, idx_l, idx_r,
-                degree, constant, val, feature, op
+                degree, constant, val, feature, op,
+                i
             )
-        else
-            Threads.@threads for i in 1:(num_threads * num_blocks)
-                gpu_kernel!(
-                    buffer,
-                    launch, num_elem, num_nodes, execution_order,
-                    cX, idx_self, idx_l, idx_r,
-                    degree, constant, val, feature, op,
-                    i
-                )
-            end
         end
         #! format: on
     end
@@ -151,7 +149,7 @@ for nuna in 0:10, nbin in 0:10
             # Storage:
             buffer,
             # Thread info:
-            launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray,
+            num_launches::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray,
             # Input data and tree
             cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
             degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray,
@@ -163,46 +161,56 @@ for nuna in 0:10, nbin in 0:10
             if i > num_elem * num_nodes
                 return nothing
             end
-
+            # 
             node = (i - 1) % num_nodes + 1
             elem = (i - node) ÷ num_nodes + 1
+            grid_group = CG.this_grid()
 
-            if execution_order[node] != launch
-                return nothing
-            end
+            for launch in 1:num_launches
+                if launch > 1
+                    # TODO: Investigate whether synchronizing within
+                    # group instead of whole grid is better.
+                    CG.sync(grid_group)
+                end
+                if execution_order[node] != launch
+                    continue
+                end
 
-            cur_degree = degree[node]
-            cur_idx = idx_self[node]
-            if cur_degree == 0
-                if constant[node] == 1
-                    cur_val = val[node]
-                    buffer[elem, cur_idx] = cur_val
+                cur_degree = degree[node]
+                cur_idx = idx_self[node]
+                if cur_degree == 0
+                    if constant[node] == 1
+                        cur_val = val[node]
+                        buffer[elem, cur_idx] = cur_val
+                    else
+                        cur_feature = feature[node]
+                        buffer[elem, cur_idx] = cX[cur_feature, elem]
+                    end
                 else
-                    cur_feature = feature[node]
-                    buffer[elem, cur_idx] = cX[cur_feature, elem]
-                end
-            else
-                if cur_degree == 1 && $nuna > 0
-                    cur_op = op[node]
-                    l_idx = idx_l[node]
-                    Base.Cartesian.@nif(
-                        $nuna,
-                        i -> i == cur_op,
-                        i -> let op = operators.unaops[i]
-                            buffer[elem, cur_idx] = op(buffer[elem, l_idx])
-                        end
-                    )
-                elseif $nbin > 0  # Note this check is to avoid type inference issues when binops is empty
-                    cur_op = op[node]
-                    l_idx = idx_l[node]
-                    r_idx = idx_r[node]
-                    Base.Cartesian.@nif(
-                        $nbin,
-                        i -> i == cur_op,
-                        i -> let op = operators.binops[i]
-                            buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx])
-                        end
-                    )
+                    if cur_degree == 1 && $nuna > 0
+                        cur_op = op[node]
+                        l_idx = idx_l[node]
+                        Base.Cartesian.@nif(
+                            $nuna,
+                            i -> i == cur_op,
+                            i -> let op = operators.unaops[i]
+                                buffer[elem, cur_idx] = op(buffer[elem, l_idx])
+                            end
+                        )
+                    elseif $nbin > 0  # Note this check is to avoid type inference issues when binops is empty
+                        cur_op = op[node]
+                        l_idx = idx_l[node]
+                        r_idx = idx_r[node]
+                        Base.Cartesian.@nif(
+                            $nbin,
+                            i -> i == cur_op,
+                            i -> let op = operators.binops[i]
+                                buffer[elem, cur_idx] = op(
+                                    buffer[elem, l_idx], buffer[elem, r_idx]
+                                )
+                            end
+                        )
+                    end
                 end
             end
             return nothing

From 1422e363e97f0085ed14cab259b0dc755dbadb24 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Fri, 29 Nov 2024 14:07:21 +0000
Subject: [PATCH 39/47] refactor: clean up extensions

---
 ext/DynamicExpressionsCUDAExt.jl |  6 +-----
 src/AsArray.jl                   | 13 ++++++++-----
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 2e1cb61e..dbdc0b39 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -51,11 +51,7 @@ function eval_tree_array(
     ## the GPU kernel, with size equal to the number of rows
     ## in the input data by the number of nodes in the tree.
     ## It has one extra row to store the constant values.
-    gworkspace = if gpu_workspace === nothing
-        similar(gcX, num_elem + 1, num_nodes)
-    else
-        gpu_workspace
-    end
+    gworkspace = @something(gpu_workspace, similar(gcX, num_elem + 1, num_nodes))
     gval = @view gworkspace[end, :]
     if _update_buffers
         copyto!(gval, val)
diff --git a/src/AsArray.jl b/src/AsArray.jl
index 809638ca..83d32377 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -2,14 +2,17 @@ module AsArrayModule
 
 using ..NodeModule: AbstractExpressionNode, tree_mapreduce, count_nodes
 
+function as_array(
+    ::Type{I}, trees::N; buffer=nothing
+) where {T,N<:AbstractExpressionNode{T},I}
+    return as_array(I, (trees,); buffer=buffer)
+end
+
 function as_array(
     ::Type{I},
-    trees::Union{N,Tuple{N,Vararg{N}},AbstractVector{N}};
+    trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}};
     buffer::Union{AbstractArray,Nothing}=nothing,
 ) where {T,N<:AbstractExpressionNode{T},I}
-    if trees isa N
-        return as_array(I, (trees,); buffer=buffer)
-    end
     each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees)
     num_nodes = sum(each_num_nodes)
 
@@ -25,7 +28,7 @@ function as_array(
     val = Array{T}(undef, num_nodes)
 
     ## Views of the same matrix:
-    buffer = buffer === nothing ? Array{I}(undef, 8, num_nodes) : buffer
+    buffer = @something(buffer, Array{I}(undef, 8, num_nodes))
     degree = @view buffer[1, :]
     feature = @view buffer[2, :]
     op = @view buffer[3, :]

From 74afd727d7df168904bc8e4b0e7736455633343f Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 15 Dec 2024 15:07:41 -0800
Subject: [PATCH 40/47] Revert "hack: try implementing cooperative group"

This reverts commit 9f49619e6580536244c8b74bcadbdb1b2d237b1c.
---
 ext/DynamicExpressionsCUDAExt.jl | 116 ++++++++++++++-----------------
 1 file changed, 54 insertions(+), 62 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index dbdc0b39..6c89e333 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -1,7 +1,7 @@
 module DynamicExpressionsCUDAExt
 
 # TODO: Switch to KernelAbstractions.jl (once they hit v1.0)
-using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx, CG
+using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx
 using DynamicExpressions: OperatorEnum, AbstractExpressionNode
 using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna
 using DynamicExpressions.AsArrayModule: as_array
@@ -74,7 +74,7 @@ function eval_tree_array(
     gidx_r = @view gbuffer[7, :]
     gconstant = @view gbuffer[8, :]
 
-    num_threads = 1024
+    num_threads = 256
     num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads))
 
     #! format: off
@@ -109,23 +109,25 @@ function _launch_gpu_kernel!(
     (nuna > 10 || nbin > 10) &&
         error("Too many operators. Kernels are only compiled up to 10.")
     gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
-    #! format: off
-    if buffer isa CuArray
-        @cuda cooperative=true threads=num_threads blocks=num_blocks gpu_kernel!(
-            buffer,
-            num_launches, num_elem, num_nodes, execution_order,
-            cX, idx_self, idx_l, idx_r,
-            degree, constant, val, feature, op
-        )
-    else
-        Threads.@threads for i in 1:(num_threads * num_blocks * num_launches)
-            gpu_kernel!(
+    for launch in one(I):I(num_launches)
+        #! format: off
+        if buffer isa CuArray
+            @cuda threads=num_threads blocks=num_blocks gpu_kernel!(
                 buffer,
-                num_launches, num_elem, num_nodes, execution_order,
+                launch, num_elem, num_nodes, execution_order,
                 cX, idx_self, idx_l, idx_r,
-                degree, constant, val, feature, op,
-                i
+                degree, constant, val, feature, op
             )
+        else
+            Threads.@threads for i in 1:(num_threads * num_blocks)
+                gpu_kernel!(
+                    buffer,
+                    launch, num_elem, num_nodes, execution_order,
+                    cX, idx_self, idx_l, idx_r,
+                    degree, constant, val, feature, op,
+                    i
+                )
+            end
         end
         #! format: on
     end
@@ -145,7 +147,7 @@ for nuna in 0:10, nbin in 0:10
             # Storage:
             buffer,
             # Thread info:
-            num_launches::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray,
+            launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray,
             # Input data and tree
             cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
             degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray,
@@ -157,56 +159,46 @@ for nuna in 0:10, nbin in 0:10
             if i > num_elem * num_nodes
                 return nothing
             end
-            # 
+
             node = (i - 1) % num_nodes + 1
             elem = (i - node) ÷ num_nodes + 1
-            grid_group = CG.this_grid()
 
-            for launch in 1:num_launches
-                if launch > 1
-                    # TODO: Investigate whether synchronizing within
-                    # group instead of whole grid is better.
-                    CG.sync(grid_group)
-                end
-                if execution_order[node] != launch
-                    continue
-                end
+            if execution_order[node] != launch
+                return nothing
+            end
 
-                cur_degree = degree[node]
-                cur_idx = idx_self[node]
-                if cur_degree == 0
-                    if constant[node] == 1
-                        cur_val = val[node]
-                        buffer[elem, cur_idx] = cur_val
-                    else
-                        cur_feature = feature[node]
-                        buffer[elem, cur_idx] = cX[cur_feature, elem]
-                    end
+            cur_degree = degree[node]
+            cur_idx = idx_self[node]
+            if cur_degree == 0
+                if constant[node] == 1
+                    cur_val = val[node]
+                    buffer[elem, cur_idx] = cur_val
                 else
-                    if cur_degree == 1 && $nuna > 0
-                        cur_op = op[node]
-                        l_idx = idx_l[node]
-                        Base.Cartesian.@nif(
-                            $nuna,
-                            i -> i == cur_op,
-                            i -> let op = operators.unaops[i]
-                                buffer[elem, cur_idx] = op(buffer[elem, l_idx])
-                            end
-                        )
-                    elseif $nbin > 0  # Note this check is to avoid type inference issues when binops is empty
-                        cur_op = op[node]
-                        l_idx = idx_l[node]
-                        r_idx = idx_r[node]
-                        Base.Cartesian.@nif(
-                            $nbin,
-                            i -> i == cur_op,
-                            i -> let op = operators.binops[i]
-                                buffer[elem, cur_idx] = op(
-                                    buffer[elem, l_idx], buffer[elem, r_idx]
-                                )
-                            end
-                        )
-                    end
+                    cur_feature = feature[node]
+                    buffer[elem, cur_idx] = cX[cur_feature, elem]
+                end
+            else
+                if cur_degree == 1 && $nuna > 0
+                    cur_op = op[node]
+                    l_idx = idx_l[node]
+                    Base.Cartesian.@nif(
+                        $nuna,
+                        i -> i == cur_op,
+                        i -> let op = operators.unaops[i]
+                            buffer[elem, cur_idx] = op(buffer[elem, l_idx])
+                        end
+                    )
+                elseif $nbin > 0  # Note this check is to avoid type inference issues when binops is empty
+                    cur_op = op[node]
+                    l_idx = idx_l[node]
+                    r_idx = idx_r[node]
+                    Base.Cartesian.@nif(
+                        $nbin,
+                        i -> i == cur_op,
+                        i -> let op = operators.binops[i]
+                            buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx])
+                        end
+                    )
                 end
             end
             return nothing

From 38c589b3e29de39c6df66fca025c4e29ed3f94bc Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 15 Dec 2024 15:15:04 -0800
Subject: [PATCH 41/47] refactor: clean up `map` operations

---
 ext/DynamicExpressionsCUDAExt.jl | 4 ++--
 src/AsArray.jl                   | 2 +-
 src/Evaluate.jl                  | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 6c89e333..ddb1c3d9 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -88,8 +88,8 @@ function eval_tree_array(
     )
     #! format: on
 
-    out = (r -> @view(gworkspace[begin:(end - 1), r])).(roots)
-    is_good = (_ -> true).(trees)
+    out = map(r -> @view(gworkspace[begin:(end - 1), r]), roots)
+    is_good = map(Returns(true), trees)
 
     return (out, is_good)
 end
diff --git a/src/AsArray.jl b/src/AsArray.jl
index 83d32377..410147e6 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -13,7 +13,7 @@ function as_array(
     trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}};
     buffer::Union{AbstractArray,Nothing}=nothing,
 ) where {T,N<:AbstractExpressionNode{T},I}
-    each_num_nodes = (t -> count_nodes(t; break_sharing=Val(true))).(trees)
+    each_num_nodes = map(t -> count_nodes(t; break_sharing=Val(true)), trees)
     num_nodes = sum(each_num_nodes)
 
     # Want `roots` to be tuple if `trees` is tuple and similar for vector
diff --git a/src/Evaluate.jl b/src/Evaluate.jl
index 569b1767..0c36947a 100644
--- a/src/Evaluate.jl
+++ b/src/Evaluate.jl
@@ -239,8 +239,8 @@ function eval_tree_array(
     operators::OperatorEnum;
     kws...,
 ) where {T<:Number,N<:AbstractExpressionNode{T}}
-    outs = (t -> eval_tree_array(t, cX, operators; kws...)).(trees)
-    return first.(outs), last.(outs)
+    outs = map(t -> eval_tree_array(t, cX, operators; kws...), trees)
+    return map(first, outs), map(last, outs)
 end
 
 # These are marked unstable due to issues discussed on

From 4cce551a57507b2dc6909e3f69b0813e3341a6b7 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 15 Dec 2024 15:35:05 -0800
Subject: [PATCH 42/47] fix: stabilize CUDA ext

---
 ext/DynamicExpressionsCUDAExt.jl | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index ddb1c3d9..9c10ebbf 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -5,6 +5,7 @@ using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx
 using DynamicExpressions: OperatorEnum, AbstractExpressionNode
 using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna
 using DynamicExpressions.AsArrayModule: as_array
+using DispatchDoctor: @stable
 
 import DynamicExpressions.EvaluateEquationModule: eval_tree_array
 
@@ -19,17 +20,19 @@ Base.size(x::FakeCuArray) = size(x.a)
 
 const MaybeCuArray{T,N} = Union{CuArray{T,N},FakeCuArray{T,N}}
 
-to_device(a, ::CuArray) = CuArray(a)
-to_device(a, ::FakeCuArray) = FakeCuArray(a)
+@stable default_mode = "disable" begin
+    to_device(a, ::CuArray) = CuArray(a)
+    to_device(a, ::FakeCuArray) = FakeCuArray(a)
+end
 
-function eval_tree_array(
+@stable default_mode = "disable" function eval_tree_array(
     tree::AbstractExpressionNode{T}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; kws...
 ) where {T<:Number}
     (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...)
     return (only(outs), only(is_good))
 end
 
-function eval_tree_array(
+@stable default_mode = "disable" function eval_tree_array(
     trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}},
     gcX::MaybeCuArray{T,2},
     operators::OperatorEnum;
@@ -95,7 +98,7 @@ function eval_tree_array(
 end
 
 #! format: off
-function _launch_gpu_kernel!(
+@stable default_mode = "disable" function _launch_gpu_kernel!(
     num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2},
     # Thread info:
     num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray{I},
@@ -141,7 +144,9 @@ end
 #      ifs to generate at that time, so we can't simply use specialization.
 #   3. We can't use `@generated` because we can't create closures in those.
 for nuna in 0:10, nbin in 0:10
-    @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin})
+    @eval @stable default_mode = "disable" function create_gpu_kernel(
+        operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin}
+    )
         #! format: off
         function (
             # Storage:

From b7ecdd3b7e24f1166a68bdfe0b966fbfe23610c3 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 15 Dec 2024 15:37:04 -0800
Subject: [PATCH 43/47] test: modularize CUDA test

---
 test/test_cuda.jl | 283 ++++++++++++++++++++++++++++------------------
 test/unittest.jl  |   7 +-
 2 files changed, 174 insertions(+), 116 deletions(-)

diff --git a/test/test_cuda.jl b/test/test_cuda.jl
index 2d46bce0..b281c024 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda.jl
@@ -1,126 +1,189 @@
-using DynamicExpressions
-using DynamicExpressions.AsArrayModule: as_array
-using CUDA
-using Random
+@testitem "Random Evals: Single Tree" begin
+    using DynamicExpressions, CUDA, Random
+    using DynamicExpressions.AsArrayModule: as_array
 
-ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt)
-const FakeCuArray = ext.FakeCuArray
+    include("tree_gen_utils.jl")
 
-include("tree_gen_utils.jl")
+    safe_sin(x) = isfinite(x) ? sin(x) : convert(eltype(x), NaN)
+    safe_cos(x) = isfinite(x) ? cos(x) : convert(eltype(x), NaN)
 
-safe_sin(x) = isfinite(x) ? sin(x) : convert(eltype(x), NaN)
-safe_cos(x) = isfinite(x) ? cos(x) : convert(eltype(x), NaN)
+    ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt)
+    const FakeCuArray = ext.FakeCuArray
 
-@testset "Random evals" begin
-    let
-        operators = OperatorEnum(;
-            binary_operators=[+, -, *, /], unary_operators=[safe_sin, safe_cos]
-        )
-        x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3)
+    operators = OperatorEnum(;
+        binary_operators=[+, -, *, /], unary_operators=[safe_sin, safe_cos]
+    )
 
-        for T in (Float32, Float64, ComplexF64), ntrees in (1, 2, 3), seed in 0:10
+    for T in (Float32, Float64, ComplexF64)
+        for seed in 0:10
             Random.seed!(seed)
-
             nrow = rand(10:30)
-            nnodes = rand(10:25, ntrees)
-            use_tuple = rand(Bool)
-
-            buffer = rand(Bool) ? ones(Int32, 8, sum(nnodes)) : nothing
-            gpu_buffer = rand(Bool) ? FakeCuArray(ones(Int32, 8, sum(nnodes))) : nothing
-            gpu_workspace =
-                rand(Bool) ? FakeCuArray(ones(T, nrow + 1, sum(nnodes))) : nothing
-
-            trees = ntuple(
-                i -> gen_random_tree_fixed_size(nnodes[i], operators, 3, T), ntrees
-            )
-            trees = use_tuple ? trees : collect(trees)
+            nnodes = rand(10:25)
+            tree = gen_random_tree_fixed_size(nnodes, operators, 3, T)
             X = randn(T, 3, nrow)
-            if ntrees > 1
-                y, completed = @inferred eval_tree_array(trees, X, operators)
-                gpu_y, gpu_completed = @inferred eval_tree_array(
-                    trees, FakeCuArray(X), operators; buffer, gpu_workspace, gpu_buffer
-                )
-
-                # Should give same result either way
-                for i in eachindex(completed, gpu_completed)
-                    if completed[i]
-                        @test y[i] ≈ gpu_y[i]
-                    end
-                end
-
-                # Should return same type as input
-                if use_tuple
-                    @test y isa Tuple
-                    @test gpu_y isa Tuple
-                else
-                    @test y isa Vector
-                    @test gpu_y isa Vector
-                end
-            else
-                y, completed = @inferred eval_tree_array(only(trees), X, operators)
-                gpu_y, gpu_completed = @inferred eval_tree_array(
-                    only(trees), FakeCuArray(X), operators
-                )
-                if completed
-                    @test y ≈ gpu_y
-                end
+
+            y, completed = eval_tree_array(tree, X, operators)
+            y_gpu, completed_gpu = eval_tree_array(tree, FakeCuArray(X), operators)
+
+            # TODO: Fix this
+            # @test completed == completed_gpu
+            if completed
+                @test y ≈ y_gpu
             end
         end
     end
 end
 
-@testset "Evaluation on pre-computed buffers" begin
-    let
-        operators = OperatorEnum(;
-            binary_operators=[+, -, *, /], unary_operators=[sin, cos]
-        )
-        x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3)
-        Random.seed!(0)
-        tree = sin(x1 * 3.1 - x3 * 0.9 + 0.2) * x2 - x3 * x3 * 1.5
-        X = randn(Float64, 3, 100)
-
-        y1, _ = eval_tree_array(tree, X, operators)
-        y2, _ = eval_tree_array(tree, FakeCuArray(X), operators)
-
-        @test y1 ≈ y2
-
-        (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, tree)
-        gpu_buffer = FakeCuArray(buffer)
-        gpu_workspace = FakeCuArray(zeros(Float64, 101, 50))
-        copyto!((@view gpu_workspace[end, :]), val)
-
-        # Now, with all buffers:
-        y3, _ = eval_tree_array(
-            tree,
-            FakeCuArray(X),
-            operators;
-            gpu_workspace,
-            gpu_buffer,
-            roots,
-            num_nodes,
-            num_launches,
-            update_buffers=Val(false),
-        )
-        @test y1 ≈ y3
-
-        # Should be able to shift some of the values in this buffer:
-        i = findfirst(gpu_workspace[end, :] .== 0.9)
-        gpu_workspace[end, i] = 0.8
-
-        # And get the updated results:
-        tree_prime = sin(x1 * 3.1 - x3 * 0.8 + 0.2) * x2 - x3 * x3 * 1.5
-        y1_prime, _ = eval_tree_array(tree_prime, X, operators)
-        y3_prime, _ = eval_tree_array(
-            x1, # Doesn't matter what we put here
-            FakeCuArray(X),
-            operators;
-            gpu_workspace,
-            gpu_buffer,
-            roots,
-            num_nodes,
-            num_launches,
-            update_buffers=Val(false),
+@testitem "Random Evals: Multiple Trees" begin
+    using DynamicExpressions, CUDA, Random
+    using DynamicExpressions.AsArrayModule: as_array
+
+    include("tree_gen_utils.jl")
+
+    safe_sin(x) = isfinite(x) ? sin(x) : convert(eltype(x), NaN)
+    safe_cos(x) = isfinite(x) ? cos(x) : convert(eltype(x), NaN)
+
+    operators = OperatorEnum(;
+        binary_operators=[+, -, *, /], unary_operators=[safe_sin, safe_cos]
+    )
+
+    ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt)
+    const FakeCuArray = ext.FakeCuArray
+
+    for T in (Float32, Float64, ComplexF64), ntrees in (2, 3), seed in 0:10
+        Random.seed!(seed)
+
+        nrow = rand(10:30)
+        nnodes = rand(10:25, ntrees)
+        use_tuple = rand(Bool)
+
+        buffer = rand(Bool) ? ones(Int32, 8, sum(nnodes)) : nothing
+        gpu_buffer = rand(Bool) ? FakeCuArray(ones(Int32, 8, sum(nnodes))) : nothing
+        gpu_workspace = rand(Bool) ? FakeCuArray(ones(T, nrow + 1, sum(nnodes))) : nothing
+
+        trees = ntuple(i -> gen_random_tree_fixed_size(nnodes[i], operators, 3, T), ntrees)
+        trees = use_tuple ? trees : collect(trees)
+
+        X = randn(T, 3, nrow)
+
+        y, completed = eval_tree_array(trees, X, operators)
+        gpu_y, gpu_completed = eval_tree_array(
+            trees, FakeCuArray(X), operators; buffer, gpu_workspace, gpu_buffer
         )
-        @test y1_prime ≈ y3_prime
+
+        # TODO: Fix this
+        # @test completed == gpu_completed
+
+        for i in eachindex(completed, gpu_completed)
+            if completed[i]
+                @test y[i] ≈ gpu_y[i]
+            end
+        end
+
+        # Check return type matches input type (tuple or vector)
+        if use_tuple
+            @test y isa Tuple
+            @test gpu_y isa Tuple
+        else
+            @test y isa Vector
+            @test gpu_y isa Vector
+        end
     end
 end
+
+@testitem "Pre-Computed Buffers: Basic Equivalence" begin
+    using DynamicExpressions, CUDA, Random
+    using DynamicExpressions.AsArrayModule: as_array
+
+    ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt)
+    const FakeCuArray = ext.FakeCuArray
+
+    # No random trees here, we define a fixed tree
+    x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3)
+    operators = OperatorEnum(; binary_operators=[+, -, *, /], unary_operators=[sin, cos])
+
+    Random.seed!(0)
+    tree = sin(x1 * 3.1 - x3 * 0.9 + 0.2) * x2 - x3 * x3 * 1.5
+    X = randn(Float64, 3, 100)
+
+    y1, _ = eval_tree_array(tree, X, operators)
+    y2, _ = eval_tree_array(tree, FakeCuArray(X), operators)
+    @test y1 ≈ y2
+end
+
+@testitem "Pre-Computed Buffers: Using Provided Buffers" begin
+    using DynamicExpressions, CUDA, Random
+    using DynamicExpressions.AsArrayModule: as_array
+
+    ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt)
+    const FakeCuArray = ext.FakeCuArray
+
+    x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3)
+    operators = OperatorEnum(; binary_operators=[+, -, *, /], unary_operators=[sin, cos])
+
+    Random.seed!(0)
+    tree = sin(x1 * 3.1 - x3 * 0.9 + 0.2) * x2 - x3 * x3 * 1.5
+    X = randn(Float64, 3, 100)
+
+    y1, _ = eval_tree_array(tree, X, operators)
+
+    # Extract arrays
+    (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, tree)
+    gpu_buffer = FakeCuArray(buffer)
+    gpu_workspace = FakeCuArray(zeros(Float64, size(X, 2) + 1, num_nodes))
+    copyto!((@view gpu_workspace[end, :]), val)
+
+    y3, _ = eval_tree_array(
+        tree,
+        FakeCuArray(X),
+        operators;
+        gpu_workspace,
+        gpu_buffer,
+        roots,
+        num_nodes,
+        num_launches,
+        update_buffers=Val(false),
+    )
+    @test y1 ≈ y3
+end
+
+@testitem "Pre-Computed Buffers: Modified Values" begin
+    using DynamicExpressions, CUDA, Random
+    using DynamicExpressions.AsArrayModule: as_array
+
+    ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt)
+    const FakeCuArray = ext.FakeCuArray
+
+    x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3)
+    operators = OperatorEnum(; binary_operators=[+, -, *, /], unary_operators=[sin, cos])
+
+    Random.seed!(0)
+    tree = sin(x1 * 3.1 - x3 * 0.9 + 0.2) * x2 - x3 * x3 * 1.5
+    X = randn(Float64, 3, 100)
+
+    y1, _ = eval_tree_array(tree, X, operators)
+
+    (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, tree)
+    gpu_buffer = FakeCuArray(buffer)
+    gpu_workspace = FakeCuArray(zeros(Float64, size(X, 2) + 1, num_nodes))
+    gpu_workspace[end, :] .= val
+
+    # Change a constant (0.9 to 0.8)
+    i = findfirst(gpu_workspace[end, :] .== 0.9)
+    gpu_workspace[end, i] = 0.8
+
+    tree_prime = sin(x1 * 3.1 - x3 * 0.8 + 0.2) * x2 - x3 * x3 * 1.5
+    y1_prime, _ = eval_tree_array(tree_prime, X, operators)
+    y3_prime, _ = eval_tree_array(
+        x1, # dummy tree
+        FakeCuArray(X),
+        operators;
+        gpu_workspace,
+        gpu_buffer,
+        roots,
+        num_nodes,
+        num_launches,
+        update_buffers=Val(false),
+    )
+    @test y1_prime ≈ y3_prime
+end
diff --git a/test/unittest.jl b/test/unittest.jl
index 57015a89..4685c119 100644
--- a/test/unittest.jl
+++ b/test/unittest.jl
@@ -123,12 +123,6 @@ end
     include("test_random.jl")
 end
 
-@testitem "Test CUDA" begin
-    if VERSION >= v"1.9"
-        include("test_cuda.jl")
-    end
-end
-
 include("test_expressions.jl")
 include("test_multi_expression.jl")
 include("test_parse.jl")
@@ -139,3 +133,4 @@ include("test_expression_math.jl")
 include("test_structured_expression.jl")
 include("test_readonlynode.jl")
 include("test_zygote_gradient_wrapper.jl")
+include("test_cuda.jl")

From 8fe652cd6a9f8127a3b238c17c1b7904252ae75c Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 15 Dec 2024 15:45:15 -0800
Subject: [PATCH 44/47] fix: potential type instability in as_array

---
 src/AsArray.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/AsArray.jl b/src/AsArray.jl
index 410147e6..040766f4 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -39,7 +39,7 @@ function as_array(
     constant = @view buffer[8, :]
 
     cursor = Ref(zero(I))
-    num_launches = zero(I)
+    num_launches = Ref(zero(I))
     for (root, tree) in zip(roots, trees)
         @assert root == cursor[] + 1
         tree_mapreduce(
@@ -78,8 +78,8 @@ function as_array(
                 execution_order[parent.id] = parent_execution_order
 
                 # Global number of launches equal to maximum execution order
-                if parent_execution_order > num_launches
-                    num_launches = parent_execution_order
+                if parent_execution_order > num_launches[]
+                    num_launches[] = parent_execution_order
                 end
 
                 (id=parent.id, order=parent_execution_order)
@@ -96,7 +96,7 @@ function as_array(
         feature,
         op,
         execution_order,
-        num_launches,
+        num_launches=num_launches[],
         idx_self,
         idx_l,
         idx_r,

From 1d06bd0ae178bc411c3ef548cc557071fb68f68b Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 15 Dec 2024 17:43:32 -0800
Subject: [PATCH 45/47] refactor: as array into modular functions

---
 Project.toml                     |   2 +
 ext/DynamicExpressionsCUDAExt.jl |  30 +++--
 src/AsArray.jl                   | 217 ++++++++++++++++++++-----------
 3 files changed, 159 insertions(+), 90 deletions(-)

diff --git a/Project.toml b/Project.toml
index 892b8c71..63b5bbee 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "1.8.0"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 DispatchDoctor = "8d63f2c5-f18a-4cf2-ba9d-b3f60fc568c8"
 Interfaces = "85a1e053-f937-4924-92a5-1367d23b7b87"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
@@ -33,6 +34,7 @@ DynamicExpressionsZygoteExt = "Zygote"
 Bumper = "0.6"
 CUDA = "4, 5"
 ChainRulesCore = "1"
+Compat = "4.16"
 DispatchDoctor = "0.4"
 Interfaces = "0.3"
 LoopVectorization = "0.12"
diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 9c10ebbf..3127e260 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -3,11 +3,11 @@ module DynamicExpressionsCUDAExt
 # TODO: Switch to KernelAbstractions.jl (once they hit v1.0)
 using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx
 using DynamicExpressions: OperatorEnum, AbstractExpressionNode
-using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna
+using DynamicExpressions.EvaluateModule: get_nbin, get_nuna
 using DynamicExpressions.AsArrayModule: as_array
 using DispatchDoctor: @stable
 
-import DynamicExpressions.EvaluateEquationModule: eval_tree_array
+import DynamicExpressions.EvaluateModule: eval_tree_array
 
 # array type for exclusively testing purposes
 struct FakeCuArray{T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N}
@@ -45,11 +45,15 @@ end
     update_buffers::Val{_update_buffers}=Val(true),
     kws...,
 ) where {T<:Number,N<:AbstractExpressionNode{T},_update_buffers}
+    local val
     if _update_buffers
         (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, trees; buffer)
     end
+    # TODO: Fix this type instability?
     num_elem = size(gcX, 2)
 
+    num_launches = num_launches isa Integer ? num_launches : num_launches[]
+
     ## The following array is our "workspace" for
     ## the GPU kernel, with size equal to the number of rows
     ## in the input data by the number of nodes in the tree.
@@ -68,14 +72,18 @@ end
     else
         copyto!(gpu_buffer, buffer)
     end
-    gdegree = @view gbuffer[1, :]
-    gfeature = @view gbuffer[2, :]
-    gop = @view gbuffer[3, :]
+
+    #! format: off
+    gdegree =          @view gbuffer[1, :]
+    gfeature =         @view gbuffer[2, :]
+    gop =              @view gbuffer[3, :]
     gexecution_order = @view gbuffer[4, :]
-    gidx_self = @view gbuffer[5, :]
-    gidx_l = @view gbuffer[6, :]
-    gidx_r = @view gbuffer[7, :]
-    gconstant = @view gbuffer[8, :]
+    gidx_self =        @view gbuffer[5, :]
+    gidx_l =           @view gbuffer[6, :]
+    gidx_r =           @view gbuffer[7, :]
+    gconstant =        @view gbuffer[8, :]
+    #! format: on
+    # TODO: This is a bit dangerous as we're assuming exact indices
 
     num_threads = 256
     num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads))
@@ -144,9 +152,7 @@ end
 #      ifs to generate at that time, so we can't simply use specialization.
 #   3. We can't use `@generated` because we can't create closures in those.
 for nuna in 0:10, nbin in 0:10
-    @eval @stable default_mode = "disable" function create_gpu_kernel(
-        operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin}
-    )
+    @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin})
         #! format: off
         function (
             # Storage:
diff --git a/src/AsArray.jl b/src/AsArray.jl
index 040766f4..6d9f19a1 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -1,109 +1,170 @@
 module AsArrayModule
 
+using Compat: Fix
+
 using ..NodeModule: AbstractExpressionNode, tree_mapreduce, count_nodes
+using ..EvaluateModule: ArrayBuffer, get_array, get_filled_array
 
 function as_array(
-    ::Type{I}, trees::N; buffer=nothing
+    ::Type{I}, trees::N; buffer::Union{ArrayBuffer,Nothing}=nothing
 ) where {T,N<:AbstractExpressionNode{T},I}
     return as_array(I, (trees,); buffer=buffer)
 end
 
+Base.@kwdef struct TreeBuffer{
+    T,I,A<:AbstractArray{I},B<:AbstractArray{T},C,D<:AbstractArray{I}
+}
+    # Corresponds to the `Node` fields
+    degree::A
+    constant::A
+    val::B
+    feature::A
+    op::A
+    idx_l::A
+    idx_r::A
+
+    # Indexing information
+    execution_order::A
+    idx_self::A
+    num_launches::Base.RefValue{I}
+    cursor::Base.RefValue{I}
+
+    # Segment information
+    roots::C
+    num_nodes::I
+
+    # Original buffer
+    buffer::D
+end
+
 function as_array(
     ::Type{I},
     trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}};
-    buffer::Union{AbstractArray,Nothing}=nothing,
+    buffer::Union{AbstractArray{I},Nothing}=nothing,
 ) where {T,N<:AbstractExpressionNode{T},I}
     each_num_nodes = map(t -> count_nodes(t; break_sharing=Val(true)), trees)
     num_nodes = sum(each_num_nodes)
 
-    # Want `roots` to be tuple if `trees` is tuple and similar for vector
+    # Compute the roots array for indexing.
     roots = cumsum(
         if each_num_nodes isa Tuple
-            tuple(one(I), each_num_nodes[1:(end - 1)]...)
+            tuple(one(I), each_num_nodes[begin:(end - 1)]...)
         else
-            vcat(one(I), each_num_nodes[1:(end - 1)])
+            vcat(one(I), @view(each_num_nodes[begin:(end - 1)]))
         end,
     )
 
     val = Array{T}(undef, num_nodes)
 
-    ## Views of the same matrix:
+    # If no buffer is provided, create a new ArrayBuffer from scratch
     buffer = @something(buffer, Array{I}(undef, 8, num_nodes))
-    degree = @view buffer[1, :]
-    feature = @view buffer[2, :]
-    op = @view buffer[3, :]
+
+    # Obtain arrays from the buffer. Each call to get_array consumes one "slot".
+    #! format: off
+    degree =          @view buffer[1, :]
+    feature =         @view buffer[2, :]
+    op =              @view buffer[3, :]
     execution_order = @view buffer[4, :]
-    idx_self = @view buffer[5, :]
-    idx_l = @view buffer[6, :]
-    idx_r = @view buffer[7, :]
-    constant = @view buffer[8, :]
-
-    cursor = Ref(zero(I))
-    num_launches = Ref(zero(I))
-    for (root, tree) in zip(roots, trees)
-        @assert root == cursor[] + 1
-        tree_mapreduce(
-            leaf -> begin
-                self = (cursor[] += one(I))
-                idx_self[self] = self
-                degree[self] = 0
-                execution_order[self] = one(I)
-                constant[self] = leaf.constant
-                if leaf.constant
-                    val[self] = leaf.val::T
-                else
-                    feature[self] = leaf.feature
-                end
-
-                (id=self, order=one(I))
-            end,
-            branch -> begin
-                self = (cursor[] += one(I))
-                idx_self[self] = self
-                op[self] = branch.op
-                degree[self] = branch.degree
-
-                (id=self, order=one(I))  # this order is unused
-            end,
-            ((parent, children::Vararg{Any,C}) where {C}) -> begin
-                idx_l[parent.id] = children[1].id
-                if C == 2
-                    idx_r[parent.id] = children[2].id
-                end
-                parent_execution_order = if C == 1
-                    children[1].order + one(I)
-                else
-                    max(children[1].order, children[2].order) + one(I)
-                end
-                execution_order[parent.id] = parent_execution_order
-
-                # Global number of launches equal to maximum execution order
-                if parent_execution_order > num_launches[]
-                    num_launches[] = parent_execution_order
-                end
-
-                (id=parent.id, order=parent_execution_order)
-            end,
-            tree;
-            break_sharing=Val(true),
-        )
-    end
+    idx_self =        @view buffer[5, :]
+    idx_l =           @view buffer[6, :]
+    idx_r =           @view buffer[7, :]
+    constant =        @view buffer[8, :]
+    #! format: on
+
+    tree_buffers = TreeBuffer(;
+        degree=degree,
+        constant=constant,
+        val=val,
+        feature=feature,
+        op=op,
+        idx_l=idx_l,
+        idx_r=idx_r,
 
-    return (;
-        degree,
-        constant,
-        val,
-        feature,
-        op,
-        execution_order,
-        num_launches=num_launches[],
-        idx_self,
-        idx_l,
-        idx_r,
-        roots,
-        buffer,
-        num_nodes,
+        # Indexing information
+        execution_order=execution_order,
+        idx_self=idx_self,
+        num_launches=Ref(zero(I)),
+        cursor=Ref(zero(I)),
+
+        # Segment information
+        roots=roots,
+        num_nodes=I(num_nodes),
+
+        # Original buffer
+        buffer=buffer,
     )
+
+    fill_tree_buffer!(tree_buffers, trees)
+
+    return tree_buffers
+end
+
+function fill_tree_buffer!(
+    tree_buffers::TreeBuffer{T,I}, trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}}
+) where {T,I,N<:AbstractExpressionNode{T}}
+    return foreach(Fix{1}(fill_single_tree!, tree_buffers), trees)
+end
+
+function fill_single_tree!(
+    tree_buffers::TreeBuffer{T,I}, tree::N
+) where {T,I,N<:AbstractExpressionNode{T}}
+    return tree_mapreduce(
+        Fix{1}(fill_single_leaf!, tree_buffers),
+        Fix{1}(fill_single_branch!, tree_buffers),
+        Fix{1}(link_parent_and_children!, tree_buffers),
+        tree;
+        break_sharing=Val(true),
+    )
+end
+
+function fill_single_leaf!(
+    tree_buffers::TreeBuffer{T,I}, leaf::N
+) where {T,I,N<:AbstractExpressionNode{T}}
+    self = (tree_buffers.cursor[] += one(I))
+    tree_buffers.idx_self[self] = self
+    tree_buffers.degree[self] = 0
+    tree_buffers.execution_order[self] = one(I)
+    tree_buffers.constant[self] = leaf.constant
+    if leaf.constant
+        tree_buffers.val[self] = leaf.val::T
+    else
+        tree_buffers.feature[self] = leaf.feature
+    end
+
+    return (id=self, order=one(I))
+end
+
+function fill_single_branch!(
+    tree_buffers::TreeBuffer{T,I}, branch::N
+) where {T,I,N<:AbstractExpressionNode{T}}
+    self = (tree_buffers.cursor[] += one(I))
+    tree_buffers.idx_self[self] = self
+    tree_buffers.op[self] = branch.op
+    tree_buffers.degree[self] = branch.degree
+
+    return (id=self, order=one(I))
+end
+
+function link_parent_and_children!(
+    tree_buffers::TreeBuffer{T,I}, parent, children::Vararg{Any,C}
+) where {T,I,C}
+    tree_buffers.idx_l[parent.id] = children[1].id
+    if C == 2
+        tree_buffers.idx_r[parent.id] = children[2].id
+    end
+    parent_execution_order = if C == 1
+        children[1].order + one(I)
+    else
+        max(children[1].order, children[2].order) + one(I)
+    end
+
+    tree_buffers.execution_order[parent.id] = parent_execution_order
+
+    if parent_execution_order > tree_buffers.num_launches[]
+        tree_buffers.num_launches[] = parent_execution_order
+    end
+
+    return (id=parent.id, order=parent_execution_order)
 end
 
 end

From 74f56d46bca33f095d099988d6136f5c1eee0c78 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Sun, 15 Dec 2024 21:12:40 -0800
Subject: [PATCH 46/47] refactor: ensure `@inbounds` for gpu kernel

---
 ext/DynamicExpressionsCUDAExt.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 3127e260..6e0fbc1d 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -165,7 +165,6 @@ for nuna in 0:10, nbin in 0:10
             # Override for unittesting:
             i=nothing,
         )
-            #! format: on
             i = i === nothing ? (blockIdx().x - 1) * blockDim().x + threadIdx().x : i
             if i > num_elem * num_nodes
                 return nothing
@@ -174,6 +173,8 @@ for nuna in 0:10, nbin in 0:10
             node = (i - 1) % num_nodes + 1
             elem = (i - node) ÷ num_nodes + 1
 
+            #! format: off
+            @inbounds begin
             if execution_order[node] != launch
                 return nothing
             end
@@ -212,6 +213,8 @@ for nuna in 0:10, nbin in 0:10
                     )
                 end
             end
+            end
+            #! format: on
             return nothing
         end
     end

From 3da1d38b1da79b9bfbfa76d675063fe868ad1b30 Mon Sep 17 00:00:00 2001
From: MilesCranmer <miles.cranmer@gmail.com>
Date: Mon, 16 Dec 2024 03:41:20 -0500
Subject: [PATCH 47/47] refactor: avoid SubArray in CUDA kernel

---
 ext/DynamicExpressionsCUDAExt.jl | 98 +++++++++++++++-----------------
 src/AsArray.jl                   | 25 +++++---
 2 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
index 6e0fbc1d..83cea775 100644
--- a/ext/DynamicExpressionsCUDAExt.jl
+++ b/ext/DynamicExpressionsCUDAExt.jl
@@ -4,7 +4,16 @@ module DynamicExpressionsCUDAExt
 using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx
 using DynamicExpressions: OperatorEnum, AbstractExpressionNode
 using DynamicExpressions.EvaluateModule: get_nbin, get_nuna
-using DynamicExpressions.AsArrayModule: as_array
+using DynamicExpressions.AsArrayModule:
+    as_array,
+    IDX_DEGREE,
+    IDX_FEATURE,
+    IDX_OP,
+    IDX_EXECUTION_ORDER,
+    IDX_SELF,
+    IDX_L,
+    IDX_R,
+    IDX_CONSTANT
 using DispatchDoctor: @stable
 
 import DynamicExpressions.EvaluateModule: eval_tree_array
@@ -59,12 +68,11 @@ end
     ## in the input data by the number of nodes in the tree.
     ## It has one extra row to store the constant values.
     gworkspace = @something(gpu_workspace, similar(gcX, num_elem + 1, num_nodes))
-    gval = @view gworkspace[end, :]
     if _update_buffers
-        copyto!(gval, val)
+        copyto!(@view(gworkspace[end, :]), val)
     end
+    val_idx = size(gworkspace, 1)
 
-    ## Index arrays (much faster to have `@view` here)
     gbuffer = if !_update_buffers
         gpu_buffer
     elseif gpu_buffer === nothing
@@ -73,17 +81,8 @@ end
         copyto!(gpu_buffer, buffer)
     end
 
-    #! format: off
-    gdegree =          @view gbuffer[1, :]
-    gfeature =         @view gbuffer[2, :]
-    gop =              @view gbuffer[3, :]
-    gexecution_order = @view gbuffer[4, :]
-    gidx_self =        @view gbuffer[5, :]
-    gidx_l =           @view gbuffer[6, :]
-    gidx_r =           @view gbuffer[7, :]
-    gconstant =        @view gbuffer[8, :]
-    #! format: on
-    # TODO: This is a bit dangerous as we're assuming exact indices
+    # Removed @view definitions of gdegree, gfeature, etc.
+    # We'll index directly into gbuffer using the constants above.
 
     num_threads = 256
     num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads))
@@ -92,10 +91,9 @@ end
     _launch_gpu_kernel!(
         num_threads, num_blocks, num_launches, gworkspace,
         # Thread info:
-        num_elem, num_nodes, gexecution_order,
-        # Input data and tree
-        operators, gcX, gidx_self, gidx_l, gidx_r,
-        gdegree, gconstant, gval, gfeature, gop,
+        num_elem, num_nodes,
+        # We'll pass gbuffer directly to the kernel now:
+        operators, gcX, gbuffer, val_idx,
     )
     #! format: on
 
@@ -109,34 +107,30 @@ end
 @stable default_mode = "disable" function _launch_gpu_kernel!(
     num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2},
     # Thread info:
-    num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray{I},
-    # Input data and tree
-    operators::OperatorEnum, cX::AbstractArray{T,2}, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
-    degree::AbstractArray, constant::AbstractArray, val::AbstractArray{T,1}, feature::AbstractArray, op::AbstractArray,
-) where {I,T}
+    num_elem::Integer, num_nodes::Integer,
+    operators::OperatorEnum, cX::AbstractArray{T,2}, gbuffer::AbstractArray{Int32,2},
+    val_idx::Integer
+) where {T}
     #! format: on
     nuna = get_nuna(typeof(operators))
     nbin = get_nbin(typeof(operators))
     (nuna > 10 || nbin > 10) &&
         error("Too many operators. Kernels are only compiled up to 10.")
     gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
-    for launch in one(I):I(num_launches)
+    for launch in one(Int32):Int32(num_launches)
         #! format: off
         if buffer isa CuArray
             @cuda threads=num_threads blocks=num_blocks gpu_kernel!(
                 buffer,
-                launch, num_elem, num_nodes, execution_order,
-                cX, idx_self, idx_l, idx_r,
-                degree, constant, val, feature, op
+                launch, num_elem, num_nodes,
+                cX, gbuffer, val_idx
             )
         else
             Threads.@threads for i in 1:(num_threads * num_blocks)
                 gpu_kernel!(
                     buffer,
-                    launch, num_elem, num_nodes, execution_order,
-                    cX, idx_self, idx_l, idx_r,
-                    degree, constant, val, feature, op,
-                    i
+                    launch, num_elem, num_nodes,
+                    cX, gbuffer, val_idx, i
                 )
             end
         end
@@ -155,17 +149,13 @@ for nuna in 0:10, nbin in 0:10
     @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin})
         #! format: off
         function (
-            # Storage:
             buffer,
-            # Thread info:
-            launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray,
-            # Input data and tree
-            cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
-            degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray,
-            # Override for unittesting:
+            launch::Integer, num_elem::Integer, num_nodes::Integer,
+            cX::AbstractArray, gbuffer::AbstractArray{Int32,2},
+            val_idx::Integer,
             i=nothing,
         )
-            i = i === nothing ? (blockIdx().x - 1) * blockDim().x + threadIdx().x : i
+            i = @something(i, (blockIdx().x - 1) * blockDim().x + threadIdx().x)
             if i > num_elem * num_nodes
                 return nothing
             end
@@ -173,26 +163,28 @@ for nuna in 0:10, nbin in 0:10
             node = (i - 1) % num_nodes + 1
             elem = (i - node) ÷ num_nodes + 1
 
-            #! format: off
+
             @inbounds begin
-            if execution_order[node] != launch
+            if gbuffer[IDX_EXECUTION_ORDER, node] != launch
                 return nothing
             end
 
-            cur_degree = degree[node]
-            cur_idx = idx_self[node]
+            # Use constants to index gbuffer:
+            cur_degree = gbuffer[IDX_DEGREE, node]
+            cur_idx = gbuffer[IDX_SELF, node]
+
             if cur_degree == 0
-                if constant[node] == 1
-                    cur_val = val[node]
+                if gbuffer[IDX_CONSTANT, node] == 1
+                    cur_val = buffer[val_idx, node]
                     buffer[elem, cur_idx] = cur_val
                 else
-                    cur_feature = feature[node]
+                    cur_feature = gbuffer[IDX_FEATURE, node]
                     buffer[elem, cur_idx] = cX[cur_feature, elem]
                 end
             else
                 if cur_degree == 1 && $nuna > 0
-                    cur_op = op[node]
-                    l_idx = idx_l[node]
+                    cur_op = gbuffer[IDX_OP, node]
+                    l_idx = gbuffer[IDX_L, node]
                     Base.Cartesian.@nif(
                         $nuna,
                         i -> i == cur_op,
@@ -200,10 +192,10 @@ for nuna in 0:10, nbin in 0:10
                             buffer[elem, cur_idx] = op(buffer[elem, l_idx])
                         end
                     )
-                elseif $nbin > 0  # Note this check is to avoid type inference issues when binops is empty
-                    cur_op = op[node]
-                    l_idx = idx_l[node]
-                    r_idx = idx_r[node]
+                elseif $nbin > 0
+                    cur_op = gbuffer[IDX_OP, node]
+                    l_idx = gbuffer[IDX_L, node]
+                    r_idx = gbuffer[IDX_R, node]
                     Base.Cartesian.@nif(
                         $nbin,
                         i -> i == cur_op,
diff --git a/src/AsArray.jl b/src/AsArray.jl
index 6d9f19a1..c63f79da 100644
--- a/src/AsArray.jl
+++ b/src/AsArray.jl
@@ -37,6 +37,15 @@ Base.@kwdef struct TreeBuffer{
     buffer::D
 end
 
+const IDX_DEGREE = 1
+const IDX_FEATURE = 2
+const IDX_OP = 3
+const IDX_EXECUTION_ORDER = 4
+const IDX_SELF = 5
+const IDX_L = 6
+const IDX_R = 7
+const IDX_CONSTANT = 8
+
 function as_array(
     ::Type{I},
     trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}};
@@ -61,14 +70,14 @@ function as_array(
 
     # Obtain arrays from the buffer. Each call to get_array consumes one "slot".
     #! format: off
-    degree =          @view buffer[1, :]
-    feature =         @view buffer[2, :]
-    op =              @view buffer[3, :]
-    execution_order = @view buffer[4, :]
-    idx_self =        @view buffer[5, :]
-    idx_l =           @view buffer[6, :]
-    idx_r =           @view buffer[7, :]
-    constant =        @view buffer[8, :]
+    degree =          @view buffer[IDX_DEGREE, :]
+    feature =         @view buffer[IDX_FEATURE, :]
+    op =              @view buffer[IDX_OP, :]
+    execution_order = @view buffer[IDX_EXECUTION_ORDER, :]
+    idx_self =        @view buffer[IDX_SELF, :]
+    idx_l =           @view buffer[IDX_L, :]
+    idx_r =           @view buffer[IDX_R, :]
+    constant =        @view buffer[IDX_CONSTANT, :]
     #! format: on
 
     tree_buffers = TreeBuffer(;